On Mon, Oct 12, 2015 at 02:55:32PM -0700, Kenneth Graunke wrote: > Signed-off-by: Kenneth Graunke <kenn...@whitecape.org>
A few comments below, but Reviewed-by: Kristian Høgsberg <k...@bitplanet.net> > --- > src/mesa/drivers/dri/i965/brw_fs.cpp | 174 ++++++++++ > src/mesa/drivers/dri/i965/brw_fs.h | 16 +- > src/mesa/drivers/dri/i965/brw_fs_nir.cpp | 378 > ++++++++++++++++++++++ > src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 49 ++- > src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp | 21 ++ > 5 files changed, 628 insertions(+), 10 deletions(-) > > diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp > b/src/mesa/drivers/dri/i965/brw_fs.cpp > index dde8c45..778237a 100644 > --- a/src/mesa/drivers/dri/i965/brw_fs.cpp > +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp > @@ -43,6 +43,7 @@ > #include "brw_wm.h" > #include "brw_fs.h" > #include "brw_cs.h" > +#include "brw_vec4_gs_visitor.h" > #include "brw_cfg.h" > #include "brw_dead_control_flow.h" > #include "main/uniforms.h" > @@ -1347,6 +1348,47 @@ fs_visitor::emit_discard_jump() > } > > void > +fs_visitor::emit_gs_thread_end() > +{ > + assert(stage == MESA_SHADER_GEOMETRY); > + > + if (gs_compile->control_data_header_size_bits > 0) { > + emit_gs_control_data_bits(this->final_gs_vertex_count); > + } > + > + const fs_builder abld = bld.annotate("thread end"); > + fs_inst *inst; > + > + if (gs_compile->prog_data.static_vertex_count != -1) { > + foreach_in_list_reverse(fs_inst, prev, &this->instructions) { > + if (prev->opcode == SHADER_OPCODE_URB_WRITE_SIMD8 || > + prev->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_MASKED || > + prev->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT || > + prev->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT) { > + prev->eot = true; > + return; > + } else if (prev->is_control_flow() || prev->has_side_effects()) { > + break; > + } > + } > + fs_reg hdr = abld.vgrf(BRW_REGISTER_TYPE_UD, 1); > + abld.MOV(hdr, fs_reg(retype(brw_vec8_grf(1, 0), > BRW_REGISTER_TYPE_UD))); > + inst = abld.emit(SHADER_OPCODE_URB_WRITE_SIMD8, reg_undef, hdr); > + inst->mlen = 1; > + } else { > + fs_reg payload = abld.vgrf(BRW_REGISTER_TYPE_UD, 2); > + fs_reg *sources = ralloc_array(mem_ctx, fs_reg, 2); > + sources[0] = fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD)); > + sources[1] = this->final_gs_vertex_count; > + abld.LOAD_PAYLOAD(payload, sources, 2, 2); > + inst = abld.emit(SHADER_OPCODE_URB_WRITE_SIMD8, reg_undef, payload); > + inst->mlen = 2; > + } > + inst->eot = true; > + inst->offset = 0; > +} > + > +void > fs_visitor::assign_curb_setup() > { > if (dispatch_width == 8) { > @@ -1550,6 +1592,53 @@ fs_visitor::assign_vs_urb_setup() > } > } > > +void > +fs_visitor::assign_gs_urb_setup() > +{ > + assert(stage == MESA_SHADER_GEOMETRY); > + > + const gl_geometry_program *gp = &gs_compile->gp->program; > + brw_vue_prog_data *vue_prog_data = (brw_vue_prog_data *) prog_data; > + > + first_non_payload_grf += > + 8 * vue_prog_data->urb_read_length * gp->VerticesIn; > + > + const unsigned first_icp_handle = payload.num_regs - > + (vue_prog_data->include_vue_handles ? gp->VerticesIn : 0); > + > + foreach_block_and_inst(block, fs_inst, inst, cfg) { > + /* Lower URB_READ_SIMD8 opcodes into real messages. */ > + if (inst->opcode == SHADER_OPCODE_URB_READ_SIMD8) { > + assert(inst->src[0].file == IMM); > + inst->src[0] = retype(brw_vec8_grf(first_icp_handle + > + inst->src[0].fixed_hw_reg.dw1.ud, > + 0), BRW_REGISTER_TYPE_UD); > + /* for now, assume constant - we can do per-slot offsets later */ > + assert(inst->src[1].file == IMM); > + inst->offset = inst->src[1].fixed_hw_reg.dw1.ud; > + inst->src[1] = fs_reg(); > + inst->mlen = 1; > + inst->base_mrf = -1; > + } > + > + /* Rewrite all ATTR file references to a real HW_REG. */ > + for (int i = 0; i < inst->sources; i++) { > + if (inst->src[i].file != ATTR) > + continue; > + > + int grf = payload.num_regs + > + prog_data->curb_read_length + > + inst->src[i].reg + > + inst->src[i].reg_offset; > + > + inst->src[i].file = HW_REG; > + inst->src[i].fixed_hw_reg = > + retype(brw_vec8_grf(grf, 0), inst->src[i].type); We need to do what assign_vs_urb_setup() does when it lowers ATTR file references, that is, account for stride and subreg_offset: inst->src[i].fixed_hw_reg = stride(byte_offset(retype(brw_vec8_grf(grf, 0), inst->src[i].type), inst->src[i].subreg_offset), inst->exec_size * inst->src[i].stride, inst->exec_size, inst->src[i].stride); in case we end up with an input as a mul src on BSW: https://bugs.freedesktop.org/show_bug.cgi?id=91970 In fact, it looks like we can move the loop to lower ATTRS to a helper and share between GS and VS. > + } > + } > +} > + > + > /** > * Split large virtual GRFs into separate components if we can. > * > @@ -4733,6 +4822,46 @@ fs_visitor::setup_vs_payload() > * > */ > void > +fs_visitor::setup_gs_payload() > +{ > + assert(stage == MESA_SHADER_GEOMETRY); > + > + const gl_geometry_program *gp = &gs_compile->gp->program; > + struct brw_gs_prog_data *gs_prog_data = > + (struct brw_gs_prog_data *) prog_data; > + struct brw_vue_prog_data *vue_prog_data = > + (struct brw_vue_prog_data *) prog_data; > + > + /* R0: thread header, R1: output URB handles */ > + payload.num_regs = 2; > + > + if (gs_prog_data->include_primitive_id) { > + /* R2: Primitive ID 0..7 */ > + payload.num_regs++; > + } > + > + /* Use a maximum of 32 registers for push-model inputs. */ > + const unsigned max_push_components = 32; > + > + /* If pushing our inputs would take too many registers, reduce the URB > read > + * length (which is in HWords, or 8 registers), and resort to pulling. > + * > + * Note that the GS reads <URB Read Length> HWords for every vertex - so > we > + * have to multiply by VerticesIn to obtain the total storage requirement. > + */ > + if (8 * vue_prog_data->urb_read_length * gp->VerticesIn > > + max_push_components) { > + gs_prog_data->base.include_vue_handles = true; > + > + /* R3..RN: ICP Handles for each incoming vertex (when using pull > model) */ > + payload.num_regs += gp->VerticesIn; > + > + vue_prog_data->urb_read_length = > + ROUND_DOWN_TO(max_push_components / gp->VerticesIn, 8) / 8; > + } > +} > + > +void > fs_visitor::setup_cs_payload() > { > assert(devinfo->gen >= 7); > @@ -4990,6 +5119,51 @@ fs_visitor::run_vs(gl_clip_plane *clip_planes) > } > > bool > +fs_visitor::run_gs() > +{ > + assert(stage == MESA_SHADER_GEOMETRY); > + > + setup_gs_payload(); > + > + this->final_gs_vertex_count = vgrf(glsl_type::uint_type); > + > + if (gs_compile->control_data_header_size_bits > 0) { > + /* Create a VGRF to store accumulated control data bits. */ > + this->control_data_bits = vgrf(glsl_type::uint_type); > + > + /* If we're outputting more than 32 control data bits, then > EmitVertex() > + * will set control_data_bits to 0 after emitting the first vertex. > + * Otherwise, we need to initialize it to 0 here. > + */ > + if (gs_compile->control_data_header_size_bits <= 32) { > + const fs_builder abld = bld.annotate("initialize control data > bits"); > + abld.MOV(this->control_data_bits, fs_reg(0u)); > + } > + } > + > + if (INTEL_DEBUG & DEBUG_SHADER_TIME) > + emit_shader_time_begin(); > + > + emit_nir_code(); We're missing if (shader_time_index >= 0) emit_shader_time_end(); here, right? > + emit_gs_thread_end(); > + > + if (failed) > + return false; > + > + calculate_cfg(); > + > + optimize(); > + > + assign_curb_setup(); > + assign_gs_urb_setup(); > + > + fixup_3src_null_dest(); > + allocate_registers(); > + > + return !failed; > +} > + > +bool > fs_visitor::run_fs(bool do_rep_send) > { > brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data; > diff --git a/src/mesa/drivers/dri/i965/brw_fs.h > b/src/mesa/drivers/dri/i965/brw_fs.h > index e049608..aa5ff70 100644 > --- a/src/mesa/drivers/dri/i965/brw_fs.h > +++ b/src/mesa/drivers/dri/i965/brw_fs.h > @@ -130,18 +130,21 @@ public: > > bool run_fs(bool do_rep_send); > bool run_vs(gl_clip_plane *clip_planes); > + bool run_gs(); > bool run_cs(); > void optimize(); > void allocate_registers(); > void setup_payload_gen4(); > void setup_payload_gen6(); > void setup_vs_payload(); > + void setup_gs_payload(); > void setup_cs_payload(); > void fixup_3src_null_dest(); > void assign_curb_setup(); > void calculate_urb_setup(); > void assign_urb_setup(); > void assign_vs_urb_setup(); > + void assign_gs_urb_setup(); > bool assign_regs(bool allow_spilling); > void assign_regs_trivial(); > void setup_payload_interference(struct ra_graph *g, int payload_reg_count, > @@ -277,7 +280,16 @@ public: > fs_reg color1, fs_reg color2, > fs_reg src0_alpha, unsigned components); > void emit_fb_writes(); > - void emit_urb_writes(); > + void emit_urb_writes(const fs_reg &gs_vertex_count = fs_reg()); > + void set_gs_stream_control_data_bits(const fs_reg &vertex_count, > + unsigned stream_id); > + void emit_gs_control_data_bits(const fs_reg &vertex_count); > + void emit_gs_end_primitive(const nir_src &vertex_count_nir_src); > + void emit_gs_vertex(const nir_src &vertex_count_nir_src, > + unsigned stream_id); > + void emit_gs_thread_end(); > + void emit_gs_input_load(const fs_reg &dst, const nir_src &vertex_src, > + unsigned offset, unsigned num_components); > void emit_cs_terminate(); > fs_reg *emit_cs_local_invocation_id_setup(); > fs_reg *emit_cs_work_group_id_setup(); > @@ -384,6 +396,8 @@ public: > fs_reg delta_xy[BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT]; > fs_reg shader_start_time; > fs_reg userplane[MAX_CLIP_PLANES]; > + fs_reg final_gs_vertex_count; > + fs_reg control_data_bits; > > unsigned grf_used; > bool spilled_any_registers; > diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp > b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp > index 70ddf59..f86645b 100644 > --- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp > +++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp > @@ -28,6 +28,7 @@ > #include "program/prog_to_nir.h" > #include "brw_fs.h" > #include "brw_fs_surface_builder.h" > +#include "brw_vec4_gs_visitor.h" > #include "brw_nir.h" > #include "brw_fs_surface_builder.h" > #include "brw_vec4_gs_visitor.h" > @@ -96,6 +97,7 @@ fs_visitor::nir_setup_outputs() > > switch (stage) { > case MESA_SHADER_VERTEX: > + case MESA_SHADER_GEOMETRY: > for (unsigned int i = 0; i < ALIGN(type_size_scalar(var->type), 4) > / 4; i++) { > int output = var->data.location + i; > this->outputs[output] = offset(reg, bld, 4 * i); > @@ -1187,6 +1189,362 @@ emit_pixel_interpolater_send(const fs_builder &bld, > return inst; > } > > +/** > + * Computes 1 << x, given a D/UD register containing some value x. > + */ > +static fs_reg > +intexp2(const fs_builder &bld, const fs_reg &x) > +{ > + assert(x.type == BRW_REGISTER_TYPE_UD || x.type == BRW_REGISTER_TYPE_D); > + > + fs_reg result = bld.vgrf(x.type, 1); > + fs_reg one = bld.vgrf(x.type, 1); > + > + bld.MOV(one, fs_reg(1u)); Do we need to use fs_reg(1) when x.type == BRW_REGISTER_TYPE_UD to avoid confusing constant propagation? Or perhaps don't support D since we only use it for UD. > + bld.SHL(result, one, x); > + return result; > +} > + > +void > +fs_visitor::emit_gs_end_primitive(const nir_src &vertex_count_nir_src) > +{ > + assert(stage == MESA_SHADER_GEOMETRY); > + > + /* We can only do EndPrimitive() functionality when the control data > + * consists of cut bits. Fortunately, the only time it isn't is when the > + * output type is points, in which case EndPrimitive() is a no-op. > + */ > + if (gs_compile->prog_data.control_data_format != > + GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT) { > + return; > + } > + > + /* Cut bits use one bit per vertex. */ > + assert(gs_compile->control_data_bits_per_vertex == 1); > + > + fs_reg vertex_count = get_nir_src(vertex_count_nir_src); > + vertex_count.type = BRW_REGISTER_TYPE_UD; > + > + /* Cut bit n should be set to 1 if EndPrimitive() was called after > emitting > + * vertex n, 0 otherwise. So all we need to do here is mark bit > + * (vertex_count - 1) % 32 in the cut_bits register to indicate that > + * EndPrimitive() was called after emitting vertex (vertex_count - 1); > + * vec4_gs_visitor::emit_control_data_bits() will take care of the rest. > + * > + * Note that if EndPrimitve() is called before emitting any vertices, this EndPrimitve -> EndPrimitive > + * will cause us to set bit 31 of the control_data_bits register to 1. > + * That's fine because: > + * > + * - If max_vertices < 32, then vertex number 31 (zero-based) will never > be > + * output, so the hardware will ignore cut bit 31. > + * > + * - If max_vertices == 32, then vertex number 31 is guaranteed to be the > + * last vertex, so setting cut bit 31 has no effect (since the primitive > + * is automatically ended when the GS terminates). > + * > + * - If max_vertices > 32, then the ir_emit_vertex visitor will reset the > + * control_data_bits register to 0 when the first vertex is emitted. > + */ > + > + const fs_builder abld = bld.annotate("end primitive"); > + > + /* control_data_bits |= 1 << ((vertex_count - 1) % 32) */ > + fs_reg prev_count = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); > + abld.ADD(prev_count, vertex_count, fs_reg(0xffffffffu)); > + fs_reg mask = intexp2(abld, prev_count); > + /* Note: we're relying on the fact that the GEN SHL instruction only pays > + * attention to the lower 5 bits of its second source argument, so on this > + * architecture, 1 << (vertex_count - 1) is equivalent to 1 << > + * ((vertex_count - 1) % 32). > + */ > + abld.OR(this->control_data_bits, this->control_data_bits, mask); > +} > + > +void > +fs_visitor::emit_gs_control_data_bits(const fs_reg &vertex_count) > +{ > + assert(stage == MESA_SHADER_GEOMETRY); > + assert(gs_compile->control_data_bits_per_vertex != 0); > + > + const fs_builder abld = bld.annotate("emit control data bits"); > + const fs_builder fwa_bld = bld.exec_all(); > + > + /* We use a single UD register to accumulate control data bits (32 bits > + * for each of the SIMD8 channels). So we need to write a DWord (32 bits) > + * at a time. > + * > + * Unfortunately, the URB_WRITE_SIMD8 message uses 128-bit (OWord) > offsets. > + * We have select a 128-bit group via the Global and Per-Slot Offsets, > then > + * use the Channel Mask phase to enable/disable which DWord within that > + * group to write. (Remember, different SIMD8 channels may have emitted > + * different numbers of vertices, so we may need per-slot offsets.) > + * > + * Channel masking presents an annoying problem: we may have to replicate > + * the data up to 4 times: > + * > + * Msg = Handles, Per-Slot Offsets, Channel Masks, Data, Data, Data, Data. > + * > + * To avoid penalizing shaders that emit a small number of vertices, we > + * can avoid these sometimes: if the size of the control data header is > + * <= 128 bits, then there is only 1 OWord. All SIMD8 channels will land > + * land in the same 128-bit group, so we can skip per-slot offsets. > + * > + * Similarly, if the control data header is <= 32 bits, there is only one > + * DWord, so we can skip channel masks. > + */ > + enum opcode opcode = SHADER_OPCODE_URB_WRITE_SIMD8; > + > + fs_reg channel_mask, per_slot_offset; > + > + if (gs_compile->control_data_header_size_bits > 32) { > + opcode = SHADER_OPCODE_URB_WRITE_SIMD8_MASKED; > + channel_mask = vgrf(glsl_type::uint_type); > + } > + > + if (gs_compile->control_data_header_size_bits > 128) { > + opcode = SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT; > + per_slot_offset = vgrf(glsl_type::uint_type); > + } > + > + /* Figure out which DWord we're trying to write to using the formula: > + * > + * dword_index = (vertex_count - 1) * bits_per_vertex / 32 > + * > + * Since bits_per_vertex is a power of two, and is known at compile > + * time, this can be optimized to: > + * > + * dword_index = (vertex_count - 1) >> (6 - log2(bits_per_vertex)) > + */ > + if (opcode != SHADER_OPCODE_URB_WRITE_SIMD8) { > + fs_reg dword_index = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); > + fs_reg prev_count = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); > + abld.ADD(prev_count, vertex_count, fs_reg(0xffffffffu)); > + unsigned log2_bits_per_vertex = > + _mesa_fls(gs_compile->control_data_bits_per_vertex); > + abld.SHR(dword_index, prev_count, fs_reg(6u - log2_bits_per_vertex)); > + > + if (per_slot_offset.file != BAD_FILE) { > + /* Set the per-slot offset to dword_index / 4, to that we'll write > to to that -> so that > + * the appropriate OWord within the control data header. > + */ > + abld.SHR(per_slot_offset, dword_index, fs_reg(2u)); > + } > + > + /* Set the channel masks to 1 << (dword_index % 4), so that we'll > + * write to the appropriate DWORD within the OWORD. > + */ > + fs_reg channel = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); > + fwa_bld.AND(channel, dword_index, fs_reg(3u)); > + channel_mask = intexp2(fwa_bld, channel); > + /* Then the channel masks need to be in bits 23:16. */ > + fwa_bld.SHL(channel_mask, channel_mask, fs_reg(16u)); > + } > + > + /* Store the control data bits in the message payload and send it. */ > + int mlen = 2; > + if (channel_mask.file != BAD_FILE) > + mlen += 4; /* channel masks, plus 3 extra copies of the data */ > + if (per_slot_offset.file != BAD_FILE) > + mlen++; > + > + fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, mlen); > + fs_reg *sources = ralloc_array(mem_ctx, fs_reg, mlen); > + int i = 0; > + sources[i++] = fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD)); > + if (per_slot_offset.file != BAD_FILE) > + sources[i++] = per_slot_offset; > + if (channel_mask.file != BAD_FILE) > + sources[i++] = channel_mask; > + while (i < mlen) { > + sources[i++] = this->control_data_bits; > + } > + > + abld.LOAD_PAYLOAD(payload, sources, mlen, mlen); > + fs_inst *inst = abld.emit(opcode, reg_undef, payload); > + inst->mlen = mlen; > + /* We need to increment Global Offset by 256-bits to make room for > + * Broadwell's extra "Vertex Count" payload at the beginning of the > + * URB entry. Since this is an OWord message, Global Offset is counted > + * in 128-bit units, so we must set it to 2. > + */ > + if (gs_compile->prog_data.static_vertex_count == -1) > + inst->offset = 2; > +} > + > +void > +fs_visitor::set_gs_stream_control_data_bits(const fs_reg &vertex_count, > + unsigned stream_id) > +{ > + /* control_data_bits |= stream_id << ((2 * (vertex_count - 1)) % 32) */ > + > + /* Note: we are calling this *before* increasing vertex_count, so > + * this->vertex_count == vertex_count - 1 in the formula above. > + */ > + > + /* Stream mode uses 2 bits per vertex */ > + assert(gs_compile->control_data_bits_per_vertex == 2); > + > + /* Must be a valid stream */ > + assert(stream_id >= 0 && stream_id < MAX_VERTEX_STREAMS); > + > + /* Control data bits are initialized to 0 so we don't have to set any > + * bits when sending vertices to stream 0. > + */ > + if (stream_id == 0) > + return; > + > + const fs_builder abld = bld.annotate("set stream control data bits", > NULL); > + > + /* reg::sid = stream_id */ > + fs_reg sid = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); > + abld.MOV(sid, fs_reg(stream_id)); > + > + /* reg:shift_count = 2 * (vertex_count - 1) */ > + fs_reg shift_count = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); > + abld.SHL(shift_count, vertex_count, fs_reg(1u)); > + > + /* Note: we're relying on the fact that the GEN SHL instruction only pays > + * attention to the lower 5 bits of its second source argument, so on this > + * architecture, stream_id << 2 * (vertex_count - 1) is equivalent to > + * stream_id << ((2 * (vertex_count - 1)) % 32). > + */ > + fs_reg mask = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); > + abld.SHL(mask, sid, shift_count); > + abld.OR(this->control_data_bits, this->control_data_bits, mask); > +} > + > +void > +fs_visitor::emit_gs_vertex(const nir_src &vertex_count_nir_src, > + unsigned stream_id) > +{ > + fs_reg vertex_count = get_nir_src(vertex_count_nir_src); > + vertex_count.type = BRW_REGISTER_TYPE_UD; > + > + /* Haswell and later hardware ignores the "Render Stream Select" bits > + * from the 3DSTATE_STREAMOUT packet when the SOL stage is disabled, > + * and instead sends all primitives down the pipeline for rasterization. > + * If the SOL stage is enabled, "Render Stream Select" is honored and > + * primitives bound to non-zero streams are discarded after stream output. > + * > + * Since the only purpose of primives sent to non-zero streams is to > + * be recorded by transform feedback, we can simply discard all geometry > + * bound to these streams when transform feedback is disabled. > + */ > + if (stream_id > 0 && !nir->info.has_transform_feedback_varyings) > + return; > + > + /* If we're outputting 32 control data bits or less, then we can wait > + * until the shader is over to output them all. Otherwise we need to > + * output them as we go. Now is the time to do it, since we're about to > + * output the vertex_count'th vertex, so it's guaranteed that the > + * control data bits associated with the (vertex_count - 1)th vertex are > + * correct. > + */ > + if (gs_compile->control_data_header_size_bits > 32) { > + const fs_builder abld = > + bld.annotate("emit vertex: emit control data bits"); > + > + /* Only emit control data bits if we've finished accumulating a batch > + * of 32 bits. This is the case when: > + * > + * (vertex_count * bits_per_vertex) % 32 == 0 > + * > + * (in other words, when the last 5 bits of vertex_count * > + * bits_per_vertex are 0). Assuming bits_per_vertex == 2^n for some > + * integer n (which is always the case, since bits_per_vertex is > + * always 1 or 2), this is equivalent to requiring that the last 5-n > + * bits of vertex_count are 0: > + * > + * vertex_count & (2^(5-n) - 1) == 0 > + * > + * 2^(5-n) == 2^5 / 2^n == 32 / bits_per_vertex, so this is > + * equivalent to: > + * > + * vertex_count & (32 / bits_per_vertex - 1) == 0 > + */ > + // XXX: check immediates. > + fs_inst *inst = > + abld.AND(bld.null_reg_d(), vertex_count, > + fs_reg(32u / gs_compile->control_data_bits_per_vertex - > 1u)); > + inst->conditional_mod = BRW_CONDITIONAL_Z; > + > + abld.IF(BRW_PREDICATE_NORMAL); > + /* If vertex_count is 0, then no control data bits have been > + * accumulated yet, so we can skip emitting them. > + */ > + abld.CMP(bld.null_reg_d(), vertex_count, fs_reg(0u), > + BRW_CONDITIONAL_NEQ); > + abld.IF(BRW_PREDICATE_NORMAL); > + emit_gs_control_data_bits(vertex_count); > + abld.emit(BRW_OPCODE_ENDIF); > + > + /* Reset control_data_bits to 0 so we can start accumulating a new > + * batch. > + * > + * Note: in the case where vertex_count == 0, this neutralizes the > + * effect of any call to EndPrimitive() that the shader may have > + * made before outputting its first vertex. > + */ > + inst = abld.MOV(this->control_data_bits, fs_reg(0u)); > + inst->force_writemask_all = true; > + abld.emit(BRW_OPCODE_ENDIF); > + } > + > + emit_urb_writes(vertex_count); > + > + /* In stream mode we have to set control data bits for all vertices > + * unless we have disabled control data bits completely (which we do > + * do for GL_POINTS outputs that don't use streams). > + */ > + if (gs_compile->control_data_header_size_bits > 0 && > + gs_compile->prog_data.control_data_format == > + GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_SID) { > + set_gs_stream_control_data_bits(vertex_count, stream_id); > + } > +} > + > +void > +fs_visitor::emit_gs_input_load(const fs_reg &dst, > + const nir_src &vertex_src, > + unsigned input_offset, > + unsigned num_components) > +{ > + const brw_vue_prog_data *vue_prog_data = (const brw_vue_prog_data *) > prog_data; > + const unsigned vertex = nir_src_as_const_value(vertex_src)->u[0]; > + > + const unsigned array_stride = vue_prog_data->urb_read_length * 8; > + > + const bool pushed = 4 * input_offset < array_stride; > + > + if (input_offset == 0) { > + /* This is the VUE header, containing VARYING_SLOT_LAYER [.y], > + * VARYING_SLOT_VIEWPORT [.z], and VARYING_SLOT_PSIZ [.w]. > + * Only gl_PointSize is available as a GS input, so they must > + * be asking for that input. > + */ > + if (pushed) { > + bld.MOV(dst, fs_reg(ATTR, array_stride * vertex + 3, dst.type)); > + } else { > + fs_reg tmp = bld.vgrf(dst.type, 4); > + fs_inst *inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp, > + fs_reg(vertex), fs_reg(0)); > + inst->regs_written = 4; > + bld.MOV(dst, offset(tmp, bld, 3)); > + } > + } else { > + if (pushed) { > + int index = vertex * array_stride + 4 * input_offset; > + for (unsigned i = 0; i < num_components; i++) { > + bld.MOV(offset(dst, bld, i), fs_reg(ATTR, index + i, dst.type)); > + } > + } else { > + fs_inst *inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dst, > + fs_reg(vertex), fs_reg(input_offset)); > + inst->regs_written = num_components; > + } > + } > +} > + > void > fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr > *instr) > { > @@ -1577,6 +1935,14 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, > nir_intrinsic_instr *instr > break; > } > > + case nir_intrinsic_load_per_vertex_input_indirect: > + assert(!"Not allowed"); > + /* fallthrough */ Heh, assert(false) and fallthrough? Maybe just unreachable()? > + case nir_intrinsic_load_per_vertex_input: > + emit_gs_input_load(dest, instr->src[0], instr->const_index[0], > + instr->num_components); > + break; > + > /* Handle ARB_gpu_shader5 interpolation intrinsics > * > * It's worth a quick word of explanation as to why we handle the full > @@ -1933,6 +2299,18 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, > nir_intrinsic_instr *instr > break; > } > > + case nir_intrinsic_emit_vertex_with_counter: > + emit_gs_vertex(instr->src[0], instr->const_index[0]); > + break; > + > + case nir_intrinsic_end_primitive_with_counter: > + emit_gs_end_primitive(instr->src[0]); > + break; > + > + case nir_intrinsic_set_vertex_count: > + bld.MOV(this->final_gs_vertex_count, get_nir_src(instr->src[0])); > + break; > + > default: > unreachable("unknown intrinsic"); > } > diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp > b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp > index dc7fa9d..76f592f 100644 > --- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp > +++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp > @@ -866,7 +866,7 @@ void fs_visitor::compute_clip_distance(gl_clip_plane > *clip_planes) > } > > void > -fs_visitor::emit_urb_writes() > +fs_visitor::emit_urb_writes(const fs_reg &gs_vertex_count) > { > int slot, urb_offset, length; > int starting_urb_offset = 0; > @@ -902,9 +902,13 @@ fs_visitor::emit_urb_writes() > return; > } > > + opcode opcode = SHADER_OPCODE_URB_WRITE_SIMD8; > + int header_size = 1; > + fs_reg per_slot_offsets; > + > if (stage == MESA_SHADER_GEOMETRY) { > const struct brw_gs_prog_data *gs_prog_data = > - (const struct brw_gs_prog_data *) prog_data; > + (const struct brw_gs_prog_data *) this->prog_data; > > /* We need to increment the Global Offset to skip over the control data > * header and the extra "Vertex Count" field (1 HWord) at the beginning > @@ -913,6 +917,27 @@ fs_visitor::emit_urb_writes() > starting_urb_offset = 2 * > gs_prog_data->control_data_header_size_hwords; > if (gs_prog_data->static_vertex_count == -1) > starting_urb_offset += 2; > + > + /* We also need to use per-slot offsets. The per-slot offset is the > + * Vertex Count. SIMD8 mode processes 8 different primitives at a > + * time; each may output a different number of vertices. > + */ > + opcode = SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT; > + header_size++; > + > + /* The URB offset is in 128-bit units, so we need to multiply by 2 */ > + const int output_vertex_size_owords = > + gs_prog_data->output_vertex_size_hwords * 2; > + > + fs_reg offset; > + if (gs_vertex_count.file == IMM) { > + per_slot_offsets = fs_reg(output_vertex_size_owords * > + gs_vertex_count.fixed_hw_reg.dw1.ud); > + } else { > + per_slot_offsets = vgrf(glsl_type::int_type); > + bld.MUL(per_slot_offsets, gs_vertex_count, > + fs_reg(output_vertex_size_owords)); > + } > } > > length = 0; > @@ -1012,19 +1037,25 @@ fs_visitor::emit_urb_writes() > if (length == 8 || last) > flush = true; > if (flush) { > - fs_reg *payload_sources = ralloc_array(mem_ctx, fs_reg, length + 1); > - fs_reg payload = fs_reg(GRF, alloc.allocate(length + 1), > + fs_reg *payload_sources = > + ralloc_array(mem_ctx, fs_reg, length + header_size); > + fs_reg payload = fs_reg(GRF, alloc.allocate(length + header_size), > BRW_REGISTER_TYPE_F); > payload_sources[0] = > fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD)); > > - memcpy(&payload_sources[1], sources, length * sizeof sources[0]); > - abld.LOAD_PAYLOAD(payload, payload_sources, length + 1, 1); > + if (opcode == SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT) > + payload_sources[1] = per_slot_offsets; > + > + memcpy(&payload_sources[header_size], sources, > + length * sizeof sources[0]); > + > + abld.LOAD_PAYLOAD(payload, payload_sources, length + header_size, > + header_size); > > - fs_inst *inst = > - abld.emit(SHADER_OPCODE_URB_WRITE_SIMD8, reg_undef, payload); > + fs_inst *inst = abld.emit(opcode, reg_undef, payload); > inst->eot = last && stage == MESA_SHADER_VERTEX; > - inst->mlen = length + 1; > + inst->mlen = length + header_size; > inst->offset = urb_offset; > urb_offset = starting_urb_offset + slot + 1; > length = 0; > diff --git a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp > b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp > index 775f64d..246ecff 100644 > --- a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp > +++ b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp > @@ -29,6 +29,7 @@ > > #include "brw_vec4_gs_visitor.h" > #include "gen6_gs_visitor.h" > +#include "brw_fs.h" > > namespace brw { > > @@ -620,6 +621,26 @@ brw_gs_emit(struct brw_context *brw, > unsigned *final_assembly_size) > { > struct gl_shader *shader = prog->_LinkedShaders[MESA_SHADER_GEOMETRY]; > + const struct brw_compiler *compiler = brw->intelScreen->compiler; > + > + if (compiler->scalar_gs) { Make this if (compiler->scalar_gs && prog_data->invocations <= 1) { for now? Or assert(prog_data->invocations == 1) inside the if to fail more loudly. > + fs_visitor v(compiler, brw, mem_ctx, c, shader->Program->nir); > + if (v.run_gs()) { > + c->prog_data.base.dispatch_mode = DISPATCH_MODE_SIMD8; > + > + fs_generator g(compiler, brw, mem_ctx, &c->key, > + &c->prog_data.base.base, &c->gp->program.Base, > + v.promoted_constants, false, "GS"); > + if (INTEL_DEBUG & DEBUG_GS) { > + char *name = ralloc_asprintf(mem_ctx, "%s geometry shader %d", > + prog->Label ? prog->Label : > "unnamed", > + prog->Name); > + g.enable_debug(name); > + } > + g.generate_code(v.cfg, 8); > + return g.get_assembly(final_assembly_size); > + } > + } > > if (brw->gen >= 7) { > /* Compile the geometry shader in DUAL_OBJECT dispatch mode, if we can > do > -- > 2.6.1 > > _______________________________________________ > mesa-dev mailing list > mesa-dev@lists.freedesktop.org > http://lists.freedesktop.org/mailman/listinfo/mesa-dev _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev