On 2016-04-21 22:32:09, Kenneth Graunke wrote: > Unlike most shader stages, the Hull Shader hardware makes us explicitly > tell it how many threads to dispatch and manually configure the channel > mask. One perk of this is that we have a lot of flexibility - we can > run it in either SIMD4x2 or SIMD8 mode. > > Treating it as SIMD8 means that shaders with 8 or fewer output vertices > (which is overwhemingly the common case) can be handled by a single > thread. This has several intriguing properties: > > - Accessing input arrays with gl_InvocationID as the index is a simple > SIMD8 URB read with g1 as the header. No indirect addressing required. > - Barriers are no-ops. > - We could potentially do output shadowing to combine writes, as the > concurrency concerns are gone. (We don't do this yet, though.) > > Signed-off-by: Kenneth Graunke <kenn...@whitecape.org> > --- > src/mesa/drivers/dri/i965/brw_compiler.c | 4 +- > src/mesa/drivers/dri/i965/brw_fs.cpp | 97 ++++++++ > src/mesa/drivers/dri/i965/brw_fs.h | 5 + > src/mesa/drivers/dri/i965/brw_fs_nir.cpp | 356 > +++++++++++++++++++++++++++ > src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 3 + > src/mesa/drivers/dri/i965/brw_tcs.c | 3 +- > src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp | 59 ++++- > 7 files changed, 512 insertions(+), 15 deletions(-) > > Shockingly, this appears to cut instruction counts in Unigine Heaven > (-2.5 to 5.5%), Synmark (-31%), and Tessmark (-37%). It increases > instruction counts in Shadow of Mordor (up to +57%) - but again, this > is running in scalar mode, so larger instruction counts are expected :) > I also have a bunch of optimizations in progress that will help those. > > Cycle counts look pretty good too. > > This patch leaves it off by default because I haven't properly benchmarked > it yet. I fully expect we'll turn it on by default. > > diff --git a/src/mesa/drivers/dri/i965/brw_compiler.c > b/src/mesa/drivers/dri/i965/brw_compiler.c > index 4496699..93a30a5 100644 > --- a/src/mesa/drivers/dri/i965/brw_compiler.c > +++ b/src/mesa/drivers/dri/i965/brw_compiler.c > @@ -152,7 +152,8 @@ brw_compiler_create(void *mem_ctx, const struct > brw_device_info *devinfo) > > compiler->scalar_stage[MESA_SHADER_VERTEX] = > devinfo->gen >= 8 && !(INTEL_DEBUG & DEBUG_VEC4VS); > - compiler->scalar_stage[MESA_SHADER_TESS_CTRL] = false; > + compiler->scalar_stage[MESA_SHADER_TESS_CTRL] = > + devinfo->gen >= 8 && env_var_as_boolean("INTEL_SCALAR_TCS", false); > compiler->scalar_stage[MESA_SHADER_TESS_EVAL] = > devinfo->gen >= 8 && env_var_as_boolean("INTEL_SCALAR_TES", true); > compiler->scalar_stage[MESA_SHADER_GEOMETRY] = > @@ -194,6 +195,7 @@ brw_compiler_create(void *mem_ctx, const struct > brw_device_info *devinfo) > > > compiler->glsl_compiler_options[MESA_SHADER_TESS_CTRL].EmitNoIndirectInput = > false; > > compiler->glsl_compiler_options[MESA_SHADER_TESS_EVAL].EmitNoIndirectInput = > false; > + > compiler->glsl_compiler_options[MESA_SHADER_TESS_CTRL].EmitNoIndirectOutput = > false; > > if (compiler->scalar_stage[MESA_SHADER_GEOMETRY]) > > compiler->glsl_compiler_options[MESA_SHADER_GEOMETRY].EmitNoIndirectInput = > false; > diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp > b/src/mesa/drivers/dri/i965/brw_fs.cpp > index 5d6a107..be5edb8 100644 > --- a/src/mesa/drivers/dri/i965/brw_fs.cpp > +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp > @@ -1758,6 +1758,21 @@ fs_visitor::assign_vs_urb_setup() > } > > void > +fs_visitor::assign_tcs_single_patch_urb_setup() > +{ > + assert(stage == MESA_SHADER_TESS_CTRL); > + > + brw_vue_prog_data *vue_prog_data = (brw_vue_prog_data *) prog_data; > + > + first_non_payload_grf += 8 * vue_prog_data->urb_read_length;
We talked about this bit of code offline. This is just a reminder that you were going to check to see if it is used currently. Patches 3-5 Reviewed-by: Jordan Justen <jordan.l.jus...@intel.com> > + > + /* Rewrite all ATTR file references to HW_REGs. */ > + foreach_block_and_inst(block, fs_inst, inst, cfg) { > + convert_attr_sources_to_hw_regs(inst); > + } > +} > + > +void > fs_visitor::assign_tes_urb_setup() > { > assert(stage == MESA_SHADER_TESS_EVAL); > @@ -5463,6 +5478,88 @@ fs_visitor::run_vs(gl_clip_plane *clip_planes) > } > > bool > +fs_visitor::run_tcs_single_patch() > +{ > + assert(stage == MESA_SHADER_TESS_CTRL); > + > + struct brw_tcs_prog_data *tcs_prog_data = > + (struct brw_tcs_prog_data *) prog_data; > + > + /* r1-r4 contain the ICP handles. */ > + payload.num_regs = 5; > + > + if (shader_time_index >= 0) > + emit_shader_time_begin(); > + > + /* Initialize gl_InvocationID */ > + fs_reg channels_uw = bld.vgrf(BRW_REGISTER_TYPE_UW); > + fs_reg channels_ud = bld.vgrf(BRW_REGISTER_TYPE_UD); > + bld.MOV(channels_uw, fs_reg(brw_imm_uv(0x76543210))); > + bld.MOV(channels_ud, channels_uw); > + > + if (tcs_prog_data->instances == 1) { > + invocation_id = channels_ud; > + } else { > + invocation_id = bld.vgrf(BRW_REGISTER_TYPE_UD); > + > + /* Get instance number from g0.2 bits 23:17, and multiply it by 8. */ > + fs_reg t = bld.vgrf(BRW_REGISTER_TYPE_UD); > + fs_reg instance_times_8 = bld.vgrf(BRW_REGISTER_TYPE_UD); > + bld.AND(t, fs_reg(retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD)), > + brw_imm_ud(INTEL_MASK(23, 17))); > + bld.SHR(instance_times_8, t, brw_imm_ud(17 - 3)); > + > + bld.ADD(invocation_id, instance_times_8, channels_ud); > + } > + > + /* Fix the disptach mask */ > + if (nir->info.tcs.vertices_out % 8) { > + bld.CMP(bld.null_reg_ud(), invocation_id, > + brw_imm_ud(nir->info.tcs.vertices_out), BRW_CONDITIONAL_L); > + bld.IF(BRW_PREDICATE_NORMAL); > + } > + > + emit_nir_code(); > + > + if (nir->info.tcs.vertices_out % 8) { > + bld.emit(BRW_OPCODE_ENDIF); > + } > + > + /* Emit EOT write; set TR DS Cache bit */ > + fs_reg srcs[3] = { > + fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)), > + fs_reg(brw_imm_ud(WRITEMASK_X << 16)), > + fs_reg(brw_imm_ud(0)), > + }; > + fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 3); > + bld.LOAD_PAYLOAD(payload, srcs, 3, 2); > + > + fs_inst *inst = bld.emit(SHADER_OPCODE_URB_WRITE_SIMD8_MASKED, > + bld.null_reg_ud(), payload); > + inst->mlen = 3; > + inst->base_mrf = -1; > + inst->eot = true; > + > + if (shader_time_index >= 0) > + emit_shader_time_end(); > + > + if (failed) > + return false; > + > + calculate_cfg(); > + > + optimize(); > + > + assign_curb_setup(); > + assign_tcs_single_patch_urb_setup(); > + > + fixup_3src_null_dest(); > + allocate_registers(); > + > + return !failed; > +} > + > +bool > fs_visitor::run_tes() > { > assert(stage == MESA_SHADER_TESS_EVAL); > diff --git a/src/mesa/drivers/dri/i965/brw_fs.h > b/src/mesa/drivers/dri/i965/brw_fs.h > index bcd2e3e..f24c78a 100644 > --- a/src/mesa/drivers/dri/i965/brw_fs.h > +++ b/src/mesa/drivers/dri/i965/brw_fs.h > @@ -110,6 +110,7 @@ public: > > bool run_fs(bool do_rep_send); > bool run_vs(gl_clip_plane *clip_planes); > + bool run_tcs_single_patch(); > bool run_tes(); > bool run_gs(); > bool run_cs(); > @@ -126,6 +127,7 @@ public: > void assign_urb_setup(); > void convert_attr_sources_to_hw_regs(fs_inst *inst); > void assign_vs_urb_setup(); > + void assign_tcs_single_patch_urb_setup(); > void assign_tes_urb_setup(); > void assign_gs_urb_setup(); > bool assign_regs(bool allow_spilling); > @@ -249,6 +251,8 @@ public: > nir_ssa_undef_instr *instr); > void nir_emit_vs_intrinsic(const brw::fs_builder &bld, > nir_intrinsic_instr *instr); > + void nir_emit_tcs_intrinsic(const brw::fs_builder &bld, > + nir_intrinsic_instr *instr); > void nir_emit_gs_intrinsic(const brw::fs_builder &bld, > nir_intrinsic_instr *instr); > void nir_emit_fs_intrinsic(const brw::fs_builder &bld, > @@ -404,6 +408,7 @@ public: > fs_reg userplane[MAX_CLIP_PLANES]; > fs_reg final_gs_vertex_count; > fs_reg control_data_bits; > + fs_reg invocation_id; > > unsigned grf_used; > bool spilled_any_registers; > diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp > b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp > index cf4f782..e617083 100644 > --- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp > +++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp > @@ -114,6 +114,9 @@ fs_visitor::nir_setup_single_output_varying(fs_reg *reg, > void > fs_visitor::nir_setup_outputs() > { > + if (stage == MESA_SHADER_TESS_CTRL) > + return; > + > brw_wm_prog_key *key = (brw_wm_prog_key*) this->key; > > nir_outputs = bld.vgrf(BRW_REGISTER_TYPE_F, nir->num_outputs); > @@ -232,6 +235,8 @@ emit_system_values_block(nir_block *block, void > *void_visitor) > break; > > case nir_intrinsic_load_invocation_id: > + if (v->stage == MESA_SHADER_TESS_CTRL) > + break; > assert(v->stage == MESA_SHADER_GEOMETRY); > reg = &v->nir_system_values[SYSTEM_VALUE_INVOCATION_ID]; > if (reg->file == BAD_FILE) { > @@ -452,6 +457,9 @@ fs_visitor::nir_emit_instr(nir_instr *instr) > case MESA_SHADER_VERTEX: > nir_emit_vs_intrinsic(abld, nir_instr_as_intrinsic(instr)); > break; > + case MESA_SHADER_TESS_CTRL: > + nir_emit_tcs_intrinsic(abld, nir_instr_as_intrinsic(instr)); > + break; > case MESA_SHADER_TESS_EVAL: > nir_emit_tes_intrinsic(abld, nir_instr_as_intrinsic(instr)); > break; > @@ -1901,6 +1909,354 @@ fs_visitor::nir_emit_vs_intrinsic(const fs_builder > &bld, > } > > void > +fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld, > + nir_intrinsic_instr *instr) > +{ > + assert(stage == MESA_SHADER_TESS_CTRL); > + struct brw_tcs_prog_key *tcs_key = (struct brw_tcs_prog_key *) key; > + struct brw_tcs_prog_data *tcs_prog_data = > + (struct brw_tcs_prog_data *) prog_data; > + > + fs_reg dst; > + if (nir_intrinsic_infos[instr->intrinsic].has_dest) > + dst = get_nir_dest(instr->dest); > + > + switch (instr->intrinsic) { > + case nir_intrinsic_load_primitive_id: > + bld.MOV(dst, fs_reg(brw_vec1_grf(0, 1))); > + break; > + case nir_intrinsic_load_invocation_id: > + bld.MOV(retype(dst, invocation_id.type), invocation_id); > + break; > + case nir_intrinsic_load_patch_vertices_in: > + bld.MOV(retype(dst, BRW_REGISTER_TYPE_D), > + brw_imm_d(tcs_key->input_vertices)); > + break; > + > + case nir_intrinsic_barrier: { > + if (tcs_prog_data->instances == 1) > + break; > + > + fs_reg m0 = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); > + fs_reg m0_2 = byte_offset(m0, 2 * sizeof(uint32_t)); > + > + const fs_builder fwa_bld = bld.exec_all(); > + > + /* Zero the message header */ > + fwa_bld.MOV(m0, brw_imm_ud(0u)); > + > + /* Copy "Barrier ID" from r0.2, bits 16:13 */ > + fwa_bld.AND(m0_2, retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD), > + brw_imm_ud(INTEL_MASK(16, 13))); > + > + /* Shift it up to bits 27:24. */ > + fwa_bld.SHL(m0_2, m0_2, brw_imm_ud(11)); > + > + /* Set the Barrier Count and the enable bit */ > + fwa_bld.OR(m0_2, m0_2, > + brw_imm_ud(tcs_prog_data->instances << 8 | (1 << 15))); > + > + bld.emit(SHADER_OPCODE_BARRIER, bld.null_reg_ud(), m0); > + break; > + } > + > + case nir_intrinsic_load_input: > + unreachable("nir_lower_io should never give us these."); > + break; > + > + case nir_intrinsic_load_per_vertex_input: { > + fs_reg indirect_offset = get_indirect_offset(instr); > + unsigned imm_offset = instr->const_index[0]; > + > + const nir_src &vertex_src = instr->src[0]; > + nir_const_value *vertex_const = nir_src_as_const_value(vertex_src); > + > + fs_inst *inst; > + > + fs_reg icp_handle; > + > + if (vertex_const) { > + /* Emit a MOV to resolve <0,1,0> regioning. */ > + icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); > + bld.MOV(icp_handle, > + retype(brw_vec1_grf(1 + (vertex_const->i32[0] >> 3), > + vertex_const->i32[0] & 7), > + BRW_REGISTER_TYPE_UD)); > + } else if (tcs_prog_data->instances == 1 && > + vertex_src.is_ssa && > + vertex_src.ssa->parent_instr->type == > nir_instr_type_intrinsic && > + > nir_instr_as_intrinsic(vertex_src.ssa->parent_instr)->intrinsic == > nir_intrinsic_load_invocation_id) { > + /* For the common case of only 1 instance, an array index of > + * gl_InvocationID means reading g1. Skip all the indirect work. > + */ > + icp_handle = retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD); > + } else { > + /* The vertex index is non-constant. We need to use indirect > + * addressing to fetch the proper URB handle. > + */ > + icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); > + > + /* Each ICP handle is a single DWord (4 bytes) */ > + fs_reg vertex_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); > + bld.SHL(vertex_offset_bytes, > + retype(get_nir_src(vertex_src), BRW_REGISTER_TYPE_UD), > + brw_imm_ud(2u)); > + > + /* Start at g1. We might read up to 4 registers. */ > + bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle, > + fs_reg(brw_vec8_grf(1, 0)), vertex_offset_bytes, > + brw_imm_ud(4 * REG_SIZE)); > + } > + > + if (indirect_offset.file == BAD_FILE) { > + /* Constant indexing - use global offset. */ > + inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dst, icp_handle); > + inst->offset = imm_offset; > + inst->mlen = 1; > + inst->base_mrf = -1; > + inst->regs_written = instr->num_components; > + } else { > + /* Indirect indexing - use per-slot offsets as well. */ > + const fs_reg srcs[] = { icp_handle, indirect_offset }; > + fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2); > + bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0); > + > + inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, dst, > payload); > + inst->offset = imm_offset; > + inst->base_mrf = -1; > + inst->mlen = 2; > + inst->regs_written = instr->num_components; > + } > + > + /* Copy the temporary to the destination to deal with writemasking. > + * > + * Also attempt to deal with gl_PointSize being in the .w component. > + */ > + if (inst->offset == 0 && indirect_offset.file == BAD_FILE) { > + inst->dst = bld.vgrf(dst.type, 4); > + inst->regs_written = 4; > + bld.MOV(dst, offset(inst->dst, bld, 3)); > + } > + break; > + } > + > + case nir_intrinsic_load_output: > + case nir_intrinsic_load_per_vertex_output: { > + fs_reg indirect_offset = get_indirect_offset(instr); > + unsigned imm_offset = instr->const_index[0]; > + > + fs_inst *inst; > + if (indirect_offset.file == BAD_FILE) { > + /* Replicate the patch handle to all enabled channels */ > + fs_reg patch_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); > + bld.MOV(patch_handle, > + retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)); > + > + if (imm_offset == 0) { > + /* This is a read of gl_TessLevelInner[], which lives in the > + * Patch URB header. The layout depends on the domain. > + */ > + dst.type = BRW_REGISTER_TYPE_F; > + switch (tcs_key->tes_primitive_mode) { > + case GL_QUADS: { > + /* DWords 3-2 (reversed) */ > + fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F, 4); > + > + inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp, > patch_handle); > + inst->offset = 0; > + inst->mlen = 1; > + inst->base_mrf = -1; > + inst->regs_written = 4; > + > + /* dst.xy = tmp.wz */ > + bld.MOV(dst, offset(tmp, bld, 3)); > + bld.MOV(offset(dst, bld, 1), offset(tmp, bld, 2)); > + break; > + } > + case GL_TRIANGLES: > + /* DWord 4; hardcode offset = 1 and regs_written = 1 */ > + inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dst, > patch_handle); > + inst->offset = 1; > + inst->mlen = 1; > + inst->base_mrf = -1; > + inst->regs_written = 1; > + break; > + case GL_ISOLINES: > + /* All channels are undefined. */ > + break; > + default: > + unreachable("Bogus tessellation domain"); > + } > + } else if (imm_offset == 1) { > + /* This is a read of gl_TessLevelOuter[], which lives in the > + * Patch URB header. The layout depends on the domain. > + */ > + dst.type = BRW_REGISTER_TYPE_F; > + > + fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F, 4); > + inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp, patch_handle); > + inst->offset = 1; > + inst->mlen = 1; > + inst->base_mrf = -1; > + inst->regs_written = 4; > + > + /* Reswizzle: WZYX */ > + fs_reg srcs[4] = { > + offset(tmp, bld, 3), > + offset(tmp, bld, 2), > + offset(tmp, bld, 1), > + offset(tmp, bld, 0), > + }; > + > + unsigned num_components; > + switch (tcs_key->tes_primitive_mode) { > + case GL_QUADS: > + num_components = 4; > + break; > + case GL_TRIANGLES: > + num_components = 3; > + break; > + case GL_ISOLINES: > + /* Isolines are not reversed; swizzle .zw -> .xy */ > + srcs[0] = offset(tmp, bld, 2); > + srcs[1] = offset(tmp, bld, 3); > + num_components = 2; > + break; > + default: > + unreachable("Bogus tessellation domain"); > + } > + bld.LOAD_PAYLOAD(dst, srcs, num_components, 0); > + } else { > + inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dst, patch_handle); > + inst->offset = imm_offset; > + inst->mlen = 1; > + inst->base_mrf = -1; > + inst->regs_written = instr->num_components; > + } > + } else { > + /* Indirect indexing - use per-slot offsets as well. */ > + const fs_reg srcs[] = { > + retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD), > + indirect_offset > + }; > + fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2); > + bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0); > + > + inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, dst, > payload); > + inst->offset = imm_offset; > + inst->mlen = 2; > + inst->base_mrf = -1; > + inst->regs_written = instr->num_components; > + } > + break; > + } > + > + case nir_intrinsic_store_output: > + case nir_intrinsic_store_per_vertex_output: { > + fs_reg value = get_nir_src(instr->src[0]); > + fs_reg indirect_offset = get_indirect_offset(instr); > + unsigned imm_offset = instr->const_index[0]; > + unsigned swiz = BRW_SWIZZLE_XYZW; > + unsigned mask = instr->const_index[1]; > + unsigned header_regs = 0; > + fs_reg srcs[7]; > + srcs[header_regs++] = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD); > + > + if (indirect_offset.file != BAD_FILE) { > + srcs[header_regs++] = indirect_offset; > + } else if (tcs_key->program_string_id != 0) { > + if (imm_offset == 0) { > + value.type = BRW_REGISTER_TYPE_F; > + > + mask &= (1 << > tesslevel_inner_components(tcs_key->tes_primitive_mode)) - 1; > + > + /* This is a write to gl_TessLevelInner[], which lives in the > + * Patch URB header. The layout depends on the domain. > + */ > + switch (tcs_key->tes_primitive_mode) { > + case GL_QUADS: > + /* gl_TessLevelInner[].xy lives at DWords 3-2 (reversed). > + * We use an XXYX swizzle to reverse put .xy in the .wz > + * channels, and use a .zw writemask. > + */ > + mask = writemask_for_backwards_vector(mask); > + swiz = BRW_SWIZZLE4(0, 0, 1, 0); > + break; > + case GL_TRIANGLES: > + /* gl_TessLevelInner[].x lives at DWord 4, so we set the > + * writemask to X and bump the URB offset by 1. > + */ > + imm_offset = 1; > + break; > + case GL_ISOLINES: > + /* Skip; gl_TessLevelInner[] doesn't exist for isolines. */ > + return; > + default: > + unreachable("Bogus tessellation domain"); > + } > + } else if (imm_offset == 1) { > + /* This is a write to gl_TessLevelOuter[] which lives in the > + * Patch URB Header at DWords 4-7. However, it's reversed, so > + * instead of .xyzw we have .wzyx. > + */ > + value.type = BRW_REGISTER_TYPE_F; > + > + mask &= (1 << > tesslevel_outer_components(tcs_key->tes_primitive_mode)) - 1; > + > + if (tcs_key->tes_primitive_mode == GL_ISOLINES) { > + /* Isolines .xy should be stored in .zw, in order. */ > + swiz = BRW_SWIZZLE4(0, 0, 0, 1); > + mask <<= 2; > + } else { > + /* Other domains are reversed; store .wzyx instead of .xyzw */ > + swiz = BRW_SWIZZLE_WZYX; > + mask = writemask_for_backwards_vector(mask); > + } > + } > + } > + > + if (mask == 0) > + break; > + > + unsigned num_components = _mesa_fls(mask); > + enum opcode opcode; > + > + if (mask != WRITEMASK_XYZW) { > + srcs[header_regs++] = brw_imm_ud(mask << 16); > + opcode = indirect_offset.file != BAD_FILE ? > + SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT : > + SHADER_OPCODE_URB_WRITE_SIMD8_MASKED; > + } else { > + opcode = indirect_offset.file != BAD_FILE ? > + SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT : > + SHADER_OPCODE_URB_WRITE_SIMD8; > + } > + > + for (unsigned i = 0; i < num_components; i++) { > + if (mask & (1 << i)) > + srcs[header_regs + i] = offset(value, bld, BRW_GET_SWZ(swiz, i)); > + } > + > + unsigned mlen = header_regs + num_components; > + > + fs_reg payload = > + bld.vgrf(BRW_REGISTER_TYPE_UD, mlen); > + bld.LOAD_PAYLOAD(payload, srcs, mlen, header_regs); > + > + fs_inst *inst = bld.emit(opcode, bld.null_reg_ud(), payload); > + inst->offset = imm_offset; > + inst->mlen = mlen; > + inst->base_mrf = -1; > + break; > + } > + > + default: > + nir_emit_intrinsic(bld, instr); > + break; > + } > +} > + > +void > fs_visitor::nir_emit_tes_intrinsic(const fs_builder &bld, > nir_intrinsic_instr *instr) > { > diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp > b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp > index daabf70..41a9b12 100644 > --- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp > +++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp > @@ -1014,6 +1014,9 @@ fs_visitor::init() > case MESA_SHADER_VERTEX: > key_tex = &((const brw_vs_prog_key *) key)->tex; > break; > + case MESA_SHADER_TESS_CTRL: > + key_tex = &((const brw_tcs_prog_key *) key)->tex; > + break; > case MESA_SHADER_TESS_EVAL: > key_tex = &((const brw_tes_prog_key *) key)->tex; > break; > diff --git a/src/mesa/drivers/dri/i965/brw_tcs.c > b/src/mesa/drivers/dri/i965/brw_tcs.c > index 0117ffe..98ed2b2 100644 > --- a/src/mesa/drivers/dri/i965/brw_tcs.c > +++ b/src/mesa/drivers/dri/i965/brw_tcs.c > @@ -214,7 +214,8 @@ brw_codegen_tcs_prog(struct brw_context *brw, > prog_data.base.base.nr_image_params = tcs->NumImages; > > brw_nir_setup_glsl_uniforms(nir, shader_prog, &tcp->program.Base, > - &prog_data.base.base, false); > + &prog_data.base.base, > + > compiler->scalar_stage[MESA_SHADER_TESS_CTRL]); > } else { > /* Upload the Patch URB Header as the first two uniforms. > * Do the annoying scrambling so the shader doesn't have to. > diff --git a/src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp > b/src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp > index 17e3448..79cf93e 100644 > --- a/src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp > +++ b/src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp > @@ -29,6 +29,7 @@ > > #include "brw_nir.h" > #include "brw_vec4_tcs.h" > +#include "brw_fs.h" > > namespace brw { > > @@ -452,7 +453,10 @@ brw_compile_tcs(const struct brw_compiler *compiler, > brw_nir_lower_tcs_outputs(nir, &vue_prog_data->vue_map); > nir = brw_postprocess_nir(nir, compiler->devinfo, is_scalar); > > - prog_data->instances = DIV_ROUND_UP(nir->info.tcs.vertices_out, 2); > + if (is_scalar) > + prog_data->instances = DIV_ROUND_UP(nir->info.tcs.vertices_out, 8); > + else > + prog_data->instances = DIV_ROUND_UP(nir->info.tcs.vertices_out, 2); > > /* Compute URB entry size. The maximum allowed URB entry size is 32k. > * That divides up as follows: > @@ -493,20 +497,49 @@ brw_compile_tcs(const struct brw_compiler *compiler, > brw_print_vue_map(stderr, &vue_prog_data->vue_map); > } > > - vec4_tcs_visitor v(compiler, log_data, key, prog_data, > - nir, mem_ctx, shader_time_index, &input_vue_map); > - if (!v.run()) { > - if (error_str) > - *error_str = ralloc_strdup(mem_ctx, v.fail_msg); > - return NULL; > - } > + if (is_scalar) { > + fs_visitor v(compiler, log_data, mem_ctx, (void *) key, > + &prog_data->base.base, NULL, nir, 8, > + shader_time_index, &input_vue_map); > + if (!v.run_tcs_single_patch()) { > + if (error_str) > + *error_str = ralloc_strdup(mem_ctx, v.fail_msg); > + return NULL; > + } > > - if (unlikely(INTEL_DEBUG & DEBUG_TCS)) > - v.dump_instructions(); > + prog_data->base.dispatch_mode = DISPATCH_MODE_SIMD8; > + > + fs_generator g(compiler, log_data, mem_ctx, (void *) key, > + &prog_data->base.base, v.promoted_constants, false, > + MESA_SHADER_TESS_CTRL); > + if (unlikely(INTEL_DEBUG & DEBUG_TCS)) { > + g.enable_debug(ralloc_asprintf(mem_ctx, > + "%s tessellation control shader %s", > + nir->info.label ? nir->info.label > + : "unnamed", > + nir->info.name)); > + } > + > + g.generate_code(v.cfg, 8); > > - return brw_vec4_generate_assembly(compiler, log_data, mem_ctx, nir, > - &prog_data->base, v.cfg, > - final_assembly_size); > + return g.get_assembly(final_assembly_size); > + } else { > + vec4_tcs_visitor v(compiler, log_data, key, prog_data, > + nir, mem_ctx, shader_time_index, &input_vue_map); > + if (!v.run()) { > + if (error_str) > + *error_str = ralloc_strdup(mem_ctx, v.fail_msg); > + return NULL; > + } > + > + if (unlikely(INTEL_DEBUG & DEBUG_TCS)) > + v.dump_instructions(); > + > + > + return brw_vec4_generate_assembly(compiler, log_data, mem_ctx, nir, > + &prog_data->base, v.cfg, > + final_assembly_size); > + } > } > > > -- > 2.8.0 > > _______________________________________________ > mesa-dev mailing list > mesa-dev@lists.freedesktop.org > https://lists.freedesktop.org/mailman/listinfo/mesa-dev _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev