On Fri, Mar 24, 2017 at 5:53 AM, Iago Toral <ito...@igalia.com> wrote:
> On Wed, 2017-03-22 at 21:01 -0700, Jason Ekstrand wrote: > --- > src/intel/vulkan/anv_private.h | 6 ++ > src/intel/vulkan/genX_cmd_buffer.c | 141 > ++++++++++++++++++++++++++++++++++++- > src/intel/vulkan/genX_pipeline.c | 10 ++- > 3 files changed, 152 insertions(+), 5 deletions(-) > > > diff --git a/src/intel/vulkan/anv_private.h > b/src/intel/vulkan/anv_private.h > index b402bc3..253dce2 100644 > --- a/src/intel/vulkan/anv_private.h > +++ b/src/intel/vulkan/anv_private.h > @@ -1980,6 +1980,12 @@ struct anv_subpass { > bool has_resolve; > }; > > +static inline unsigned > +anv_subpass_view_count(const struct anv_subpass *subpass) > +{ > + return MAX2(1, _mesa_bitcount(subpass->view_mask)); > +} > + > enum anv_subpass_usage { > ANV_SUBPASS_USAGE_DRAW = (1 << 0), > ANV_SUBPASS_USAGE_INPUT = (1 << 1), > diff --git a/src/intel/vulkan/genX_cmd_buffer.c > b/src/intel/vulkan/genX_cmd_buffer.c > index aafe7fd..8c21c33 100644 > --- a/src/intel/vulkan/genX_cmd_buffer.c > +++ b/src/intel/vulkan/genX_cmd_buffer.c > @@ -26,6 +26,7 @@ > > #include "anv_private.h" > #include "vk_format_info.h" > +#include "util/vk_util.h" > > #include "common/gen_l3_config.h" > #include "genxml/gen_macros.h" > @@ -50,6 +51,17 @@ emit_lri(struct anv_batch *batch, uint32_t reg, > uint32_t imm) > } > } > > +#if GEN_IS_HASWELL || GEN_GEN >= 8 > +static void > +emit_lrr(struct anv_batch *batch, uint32_t dst, uint32_t src) > +{ > + anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_REG), lrr) { > + lrr.SourceRegisterAddress = src; > + lrr.DestinationRegisterAddress = dst; > + } > +} > +#endif > + > void > genX(cmd_buffer_emit_state_base_address)(struct anv_cmd_buffer > *cmd_buffer) > { > @@ -1525,7 +1537,13 @@ genX(cmd_buffer_flush_state)(struct > anv_cmd_buffer *cmd_buffer) > .MemoryObjectControlState = GENX(MOCS), > #else > .BufferAccessType = pipeline->instancing_enable[vb] ? > INSTANCEDATA : VERTEXDATA, > - .InstanceDataStepRate = 1, > + /* Our implementation of VK_KHR_multiview uses > instancing to draw > + * the different views. If the client asks for > instancing, we > + * need to use the Instance Data Step Rate to ensure > that we > + * repeat the client's per-instance data once for each > view. > + */ > + .InstanceDataStepRate = > + _mesa_bitcount(pipeline->subpass->view_mask), > > mmm... shouldn't this be: > > .InstanceDataStepRate = anv_subpass_view_count(pipeline->subpass); > > so that we set this to 1 when multiview is not in use? (view_mask == 0) > Good call! You're absolutely right. This line is older than anv_subpass_view_count, I think. > > .VertexBufferMemoryObjectControlState = GENX(MOCS), > #endif > > @@ -1715,6 +1733,11 @@ void genX(CmdDraw)( > if (vs_prog_data->uses_drawid) > emit_draw_index(cmd_buffer, 0); > > + /* Our implementation of VK_KHR_multiview uses instancing to draw > the > + * different views. We need to multiply instanceCount by the > view count. > + */ > + instanceCount *= anv_subpass_view_count(cmd_buffer- > >state.subpass); > + > anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) { > prim.VertexAccessType = SEQUENTIAL; > prim.PrimitiveTopologyType = pipeline->topology; > @@ -1748,6 +1771,11 @@ void genX(CmdDrawIndexed)( > if (vs_prog_data->uses_drawid) > emit_draw_index(cmd_buffer, 0); > > + /* Our implementation of VK_KHR_multiview uses instancing to draw > the > + * different views. We need to multiply instanceCount by the > view count. > + */ > + instanceCount *= anv_subpass_view_count(cmd_buffer- > >state.subpass); > + > anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) { > prim.VertexAccessType = RANDOM; > prim.PrimitiveTopologyType = pipeline->topology; > @@ -1767,6 +1795,90 @@ void genX(CmdDrawIndexed)( > #define GEN7_3DPRIM_START_INSTANCE 0x243C > #define GEN7_3DPRIM_BASE_VERTEX 0x2440 > > +/* MI_MATH only exists on Haswell+ */ > +#if GEN_IS_HASWELL || GEN_GEN >= 8 > + > +#define alu_opcode(v) __gen_uint((v), 20, 31) > +#define alu_operand1(v) __gen_uint((v), 10, 19) > +#define alu_operand2(v) __gen_uint((v), 0, 9) > +#define alu(opcode, operand1, operand2) \ > + alu_opcode(opcode) | alu_operand1(operand1) | > alu_operand2(operand2) > + > +#define OPCODE_NOOP 0x000 > +#define OPCODE_LOAD 0x080 > +#define OPCODE_LOADINV 0x480 > +#define OPCODE_LOAD0 0x081 > +#define OPCODE_LOAD1 0x481 > +#define OPCODE_ADD 0x100 > +#define OPCODE_SUB 0x101 > +#define OPCODE_AND 0x102 > +#define OPCODE_OR 0x103 > +#define OPCODE_XOR 0x104 > +#define OPCODE_STORE 0x180 > +#define OPCODE_STOREINV 0x580 > + > +#define OPERAND_R0 0x00 > +#define OPERAND_R1 0x01 > +#define OPERAND_R2 0x02 > +#define OPERAND_R3 0x03 > +#define OPERAND_R4 0x04 > +#define OPERAND_SRCA 0x20 > +#define OPERAND_SRCB 0x21 > +#define OPERAND_ACCU 0x31 > +#define OPERAND_ZF 0x32 > +#define OPERAND_CF 0x33 > + > +#define CS_GPR(n) (0x2600 + (n) * 8) > + > +/* Emit dwords to multiply GPR0 by N */ > +static void > +build_alu_multiply_gpr0(uint32_t *dw, unsigned *dw_count, uint32_t > N) > +{ > + VK_OUTARRAY_MAKE(out, dw, dw_count); > + > +#define append_alu(opcode, operand1, operand2) \ > + vk_outarray_append(&out, alu_dw) *alu_dw = alu(opcode, operand1, > operand2) > + > + assert(N > 0); > + unsigned top_bit = 31 - __builtin_clz(N); > + for (int i = top_bit - 1; i >= 0; i--) { > + /* We get our initial data in GPR0 and we write the final data > out to > + * GPR0 but we use GPR1 as our scratch register. > + */ > + unsigned src_reg = i == top_bit - 1 ? OPERAND_R0 : OPERAND_R1; > + unsigned dst_reg = i == 0 ? OPERAND_R0 : OPERAND_R1; > + > + /* Shift the current value left by 1 */ > + append_alu(OPCODE_LOAD, OPERAND_SRCA, src_reg); > + append_alu(OPCODE_LOAD, OPERAND_SRCB, src_reg); > + append_alu(OPCODE_AND, 0, 0); > + > + if (N & (1 << i)) { > + /* Store ACCU to R1 and add R0 to R1 */ > + append_alu(OPCODE_STORE, OPERAND_R1, OPERAND_ACCU); > + append_alu(OPCODE_LOAD, OPERAND_SRCA, OPERAND_R0); > + append_alu(OPCODE_LOAD, OPERAND_SRCB, OPERAND_R1); > + append_alu(OPCODE_AND, 0, 0); > + } > + > + append_alu(OPCODE_STORE, dst_reg, OPERAND_ACCU); > + } > + > +#undef append_alu > +} > + > +static void > +emit_mul_gpr0(struct anv_batch *batch, uint32_t N) > +{ > + uint32_t num_dwords; > > + build_alu_multiply_gpr0(NULL, &num_dwords, N); > > I don't get this, it seems that the expectation of this code is > that build_alu_multiply_gpr0() fills in num_dwords when we call it > with NULL in the first argument, but I don't see where it does that, > what am I missing? > build_alu_multiply_gpr0 has the same semantics as the Vulkan query functions that return arrays. If you call it with NULL, it returns the number of dwords required. Otherwise, it assumes num_dwords is the number of dwords that it has available to write. I made use of vk_outarray to get this behavior. Yeah, it's a bit deceptive, but it worked out really well. :-) Feel free to tell me that it's a bit too clever and that I should make it simpler. > + > + uint32_t *dw = anv_batch_emitn(batch, 1 + num_dwords, > GENX(MI_MATH)); > + build_alu_multiply_gpr0(dw, &num_dwords, N); > +} > + > +#endif /* GEN_IS_HASWELL || GEN_GEN >= 8 */ > + > static void > load_indirect_parameters(struct anv_cmd_buffer *cmd_buffer, > struct anv_buffer *buffer, uint64_t offset, > @@ -1777,7 +1889,22 @@ load_indirect_parameters(struct anv_cmd_buffer > *cmd_buffer, > uint32_t bo_offset = buffer->offset + offset; > > emit_lrm(batch, GEN7_3DPRIM_VERTEX_COUNT, bo, bo_offset); > - emit_lrm(batch, GEN7_3DPRIM_INSTANCE_COUNT, bo, bo_offset + 4); > + > + unsigned view_count = anv_subpass_view_count(cmd_buffer- > >state.subpass); > + if (view_count > 1) { > +#if GEN_IS_HASWELL || GEN_GEN >= 8 > + emit_lrm(batch, CS_GPR(0), bo, bo_offset + 4); > + emit_mul_gpr0(batch, view_count); > + emit_lrr(batch, GEN7_3DPRIM_INSTANCE_COUNT, CS_GPR(0)); > +#else > + anv_finishme("Multiview + indirect draw requires MI_MATH\n" > + "MI_MATH is not supported on Ivy Bridge"); > + emit_lrm(batch, GEN7_3DPRIM_INSTANCE_COUNT, bo, bo_offset + > 4); > +#endif > + } else { > + emit_lrm(batch, GEN7_3DPRIM_INSTANCE_COUNT, bo, bo_offset + > 4); > + } > + > emit_lrm(batch, GEN7_3DPRIM_START_VERTEX, bo, bo_offset + 8); > > if (indexed) { > @@ -2531,6 +2658,16 @@ genX(cmd_buffer_set_subpass)(struct > anv_cmd_buffer *cmd_buffer, > > cmd_buffer->state.dirty |= ANV_CMD_DIRTY_RENDER_TARGETS; > > + /* Our implementation of VK_KHR_multiview uses instancing to draw > the > + * different views. If the client asks for instancing, we need > to use the > + * Instance Data Step Rate to ensure that we repeat the client's > + * per-instance data once for each view. Since this bit is in > + * VERTEX_BUFFER_STATE on gen7, we need to dirty vertex buffers > at the top > + * of each subpass. > + */ > + if (GEN_GEN == 7) > + cmd_buffer->state.vb_dirty |= ~0; > > + > > Shouldn't we do: > > if (GEN_GEN == 7 && subpass->view_mask != 0) > cmd_buffer->state.vb_dirty |= ~0; > > So this doesn't affect cases that don't use multiview? Yes, we could do that because we're guaranteed that, for each render pass, either all subpass view masks are 0 or all are non-zero. This seems safer though and I doubt flushing vertex buffers at the subpass boundary will hurt anyone. > > /* Perform transitions to the subpass layout before any writes > > have > * occurred. > */ > diff --git a/src/intel/vulkan/genX_pipeline.c > b/src/intel/vulkan/genX_pipeline.c > index 85a9e4f..9aaa8a0 100644 > --- a/src/intel/vulkan/genX_pipeline.c > +++ b/src/intel/vulkan/genX_pipeline.c > @@ -155,9 +155,13 @@ emit_vertex_input(struct anv_pipeline *pipeline, > anv_batch_emit(&pipeline->batch, GENX(3DSTATE_VF_INSTANCING), > vfi) { > vfi.InstancingEnable = pipeline->instancing_enable[desc- > >binding]; > vfi.VertexElementIndex = slot; > - /* Vulkan so far doesn't have an instance divisor, so > - * this is always 1 (ignored if not instancing). */ > - vfi.InstanceDataStepRate = 1; > + /* Our implementation of VK_KHX_multiview uses instancing > to draw > + * the different views. If the client asks for instancing, > we > + * need to use the Instance Data Step Rate to ensure that > we > + * repeat the client's per-instance data once for each > view. > + */ > + vfi.InstanceDataStepRate = > > + _mesa_bitcount(pipeline->subpass->view_mask); > > Same comment as above, shouldn't this be: > > vfi.InstanceDataStepRate = anv_subpass_view_count(pipeline->subpass); > Yup. Thanks! I've fixed both locally.
_______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev