On Fri, 2017-03-24 at 16:57 -0700, Jason Ekstrand wrote: > On Fri, Mar 24, 2017 at 5:53 AM, Iago Toral <ito...@igalia.com> > wrote: > > On Wed, 2017-03-22 at 21:01 -0700, Jason Ekstrand wrote: > > --- > > src/intel/vulkan/anv_private.h | 6 ++ > > src/intel/vulkan/genX_cmd_buffer.c | 141 > > ++++++++++++++++++++++++++++++++++++- > > src/intel/vulkan/genX_pipeline.c | 10 ++- > > 3 files changed, 152 insertions(+), 5 deletions(-) > > > > > > diff --git a/src/intel/vulkan/anv_private.h > > b/src/intel/vulkan/anv_private.h > > index b402bc3..253dce2 100644 > > --- a/src/intel/vulkan/anv_private.h > > +++ b/src/intel/vulkan/anv_private.h > > @@ -1980,6 +1980,12 @@ struct anv_subpass { > > bool has_resolve; > > }; > > > > +static inline unsigned > > +anv_subpass_view_count(const struct anv_subpass *subpass) > > +{ > > + return MAX2(1, _mesa_bitcount(subpass->view_mask)); > > +} > > + > > enum anv_subpass_usage { > > ANV_SUBPASS_USAGE_DRAW = (1 << 0), > > ANV_SUBPASS_USAGE_INPUT = (1 << 1), > > diff --git a/src/intel/vulkan/genX_cmd_buffer.c > > b/src/intel/vulkan/genX_cmd_buffer.c > > index aafe7fd..8c21c33 100644 > > --- a/src/intel/vulkan/genX_cmd_buffer.c > > +++ b/src/intel/vulkan/genX_cmd_buffer.c > > @@ -26,6 +26,7 @@ > > > > #include "anv_private.h" > > #include "vk_format_info.h" > > +#include "util/vk_util.h" > > > > #include "common/gen_l3_config.h" > > #include "genxml/gen_macros.h" > > @@ -50,6 +51,17 @@ emit_lri(struct anv_batch *batch, uint32_t reg, > > uint32_t imm) > > } > > } > > > > +#if GEN_IS_HASWELL || GEN_GEN >= 8 > > +static void > > +emit_lrr(struct anv_batch *batch, uint32_t dst, uint32_t src) > > +{ > > + anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_REG), lrr) { > > + lrr.SourceRegisterAddress = src; > > + lrr.DestinationRegisterAddress = dst; > > + } > > +} > > +#endif > > + > > void > > genX(cmd_buffer_emit_state_base_address)(struct anv_cmd_buffer > > *cmd_buffer) > > { > > @@ -1525,7 +1537,13 @@ genX(cmd_buffer_flush_state)(struct > > anv_cmd_buffer *cmd_buffer) > > .MemoryObjectControlState = GENX(MOCS), > > #else > > .BufferAccessType = pipeline->instancing_enable[vb] ? > > INSTANCEDATA : VERTEXDATA, > > - .InstanceDataStepRate = 1, > > + /* Our implementation of VK_KHR_multiview uses > > instancing to draw > > + * the different views. If the client asks for > > instancing, we > > + * need to use the Instance Data Step Rate to ensure > > that we > > + * repeat the client's per-instance data once for each > > view. > > + */ > > + .InstanceDataStepRate = > > + _mesa_bitcount(pipeline->subpass->view_mask), > > > > mmm... shouldn't this be: > > > > .InstanceDataStepRate = anv_subpass_view_count(pipeline->subpass); > > > > so that we set this to 1 when multiview is not in use? (view_mask > > == 0) > Good call! You're absolutely right. This line is older than > anv_subpass_view_count, I think. > > > > > .VertexBufferMemoryObjectControlState = GENX(MOCS), > > #endif > > > > @@ -1715,6 +1733,11 @@ void genX(CmdDraw)( > > if (vs_prog_data->uses_drawid) > > emit_draw_index(cmd_buffer, 0); > > > > + /* Our implementation of VK_KHR_multiview uses instancing to > > draw > > the > > + * different views. We need to multiply instanceCount by the > > view count. > > + */ > > + instanceCount *= anv_subpass_view_count(cmd_buffer- > > >state.subpass); > > + > > anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) { > > prim.VertexAccessType = SEQUENTIAL; > > prim.PrimitiveTopologyType = pipeline->topology; > > @@ -1748,6 +1771,11 @@ void genX(CmdDrawIndexed)( > > if (vs_prog_data->uses_drawid) > > emit_draw_index(cmd_buffer, 0); > > > > + /* Our implementation of VK_KHR_multiview uses instancing to > > draw > > the > > + * different views. We need to multiply instanceCount by the > > view count. > > + */ > > + instanceCount *= anv_subpass_view_count(cmd_buffer- > > >state.subpass); > > + > > anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) { > > prim.VertexAccessType = RANDOM; > > prim.PrimitiveTopologyType = pipeline->topology; > > @@ -1767,6 +1795,90 @@ void genX(CmdDrawIndexed)( > > #define GEN7_3DPRIM_START_INSTANCE 0x243C > > #define GEN7_3DPRIM_BASE_VERTEX 0x2440 > > > > +/* MI_MATH only exists on Haswell+ */ > > +#if GEN_IS_HASWELL || GEN_GEN >= 8 > > + > > +#define alu_opcode(v) __gen_uint((v), 20, 31) > > +#define alu_operand1(v) __gen_uint((v), 10, 19) > > +#define alu_operand2(v) __gen_uint((v), 0, 9) > > +#define alu(opcode, operand1, operand2) \ > > + alu_opcode(opcode) | alu_operand1(operand1) | > > alu_operand2(operand2) > > + > > +#define OPCODE_NOOP 0x000 > > +#define OPCODE_LOAD 0x080 > > +#define OPCODE_LOADINV 0x480 > > +#define OPCODE_LOAD0 0x081 > > +#define OPCODE_LOAD1 0x481 > > +#define OPCODE_ADD 0x100 > > +#define OPCODE_SUB 0x101 > > +#define OPCODE_AND 0x102 > > +#define OPCODE_OR 0x103 > > +#define OPCODE_XOR 0x104 > > +#define OPCODE_STORE 0x180 > > +#define OPCODE_STOREINV 0x580 > > + > > +#define OPERAND_R0 0x00 > > +#define OPERAND_R1 0x01 > > +#define OPERAND_R2 0x02 > > +#define OPERAND_R3 0x03 > > +#define OPERAND_R4 0x04 > > +#define OPERAND_SRCA 0x20 > > +#define OPERAND_SRCB 0x21 > > +#define OPERAND_ACCU 0x31 > > +#define OPERAND_ZF 0x32 > > +#define OPERAND_CF 0x33 > > + > > +#define CS_GPR(n) (0x2600 + (n) * 8) > > + > > +/* Emit dwords to multiply GPR0 by N */ > > +static void > > +build_alu_multiply_gpr0(uint32_t *dw, unsigned *dw_count, uint32_t > > N) > > +{ > > + VK_OUTARRAY_MAKE(out, dw, dw_count); > > + > > +#define append_alu(opcode, operand1, operand2) \ > > + vk_outarray_append(&out, alu_dw) *alu_dw = alu(opcode, > > operand1, > > operand2) > > + > > + assert(N > 0); > > + unsigned top_bit = 31 - __builtin_clz(N); > > + for (int i = top_bit - 1; i >= 0; i--) { > > + /* We get our initial data in GPR0 and we write the final > > data > > out to > > + * GPR0 but we use GPR1 as our scratch register. > > + */ > > + unsigned src_reg = i == top_bit - 1 ? OPERAND_R0 : > > OPERAND_R1; > > + unsigned dst_reg = i == 0 ? OPERAND_R0 : OPERAND_R1; > > + > > + /* Shift the current value left by 1 */ > > + append_alu(OPCODE_LOAD, OPERAND_SRCA, src_reg); > > + append_alu(OPCODE_LOAD, OPERAND_SRCB, src_reg); > > + append_alu(OPCODE_AND, 0, 0); > > + > > + if (N & (1 << i)) { > > + /* Store ACCU to R1 and add R0 to R1 */ > > + append_alu(OPCODE_STORE, OPERAND_R1, OPERAND_ACCU); > > + append_alu(OPCODE_LOAD, OPERAND_SRCA, OPERAND_R0); > > + append_alu(OPCODE_LOAD, OPERAND_SRCB, OPERAND_R1); > > + append_alu(OPCODE_AND, 0, 0); > > + } > > + > > + append_alu(OPCODE_STORE, dst_reg, OPERAND_ACCU); > > + } > > + > > +#undef append_alu > > +} > > + > > +static void > > +emit_mul_gpr0(struct anv_batch *batch, uint32_t N) > > +{ > > + uint32_t num_dwords; > > > + build_alu_multiply_gpr0(NULL, &num_dwords, N); > > > > I don't get this, it seems that the expectation of this code is > > that build_alu_multiply_gpr0() fills in num_dwords when we call it > > with NULL in the first argument, but I don't see where it does > > that, > > what am I missing? > build_alu_multiply_gpr0 has the same semantics as the Vulkan query > functions that return arrays. If you call it with NULL, it returns > the number of dwords required. Otherwise, it assumes num_dwords is > the number of dwords that it has available to write. I made use of > vk_outarray to get this behavior. Yeah, it's a bit deceptive, but it > worked out really well. :-) Feel free to tell me that it's a bit too > clever and that I should make it simpler. No, it is okay, I understood what the intended behavior was but somehow I didn't see clearly how the macros in vk-util achieved that and got confused, I looked at it more carefully now and I think it is clear and make the implementation easier too. > > > > + > > > > + uint32_t *dw = anv_batch_emitn(batch, 1 + num_dwords, > > > > GENX(MI_MATH)); > > > > + build_alu_multiply_gpr0(> > dw, &num_dwords, N); > > > > +} > > > > + > > > > +#endif /* GEN_IS_HASWELL || GEN_GEN >= 8 */ > > > > + > > > > static void > > > > load_indirect_parameters(> > struct anv_cmd_buffer *cmd_buffer, > > > > > > struct anv_buffer *buffer, uint64_t offset, > > > > @@ -1777,7 +1889,22 @@ load_indirect_parameters(> > struct anv_cmd_buffer > > > > *cmd_buffer, > > > > uint32_t bo_offset = buffer->offset + offset; > > > > > > > > emit_lrm(batch, GEN7_3DPRIM_VERTEX_COUNT, bo, bo_offset); > > > > - emit_lrm(batch, GEN7_3DPRIM_INSTANCE_COUNT, bo, bo_offset + 4); > > > > + > > > > + unsigned view_count = anv_subpass_view_count(cmd_> > buffer- > > > > >state.subpass); > > > > + if (view_count > 1) { > > > > +#if GEN_IS_HASWELL || GEN_GEN >= 8 > > > > + emit_lrm(batch, CS_GPR(0), bo, bo_offset + 4); > > > > + emit_mul_gpr0(batch, view_count); > > > > + emit_lrr(batch, GEN7_3DPRIM_INSTANCE_COUNT, CS_GPR(0)); > > > > +#else > > > > + anv_finishme("Multiview + indirect draw requires MI_MATH\n" > > > > + "MI_MATH is not supported on Ivy Bridge"); > > > > + emit_lrm(batch, GEN7_3DPRIM_INSTANCE_COUNT, bo, bo_offset + > > > > 4); > > > > +#endif > > > > + } else { > > > > + emit_lrm(batch, GEN7_3DPRIM_INSTANCE_COUNT, bo, bo_offset + > > > > 4); > > > > + } > > > > + > > > > emit_lrm(batch, GEN7_3DPRIM_START_VERTEX, bo, bo_offset + 8); > > > > > > > > if (indexed) { > > > > @@ -2531,6 +2658,16 @@ genX(cmd_buffer_set_subpass)(> > struct > > > > anv_cmd_buffer *cmd_buffer, > > > > > > > > cmd_buffer->state.dirty |= ANV_CMD_DIRTY_RENDER_TARGETS; > > > > > > > > + /* Our implementation of VK_KHR_multiview uses instancing to draw > > > > the > > > > + * different views. If the client asks for instancing, we need > > > > to use the > > > > + * Instance Data Step Rate to ensure that we repeat the client's > > > > + * per-instance data once for each view. Since this bit is in > > > > + * VERTEX_BUFFER_STATE on gen7, we need to dirty vertex buffers > > > > at the top > > > > + * of each subpass. > > > > + */ > > > > + if (GEN_GEN == 7) > > > > + cmd_buffer->state.vb_> > dirty |= ~0; > > > > > + > >
> > > > Shouldn't we do: > > > > > > if (GEN_GEN == 7 && subpass->view_mask != 0) > > > > cmd_buffer->state.vb_dirty |= ~0; > > > > > > So this doesn't affect cases that don't use multiview? > > Yes, we could do > > that because we're guaranteed that, for each render pass, either all > > subpass view masks are 0 or all are non-zero. This seems safer though and > > I doubt flushing vertex buffers at the subpass boundary will hurt anyone. Right, makes sense. Reviewed-by: Iago Toral Quiroga <ito...@igalia.com> > > > > > /* Perform transitions to the subpass layout before any writes > > > > > have > > > > * occurred. > > > > */ > > > > diff --git a/src/intel/vulkan/genX_> > pipeline.c > > > > b/src/intel/vulkan/genX_> > pipeline.c > > > > index 85a9e4f..9aaa8a0 100644 > > > > --- a/src/intel/vulkan/genX_> > pipeline.c > > > > +++ b/src/intel/vulkan/genX_> > pipeline.c > > > > @@ -155,9 +155,13 @@ emit_vertex_input(struct anv_pipeline *pipeline, > > > > anv_batch_emit(&> > pipeline->batch, GENX(3DSTATE_VF_INSTANCING), > > > > vfi) { > > > > vfi.InstancingEnable = pipeline->instancing_enable[> > desc- > > > > >binding]; > > > > vfi.> > VertexElementIndex = slot; > > > > - /* Vulkan so far doesn't have an instance divisor, so > > > > - * this is always 1 (ignored if not instancing). */ > > > > - vfi.> > InstanceDataStepRate = 1; > > > > + /* Our implementation of VK_KHX_multiview uses instancing > > > > to draw > > > > + * the different views. If the client asks for instancing, > > > > we > > > > + * need to use the Instance Data Step Rate to ensure that > > > > we > > > > + * repeat the client's per-instance data once for each > > > > view. > > > > + */ > > > > + vfi.> > InstanceDataStepRate = > > > > > + _mesa_bitcount(> > pipeline->subpass->view_mask); > > > > > > Same comment as above, shouldn't this be: > > > > > > vfi.InstanceDataStepRate = anv_subpass_view_count(> > pipeline->subpass);> > > > > > > + > > > > + uint32_t *dw = anv_batch_emitn(batch, 1 + num_dwords, > > > > GENX(MI_MATH)); > > > > + build_alu_multiply_gpr0(> > dw, &num_dwords, N); > > > > +} > > > > + > > > > +#endif /* GEN_IS_HASWELL || GEN_GEN >= 8 */ > > > > + > > > > static void > > > > load_indirect_parameters(> > struct anv_cmd_buffer *cmd_buffer, > > > > > > struct anv_buffer *buffer, uint64_t offset, > > > > @@ -1777,7 +1889,22 @@ load_indirect_parameters(> > struct anv_cmd_buffer > > > > *cmd_buffer, > > > > uint32_t bo_offset = buffer->offset + offset; > > > > > > > > emit_lrm(batch, GEN7_3DPRIM_VERTEX_COUNT, bo, bo_offset); > > > > - emit_lrm(batch, GEN7_3DPRIM_INSTANCE_COUNT, bo, bo_offset + 4); > > > > + > > > > + unsigned view_count = anv_subpass_view_count(cmd_> > buffer- > > > > >state.subpass); > > > > + if (view_count > 1) { > > > > +#if GEN_IS_HASWELL || GEN_GEN >= 8 > > > > + emit_lrm(batch, CS_GPR(0), bo, bo_offset + 4); > > > > + emit_mul_gpr0(batch, view_count); > > > > + emit_lrr(batch, GEN7_3DPRIM_INSTANCE_COUNT, CS_GPR(0)); > > > > +#else > > > > + anv_finishme("Multiview + indirect draw requires MI_MATH\n" > > > > + "MI_MATH is not supported on Ivy Bridge"); > > > > + emit_lrm(batch, GEN7_3DPRIM_INSTANCE_COUNT, bo, bo_offset + > > > > 4); > > > > +#endif > > > > + } else { > > > > + emit_lrm(batch, GEN7_3DPRIM_INSTANCE_COUNT, bo, bo_offset + > > > > 4); > > > > + } > > > > + > > > > emit_lrm(batch, GEN7_3DPRIM_START_VERTEX, bo, bo_offset + 8); > > > > > > > > if (indexed) { > > > > @@ -2531,6 +2658,16 @@ genX(cmd_buffer_set_subpass)(> > struct > > > > anv_cmd_buffer *cmd_buffer, > > > > > > > > cmd_buffer->state.dirty |= ANV_CMD_DIRTY_RENDER_TARGETS; > > > > > > > > + /* Our implementation of VK_KHR_multiview uses instancing to draw > > > > the > > > > + * different views. If the client asks for instancing, we need > > > > to use the > > > > + * Instance Data Step Rate to ensure that we repeat the client's > > > > + * per-instance data once for each view. Since this bit is in > > > > + * VERTEX_BUFFER_STATE on gen7, we need to dirty vertex buffers > > > > at the top > > > > + * of each subpass. > > > > + */ > > > > + if (GEN_GEN == 7) > > > > + cmd_buffer->state.vb_> > dirty |= ~0; > > > > > + > > > > > > Shouldn't we do: > > > > > > if (GEN_GEN == 7 && subpass->view_mask != 0) > > > > cmd_buffer->state.vb_dirty |= ~0; > > > > > > So this doesn't affect cases that don't use multiview? > > Yes, we could do > > that because we're guaranteed that, for each render pass, either all > > subpass view masks are 0 or all are non-zero. This seems safer though and > > I doubt flushing vertex buffers at the subpass boundary will hurt anyone.> >
_______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev