On Wed, Oct 17, 2018 at 6:59 AM Danylo Piliaiev <danylo.pilia...@gmail.com>
wrote:
Conditional rendering affects next functions:
- vkCmdDraw, vkCmdDrawIndexed, vkCmdDrawIndirect, vkCmdDrawIndexedIndirect
- vkCmdDrawIndirectCountKHR, vkCmdDrawIndexedIndirectCountKHR
- vkCmdDispatch, vkCmdDispatchIndirect, vkCmdDispatchBase
- vkCmdClearAttachments
To reduce readings from the memory a result of the condition is calculated
and stored into designated register MI_ALU_REG15.
In current implementation affected functions expect MI_PREDICATE_RESULT
being set before their call so any code which changes the predicate
should restore it with restore_conditional_render_predicate.
An alternative is to restore MI_PREDICATE_RESULT in all affected
functions at their beginning.
Signed-off-by: Danylo Piliaiev <danylo.pilia...@globallogic.com>
---
src/intel/vulkan/anv_blorp.c | 7 +-
src/intel/vulkan/anv_device.c | 12 ++
src/intel/vulkan/anv_extensions.py | 1 +
src/intel/vulkan/anv_private.h | 2 +
src/intel/vulkan/genX_cmd_buffer.c | 192 ++++++++++++++++++++++++++++-
5 files changed, 209 insertions(+), 5 deletions(-)
diff --git a/src/intel/vulkan/anv_blorp.c b/src/intel/vulkan/anv_blorp.c
index 478b8e7a3d..157875d16f 100644
--- a/src/intel/vulkan/anv_blorp.c
+++ b/src/intel/vulkan/anv_blorp.c
@@ -1144,8 +1144,11 @@ void anv_CmdClearAttachments(
* trash our depth and stencil buffers.
*/
struct blorp_batch batch;
- blorp_batch_init(&cmd_buffer->device->blorp, &batch, cmd_buffer,
- BLORP_BATCH_NO_EMIT_DEPTH_STENCIL);
+ enum blorp_batch_flags flags = BLORP_BATCH_NO_EMIT_DEPTH_STENCIL;
+ if (cmd_buffer->state.conditional_render_enabled) {
+ flags |= BLORP_BATCH_PREDICATE_ENABLE;
+ }
+ blorp_batch_init(&cmd_buffer->device->blorp, &batch, cmd_buffer,
flags);
for (uint32_t a = 0; a < attachmentCount; ++a) {
if (pAttachments[a].aspectMask & VK_IMAGE_ASPECT_ANY_COLOR_BIT_ANV)
{
diff --git a/src/intel/vulkan/anv_device.c b/src/intel/vulkan/anv_device.c
index a2551452eb..930a192c25 100644
--- a/src/intel/vulkan/anv_device.c
+++ b/src/intel/vulkan/anv_device.c
@@ -957,6 +957,18 @@ void anv_GetPhysicalDeviceFeatures2(
break;
}
+ case
VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_CONDITIONAL_RENDERING_FEATURES_EXT: {
+ VkPhysicalDeviceConditionalRenderingFeaturesEXT *features =
+ (VkPhysicalDeviceConditionalRenderingFeaturesEXT*)ext;
+ ANV_FROM_HANDLE(anv_physical_device, pdevice, physicalDevice);
+
+ features->conditionalRendering = pdevice->info.gen >= 8 ||
+ pdevice->info.is_haswell;
+ features->inheritedConditionalRendering = pdevice->info.gen >= 8
||
+
pdevice->info.is_haswell;
+ break;
+ }
+
default:
anv_debug_ignored_stype(ext->sType);
break;
diff --git a/src/intel/vulkan/anv_extensions.py
b/src/intel/vulkan/anv_extensions.py
index c13ce531ee..2ef7a52d01 100644
--- a/src/intel/vulkan/anv_extensions.py
+++ b/src/intel/vulkan/anv_extensions.py
@@ -127,6 +127,7 @@ EXTENSIONS = [
Extension('VK_EXT_vertex_attribute_divisor', 3, True),
Extension('VK_EXT_post_depth_coverage', 1,
'device->info.gen >= 9'),
Extension('VK_EXT_sampler_filter_minmax', 1,
'device->info.gen >= 9'),
+ Extension('VK_EXT_conditional_rendering', 1,
'device->info.gen >= 8 || device->info.is_haswell'),
]
class VkVersion:
diff --git a/src/intel/vulkan/anv_private.h
b/src/intel/vulkan/anv_private.h
index 599b903f25..108da51a59 100644
--- a/src/intel/vulkan/anv_private.h
+++ b/src/intel/vulkan/anv_private.h
@@ -2032,6 +2032,8 @@ struct anv_cmd_state {
*/
bool hiz_enabled;
+ bool
conditional_render_enabled;
+
/**
* Array length is anv_cmd_state::pass::attachment_count. Array
content is
* valid only when recording a render pass instance.
diff --git a/src/intel/vulkan/genX_cmd_buffer.c
b/src/intel/vulkan/genX_cmd_buffer.c
index f07a6aa7c9..87abc443b6 100644
--- a/src/intel/vulkan/genX_cmd_buffer.c
+++ b/src/intel/vulkan/genX_cmd_buffer.c
@@ -479,8 +479,9 @@ transition_depth_buffer(struct anv_cmd_buffer
*cmd_buffer,
0, 0, 1, hiz_op);
}
-#define MI_PREDICATE_SRC0 0x2400
-#define MI_PREDICATE_SRC1 0x2408
+#define MI_PREDICATE_SRC0 0x2400
+#define MI_PREDICATE_SRC1 0x2408
+#define MI_PREDICATE_RESULT 0x2418
static void
set_image_compressed_bit(struct anv_cmd_buffer *cmd_buffer,
@@ -545,6 +546,14 @@ mi_alu(uint32_t opcode, uint32_t operand1, uint32_t
operand2)
#define CS_GPR(n) (0x2600 + (n) * 8)
+#if GEN_GEN >= 8 || GEN_IS_HASWELL
+static void
+restore_conditional_render_predicate(struct anv_cmd_buffer *cmd_buffer)
+{
+ emit_lrr(&cmd_buffer->batch, MI_PREDICATE_RESULT,
CS_GPR(MI_ALU_REG15));
Does this work? Is it sufficient to just set MI_PREDICATE_RESULT or do we
actually need to use an MI_PREDICATE? I genuinely don't know and this
strikes me as odd.
+ break;
+ }
+ default:
+ anv_debug_ignored_stype(s->sType);
+ break;
+ }
+ }
+ }
+#endif
+
return result;
}
@@ -1501,6 +1536,20 @@ genX(CmdExecuteCommands)(
assert(secondary->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
assert(!anv_batch_has_error(&secondary->batch));
+#if GEN_GEN >= 8 || GEN_IS_HASWELL
+ if (secondary->state.conditional_render_enabled) {
+ /* Secondary buffer is constructed as if it will be executed
+ * with conditional rendering, we should satisfy this dependency
+ * regardless of conditional rendering being enabled in primary.
+ */
+ if (!primary->state.conditional_render_enabled) {
+ emit_lri(&primary->batch, CS_GPR(MI_ALU_REG15), 1);
+ emit_lri(&primary->batch, CS_GPR(MI_ALU_REG15) + 4, 0);
+ emit_lrr(&primary->batch, MI_PREDICATE_RESULT,
CS_GPR(MI_ALU_REG15));
+ }
+ }
+#endif
+
if (secondary->usage_flags &
VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) {
/* If we're continuing a render pass from the primary, we need to
@@ -2761,6 +2810,7 @@ void genX(CmdDraw)(
instanceCount *= anv_subpass_view_count(cmd_buffer->state.subpass);
anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
+ prim.PredicateEnable =
cmd_buffer->state.conditional_render_enabled;
prim.VertexAccessType = SEQUENTIAL;
prim.PrimitiveTopologyType = pipeline->topology;
prim.VertexCountPerInstance = vertexCount;
@@ -2800,6 +2850,7 @@ void genX(CmdDrawIndexed)(
instanceCount *= anv_subpass_view_count(cmd_buffer->state.subpass);
anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
+ prim.PredicateEnable =
cmd_buffer->state.conditional_render_enabled;
prim.VertexAccessType = RANDOM;
prim.PrimitiveTopologyType = pipeline->topology;
prim.VertexCountPerInstance = indexCount;
@@ -2935,6 +2986,7 @@ void genX(CmdDrawIndirect)(
anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
prim.IndirectParameterEnable = true;
+ prim.PredicateEnable =
cmd_buffer->state.conditional_render_enabled;
prim.VertexAccessType = SEQUENTIAL;
prim.PrimitiveTopologyType = pipeline->topology;
}
@@ -2974,6 +3026,7 @@ void genX(CmdDrawIndexedIndirect)(
anv_batch_emit(&cmd_buffer->batch, GENX(3DPRIMITIVE), prim) {
prim.IndirectParameterEnable = true;
+ prim.PredicateEnable =
cmd_buffer->state.conditional_render_enabled;
prim.VertexAccessType = RANDOM;
prim.PrimitiveTopologyType = pipeline->topology;
}
@@ -3024,6 +3077,42 @@ emit_draw_count_predicate(struct anv_cmd_buffer
*cmd_buffer,
}
}
+#if GEN_GEN >= 8 || GEN_IS_HASWELL
+static void
+emit_draw_count_predicate_with_conditional_render(
+ struct anv_cmd_buffer *cmd_buffer,
+ struct anv_address count_address,
+ uint32_t draw_index)
+{
+ const int draw_index_reg = MI_ALU_REG0;
+ const int draw_count_reg = MI_ALU_REG14;
+ const int condition_reg = MI_ALU_REG15;
+ const int tmp_result_reg = MI_ALU_REG1;
+
+ emit_lri(&cmd_buffer->batch, CS_GPR(draw_index_reg), draw_index);
+ emit_lri(&cmd_buffer->batch, CS_GPR(draw_index_reg) + 4, 0);
+
+ uint32_t *dw;
+ /* Compute (draw_index < draw_count).
+ * We do this by subtracting and storing the carry bit.
+ */
+ dw = anv_batch_emitn(&cmd_buffer->batch, 5, GENX(MI_MATH));
+ dw[1] = mi_alu(MI_ALU_LOAD, MI_ALU_SRCA, draw_index_reg);
+ dw[2] = mi_alu(MI_ALU_LOAD, MI_ALU_SRCB, draw_count_reg);
+ dw[3] = mi_alu(MI_ALU_SUB, 0, 0);
+ dw[4] = mi_alu(MI_ALU_STORE, tmp_result_reg, MI_ALU_CF);
+
+ /* & condition */
+ dw = anv_batch_emitn(&cmd_buffer->batch, 5, GENX(MI_MATH));
+ dw[1] = mi_alu(MI_ALU_LOAD, MI_ALU_SRCA, tmp_result_reg);
+ dw[2] = mi_alu(MI_ALU_LOAD, MI_ALU_SRCB, condition_reg);
+ dw[3] = mi_alu(MI_ALU_AND, 0, 0);
+ dw[4] = mi_alu(MI_ALU_STORE, tmp_result_reg, MI_ALU_ACCU);
+
+ emit_lrr(&cmd_buffer->batch, MI_PREDICATE_RESULT,
CS_GPR(tmp_result_reg));
Again, is this sufficient? Maybe I'm missing something.
+}
+#endif
+
void genX(CmdDrawIndirectCountKHR)(
VkCommandBuffer commandBuffer,
VkBuffer _buffer,
@@ -3063,7 +3152,15 @@ void genX(CmdDrawIndirectCountKHR)(
for (uint32_t i = 0; i < maxDrawCount; i++) {
struct anv_address draw = anv_address_add(buffer->address, offset);
+#if GEN_GEN >= 8 || GEN_IS_HASWELL
+ if (cmd_state->conditional_render_enabled) {
+ emit_draw_count_predicate_with_conditional_render(cmd_buffer,
count_address, i);
+ } else {
+ emit_draw_count_predicate(cmd_buffer, count_address, i);
+ }
+#else
emit_draw_count_predicate(cmd_buffer, count_address, i);
+#endif
if (vs_prog_data->uses_firstvertex ||
vs_prog_data->uses_baseinstance)
@@ -3082,6 +3179,12 @@ void genX(CmdDrawIndirectCountKHR)(
offset += stride;
}
+
+#if GEN_GEN >= 8 || GEN_IS_HASWELL
+ if (cmd_state->conditional_render_enabled) {
+ restore_conditional_render_predicate(cmd_buffer);
+ }
+#endif
}
void genX(CmdDrawIndexedIndirectCountKHR)(
@@ -3123,7 +3226,15 @@ void genX(CmdDrawIndexedIndirectCountKHR)(
for (uint32_t i = 0; i < maxDrawCount; i++) {
struct anv_address draw = anv_address_add(buffer->address, offset);
+#if GEN_GEN >= 8 || GEN_IS_HASWELL
+ if (cmd_state->conditional_render_enabled) {
+ emit_draw_count_predicate_with_conditional_render(cmd_buffer,
count_address, i);
+ } else {
+ emit_draw_count_predicate(cmd_buffer, count_address, i);
+ }
+#else
emit_draw_count_predicate(cmd_buffer, count_address, i);
+#endif
/* TODO: We need to stomp base vertex to 0 somehow */
if (vs_prog_data->uses_firstvertex ||
@@ -3143,6 +3254,12 @@ void genX(CmdDrawIndexedIndirectCountKHR)(
offset += stride;
}
+
+#if GEN_GEN >= 8 || GEN_IS_HASWELL
+ if (cmd_state->conditional_render_enabled) {
+ restore_conditional_render_predicate(cmd_buffer);
+ }
+#endif
}
static VkResult
@@ -3351,6 +3468,7 @@ void genX(CmdDispatchBase)(
genX(cmd_buffer_flush_compute_state)(cmd_buffer);
anv_batch_emit(&cmd_buffer->batch, GENX(GPGPU_WALKER), ggw) {
+ ggw.PredicateEnable =
cmd_buffer->state.conditional_render_enabled;
ggw.SIMDSize = prog_data->simd_size / 16;
ggw.ThreadDepthCounterMaximum = 0;
ggw.ThreadHeightCounterMaximum = 0;
@@ -3448,7 +3566,8 @@ void genX(CmdDispatchIndirect)(
anv_batch_emit(batch, GENX(GPGPU_WALKER), ggw) {
ggw.IndirectParameterEnable = true;
- ggw.PredicateEnable = GEN_GEN <= 7;
+ ggw.PredicateEnable = GEN_GEN <= 7 ||
+
cmd_buffer->state.conditional_render_enabled;
ggw.SIMDSize = prog_data->simd_size / 16;
ggw.ThreadDepthCounterMaximum = 0;
ggw.ThreadHeightCounterMaximum = 0;
@@ -4158,3 +4277,70 @@ void genX(CmdEndRenderPass2KHR)(
{
genX(CmdEndRenderPass)(commandBuffer);
}
+
+#if GEN_GEN >= 8 || GEN_IS_HASWELL
+void genX(CmdBeginConditionalRenderingEXT)(
+ VkCommandBuffer commandBuffer,
+ const VkConditionalRenderingBeginInfoEXT*
pConditionalRenderingBegin)
+{
+ ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+ ANV_FROM_HANDLE(anv_buffer, buffer,
pConditionalRenderingBegin->buffer);
+ struct anv_cmd_state *cmd_state = &cmd_buffer->state;
+ struct anv_address value_address =
+ anv_address_add(buffer->address,
pConditionalRenderingBegin->offset);
+
+ const bool inverted = pConditionalRenderingBegin->flags &
+ VK_CONDITIONAL_RENDERING_INVERTED_BIT_EXT;
+
+ cmd_state->conditional_render_enabled = true;
+
+ /* Needed to ensure the memory is coherent for the
MI_LOAD_REGISTER_MEM
+ * command when loading the values into the predicate source
registers.
+ */
+ anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) {
+ pc.PipeControlFlushEnable = true;
+ }
+
+ /* Section 19.4 of the Vulkan 1.1.85 spec says:
+ *
+ * If the value of the predicate in buffer memory changes
+ * while conditional rendering is active, the rendering commands
+ * may be discarded in an implementation-dependent way.
+ * Some implementations may latch the value of the predicate
+ * upon beginning conditional rendering while others
+ * may read it before every rendering command.
+ *
+ * So it's perfectly fine to read a value from the buffer once.
+ */
+
+ emit_lrm(&cmd_buffer->batch, MI_PREDICATE_SRC0, value_address);
+ /* Zero the top 32-bits of MI_PREDICATE_SRC0 */
+ emit_lri(&cmd_buffer->batch, MI_PREDICATE_SRC0 + 4, 0);
+ emit_lri(&cmd_buffer->batch, MI_PREDICATE_SRC1, 0);
+ emit_lri(&cmd_buffer->batch, MI_PREDICATE_SRC1 + 4, 0);
+
+ anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) {
+ mip.LoadOperation = inverted ? LOAD_LOAD : LOAD_LOADINV;
+ mip.CombineOperation = COMBINE_SET;
+ mip.CompareOperation = COMPARE_SRCS_EQUAL;
+ }
+
+ /* Calculate predicate result once and store it in MI_ALU_REG15
+ * to prevent recalculating it when interacting with
+ * VK_KHR_draw_indirect_count which also uses predicates.
+ * It is also the only way to support conditional render of
+ * secondary buffers because they are formed before we
+ * know whether conditional render is enabled.
+ */
+ emit_lrr(&cmd_buffer->batch, CS_GPR(MI_ALU_REG15),
MI_PREDICATE_RESULT);
+}
+
+void genX(CmdEndConditionalRenderingEXT)(
+ VkCommandBuffer commandBuffer)
+{
+ ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer);
+ struct anv_cmd_state *cmd_state = &cmd_buffer->state;
+
+ cmd_state->conditional_render_enabled = false;
+}
+#endif
--
2.18.0