On Wed, Nov 23, 2016 at 1:52 PM, Ilia Mirkin <imir...@alum.mit.edu> wrote: > The strategy is to just keep n anv_query_pool_slot entries per query > instead of one. The available bit is only valid in the last one. > > Signed-off-by: Ilia Mirkin <imir...@alum.mit.edu> > --- > > This was lightly tested with this patch to the vulkan loader cube.c: > > http://pastie.org/private/fe92867hwhxu7fzjw9pvzq > > Flipping it to TYPE_OCCLUSION makes it show the same number of frag > invocations. > > Without the giant stall, I didn't get any clipping primitives or frag > invocations. I'm sure some of the bits are wholly unneccessary, but as I don't > know anything about the architecture, I'm not inclined to guess which ones are > and which ones aren't. > > No testing was done on the vkCmdCopyQueryPoolResults path. > > src/intel/vulkan/anv_device.c | 2 +- > src/intel/vulkan/anv_private.h | 3 + > src/intel/vulkan/anv_query.c | 87 +++++++++++++----- > src/intel/vulkan/genX_cmd_buffer.c | 184 > +++++++++++++++++++++++++++++++------ > 4 files changed, 222 insertions(+), 54 deletions(-) > > diff --git a/src/intel/vulkan/anv_device.c b/src/intel/vulkan/anv_device.c > index 7cce92c..fc8482c 100644 > --- a/src/intel/vulkan/anv_device.c > +++ b/src/intel/vulkan/anv_device.c > @@ -427,7 +427,7 @@ void anv_GetPhysicalDeviceFeatures( > .textureCompressionASTC_LDR = pdevice->info.gen >= 9, /* > FINISHME CHV */ > .textureCompressionBC = true, > .occlusionQueryPrecise = true, > - .pipelineStatisticsQuery = false, > + .pipelineStatisticsQuery = true, > .fragmentStoresAndAtomics = true, > .shaderTessellationAndGeometryPointSize = true, > .shaderImageGatherExtended = false, > diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h > index 2fc543d..f033a8c 100644 > --- a/src/intel/vulkan/anv_private.h > +++ b/src/intel/vulkan/anv_private.h > @@ -1763,6 +1763,8 @@ struct anv_render_pass { > struct anv_subpass subpasses[0]; > }; > > +#define ANV_PIPELINE_STATISTICS_COUNT 11 > + > struct anv_query_pool_slot { > uint64_t begin; > uint64_t end; > @@ -1772,6 +1774,7 @@ struct anv_query_pool_slot { > struct anv_query_pool { > VkQueryType type; > uint32_t slots; > + uint32_t pipeline_statistics; > struct anv_bo bo; > }; > > diff --git a/src/intel/vulkan/anv_query.c b/src/intel/vulkan/anv_query.c > index 293257b..46e9d9a 100644 > --- a/src/intel/vulkan/anv_query.c > +++ b/src/intel/vulkan/anv_query.c > @@ -38,8 +38,8 @@ VkResult anv_CreateQueryPool( > ANV_FROM_HANDLE(anv_device, device, _device); > struct anv_query_pool *pool; > VkResult result; > - uint32_t slot_size; > - uint64_t size; > + uint32_t slot_size = sizeof(struct anv_query_pool_slot); > + uint64_t size = pCreateInfo->queryCount * slot_size; > > assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO); > > @@ -48,12 +48,13 @@ VkResult anv_CreateQueryPool( > case VK_QUERY_TYPE_TIMESTAMP: > break; > case VK_QUERY_TYPE_PIPELINE_STATISTICS: > - return VK_ERROR_INCOMPATIBLE_DRIVER; > + size *= _mesa_bitcount(pCreateInfo->pipelineStatistics); > + break; > default: > assert(!"Invalid query type"); > + return VK_ERROR_INCOMPATIBLE_DRIVER; > } > > - slot_size = sizeof(struct anv_query_pool_slot); > pool = vk_alloc2(&device->alloc, pAllocator, sizeof(*pool), 8, > VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); > if (pool == NULL) > @@ -61,8 +62,8 @@ VkResult anv_CreateQueryPool( > > pool->type = pCreateInfo->queryType; > pool->slots = pCreateInfo->queryCount; > + pool->pipeline_statistics = pCreateInfo->pipelineStatistics; > > - size = pCreateInfo->queryCount * slot_size; > result = anv_bo_init_new(&pool->bo, device, size); > if (result != VK_SUCCESS) > goto fail; > @@ -95,6 +96,27 @@ void anv_DestroyQueryPool( > vk_free2(&device->alloc, pAllocator, pool); > } > > +static void * > +store_query_result(void *pData, VkQueryResultFlags flags, > + uint64_t result, uint64_t available) > +{ > + if (flags & VK_QUERY_RESULT_64_BIT) { > + uint64_t *dst = pData; > + *dst++ = result; > + if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) > + *dst++ = available; > + return dst; > + } else { > + uint32_t *dst = pData; > + if (result > UINT32_MAX) > + result = UINT32_MAX; > + *dst++ = result; > + if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) > + *dst++ = available; > + return dst; > + } > +} > + > VkResult anv_GetQueryPoolResults( > VkDevice _device, > VkQueryPool queryPool, > @@ -112,6 +134,7 @@ VkResult anv_GetQueryPoolResults( > int ret; > > assert(pool->type == VK_QUERY_TYPE_OCCLUSION || > + pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS || > pool->type == VK_QUERY_TYPE_TIMESTAMP); > > if (pData == NULL) > @@ -129,14 +152,35 @@ VkResult anv_GetQueryPoolResults( > void *data_end = pData + dataSize; > struct anv_query_pool_slot *slot = pool->bo.map; > > - for (uint32_t i = 0; i < queryCount; i++) { > + for (uint32_t i = 0; i < queryCount && pData < data_end; > + i++, pData += stride) { > + if (pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS) { > + VkQueryResultFlags f = flags & > ~VK_QUERY_RESULT_WITH_AVAILABILITY_BIT; > + void *pos = pData; > + uint32_t queries = _mesa_bitcount(pool->pipeline_statistics); > + struct anv_query_pool_slot *base = &slot[(firstQuery + i) * > queries]; > + > + for (uint32_t stat = 0; stat < ANV_PIPELINE_STATISTICS_COUNT; > stat++) { > + if (pool->pipeline_statistics & (1 << stat)) { > + pos = store_query_result(pos, f, base->end - base->begin, 0); > + base++; > + } > + } > + if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) { > + base--; > + if (flags & VK_QUERY_RESULT_64_BIT) > + *(uint64_t *)pos = base->available; > + else > + *(uint32_t *)pos = base->available; > + } > + continue; > + } > + > switch (pool->type) { > case VK_QUERY_TYPE_OCCLUSION: { > result = slot[firstQuery + i].end - slot[firstQuery + i].begin; > break; > } > - case VK_QUERY_TYPE_PIPELINE_STATISTICS: > - unreachable("pipeline stats not supported"); > case VK_QUERY_TYPE_TIMESTAMP: { > result = slot[firstQuery + i].begin; > break; > @@ -145,23 +189,7 @@ VkResult anv_GetQueryPoolResults( > unreachable("invalid pool type"); > } > > - if (flags & VK_QUERY_RESULT_64_BIT) { > - uint64_t *dst = pData; > - dst[0] = result; > - if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) > - dst[1] = slot[firstQuery + i].available; > - } else { > - uint32_t *dst = pData; > - if (result > UINT32_MAX) > - result = UINT32_MAX; > - dst[0] = result; > - if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) > - dst[1] = slot[firstQuery + i].available; > - } > - > - pData += stride; > - if (pData >= data_end) > - break; > + store_query_result(pData, flags, result, slot[firstQuery + > i].available); > } > > return VK_SUCCESS; > @@ -183,6 +211,15 @@ void anv_CmdResetQueryPool( > slot[firstQuery + i].available = 0; > break; > } > + case VK_QUERY_TYPE_PIPELINE_STATISTICS: { > + struct anv_query_pool_slot *slot = pool->bo.map; > + uint32_t queries = _mesa_bitcount(pool->pipeline_statistics); > + > + slot = &slot[(firstQuery + i) * queries]; > + for (uint32_t j = 0; j < queries; j++) > + slot[j].available = 0; > + break; > + } > default: > assert(!"Invalid query type"); > } > diff --git a/src/intel/vulkan/genX_cmd_buffer.c > b/src/intel/vulkan/genX_cmd_buffer.c > index a965cd6..34c079c 100644 > --- a/src/intel/vulkan/genX_cmd_buffer.c > +++ b/src/intel/vulkan/genX_cmd_buffer.c > @@ -2272,6 +2272,55 @@ emit_query_availability(struct anv_cmd_buffer > *cmd_buffer, > } > } > > +#define IA_VERTICES_COUNT 0x2310 > +#define IA_PRIMITIVES_COUNT 0x2318 > +#define VS_INVOCATION_COUNT 0x2320 > +#define HS_INVOCATION_COUNT 0x2300 > +#define DS_INVOCATION_COUNT 0x2308 > +#define GS_INVOCATION_COUNT 0x2328 > +#define GS_PRIMITIVES_COUNT 0x2330 > +#define CL_INVOCATION_COUNT 0x2338 > +#define CL_PRIMITIVES_COUNT 0x2340 > +#define PS_INVOCATION_COUNT 0x2348 > +#define CS_INVOCATION_COUNT 0x2290 > + > +static const uint32_t PIPELINE_STAT_TO_REG[] = { > + IA_VERTICES_COUNT, > + IA_PRIMITIVES_COUNT, > + VS_INVOCATION_COUNT, > + GS_INVOCATION_COUNT, > + GS_PRIMITIVES_COUNT, > + CL_INVOCATION_COUNT, > + CL_PRIMITIVES_COUNT, > + PS_INVOCATION_COUNT, > + HS_INVOCATION_COUNT, > + DS_INVOCATION_COUNT, > + CS_INVOCATION_COUNT > +}; > + > +static void > +emit_pipeline_stat(struct anv_cmd_buffer *cmd_buffer, uint32_t stat, > + struct anv_bo *bo, uint32_t offset) { > + STATIC_ASSERT(ARRAY_SIZE(PIPELINE_STAT_TO_REG) == > + ANV_PIPELINE_STATISTICS_COUNT); > + > + uint32_t reg = PIPELINE_STAT_TO_REG[stat]; > + > + /* TODO: Implement WaDividePSInvocationCountBy4:HSW,BDW > + * NOTE: Experimentally, PS_INVOCATION_COUNT returns the same thing as > + * the occlusion query on SKL. > + */ > + > + anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM), lrm) { > + lrm.RegisterAddress = reg, > + lrm.MemoryAddress = (struct anv_address) { bo, offset }; > + } > + anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM), lrm) { > + lrm.RegisterAddress = reg + 4, > + lrm.MemoryAddress = (struct anv_address) { bo, offset + 4 }; > + } > +} > + > void genX(CmdBeginQuery)( > VkCommandBuffer commandBuffer, > VkQueryPool queryPool, > @@ -2301,7 +2350,31 @@ void genX(CmdBeginQuery)( > query * sizeof(struct anv_query_pool_slot)); > break; > > - case VK_QUERY_TYPE_PIPELINE_STATISTICS: > + case VK_QUERY_TYPE_PIPELINE_STATISTICS: { > + uint32_t queries = _mesa_bitcount(pool->pipeline_statistics); > + uint32_t q = 0; > + > + /* gen6_queryobj has this before emitting stats. Do we need it? > + anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { > + pc.PostSyncOperation = NoWrite; > + pc.RenderTargetCacheFlushEnable = true; > + pc.InstructionCacheInvalidateEnable = true; > + pc.DepthCacheFlushEnable = true; > + pc.VFCacheInvalidationEnable = true; > + pc.TextureCacheInvalidationEnable = true; > + pc.CommandStreamerStallEnable = true; > + } > + */
Urgh, sorry - That was meant to terminate at the top of the comment and do the PIPE_CONTROL emit. Forgot to fold my latest bit in. Same for CmdEndQuery. > + for (uint32_t stat = 0; stat < ANV_PIPELINE_STATISTICS_COUNT; stat++) { > + if (pool->pipeline_statistics & (1 << stat)) { > + emit_pipeline_stat(cmd_buffer, stat, &pool->bo, > + (query * queries + q) * > + sizeof(struct anv_query_pool_slot)); > + q++; > + } > + } > + break; > + } > default: > unreachable(""); > } > @@ -2324,7 +2397,37 @@ void genX(CmdEndQuery)( > query * sizeof(struct anv_query_pool_slot) + > 16); > break; > > - case VK_QUERY_TYPE_PIPELINE_STATISTICS: > + case VK_QUERY_TYPE_PIPELINE_STATISTICS: { > + uint32_t queries = _mesa_bitcount(pool->pipeline_statistics); > + uint32_t q = 0; > + > + /* gen6_queryobj has this before emitting stats. Do we need it? > + anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { > + pc.PostSyncOperation = NoWrite; > + pc.RenderTargetCacheFlushEnable = true; > + pc.InstructionCacheInvalidateEnable = true; > + pc.DepthCacheFlushEnable = true; > + pc.VFCacheInvalidationEnable = true; > + pc.TextureCacheInvalidationEnable = true; > + pc.CommandStreamerStallEnable = true; > + } > + */ > + > + for (uint32_t stat = 0; stat < ANV_PIPELINE_STATISTICS_COUNT; stat++) { > + if (pool->pipeline_statistics & (1 << stat)) { > + emit_pipeline_stat(cmd_buffer, stat, &pool->bo, > + (query * queries + q) * > + sizeof(struct anv_query_pool_slot) + 8); > + q++; > + } > + } > + > + emit_query_availability(cmd_buffer, &pool->bo, > + (query * queries + q - 1) * > + sizeof(struct anv_query_pool_slot) + 16); > + break; > + } > + > default: > unreachable(""); > } > @@ -2420,7 +2523,7 @@ emit_load_alu_reg_u64(struct anv_batch *batch, uint32_t > reg, > } > } > > -static void > +static uint32_t > store_query_result(struct anv_batch *batch, uint32_t reg, > struct anv_bo *bo, uint32_t offset, VkQueryResultFlags > flags) > { > @@ -2434,7 +2537,27 @@ store_query_result(struct anv_batch *batch, uint32_t > reg, > srm.RegisterAddress = reg + 4; > srm.MemoryAddress = (struct anv_address) { bo, offset + 4 }; > } > + > + return offset + 8; > } > + > + return offset + 4; > +} > + > +static void > +compute_query_result(struct anv_batch *batch, struct anv_bo *bo, > + uint32_t offset) > +{ > + emit_load_alu_reg_u64(batch, CS_GPR(0), bo, offset); > + emit_load_alu_reg_u64(batch, CS_GPR(1), bo, offset + 8); > + > + /* FIXME: We need to clamp the result for 32 bit. */ > + > + uint32_t *dw = anv_batch_emitn(batch, 5, GENX(MI_MATH)); > + dw[1] = alu(OPCODE_LOAD, OPERAND_SRCA, OPERAND_R1); > + dw[2] = alu(OPCODE_LOAD, OPERAND_SRCB, OPERAND_R0); > + dw[3] = alu(OPCODE_SUB, 0, 0); > + dw[4] = alu(OPCODE_STORE, OPERAND_R2, OPERAND_ACCU); > } > > void genX(CmdCopyQueryPoolResults)( > @@ -2459,50 +2582,55 @@ void genX(CmdCopyQueryPoolResults)( > } > } > > - dst_offset = buffer->offset + destOffset; > for (uint32_t i = 0; i < queryCount; i++) { > - > + dst_offset = buffer->offset + destOffset + destStride * i; > slot_offset = (firstQuery + i) * sizeof(struct anv_query_pool_slot); > switch (pool->type) { > case VK_QUERY_TYPE_OCCLUSION: > - emit_load_alu_reg_u64(&cmd_buffer->batch, > - CS_GPR(0), &pool->bo, slot_offset); > - emit_load_alu_reg_u64(&cmd_buffer->batch, > - CS_GPR(1), &pool->bo, slot_offset + 8); > - > - /* FIXME: We need to clamp the result for 32 bit. */ > - > - uint32_t *dw = anv_batch_emitn(&cmd_buffer->batch, 5, > GENX(MI_MATH)); > - dw[1] = alu(OPCODE_LOAD, OPERAND_SRCA, OPERAND_R1); > - dw[2] = alu(OPCODE_LOAD, OPERAND_SRCB, OPERAND_R0); > - dw[3] = alu(OPCODE_SUB, 0, 0); > - dw[4] = alu(OPCODE_STORE, OPERAND_R2, OPERAND_ACCU); > + compute_query_result(&cmd_buffer->batch, &pool->bo, slot_offset); > + dst_offset = store_query_result( > + &cmd_buffer->batch, > + CS_GPR(2), buffer->bo, dst_offset, flags); > break; > > case VK_QUERY_TYPE_TIMESTAMP: > emit_load_alu_reg_u64(&cmd_buffer->batch, > CS_GPR(2), &pool->bo, slot_offset); > + dst_offset = store_query_result( > + &cmd_buffer->batch, > + CS_GPR(2), buffer->bo, dst_offset, flags); > break; > > + case VK_QUERY_TYPE_PIPELINE_STATISTICS: { > + uint32_t queries = _mesa_bitcount(pool->pipeline_statistics); > + > + slot_offset *= queries; > + for (uint32_t slot = 0; slot < ANV_PIPELINE_STATISTICS_COUNT; > slot++) { > + if (pool->pipeline_statistics & (1 << slot)) { > + compute_query_result(&cmd_buffer->batch, &pool->bo, > slot_offset); > + dst_offset = store_query_result( > + &cmd_buffer->batch, > + CS_GPR(2), buffer->bo, dst_offset, flags); > + slot_offset += sizeof(struct anv_query_pool_slot); > + } > + } > + > + /* Get the slot offset to where it's supposed to be for the > + * availability bit. > + */ > + slot_offset -= sizeof(struct anv_query_pool_slot); > + break; > + } > default: > unreachable("unhandled query type"); > } > > - store_query_result(&cmd_buffer->batch, > - CS_GPR(2), buffer->bo, dst_offset, flags); > - > if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) { > emit_load_alu_reg_u64(&cmd_buffer->batch, CS_GPR(0), > &pool->bo, slot_offset + 16); > - if (flags & VK_QUERY_RESULT_64_BIT) > - store_query_result(&cmd_buffer->batch, > - CS_GPR(0), buffer->bo, dst_offset + 8, flags); > - else > - store_query_result(&cmd_buffer->batch, > - CS_GPR(0), buffer->bo, dst_offset + 4, flags); > + store_query_result(&cmd_buffer->batch, > + CS_GPR(0), buffer->bo, dst_offset, flags); > } > - > - dst_offset += destStride; > } > } > > -- > 2.7.3 > _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev