The strategy is to just keep n anv_query_pool_slot entries per query instead of one. The available bit is only valid in the last one.
Signed-off-by: Ilia Mirkin <imir...@alum.mit.edu> --- I think this is in a pretty good state now. I've tested both the direct and buffer paths with a hacked up cube application, and I'm seeing non-ridiculous values for the various counters, although I haven't 100% verified them for accuracy. This also implements the hsw/bdw workaround for dividing frag invocations by 4, copied from hsw_queryobj. I tested this on SKL and it seem to divide the values as expected. The cube patch I've been testing with is at http://paste.debian.net/899374/ You can flip between copying to a buffer and explicit retrieval by commenting out the relevant function calls. src/intel/vulkan/anv_device.c | 2 +- src/intel/vulkan/anv_private.h | 4 + src/intel/vulkan/anv_query.c | 99 ++++++++++---- src/intel/vulkan/genX_cmd_buffer.c | 260 ++++++++++++++++++++++++++++++++----- 4 files changed, 308 insertions(+), 57 deletions(-) diff --git a/src/intel/vulkan/anv_device.c b/src/intel/vulkan/anv_device.c index 99eb73c..7ad1970 100644 --- a/src/intel/vulkan/anv_device.c +++ b/src/intel/vulkan/anv_device.c @@ -427,7 +427,7 @@ void anv_GetPhysicalDeviceFeatures( .textureCompressionASTC_LDR = pdevice->info.gen >= 9, /* FINISHME CHV */ .textureCompressionBC = true, .occlusionQueryPrecise = true, - .pipelineStatisticsQuery = false, + .pipelineStatisticsQuery = true, .fragmentStoresAndAtomics = true, .shaderTessellationAndGeometryPointSize = true, .shaderImageGatherExtended = false, diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h index 2fc543d..7271609 100644 --- a/src/intel/vulkan/anv_private.h +++ b/src/intel/vulkan/anv_private.h @@ -1763,6 +1763,8 @@ struct anv_render_pass { struct anv_subpass subpasses[0]; }; +#define ANV_PIPELINE_STATISTICS_COUNT 11 + struct anv_query_pool_slot { uint64_t begin; uint64_t end; @@ -1772,6 +1774,8 @@ struct anv_query_pool_slot { struct anv_query_pool { VkQueryType type; uint32_t slots; + uint32_t pipeline_statistics; + uint32_t slot_stride; struct anv_bo bo; }; diff --git a/src/intel/vulkan/anv_query.c b/src/intel/vulkan/anv_query.c index 293257b..dc00859 100644 --- a/src/intel/vulkan/anv_query.c +++ b/src/intel/vulkan/anv_query.c @@ -38,8 +38,10 @@ VkResult anv_CreateQueryPool( ANV_FROM_HANDLE(anv_device, device, _device); struct anv_query_pool *pool; VkResult result; - uint32_t slot_size; - uint64_t size; + uint32_t slot_size = sizeof(struct anv_query_pool_slot); + uint32_t slot_stride = 1; + uint64_t size = pCreateInfo->queryCount * slot_size; + uint32_t pipeline_statistics = 0; assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO); @@ -48,12 +50,16 @@ VkResult anv_CreateQueryPool( case VK_QUERY_TYPE_TIMESTAMP: break; case VK_QUERY_TYPE_PIPELINE_STATISTICS: - return VK_ERROR_INCOMPATIBLE_DRIVER; + pipeline_statistics = pCreateInfo->pipelineStatistics & + ((1 << ANV_PIPELINE_STATISTICS_COUNT) - 1); + slot_stride = _mesa_bitcount(pipeline_statistics); + size *= slot_stride; + break; default: assert(!"Invalid query type"); + return VK_ERROR_INCOMPATIBLE_DRIVER; } - slot_size = sizeof(struct anv_query_pool_slot); pool = vk_alloc2(&device->alloc, pAllocator, sizeof(*pool), 8, VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); if (pool == NULL) @@ -61,8 +67,9 @@ VkResult anv_CreateQueryPool( pool->type = pCreateInfo->queryType; pool->slots = pCreateInfo->queryCount; + pool->pipeline_statistics = pipeline_statistics; + pool->slot_stride = slot_stride; - size = pCreateInfo->queryCount * slot_size; result = anv_bo_init_new(&pool->bo, device, size); if (result != VK_SUCCESS) goto fail; @@ -95,6 +102,27 @@ void anv_DestroyQueryPool( vk_free2(&device->alloc, pAllocator, pool); } +static void * +store_query_result(void *pData, VkQueryResultFlags flags, + uint64_t result, uint64_t available) +{ + if (flags & VK_QUERY_RESULT_64_BIT) { + uint64_t *dst = pData; + *dst++ = result; + if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) + *dst++ = available; + return dst; + } else { + uint32_t *dst = pData; + if (result > UINT32_MAX) + result = UINT32_MAX; + *dst++ = result; + if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) + *dst++ = available; + return dst; + } +} + VkResult anv_GetQueryPoolResults( VkDevice _device, VkQueryPool queryPool, @@ -112,6 +140,7 @@ VkResult anv_GetQueryPoolResults( int ret; assert(pool->type == VK_QUERY_TYPE_OCCLUSION || + pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS || pool->type == VK_QUERY_TYPE_TIMESTAMP); if (pData == NULL) @@ -129,14 +158,42 @@ VkResult anv_GetQueryPoolResults( void *data_end = pData + dataSize; struct anv_query_pool_slot *slot = pool->bo.map; - for (uint32_t i = 0; i < queryCount; i++) { + for (uint32_t i = 0; i < queryCount && pData < data_end; + i++, pData += stride) { + if (pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS) { + VkQueryResultFlags f = flags & ~VK_QUERY_RESULT_WITH_AVAILABILITY_BIT; + void *pos = pData; + uint32_t pipeline_statistics = pool->pipeline_statistics; + struct anv_query_pool_slot *base = + &slot[(firstQuery + i) * pool->slot_stride]; + + while (pipeline_statistics) { + uint32_t stat = u_bit_scan(&pipeline_statistics); + uint64_t result = base->end - base->begin; + + /* WaDividePSInvocationCountBy4:HSW,BDW */ + if ((device->info.gen == 8 || device->info.is_haswell) && + (1 << stat) == VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT) + result >>= 2; + + pos = store_query_result(pos, f, result, 0); + base++; + } + if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) { + base--; + if (flags & VK_QUERY_RESULT_64_BIT) + *(uint64_t *)pos = base->available; + else + *(uint32_t *)pos = base->available; + } + continue; + } + switch (pool->type) { case VK_QUERY_TYPE_OCCLUSION: { result = slot[firstQuery + i].end - slot[firstQuery + i].begin; break; } - case VK_QUERY_TYPE_PIPELINE_STATISTICS: - unreachable("pipeline stats not supported"); case VK_QUERY_TYPE_TIMESTAMP: { result = slot[firstQuery + i].begin; break; @@ -145,23 +202,7 @@ VkResult anv_GetQueryPoolResults( unreachable("invalid pool type"); } - if (flags & VK_QUERY_RESULT_64_BIT) { - uint64_t *dst = pData; - dst[0] = result; - if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) - dst[1] = slot[firstQuery + i].available; - } else { - uint32_t *dst = pData; - if (result > UINT32_MAX) - result = UINT32_MAX; - dst[0] = result; - if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) - dst[1] = slot[firstQuery + i].available; - } - - pData += stride; - if (pData >= data_end) - break; + store_query_result(pData, flags, result, slot[firstQuery + i].available); } return VK_SUCCESS; @@ -183,6 +224,14 @@ void anv_CmdResetQueryPool( slot[firstQuery + i].available = 0; break; } + case VK_QUERY_TYPE_PIPELINE_STATISTICS: { + struct anv_query_pool_slot *slot = pool->bo.map; + + slot = &slot[(firstQuery + i) * pool->slot_stride]; + for (uint32_t j = 0; j < pool->slot_stride; j++) + slot[j].available = 0; + break; + } default: assert(!"Invalid query type"); } diff --git a/src/intel/vulkan/genX_cmd_buffer.c b/src/intel/vulkan/genX_cmd_buffer.c index a965cd6..1369ac2 100644 --- a/src/intel/vulkan/genX_cmd_buffer.c +++ b/src/intel/vulkan/genX_cmd_buffer.c @@ -2272,6 +2272,50 @@ emit_query_availability(struct anv_cmd_buffer *cmd_buffer, } } +#define IA_VERTICES_COUNT 0x2310 +#define IA_PRIMITIVES_COUNT 0x2318 +#define VS_INVOCATION_COUNT 0x2320 +#define HS_INVOCATION_COUNT 0x2300 +#define DS_INVOCATION_COUNT 0x2308 +#define GS_INVOCATION_COUNT 0x2328 +#define GS_PRIMITIVES_COUNT 0x2330 +#define CL_INVOCATION_COUNT 0x2338 +#define CL_PRIMITIVES_COUNT 0x2340 +#define PS_INVOCATION_COUNT 0x2348 +#define CS_INVOCATION_COUNT 0x2290 + +static const uint32_t PIPELINE_STAT_TO_REG[] = { + IA_VERTICES_COUNT, + IA_PRIMITIVES_COUNT, + VS_INVOCATION_COUNT, + GS_INVOCATION_COUNT, + GS_PRIMITIVES_COUNT, + CL_INVOCATION_COUNT, + CL_PRIMITIVES_COUNT, + PS_INVOCATION_COUNT, + HS_INVOCATION_COUNT, + DS_INVOCATION_COUNT, + CS_INVOCATION_COUNT +}; + +static void +emit_pipeline_stat(struct anv_cmd_buffer *cmd_buffer, uint32_t stat, + struct anv_bo *bo, uint32_t offset) { + STATIC_ASSERT(ARRAY_SIZE(PIPELINE_STAT_TO_REG) == + ANV_PIPELINE_STATISTICS_COUNT); + + uint32_t reg = PIPELINE_STAT_TO_REG[stat]; + + anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM), lrm) { + lrm.RegisterAddress = reg, + lrm.MemoryAddress = (struct anv_address) { bo, offset }; + } + anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_REGISTER_MEM), lrm) { + lrm.RegisterAddress = reg + 4, + lrm.MemoryAddress = (struct anv_address) { bo, offset + 4 }; + } +} + void genX(CmdBeginQuery)( VkCommandBuffer commandBuffer, VkQueryPool queryPool, @@ -2301,7 +2345,25 @@ void genX(CmdBeginQuery)( query * sizeof(struct anv_query_pool_slot)); break; - case VK_QUERY_TYPE_PIPELINE_STATISTICS: + case VK_QUERY_TYPE_PIPELINE_STATISTICS: { + uint32_t pipeline_statistics = pool->pipeline_statistics; + uint32_t slot_offset = query * pool->slot_stride * + sizeof(struct anv_query_pool_slot); + + /* TODO: This might only be necessary for certain stats */ + anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { + pc.CommandStreamerStallEnable = true; + pc.StallAtPixelScoreboard = true; + } + + while (pipeline_statistics) { + uint32_t stat = u_bit_scan(&pipeline_statistics); + + emit_pipeline_stat(cmd_buffer, stat, &pool->bo, slot_offset); + slot_offset += sizeof(struct anv_query_pool_slot); + } + break; + } default: unreachable(""); } @@ -2314,17 +2376,35 @@ void genX(CmdEndQuery)( { ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); ANV_FROM_HANDLE(anv_query_pool, pool, queryPool); + uint32_t slot_offset = query * pool->slot_stride * + sizeof(struct anv_query_pool_slot); switch (pool->type) { case VK_QUERY_TYPE_OCCLUSION: - emit_ps_depth_count(cmd_buffer, &pool->bo, - query * sizeof(struct anv_query_pool_slot) + 8); + emit_ps_depth_count(cmd_buffer, &pool->bo, slot_offset + 8); + emit_query_availability(cmd_buffer, &pool->bo, slot_offset + 16); + break; + + case VK_QUERY_TYPE_PIPELINE_STATISTICS: { + uint32_t pipeline_statistics = pool->pipeline_statistics; + /* TODO: This might only be necessary for certain stats */ + anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { + pc.CommandStreamerStallEnable = true; + pc.StallAtPixelScoreboard = true; + } + + while (pipeline_statistics) { + uint32_t stat = u_bit_scan(&pipeline_statistics); - emit_query_availability(cmd_buffer, &pool->bo, - query * sizeof(struct anv_query_pool_slot) + 16); + emit_pipeline_stat(cmd_buffer, stat, &pool->bo, slot_offset + 8); + slot_offset += sizeof(struct anv_query_pool_slot); + } + + slot_offset -= sizeof(struct anv_query_pool_slot); + emit_query_availability(cmd_buffer, &pool->bo, slot_offset + 16); break; + } - case VK_QUERY_TYPE_PIPELINE_STATISTICS: default: unreachable(""); } @@ -2421,6 +2501,31 @@ emit_load_alu_reg_u64(struct anv_batch *batch, uint32_t reg, } static void +emit_load_alu_reg_imm32(struct anv_batch *batch, uint32_t reg, uint32_t imm) +{ + anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_IMM), lri) { + lri.RegisterOffset = reg; + lri.DataDWord = imm; + } +} + +static void +emit_load_alu_reg_imm64(struct anv_batch *batch, uint32_t reg, uint64_t imm) +{ + emit_load_alu_reg_imm32(batch, reg, (uint32_t)imm); + emit_load_alu_reg_imm32(batch, reg + 4, (uint32_t)(imm >> 32)); +} + +static void +emit_load_alu_reg_reg32(struct anv_batch *batch, uint32_t src, uint32_t dst) +{ + anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_REG), lrr) { + lrr.SourceRegisterAddress = src; + lrr.DestinationRegisterAddress = dst; + } +} + +static uint32_t store_query_result(struct anv_batch *batch, uint32_t reg, struct anv_bo *bo, uint32_t offset, VkQueryResultFlags flags) { @@ -2434,9 +2539,88 @@ store_query_result(struct anv_batch *batch, uint32_t reg, srm.RegisterAddress = reg + 4; srm.MemoryAddress = (struct anv_address) { bo, offset + 4 }; } + + return offset + 8; + } + + return offset + 4; +} + +/* + * GPR0 = GPR0 & ((1ull << n) - 1); + */ +static void +keep_gpr0_lower_n_bits(struct anv_batch *batch, uint32_t n) +{ + assert(n < 64); + emit_load_alu_reg_imm64(batch, CS_GPR(1), (1ull << n) - 1); + + uint32_t *dw = anv_batch_emitn(batch, 5, GENX(MI_MATH)); + dw[1] = alu(OPCODE_LOAD, OPERAND_SRCA, OPERAND_R0); + dw[2] = alu(OPCODE_LOAD, OPERAND_SRCB, OPERAND_R1); + dw[3] = alu(OPCODE_AND, 0, 0); + dw[4] = alu(OPCODE_STORE, OPERAND_R0, OPERAND_ACCU); +} + +/* + * GPR0 = GPR0 << 30; + */ +static void +shl_gpr0_by_30_bits(struct anv_batch *batch) +{ + /* First we mask 34 bits of GPR0 to prevent overflow */ + keep_gpr0_lower_n_bits(batch, 34); + + const uint32_t outer_count = 5; + const uint32_t inner_count = 6; + STATIC_ASSERT(outer_count * inner_count == 30); + const uint32_t cmd_len = 1 + inner_count * 4; + + /* We'll emit 5 commands, each shifting GPR0 left by 6 bits, for a total of + * 30 left shifts. + */ + for (int o = 0; o < outer_count; o++) { + /* Submit one MI_MATH to shift left by 6 bits */ + uint32_t *dw = anv_batch_emitn(batch, cmd_len, GENX(MI_MATH)); + dw++; + for (int i = 0; i < inner_count; i++, dw += 4) { + dw[0] = alu(OPCODE_LOAD, OPERAND_SRCA, OPERAND_R0); + dw[1] = alu(OPCODE_LOAD, OPERAND_SRCB, OPERAND_R0); + dw[2] = alu(OPCODE_ADD, 0, 0); + dw[3] = alu(OPCODE_STORE, OPERAND_R0, OPERAND_ACCU); + } } } +/* + * GPR0 = GPR0 >> 2; + * + * Note that the upper 30 bits of GPR are lost! + */ +static void +shr_gpr0_by_2_bits(struct anv_batch *batch) +{ + shl_gpr0_by_30_bits(batch); + emit_load_alu_reg_reg32(batch, CS_GPR(0) + 4, CS_GPR(0)); + emit_load_alu_reg_imm32(batch, CS_GPR(0) + 4, 0); +} + +static void +compute_query_result(struct anv_batch *batch, struct anv_bo *bo, + uint32_t dst_reg, uint32_t offset) +{ + emit_load_alu_reg_u64(batch, CS_GPR(0), bo, offset); + emit_load_alu_reg_u64(batch, CS_GPR(1), bo, offset + 8); + + /* FIXME: We need to clamp the result for 32 bit. */ + + uint32_t *dw = anv_batch_emitn(batch, 5, GENX(MI_MATH)); + dw[1] = alu(OPCODE_LOAD, OPERAND_SRCA, OPERAND_R1); + dw[2] = alu(OPCODE_LOAD, OPERAND_SRCB, OPERAND_R0); + dw[3] = alu(OPCODE_SUB, 0, 0); + dw[4] = alu(OPCODE_STORE, dst_reg, OPERAND_ACCU); +} + void genX(CmdCopyQueryPoolResults)( VkCommandBuffer commandBuffer, VkQueryPool queryPool, @@ -2459,50 +2643,64 @@ void genX(CmdCopyQueryPoolResults)( } } - dst_offset = buffer->offset + destOffset; for (uint32_t i = 0; i < queryCount; i++) { - + dst_offset = buffer->offset + destOffset + destStride * i; slot_offset = (firstQuery + i) * sizeof(struct anv_query_pool_slot); switch (pool->type) { case VK_QUERY_TYPE_OCCLUSION: - emit_load_alu_reg_u64(&cmd_buffer->batch, - CS_GPR(0), &pool->bo, slot_offset); - emit_load_alu_reg_u64(&cmd_buffer->batch, - CS_GPR(1), &pool->bo, slot_offset + 8); - - /* FIXME: We need to clamp the result for 32 bit. */ - - uint32_t *dw = anv_batch_emitn(&cmd_buffer->batch, 5, GENX(MI_MATH)); - dw[1] = alu(OPCODE_LOAD, OPERAND_SRCA, OPERAND_R1); - dw[2] = alu(OPCODE_LOAD, OPERAND_SRCB, OPERAND_R0); - dw[3] = alu(OPCODE_SUB, 0, 0); - dw[4] = alu(OPCODE_STORE, OPERAND_R2, OPERAND_ACCU); + compute_query_result(&cmd_buffer->batch, &pool->bo, OPERAND_R2, + slot_offset); + dst_offset = store_query_result( + &cmd_buffer->batch, + CS_GPR(2), buffer->bo, dst_offset, flags); break; case VK_QUERY_TYPE_TIMESTAMP: emit_load_alu_reg_u64(&cmd_buffer->batch, CS_GPR(2), &pool->bo, slot_offset); + dst_offset = store_query_result( + &cmd_buffer->batch, + CS_GPR(2), buffer->bo, dst_offset, flags); break; + case VK_QUERY_TYPE_PIPELINE_STATISTICS: { + uint32_t pipeline_statistics = pool->pipeline_statistics; + + slot_offset *= pool->slot_stride; + while (pipeline_statistics) { + uint32_t stat = u_bit_scan(&pipeline_statistics); + + compute_query_result(&cmd_buffer->batch, &pool->bo, OPERAND_R0, + slot_offset); + + /* WaDividePSInvocationCountBy4:HSW,BDW */ + if ((cmd_buffer->device->info.gen == 8 || + cmd_buffer->device->info.is_haswell) && + (1 << stat) == VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT) { + shr_gpr0_by_2_bits(&cmd_buffer->batch); + } + dst_offset = store_query_result( + &cmd_buffer->batch, + CS_GPR(0), buffer->bo, dst_offset, flags); + slot_offset += sizeof(struct anv_query_pool_slot); + } + + /* Get the slot offset to where it's supposed to be for the + * availability bit. + */ + slot_offset -= sizeof(struct anv_query_pool_slot); + break; + } default: unreachable("unhandled query type"); } - store_query_result(&cmd_buffer->batch, - CS_GPR(2), buffer->bo, dst_offset, flags); - if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) { emit_load_alu_reg_u64(&cmd_buffer->batch, CS_GPR(0), &pool->bo, slot_offset + 16); - if (flags & VK_QUERY_RESULT_64_BIT) - store_query_result(&cmd_buffer->batch, - CS_GPR(0), buffer->bo, dst_offset + 8, flags); - else - store_query_result(&cmd_buffer->batch, - CS_GPR(0), buffer->bo, dst_offset + 4, flags); + store_query_result(&cmd_buffer->batch, + CS_GPR(0), buffer->bo, dst_offset, flags); } - - dst_offset += destStride; } } -- 2.7.3 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev