graphics global performance counters

Samuel Pitoiset Thu, 23 Jul 2015 08:26:03 -0700


On 07/23/2015 12:05 AM, Martin Peres wrote:

On 01/07/15 01:01, Samuel Pitoiset wrote:

This commit adds support for both compute and graphics global
performance counters which have been reverse engineered with
CUPTI (Linux) and PerfKit (Windows).

Currently, only one query type can be monitored at the same time because
the Gallium's HUD doesn't fit pretty well. This will be improved later.

Changes since v2:
- replace \% by percentage
- remove one extra call to PUSH_SPACE
- use nouveau_fence instead of my hand-made fence mechanism

Signed-off-by: Samuel Pitoiset <samuel.pitoi...@gmail.com>
---

src/gallium/drivers/nouveau/nv50/nv50_query.c | 1066+++++++++++++++++++++++-

  src/gallium/drivers/nouveau/nv50/nv50_screen.h |   35 +
  2 files changed, 1096 insertions(+), 5 deletions(-)

diff --git a/src/gallium/drivers/nouveau/nv50/nv50_query.cb/src/gallium/drivers/nouveau/nv50/nv50_query.c

index 81f7474..7fb6f3a 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_query.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_query.c
@@ -27,6 +27,8 @@
  #include "nv50/nv50_context.h"
  #include "nv_object.xml.h"
  +#include "nouveau_perfmon.h"
+
  #define NV50_QUERY_STATE_READY   0
  #define NV50_QUERY_STATE_ACTIVE  1
  #define NV50_QUERY_STATE_ENDED   2
@@ -51,10 +53,25 @@ struct nv50_query {
     boolean is64bit;
     struct nouveau_mm_allocation *mm;
     struct nouveau_fence *fence;
+   struct nouveau_object *perfdom;
  };
    #define NV50_QUERY_ALLOC_SPACE 256
  +#ifdef DEBUG

No need to guard the definition of this function. The compiler willget rid of it if it has no users.


Fixed.

+static void nv50_hw_pm_dump_perfdom(struct nvif_perfdom_v0 *args);
+#endif
+
+static boolean
+nv50_hw_pm_query_create(struct nv50_context *, struct nv50_query *);
+static void
+nv50_hw_pm_query_destroy(struct nv50_context *, struct nv50_query *);
+static boolean
+nv50_hw_pm_query_begin(struct nv50_context *, struct nv50_query *);
+static void nv50_hw_pm_query_end(struct nv50_context *, structnv50_query *);
+static boolean nv50_hw_pm_query_result(struct nv50_context *,
+ struct nv50_query *, boolean,void *);
+
  static INLINE struct nv50_query *
  nv50_query(struct pipe_query *pipe)
  {
@@ -96,9 +113,15 @@ nv50_query_allocate(struct nv50_context *nv50,struct nv50_query *q, int size)
  static void
  nv50_query_destroy(struct pipe_context *pipe, struct pipe_query *pq)
  {
-   nv50_query_allocate(nv50_context(pipe), nv50_query(pq), 0);
-   nouveau_fence_ref(NULL, &nv50_query(pq)->fence);
-   FREE(nv50_query(pq));
+   struct nv50_context *nv50 = nv50_context(pipe);
+   struct nv50_query *q = nv50_query(pq);
+
+ if ((q->type >= NV50_HW_PM_QUERY(0) && q->type <=NV50_HW_PM_QUERY_LAST))
+      nv50_hw_pm_query_destroy(nv50, q);
+
+   nv50_query_allocate(nv50, q, 0);
+   nouveau_fence_ref(NULL, &q->fence);
+   FREE(q);
  }
    static struct pipe_query *
@@ -120,6 +143,12 @@ nv50_query_create(struct pipe_context *pipe,unsigned type, unsigned index)
                   type == PIPE_QUERY_PRIMITIVES_EMITTED ||
                   type == PIPE_QUERY_SO_STATISTICS ||
                   type == PIPE_QUERY_PIPELINE_STATISTICS);
+ if (type >= NV50_HW_PM_QUERY(0) && q->type <=NV50_HW_PM_QUERY_LAST) {+ /* Hardware global performance counters are not 64 bits, butwe also use
+       * a fence to make sure the query is ready. */
I do not understand the logic of this comment.

Only 64-bits queries use a nouveau_fence to make sure result isavailable. 32-bits queries use a hand-made sequence number.Global PM are declared as 32-bits queries but we also use anouveau_fence to check the result.

I'll rewrite that comment.

+      q->is64bit = TRUE;
+   }
+
     q->type = type;
       if (q->type == PIPE_QUERY_OCCLUSION_COUNTER) {
@@ -127,6 +156,11 @@ nv50_query_create(struct pipe_context *pipe,unsigned type, unsigned index)q->data -= 32 / sizeof(*q->data); /* we advance beforequery_begin ! */
     }
+ if ((q->type >= NV50_HW_PM_QUERY(0) && q->type <=NV50_HW_PM_QUERY_LAST)) {
+      if (!nv50_hw_pm_query_create(nv50, q))
+         return NULL;
+   }
+
     return (struct pipe_query *)q;
  }
@@ -151,6 +185,7 @@ nv50_query_begin(struct pipe_context *pipe,struct pipe_query *pq)
     struct nv50_context *nv50 = nv50_context(pipe);
     struct nouveau_pushbuf *push = nv50->base.pushbuf;
     struct nv50_query *q = nv50_query(pq);
+   boolean ret = TRUE;
/* For occlusion queries we have to change the storage,because a previous* query might set the initial render conition to FALSE even*after* we re-@@ -205,10 +240,13 @@ nv50_query_begin(struct pipe_context *pipe,struct pipe_query *pq)
        nv50_query_get(push, q, 0x10, 0x00005002);
        break;
     default:
+ if ((q->type >= NV50_HW_PM_QUERY(0) && q->type <=NV50_HW_PM_QUERY_LAST)) {
+         ret = nv50_hw_pm_query_begin(nv50, q);
+      }
        break;
     }
     q->state = NV50_QUERY_STATE_ACTIVE;
-   return true;
+   return ret;
  }
    static void
@@ -265,7 +303,9 @@ nv50_query_end(struct pipe_context *pipe, structpipe_query *pq)
        q->state = NV50_QUERY_STATE_READY;
        break;
     default:
-      assert(0);
+ if ((q->type >= NV50_HW_PM_QUERY(0) && q->type <=NV50_HW_PM_QUERY_LAST)) {
+         nv50_hw_pm_query_end(nv50, q);
+      }
I get the idea, but deleting assert(0) is not acceptable. Why don'tyou move it to after your if and add a break at the end of the ifblock? This way, you preserve the old behaviour :)

I'll keep that assertion but adding it in end_query() doesn't make anysense. It should be added (at least) in begin_query().

        break;
     }

@@ -300,6 +340,10 @@ nv50_query_result(struct pipe_context *pipe,struct pipe_query *pq,

     if (q->state != NV50_QUERY_STATE_READY)
        nv50_query_update(q);

+ if ((q->type >= NV50_HW_PM_QUERY(0) && q->type <=NV50_HW_PM_QUERY_LAST)) {

+      return nv50_hw_pm_query_result(nv50, q, wait, result);
+   }
+
     if (q->state != NV50_QUERY_STATE_READY) {
        if (!wait) {
           /* for broken apps that spin on GL_QUERY_RESULT_AVAILABLE */

@@ -476,6 +520,1018 @@ nva0_so_target_save_offset(struct pipe_context*pipe,

     nv50_query_end(pipe, targ->pq);
  }
  +/* === HARDWARE GLOBAL PERFORMANCE COUNTERS for NV50 === */
+
+struct nv50_hw_pm_source_cfg
+{
+   const char *name;
+   uint64_t value;
+};
+
+struct nv50_hw_pm_signal_cfg
+{
+   const char *name;
+   const struct nv50_hw_pm_source_cfg src[8];
+};
+
+struct nv50_hw_pm_counter_cfg
+{
+   uint16_t logic_op;
+   const struct nv50_hw_pm_signal_cfg sig[4];
+};
+
+enum nv50_hw_pm_query_display
+{
+   NV50_HW_PM_EVENT_DISPLAY_RAW,
+   NV50_HW_PM_EVENT_DISPLAY_RATIO,
+};
+
+enum nv50_hw_pm_query_count
+{
+   NV50_HW_PM_EVENT_COUNT_SIMPLE,
+   NV50_HW_PM_EVENT_COUNT_B4,
+   NV50_HW_PM_EVENT_COUNT_B6,
+};
+
+struct nv50_hw_pm_event_cfg
+{
+   const char *name;
+   const char *desc;
+   enum nv50_hw_pm_query_display display;
+   enum nv50_hw_pm_query_count count;
+   uint8_t domain;
+};
+
+struct nv50_hw_pm_query_cfg
+{
+   const struct nv50_hw_pm_event_cfg *event;
+   const struct nv50_hw_pm_counter_cfg ctr[4];
+};
+
+#define SRC(name, val) { name, val }
+#define SIG(name, ...) { name, { __VA_ARGS__ } }
+#define CTR(func, ...) { func, { __VA_ARGS__ } }
+
+/*
+ * GPU
+ */
+/* gpu_idle */
+static const struct nv50_hw_pm_event_cfg
+nv50_gpu_idle_event =
+{
+   .name    = "gpu_idle",

+ .desc = "The percentage of time the GPU is idle/busy since thelast "+ "call. Having the GPU idle at all is a waste ofvaluable "+ "resources. You want to balance the GPU and CPUworkloads so "+ "that no one processor is starved for work. Timemanagement or "+ "using multithreading in your application can helpbalance CPU "+ "based tasks (world management, etc.) with therendering "

+              "pipeline.",
+   .display = NV50_HW_PM_EVENT_DISPLAY_RATIO,
+   .count   = NV50_HW_PM_EVENT_COUNT_SIMPLE,
+   .domain  = 1,
+};
+
+static const struct nv50_hw_pm_query_cfg
+nv50_gpu_idle_query =
+{
+   .event  = &nv50_gpu_idle_event,
+   .ctr[0] = CTR(0xaaaa, SIG("pc01_gr_idle")),
+};
+
+/*
+ * INPUT ASSEMBLER
+ */
+/* input_assembler_busy */
+static const struct nv50_hw_pm_event_cfg
+nv50_ia_busy_event =
+{
+   .name    = "input_assembler_busy",

+ .desc = "The percentage of time the input assembler unit isbusy. This "+ "is mainly impacted by both the number of verticesprocessed as "+ "well as the size of the attributes on those vertices.You can "+ "optimize this by reducing vertex size as much aspossible and "+ "using indexed primitives to take advantage of thevertex cache.",

+   .display = NV50_HW_PM_EVENT_DISPLAY_RATIO,
+   .count   = NV50_HW_PM_EVENT_COUNT_SIMPLE,
+   .domain  = 1,
+};
+
+static const struct nv50_hw_pm_query_cfg
+nv50_ia_busy_query =
+{
+   .event   = &nv50_ia_busy_event,
+   .ctr[0]  = CTR(0xf888, SIG("pc01_vfetch_18",
+                              SRC("pgraph_vfetch_unk0c_unk0", 0x1)),
+                          SIG("pc01_vfetch_17"),
+                          SIG("pc01_vfetch_03"),
+                          SIG("pc01_vfetch_02")),
+};
+
+static const struct nv50_hw_pm_query_cfg
+nva0_ia_busy_query =
+{
+   .event   = &nv50_ia_busy_event,
+   .ctr[0]  = CTR(0xf888, SIG("pc01_vfetch_15",
+                              SRC("pgraph_vfetch_unk0c_unk0", 0x1)),
+                          SIG("pc01_vfetch_14"),
+                          SIG("pc01_vfetch_03"),
+                          SIG("pc01_vfetch_02")),
+};
+
+/* input_assembler_waits_for_fb */
+static const struct nv50_hw_pm_event_cfg
+nv50_ia_waits_for_fb_event = {
+   .name    = "input_assembler_waits_for_fb",

+ .desc = "This is the amount of time the input assembler unitwas "

+              "waiting for data from the frame buffer unit.",
+   .display = NV50_HW_PM_EVENT_DISPLAY_RATIO,
+   .count   = NV50_HW_PM_EVENT_COUNT_SIMPLE,
+   .domain  = 1,
+};
+
+static const struct nv50_hw_pm_query_cfg
+nv50_ia_waits_for_fb_query =
+{
+   .event   = &nv50_ia_waits_for_fb_event,
+   .ctr[0]  = CTR(0xaaaa, SIG("pc01_vfetch_0e",
+                              SRC("pgraph_vfetch_unk0c_unk0", 0x1))),
+};
+
+static const struct nv50_hw_pm_query_cfg
+nva0_ia_waits_for_fb_query =
+{
+   .event   = &nv50_ia_waits_for_fb_event,
+   .ctr[0]  = CTR(0xaaaa, SIG("pc01_vfetch_0b",
+                              SRC("pgraph_vfetch_unk0c_unk0", 0x1))),
+};
+
+/* vertex_attribute_count */
+static const struct nv50_hw_pm_event_cfg
+nv50_vertex_attr_count_event =
+{
+   .name    = "vertex_attribute_count",

+ .desc = "The number of vertex attributes that are fetched andpassed to "+ "the geometry unit is returned in this counter. Alarge number "+ "of attributes (or unaligned vertices) can hurt vertexcache "

+              "performance and reduce the overall vertex processing "
+              "capabilities of the pipeline.",
+   .display = NV50_HW_PM_EVENT_DISPLAY_RAW,
+   .count   = NV50_HW_PM_EVENT_COUNT_SIMPLE,
+   .domain  = 1,
+};
+
+static const struct nv50_hw_pm_query_cfg
+nv50_vertex_attr_count_query =
+{
+   .event = &nv50_vertex_attr_count_event,
+   .ctr[0] = CTR(0xf888, SIG("pc01_vfetch_18",
+                             SRC("pgraph_vfetch_unk0c_unk0", 0x1)),
+                         SIG("pc01_vfetch_17"),
+                         SIG("pc01_vfetch_03"),
+                         SIG("pc01_vfetch_02")),
+};
+
+static const struct nv50_hw_pm_query_cfg
+nva0_vertex_attr_count_query =
+{
+   .event  = &nv50_vertex_attr_count_event,
+   .ctr[0] = CTR(0xf888, SIG("pc01_vfetch_15",
+                             SRC("pgraph_vfetch_unk0c_unk0", 0x1)),
+                         SIG("pc01_vfetch_14"),
+                         SIG("pc01_vfetch_03"),
+                         SIG("pc01_vfetch_02")),
+};
+
+/*
+ * GEOM
+ */
+/* geom_vertex_in_count */
+static const struct nv50_hw_pm_event_cfg
+nv50_geom_vertex_in_count_event =
+{
+   .name    = "geom_vertex_in_count",
+   .desc    = "The number of vertices input to the geom unit.",
+   .display = NV50_HW_PM_EVENT_DISPLAY_RAW,
+   .count   = NV50_HW_PM_EVENT_COUNT_B4,
+   .domain  = 1,
+};
+
+static const struct nv50_hw_pm_query_cfg
+nv50_geom_vertex_in_count_query =
+{
+   .event  = &nv50_geom_vertex_in_count_event,
+   .ctr[1] = CTR(0xffff, SIG("pc01_vfetch_0e",
+                             SRC("pgraph_vfetch_unk0c_unk0", 0x0)),
+                         SIG("pc01_vfetch_0f"),
+                         SIG("pc01_vfetch_10"),
+                         SIG("pc01_trailer")),
+   .ctr[2] = CTR(0x5555, SIG("pc01_trailer"),
+                         SIG("pc01_trailer"),
+                         SIG("pc01_trailer"),
+                         SIG("pc01_trailer")),
+};
+
+/* geom_vertex_out_count */
+static const struct nv50_hw_pm_event_cfg
+nv50_geom_vertex_out_count_event =
+{
+   .name    = "geom_vertex_out_count",

+ .desc = "The number of vertices coming out of the geom unitafter any "

+              "geometry shader expansion.",
+   .display = NV50_HW_PM_EVENT_DISPLAY_RAW,
+   .count   = NV50_HW_PM_EVENT_COUNT_SIMPLE,
+   .domain  = 1,
+};
+
+static const struct nv50_hw_pm_query_cfg
+nv50_geom_vertex_out_count_query =
+{
+   .event  = &nv50_geom_vertex_out_count_event,
+   .ctr[0] = CTR(0xaaaa, SIG("pc01_vattr_01")),
+};
+
+/* geom_primitive_in_count */
+static const struct nv50_hw_pm_event_cfg
+nv50_geom_primitive_in_count_event =
+{
+   .name    = "geom_primitive_in_count",
+   .desc    = "The number of primitives input to the geom unit.",
+   .display = NV50_HW_PM_EVENT_DISPLAY_RAW,
+   .count   = NV50_HW_PM_EVENT_COUNT_SIMPLE,
+   .domain  = 1,
+};
+
+static const struct nv50_hw_pm_query_cfg
+nv50_geom_primitive_in_count_query =
+{
+   .event  = &nv50_geom_primitive_in_count_event,
+   .ctr[0] = CTR(0xaaaa, SIG("pc01_vfetch_08",
+                             SRC("pgraph_vfetch_unk0c_unk0", 0x0))),
+};
+
+/* geom_primitive_out_count */
+static const struct nv50_hw_pm_event_cfg
+nv50_geom_primitive_out_count_event =
+{
+   .name    = "geom_primitive_out_count",

+ .desc = "The number of primitives coming out the geom unitafter any "

+              "geometry shader expansion.",
+   .display = NV50_HW_PM_EVENT_DISPLAY_RAW,
+   .count   = NV50_HW_PM_EVENT_COUNT_SIMPLE,
+   .domain  = 1,
+};
+
+static const struct nv50_hw_pm_query_cfg
+nv50_geom_primitive_out_count_query =
+{
+   .event  = &nv50_geom_primitive_out_count_event,
+   .ctr[0] = CTR(0xaaaa, SIG("pc01_vattr_00")),
+};
+
+/*
+ * STREAM OUT
+ */
+/* stream_out_busy */
+static const struct nv50_hw_pm_event_cfg
+nv50_so_busy_event =
+{
+   .name    = "stream_out_busy",

+ .desc = "This unit manages the writing of vertices to theframe buffer "+ "when using stream out. If a significant number ofvertices are "

+              "written, this can become a bottleneck.",
+   .display = NV50_HW_PM_EVENT_DISPLAY_RATIO,
+   .count   = NV50_HW_PM_EVENT_COUNT_SIMPLE,
+   .domain  = 1,
+};
+
+static const struct nv50_hw_pm_query_cfg
+nv50_so_busy_query =
+{
+   .event  = &nv50_so_busy_event,
+   .ctr[0] = CTR(0x8888, SIG("pc01_strmout_00"),
+                         SIG("pc01_strmout_01")),
+};
+
+/*
+ * SETUP
+ */
+/* setup_primitive_count */
+static const struct nv50_hw_pm_event_cfg
+nv50_setup_primitive_count_event =
+{
+   .name    = "setup_primitive_count",

+ .desc = "Returns the number of primitives processed in thegeometry "+ "subsystem. This experiments counts points, lines andtriangles. "+ "To count only triangles, use the setup_triangle_countcounter. "+ "Balance these counts with the number of pixels beingdrawn to "

+              "see if you could simplify your geometry and use "
+              "bump/displacement maps, for example.",
+   .display = NV50_HW_PM_EVENT_DISPLAY_RAW,
+   .count   = NV50_HW_PM_EVENT_COUNT_SIMPLE,
+   .domain  = 1,
+};
+
+static const struct nv50_hw_pm_query_cfg
+nv50_setup_primitive_count_query =
+{
+   .event  = &nv50_setup_primitive_count_event,
+   .ctr[0] = CTR(0xaaaa, SIG("pc01_trast_00")),
+};
+
+/* setup_point_count */
+static const struct nv50_hw_pm_event_cfg
+nv50_setup_point_count_event =
+{
+   .name    = "setup_point_count",

+ .desc = "The number of points seen by the primitive setup unit(just "

+              "before rasterization).",
+   .display = NV50_HW_PM_EVENT_DISPLAY_RAW,
+   .count   = NV50_HW_PM_EVENT_COUNT_SIMPLE,
+   .domain  = 1,
+};
+
+static const struct nv50_hw_pm_query_cfg
+nv50_setup_point_count_query =
+{
+   .event  = &nv50_setup_point_count_event,
+   .ctr[0] = CTR(0x8080, SIG("pc01_trast_01"),
+                         SIG("pc01_trast_04"),
+                         SIG("pc01_trast_05")),
+};
+
+/* setup_line_count */
+static const struct nv50_hw_pm_event_cfg
+nv50_setup_line_count_event =
+{
+   .name    = "setup_line_count",

+ .desc = "The number of lines seen by the primitive setup unit(just "

+              "before rasterization).",
+   .display = NV50_HW_PM_EVENT_DISPLAY_RAW,
+   .count   = NV50_HW_PM_EVENT_COUNT_SIMPLE,
+   .domain  = 1,
+};
+
+static const struct nv50_hw_pm_query_cfg
+nv50_setup_line_count_query =
+{
+   .event  = &nv50_setup_line_count_event,
+   .ctr[0] = CTR(0x8080, SIG("pc01_trast_02"),
+                         SIG("pc01_trast_04"),
+                         SIG("pc01_trast_05")),
+};
+
+/* setup_triangle_count */
+static const struct nv50_hw_pm_event_cfg
+nv50_setup_triangle_count_event =
+{
+   .name    = "setup_triangle_count",

+ .desc = "Returns the number of triangles processed in thegeometry "

+              "subsystem.",
+   .display = NV50_HW_PM_EVENT_DISPLAY_RAW,
+   .count   = NV50_HW_PM_EVENT_COUNT_SIMPLE,
+   .domain  = 1,
+};
+
+static const struct nv50_hw_pm_query_cfg
+nv50_setup_triangle_count_query =
+{
+   .event  = &nv50_setup_triangle_count_event,
+   .ctr[0] = CTR(0x8080, SIG("pc01_trast_03"),
+                         SIG("pc01_trast_04"),
+                         SIG("pc01_trast_05")),
+};
+
+/* setup_primitive_culled_count */
+static const struct nv50_hw_pm_event_cfg
+nv50_setup_primitive_culled_count_event =
+{
+   .name    = "setup_primitive_culled_count",

+ .desc = "Returns the number of primitives culled in primitivesetup. If "

+              "you are performing viewport culling, this gives you an "

+ "indication of the accuracy of the algorithm beingused, and can "+ "give you and idea if you need to improves thisculling. This "+ "includes primitives culled when using backfaceculling. Drawing "+ "a fully visible sphere on the screen should cull halfof the "

+              "triangles if backface culling is turned on and all the "
+              "triangles are ordered consistently (CW or CCW).",
+   .display = NV50_HW_PM_EVENT_DISPLAY_RAW,
+   .count   = NV50_HW_PM_EVENT_COUNT_SIMPLE,
+   .domain  = 1,
+};
+
+static const struct nv50_hw_pm_query_cfg
+nv50_setup_primitive_culled_count_query =
+{
+   .event  = &nv50_setup_primitive_culled_count_event,
+   .ctr[0] = CTR(0xaaaa, SIG("pc01_unk00")),
+};
+
+/*
+ * RASTERIZER
+ */
+/* rast_tiles_killed_by_zcull_count */
+static const struct nv50_hw_pm_event_cfg
+nv50_rast_tiles_killed_by_zcull_event =
+{
+   .name    = "rasterizer_tiles_killed_by_zcull_count",

+ .desc = "The number of pixels killed by the zcull unit in therasterizer.",

+   .display = NV50_HW_PM_EVENT_DISPLAY_RAW,
+   .count   = NV50_HW_PM_EVENT_COUNT_B6,
+   .domain  = 1,
+};
+
+static const struct nv50_hw_pm_query_cfg
+nv50_rast_tiles_killed_by_zcull_query =
+{
+   .event  = &nv50_rast_tiles_killed_by_zcull_event,
+   .ctr[1] = CTR(0xffff, SIG("pc01_zcull_00",
+                             SRC("pgraph_zcull_pm_unka4_unk0", 0x7)),
+                         SIG("pc01_zcull_01"),
+                         SIG("pc01_zcull_02"),
+                         SIG("pc01_zcull_03")),
+   .ctr[2] = CTR(0x5555, SIG("pc01_trailer"),
+                         SIG("pc01_trailer"),
+                         SIG("pc01_zcull_04"),
+                         SIG("pc01_zcull_05")),
+};
+
+/* rast_tiles_in_count */
+static const struct nv50_hw_pm_event_cfg
+nv50_rast_tiles_in_count_event =
+{
+   .name    = "rasterizer_tiles_in_count",

+ .desc = "Count of tiles (each of which contain 1-8 pixels)seen by the "

+              "rasterizer stage.",
+   .display = NV50_HW_PM_EVENT_DISPLAY_RAW,
+   .count   = NV50_HW_PM_EVENT_COUNT_B6,
+   .domain  = 1,
+};
+
+static const struct nv50_hw_pm_query_cfg
+nv50_rast_tiles_in_count_query =
+{
+   .event  = &nv50_rast_tiles_in_count_event,
+   .ctr[1] = CTR(0xffff, SIG("pc01_zcull_00",
+                             SRC("pgraph_zcull_pm_unka4_unk0", 0x0)),
+                         SIG("pc01_zcull_01"),
+                         SIG("pc01_zcull_02"),
+                         SIG("pc01_zcull_03")),
+   .ctr[2] = CTR(0x5555, SIG("pc01_trailer"),
+                         SIG("pc01_trailer"),
+                         SIG("pc01_zcull_04"),
+                         SIG("pc01_zcull_05")),
+};
+
+/*
+ * ROP
+ */
+/* rop_busy */
+static const struct nv50_hw_pm_event_cfg
+nv50_rop_busy_event =
+{
+   .name    = "rop_busy",

+ .desc = "Percentage of time that the ROP unit is activelydoing work. "+ "This can be high if alpha blending is turned on, ofoverdraw "

+              "is high, etc.",
+   .display = NV50_HW_PM_EVENT_DISPLAY_RATIO,
+   .count   = NV50_HW_PM_EVENT_COUNT_SIMPLE,
+   .domain  = 2,
+};
+
+static const struct nv50_hw_pm_query_cfg
+nv50_rop_busy_query =
+{
+   .event  = &nv50_rop_busy_event,
+   .ctr[0] = CTR(0xf888, SIG("pc02_prop_02",
+                             SRC("pgraph_tpc0_prop_pm_mux_sel", 0x0)),
+                         SIG("pc02_prop_03"),
+                         SIG("pc02_prop_04"),
+                         SIG("pc02_prop_05")),
+};
+
+/* rop_waits_for_fb */
+static const struct nv50_hw_pm_event_cfg
+nv50_rop_waits_for_fb_event =
+{
+   .name    = "rop_waits_for_fb",

+ .desc = "The amount of time the blending unit spent waitingfor data "+ "from the frame buffer unit. If blending is enabledand there "+ "is a lot of traffic here (since this is aread/modify/write "

+              "operation) this can become a bottleneck.",
+   .display = NV50_HW_PM_EVENT_DISPLAY_RATIO,
+   .count   = NV50_HW_PM_EVENT_COUNT_SIMPLE,
+   .domain  = 2,
+};
+
+static const struct nv50_hw_pm_query_cfg
+nv50_rop_waits_for_fb_query =
+{
+   .event  = &nv50_rop_waits_for_fb_event,
+   .ctr[0] = CTR(0x22f2, SIG("pc02_crop_03",
+ SRC("pgraph_rop0_crop_pm_mux_sel0", 0x0)),
+                         SIG("pc02_crop_02"),
+                         SIG("pc02_zrop_03",
+ SRC("pgraph_rop0_zrop_pm_mux_sel0", 0x0)),
+                         SIG("pc02_zrop_02")),
+};
+
+/* rop_waits_for_shader */
+static const struct nv50_hw_pm_event_cfg
+nv50_rop_waits_for_shader_event =
+{
+   .name    = "rop_waits_for_shader",

+ .desc = "This is a measurement of how often the blending unitwas "+ "waiting on new work (fragments to be placed into therender "+ "target). If the pixel shaders are particularlyexpensive, the "

+              "ROP unit could be starved waiting for results.",
+   .display = NV50_HW_PM_EVENT_DISPLAY_RATIO,
+   .count   = NV50_HW_PM_EVENT_COUNT_SIMPLE,
+   .domain  = 2,
+};
+
+static const struct nv50_hw_pm_query_cfg
+nv50_rop_waits_for_shader_query =
+{
+   .event  = &nv50_rop_waits_for_shader_event,
+   .ctr[0] = CTR(0x2222, SIG("pc02_prop_6",
+                             SRC("pgraph_tpc0_prop_pm_mux_sel", 0x0)),
+                         SIG("pc02_prop_7")),
+};
+
+/* rop_samples_killed_by_earlyz_count */
+static const struct nv50_hw_pm_event_cfg
+nv50_rop_samples_killed_by_earlyz_event =
+{
+   .name    = "rop_samples_killed_by_earlyz_count",

+ .desc = "This returns the number of pixels that were killed inthe "+ "earlyZ hardware. This signal will give you an ideaof, for "+ "instance, a Z only pass was successful in setting upthe depth "

+              "buffer.",
+   .display = NV50_HW_PM_EVENT_DISPLAY_RAW,
+   .count   = NV50_HW_PM_EVENT_COUNT_B6,
+   .domain  = 2,
+};
+
+static const struct nv50_hw_pm_query_cfg
+nv50_rop_samples_killed_by_earlyz_query =
+{
+   .event  = &nv50_rop_samples_killed_by_earlyz_event,
+   .ctr[1] = CTR(0xffff, SIG("pc02_prop_00",
+                             SRC("pgraph_tpc0_prop_pm_mux_sel", 0x1a)),
+                         SIG("pc02_prop_01"),
+                         SIG("pc02_prop_02"),
+                         SIG("pc02_prop_03")),
+   .ctr[2] = CTR(0x5555, SIG("pc02_prop_07"),
+                         SIG("pc02_trailer"),
+                         SIG("pc02_prop_04"),
+                         SIG("pc02_prop_05")),
+};
+
+/* rop_samples_killed_by_latez_count */
+static const struct nv50_hw_pm_event_cfg
+nv50_rop_samples_killed_by_latez_event =
+{
+   .name    = "rop_samples_killed_by_latez_count",

+ .desc = "This returns the number of pixels that were killedafter the "+ "pixel shader ran. This can happen if the early Z isunable to "+ "cull the pixel because of an API setup issue likechanging the "

+              "Z direction or modifying Z in the pixel shader.",
+   .display = NV50_HW_PM_EVENT_DISPLAY_RAW,
+   .count   = NV50_HW_PM_EVENT_COUNT_B6,
+   .domain  = 2,
+};
+
+static const struct nv50_hw_pm_query_cfg
+nv50_rop_samples_killed_by_latez_query =
+{
+   .event  = &nv50_rop_samples_killed_by_latez_event,
+   .ctr[1] = CTR(0xffff, SIG("pc02_prop_00",
+                             SRC("pgraph_tpc0_prop_pm_mux_sel", 0x1b)),
+                         SIG("pc02_prop_01"),
+                         SIG("pc02_prop_02"),
+                         SIG("pc02_prop_03")),
+   .ctr[2] = CTR(0x5555, SIG("pc02_prop_07"),
+                         SIG("pc02_trailer"),
+                         SIG("pc02_prop_04"),
+                         SIG("pc02_prop_05")),
+};
+
+/*
+ * TEXTURE
+ */
+/* tex_cache_miss */
+static const struct nv50_hw_pm_event_cfg
+nv50_tex_cache_miss_event =
+{
+   .name    = "tex_cache_miss",
+   .desc    = "Number of texture cache misses.",
+   .display = NV50_HW_PM_EVENT_DISPLAY_RAW,
+   .count   = NV50_HW_PM_EVENT_COUNT_SIMPLE,
+   .domain  = 2,
+};
+
+static const struct nv50_hw_pm_query_cfg
+nv50_tex_cache_miss_query =
+{
+   .event  = &nv50_tex_cache_miss_event,
+   .ctr[0] = CTR(0xaaaa, SIG("pc02_tex_04",

+ SRC("pgraph_tpc0_tex_unk08_unk0",0x200))),

+};
+
+static const struct nv50_hw_pm_query_cfg
+nv84_tex_cache_miss_query =
+{
+   .event  = &nv50_tex_cache_miss_event,
+   .ctr[0] = CTR(0xaaaa, SIG("pc02_tex_04",

+ SRC("pgraph_tpc0_tex_unk08_unk0",0x800))),

+};
+
+/* tex_cache_hit */
+static const struct nv50_hw_pm_event_cfg
+nv50_tex_cache_hit_event =
+{
+   .name    = "tex_cache_hit",
+   .desc    = "Number of texture cache hits.",
+   .display = NV50_HW_PM_EVENT_DISPLAY_RAW,
+   .count   = NV50_HW_PM_EVENT_COUNT_SIMPLE,
+   .domain  = 2,
+};
+
+static const struct nv50_hw_pm_query_cfg
+nv50_tex_cache_hit_query =
+{
+   .event  = &nv50_tex_cache_hit_event,
+   .ctr[0] = CTR(0xaaaa, SIG("pc02_tex_05",

+ SRC("pgraph_tpc0_tex_unk08_unk0",0x200))),

+};
+
+static const struct nv50_hw_pm_query_cfg
+nv84_tex_cache_hit_query =
+{
+   .event  = &nv50_tex_cache_hit_event,
+   .ctr[0] = CTR(0xaaaa, SIG("pc02_tex_05",

+ SRC("pgraph_tpc0_tex_unk08_unk0",0x800))),

+};
+
+/* tex_waits_for_fb */
+static const struct nv50_hw_pm_event_cfg
+nv50_tex_waits_for_fb_event =
+{
+   .name    = "tex_waits_for_fb",

+ .desc = "This is the amount of time the texture unit spentwaiting on "+ "samples to return from the frame buffer unit. It is apotential "

+              "indication of poor texture cache utilization.",
+   .display = NV50_HW_PM_EVENT_DISPLAY_RATIO,
+   .count   = NV50_HW_PM_EVENT_COUNT_SIMPLE,
+   .domain  = 2,
+};
+
+static const struct nv50_hw_pm_query_cfg
+nv50_tex_waits_for_fb_query =
+{
+   .event  = &nv50_tex_waits_for_fb_event,
+   .ctr[0] = CTR(0xaaaa, SIG("pc02_tex_06",

+ SRC("pgraph_tpc0_tex_unk08_unk0",0x200))),

+};
+
+static const struct nv50_hw_pm_query_cfg
+nv84_tex_waits_for_fb_query =
+{
+   .event  = &nv50_tex_waits_for_fb_event,
+   .ctr[0] = CTR(0xaaaa, SIG("pc02_tex_06",

+ SRC("pgraph_tpc0_tex_unk08_unk0",0x800))),

+};
+

+static const struct nv50_hw_pm_query_cfg*nv50_hw_pm_queries[NV50_HW_PM_QUERY_COUNT];

+
+#define _Q(n, q) nv50_hw_pm_queries[NV50_HW_PM_QUERY_##n] = &q;
+
+static void
+nv50_identify_events(struct nv50_screen *screen)
+{
+  _Q(GPU_IDLE,                      nv50_gpu_idle_query);
+  _Q(IA_BUSY,                       nv50_ia_busy_query);
+  _Q(IA_WAITS_FOR_FB, nv50_ia_waits_for_fb_query);
+  _Q(VERTEX_ATTR_COUNT, nv50_vertex_attr_count_query);
+  _Q(GEOM_VERTEX_IN_COUNT, nv50_geom_vertex_in_count_query);
+  _Q(GEOM_VERTEX_OUT_COUNT, nv50_geom_vertex_out_count_query);
+  _Q(GEOM_PRIMITIVE_IN_COUNT, nv50_geom_primitive_in_count_query);
+  _Q(GEOM_PRIMITIVE_OUT_COUNT, nv50_geom_primitive_out_count_query);
+  _Q(SO_BUSY,                       nv50_so_busy_query);
+  _Q(SETUP_PRIMITIVE_COUNT, nv50_setup_primitive_count_query);
+  _Q(SETUP_POINT_COUNT, nv50_setup_point_count_query);
+  _Q(SETUP_LINE_COUNT, nv50_setup_line_count_query);
+  _Q(SETUP_TRIANGLE_COUNT, nv50_setup_triangle_count_query);

+ _Q(SETUP_PRIMITIVE_CULLED_COUNT,nv50_setup_primitive_culled_count_query);+ _Q(RAST_TILES_KILLED_BY_ZCULL,nv50_rast_tiles_killed_by_zcull_query);

+  _Q(RAST_TILES_IN_COUNT, nv50_rast_tiles_in_count_query);
+  _Q(ROP_BUSY,                      nv50_rop_busy_query);
+  _Q(ROP_WAITS_FOR_FB, nv50_rop_waits_for_fb_query);
+  _Q(ROP_WAITS_FOR_SHADER, nv50_rop_waits_for_shader_query);

+ _Q(ROP_SAMPLES_KILLED_BY_EARLYZ,nv50_rop_samples_killed_by_earlyz_query);+ _Q(ROP_SAMPLES_KILLED_BY_LATEZ,nv50_rop_samples_killed_by_latez_query );

+  _Q(TEX_CACHE_MISS,                nv50_tex_cache_miss_query);
+  _Q(TEX_CACHE_HIT,                 nv50_tex_cache_hit_query);
+  _Q(TEX_WAITS_FOR_FB, nv50_tex_waits_for_fb_query);
+
+   if (screen->base.class_3d >= NV84_3D_CLASS) {
+      /* Variants for NV84+ */
+      _Q(TEX_CACHE_MISS,   nv84_tex_cache_miss_query);
+      _Q(TEX_CACHE_HIT,    nv84_tex_cache_hit_query);
+      _Q(TEX_WAITS_FOR_FB, nv84_tex_waits_for_fb_query);
+   }
+
+   if (screen->base.class_3d >= NVA0_3D_CLASS) {
+      /* Variants for NVA0+ */
+      _Q(IA_BUSY,           nva0_ia_busy_query);
+      _Q(IA_WAITS_FOR_FB,   nva0_ia_waits_for_fb_query);
+      _Q(VERTEX_ATTR_COUNT, nva0_vertex_attr_count_query);
+   }
+}
+
+#undef _Q
+
+#ifdef DEBUG

Same as above, get rid of this test.

+static void
+nv50_hw_pm_dump_perfdom(struct nvif_perfdom_v0 *args)
+{
+   int i, j, k;
+
+   debug_printf("PERFDOM CONFIGURATION:\n");
+   debug_printf("domaine: 0x%02x\n", args->domain);
+   debug_printf("mode: 0x%02x\n", args->mode);
+   for (i = 0; i < 4; i++) {
+      uint32_t signal = 0;
+      for (j = 0; j < 4; j++)
+         signal |= args->ctr[i].signal[j] << (j * 8);
+
+      debug_printf("ctr[%d]: func = 0x%04x, signal=0x%08x\n",
+                   i, args->ctr[i].logic_op, signal);
+
+      for (j = 0; j < 4; j++) {
+         for (k = 0; k < 8; k++) {
+            uint32_t source, value;
+            if (!args->ctr[i].source[j][k])
+               continue;
+
+            source = args->ctr[i].source[j][k];
+            value  = args->ctr[i].source[j][k] >> 32;

+ debug_printf(" src[%d][%d]: source = 0x%08x, value =0x%08x\n",

+                         j, k, source, value);
+         }
+      }
+   }
+}
+#endif
+
+static const struct nv50_hw_pm_query_cfg *

+nv50_hw_pm_query_get_cfg(struct nv50_screen *screen, uint32_tquery_type)

+{
+   return nv50_hw_pm_queries[query_type - NV50_HW_PM_QUERY(0)];

No check that you have a valid query_type? As in, query_type -NV50_HW_PM_QUERY(0) <= NV50_HW_PM_QUERY_LAST).

This should never happen because we don't allow to create a query withan invalid type.

Same behaviour on nvc0, btw.

+}
+
+static boolean

+nv50_hw_pm_query_create(struct nv50_context *nv50, struct nv50_query*q)

+{
+   struct nv50_screen *screen = nv50->screen;
+   struct nouveau_perfmon *perfmon = screen->base.perfmon;
+   static const struct nv50_hw_pm_query_cfg *cfg;
+   struct nvif_perfdom_v0 args = {};
+   struct nouveau_perfmon_dom *dom;
+   int i, j, k;
+   int ret;
+
+   if (!screen->pm.num_active) {

+ /* TODO: Currently, only one query type can be monitoredsimultaneously+ * because the Gallium's HUD doesn't fit well with the perfdominterface.

+       *

+ * With two different query types, the current scenario is asfollows:+ * CREATE Q1, BEGIN Q1, CREATE Q2, BEGIN Q2, END Q1, RESULTQ1, BEGIN Q1,

+       * END Q2, RESULT Q2, BEGIN Q2, END Q1, and so on.
+       *

+ * This behaviour doesn't allow to schedule multiple countersbecause+ * we have to do that at query creation (ie. when a perfdom iscreated).

+       *
+       * To get rid of this limitation, a better scenario would be:

+ * CREATE Q1, CREATE Q2, BEGIN Q1, BEGIN Q2, END Q1, END Q2,RESULT Q1,

+       * RESULT Q2, BEGIN Q1, BEGIN Q2, END Q1, and so on.
+       *
+       * With this kind of behaviour, we could introduce

+ * {create,begin,end}_all_queries() functions to be able toconfigure

+       * all queries in one shot.
+       */
+      screen->pm.query_type = q->type;
+   }
+   screen->pm.num_active++;
+
+   if (screen->pm.query_type != q->type) {

+ NOUVEAU_ERR("Only one query type can be monitored at the sametime!");

+      return FALSE;
+   }
+
+   cfg = nv50_hw_pm_query_get_cfg(nv50->screen, q->type);
+
+   dom = nouveau_perfmon_get_dom_by_id(perfmon, cfg->event->domain);
+   if (!dom) {
+      NOUVEAU_ERR("Failed to find domain %d\n", cfg->event->domain);
+      return FALSE;
+   }
+
+   /* configure domain and counting mode */
+   args.domain = dom->id;
+   args.mode   = cfg->event->count;
+
+   /* configure counters for this hardware event */
+   for (i = 0; i < ARRAY_SIZE(cfg->ctr); i++) {
+      const struct nv50_hw_pm_counter_cfg *sctr = &cfg->ctr[i];
+
+      if (!sctr->logic_op)
+         continue;
+      args.ctr[i].logic_op = sctr->logic_op;
+
+      /* configure signals for this counter */
+      for (j = 0; j < ARRAY_SIZE(sctr->sig); j++) {
+         const struct nv50_hw_pm_signal_cfg *ssig = &sctr->sig[j];
+         struct nouveau_perfmon_sig *sig;
+
+         if (!ssig->name)
+            continue;
+
+         sig = nouveau_perfmon_get_sig_by_name(dom, ssig->name);
+         if (!sig) {
+            NOUVEAU_ERR("Failed to find signal %s\n", ssig->name);
+            return FALSE;
+         }
+         args.ctr[i].signal[j] = sig->signal;
+
+         /* configure sources for this signal */
+         for (k = 0; k < ARRAY_SIZE(ssig->src); k++) {
+            const struct nv50_hw_pm_source_cfg *ssrc = &ssig->src[k];
+            struct nouveau_perfmon_src *src;
+
+            if (!ssrc->name)
+               continue;
+
+            src = nouveau_perfmon_get_src_by_name(sig, ssrc->name);
+            if (!src) {
+               NOUVEAU_ERR("Failed to find source %s\n", ssrc->name);
+               return FALSE;
+            }
+            args.ctr[i].source[j][k] = (ssrc->value << 32) | src->id;
+         }
+      }
+   }
+
+#ifdef DEBUG
+   if (debug_get_num_option("NV50_PM_DEBUG", 0))
+      nv50_hw_pm_dump_perfdom(&args);
+#endif
+
+   ret = nouveau_object_new(perfmon->object, perfmon->handle++,
+                            NVIF_IOCTL_NEW_V0_PERFDOM,
+                            &args, sizeof(args), &q->perfdom);
+   if (ret) {
+      NOUVEAU_ERR("Failed to create perfdom object: %d\n", ret);
+      return FALSE;
+   }
+
+   return TRUE;
+}
+
+static void

+nv50_hw_pm_query_destroy(struct nv50_context *nv50, structnv50_query *q)

+{
+   struct nv50_screen *screen = nv50->screen;
+
+   nouveau_object_del(&q->perfdom);
+   screen->pm.num_active--;
+}
+
+static boolean
+nv50_hw_pm_query_begin(struct nv50_context *nv50, struct nv50_query *q)
+{
+   struct nouveau_pushbuf *push = nv50->base.pushbuf;
+
+   /* start the next batch of counters */
+   PUSH_SPACE(push, 2);
+   BEGIN_NV04(push, SUBC_SW(0x0608), 1);

Put this sw method in libdrm?

+   PUSH_DATA (push, q->perfdom->handle);
+
+   return TRUE;
+}
+
+static void
+nv50_hw_pm_query_end(struct nv50_context *nv50, struct nv50_query *q)
+{
+   struct nouveau_pushbuf *push = nv50->base.pushbuf;
+   struct nv50_screen *screen = nv50->screen;
+
+   /* set sequence field (used to check if result is available) */
+   q->sequence = ++screen->pm.sequence;
+
+   /* sample the previous batch of counters */
+   PUSH_SPACE(push, 4);
+   BEGIN_NV04(push, SUBC_SW(0x060c), 1);
+   PUSH_DATA (push, q->perfdom->handle);
+
+   /* read back counters values */
+   BEGIN_NV04(push, SUBC_SW(0x0700), 1);
+   PUSH_DATA (push, screen->pm.sequence);
+}
+
+static volatile void *
+nv50_ntfy(struct nv50_screen *screen)
+{
+   struct nv04_notify *query = screen->query->data;
+   struct nouveau_bo *notify = screen->notify_bo;
+
+   return (char *)notify->map + query->offset;
+}
+
+static INLINE uint32_t
+nv50_hw_pm_query_get_offset(struct nv50_query *q)
+{
+   return (1 + (q->sequence % NV50_HW_PM_RING_BUFFER_MAX_QUERIES) *
+           NV50_HW_PM_RING_BUFFER_NUM_DOMAINS * 6);
+}
+
+static INLINE boolean

+nv50_hw_pm_query_read_data(struct nv50_context *nv50, structnv50_query *q,+ boolean wait, uint32_t ctr[4], uint32_t*clk)

+{
+   volatile uint32_t *ntfy = nv50_ntfy(nv50->screen);
+   uint32_t offset = nv50_hw_pm_query_get_offset(q);
+   boolean found = FALSE;
+   int i;
+
+   if (q->state != NV50_QUERY_STATE_READY) {
+      if (!wait)
+         return FALSE;
+      if (!nouveau_fence_wait(q->fence))
+         return FALSE;
+   }
+

+ if (ntfy[0] > q->sequence + NV50_HW_PM_RING_BUFFER_MAX_QUERIES -1) {+ /* Results in the ring buffer are too old, throw away thatquery. */

+      return FALSE;
+   }
+
+   for (i = 0; i < NV50_HW_PM_RING_BUFFER_NUM_DOMAINS; i++) {
+      if (ntfy[offset + i * 6] == q->perfdom->handle) {
+         found = TRUE;
+         break;
+      }
+   }
+
+   if (!found) {
+      NOUVEAU_ERR("Failed to find perfdom object %" PRIu64 "!\n",
+                  q->perfdom->handle);
+      return FALSE;
+   }
+
+   for (i = 0; i < 4; i++)
+      ctr[i] = ntfy[offset + i + 1];
+   *clk = ntfy[offset + 5];
+
+   return TRUE;
+}
+
+static boolean

+nv50_hw_pm_query_result(struct nv50_context *nv50, struct nv50_query*q,

+                        boolean wait, void *result)
+{
+   struct nv50_screen *screen = nv50->screen;
+   const struct nv50_hw_pm_query_cfg *cfg;
+   uint32_t ctr[4], clk;
+   uint64_t value = 0;
+   int ret;
+
+   ret = nv50_hw_pm_query_read_data(nv50, q, wait, ctr, &clk);
+   if (!ret)
+      return FALSE;
+
+   cfg = nv50_hw_pm_query_get_cfg(screen, q->type);
+   if (cfg->event->count == NV50_HW_PM_EVENT_COUNT_SIMPLE) {
+      /* SIMPLE hardware events are sampled on PRE_CTR. */
+      value = ctr[0];
+   } else {
+      /* EVENT_B4/EVENT_B6 hardware events are sampled on EVENT_CTR. */
+      value = ctr[2];
+   }
+
+   if (cfg->event->display == NV50_HW_PM_EVENT_DISPLAY_RATIO) {
+      if (clk)
+         value = (value * 100) / (float)clk;
+   }
+

+ fprintf(stderr, "ctr[0]=%d, ctr[1]=%d, ctr[2]=%d, ctr[3]=%d,clk=%d, val=%d\n",

+           ctr[0], ctr[1], ctr[2], ctr[3], clk, value);

The above is likely to be a leftover, right? :p


For debugging purposes, I'll remove it for the final version. ;)

+
+   *(uint64_t *)result = value;
+   return TRUE;
+}
+
  void
  nv50_init_query_functions(struct nv50_context *nv50)
  {

diff --git a/src/gallium/drivers/nouveau/nv50/nv50_screen.hb/src/gallium/drivers/nouveau/nv50/nv50_screen.h

index 71a5247..0449659 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_screen.h
+++ b/src/gallium/drivers/nouveau/nv50/nv50_screen.h
@@ -89,6 +89,12 @@ struct nv50_screen {
        struct nouveau_bo *bo;
     } fence;
  +   struct {
+      uint32_t sequence;
+      uint32_t query_type;
+      uint32_t num_active;
+   } pm;
+
     struct nouveau_object *sync;
     struct nouveau_object *query;
  @@ -108,6 +114,35 @@ nv50_screen(struct pipe_screen *screen)
     return (struct nv50_screen *)screen;
  }
  +/* Hardware global performance counters. */
+#define NV50_HW_PM_QUERY_COUNT  24
+#define NV50_HW_PM_QUERY(i)    (PIPE_QUERY_DRIVER_SPECIFIC + (i))

+#define NV50_HW_PM_QUERY_LASTNV50_HW_PM_QUERY(NV50_HW_PM_QUERY_COUNT - 1)

+#define NV50_HW_PM_QUERY_GPU_IDLE                            0
+#define NV50_HW_PM_QUERY_IA_BUSY                             1
+#define NV50_HW_PM_QUERY_IA_WAITS_FOR_FB                     2
+#define NV50_HW_PM_QUERY_VERTEX_ATTR_COUNT                   3
+#define NV50_HW_PM_QUERY_GEOM_VERTEX_IN_COUNT                4
+#define NV50_HW_PM_QUERY_GEOM_VERTEX_OUT_COUNT               5
+#define NV50_HW_PM_QUERY_GEOM_PRIMITIVE_IN_COUNT             6
+#define NV50_HW_PM_QUERY_GEOM_PRIMITIVE_OUT_COUNT            7
+#define NV50_HW_PM_QUERY_SO_BUSY                             8
+#define NV50_HW_PM_QUERY_SETUP_PRIMITIVE_COUNT               9
+#define NV50_HW_PM_QUERY_SETUP_POINT_COUNT                  10
+#define NV50_HW_PM_QUERY_SETUP_LINE_COUNT                   11
+#define NV50_HW_PM_QUERY_SETUP_TRIANGLE_COUNT               12
+#define NV50_HW_PM_QUERY_SETUP_PRIMITIVE_CULLED_COUNT       13
+#define NV50_HW_PM_QUERY_RAST_TILES_KILLED_BY_ZCULL         14
+#define NV50_HW_PM_QUERY_RAST_TILES_IN_COUNT                15
+#define NV50_HW_PM_QUERY_ROP_BUSY                           16
+#define NV50_HW_PM_QUERY_ROP_WAITS_FOR_FB                   17
+#define NV50_HW_PM_QUERY_ROP_WAITS_FOR_SHADER               18
+#define NV50_HW_PM_QUERY_ROP_SAMPLES_KILLED_BY_EARLYZ       19
+#define NV50_HW_PM_QUERY_ROP_SAMPLES_KILLED_BY_LATEZ        20
+#define NV50_HW_PM_QUERY_TEX_CACHE_MISS                     21
+#define NV50_HW_PM_QUERY_TEX_CACHE_HIT                      22
+#define NV50_HW_PM_QUERY_TEX_WAITS_FOR_FB                   23
+
  boolean nv50_blitter_create(struct nv50_screen *);
  void nv50_blitter_destroy(struct nv50_screen *);

Congrats, it looks really clean! With the above fixed, this patch is
Reviewed-by: Martin Peres <martin.pe...@free.fr>
_______________________________________________
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


_______________________________________________
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev

Re: [Mesa-dev] [PATCH v2 5/7] nv50: add support for compute/graphics global performance counters

Reply via email to