...to directly read from the vpm, saving a handful of QPU cycles.
The order of reads is preserved.

Signed-off-by: Varad Gautam <varadgau...@gmail.com>
 src/gallium/drivers/vc4/vc4_opt_vpm.c | 74 ++++++++++++++++++++++++++++++++---
 src/gallium/drivers/vc4/vc4_qir.c     |  2 +-
 src/gallium/drivers/vc4/vc4_qir.h     |  2 +-
 3 files changed, 71 insertions(+), 7 deletions(-)

diff --git a/src/gallium/drivers/vc4/vc4_opt_vpm.c 
index 0fcf1e5..277b345 100644
--- a/src/gallium/drivers/vc4/vc4_opt_vpm.c
+++ b/src/gallium/drivers/vc4/vc4_opt_vpm.c
@@ -24,22 +24,26 @@
  * @file vc4_opt_vpm.c
- * This modifies instructions that generate the value consumed by a VPM write
- * to write directly into the VPM.
+ * This modifies instructions that:
+ * 1. exclusively consume a value read from the VPM to directly read the VPM if
+ *    other operands allow it.
+ * 2. generate the value consumed by a VPM write to write directly into the 
 #include "vc4_qir.h"
-qir_opt_vpm_writes(struct vc4_compile *c)
+qir_opt_vpm(struct vc4_compile *c)
         if (c->stage == QSTAGE_FRAG)
                 return false;
         bool progress = false;
         struct qinst *vpm_writes[64] = { 0 };
+        struct qinst *vpm_reads[64] = { 0 };
         uint32_t use_count[c->num_temps];
         uint32_t vpm_write_count = 0;
+        uint32_t vpm_read_count = 0;
         memset(&use_count, 0, sizeof(use_count));
         list_for_each_entry(struct qinst, inst, &c->instructions, link) {
@@ -52,8 +56,68 @@ qir_opt_vpm_writes(struct vc4_compile *c)
                 for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) {
-                        if (inst->src[i].file == QFILE_TEMP)
-                                use_count[inst->src[i].index]++;
+                        if (inst->src[i].file == QFILE_TEMP) {
+                                uint32_t temp = inst->src[i].index;
+                                use_count[temp]++;
+                                struct qinst *mov = c->defs[temp];
+                                if (!mov ||
+                                    (mov->op != QOP_MOV &&
+                                    mov->op != QOP_FMOV &&
+                                    mov->op != QOP_MMOV)) {
+                                        continue;
+                                }
+                                if (mov->src[0].file == QFILE_VPM)
+                                        vpm_reads[vpm_read_count++] = inst;
+                        }
+                }
+        }
+        for (int i = 0; i < vpm_read_count; i++) {
+                struct qinst *inst = vpm_reads[i];
+                if (!inst || qir_is_multi_instruction(inst))
+                        continue;
+                if (qir_depends_on_flags(inst) || inst->sf)
+                        continue;
+                if (qir_has_side_effects(c, inst) ||
+                    qir_has_side_effect_reads(c, inst))
+                        continue;
+                for (int j = 0; j < qir_get_op_nsrc(inst->op); j++) {
+                        if(inst->src[j].file != QFILE_TEMP)
+                                continue;
+                        uint32_t temp = inst->src[j].index;
+                        if (use_count[temp] != 1)
+                                continue;
+                        struct qinst *mov = c->defs[temp];
+                        if (mov->src[0].file != QFILE_VPM)
+                                continue;
+                        uint32_t temps = 0;
+                        for (int k = 0; k < qir_get_op_nsrc(inst->op); k++) {
+                                if (inst->src[k].file == QFILE_TEMP)
+                                        temps++;
+                        }
+                        /* The instruction is safe to reorder if its other
+                         * sources are independent of previous instructions
+                         */
+                        if (temps == 1 ) {
+                                list_del(&inst->link);
+                                inst->src[j] = mov->src[0];
+                                list_replace(&mov->link, &inst->link);
+                                c->defs[temp] = NULL;
+                                free(mov);
+                        }
+                        progress = true;
diff --git a/src/gallium/drivers/vc4/vc4_qir.c 
index f9eb0e1..65f0067 100644
--- a/src/gallium/drivers/vc4/vc4_qir.c
+++ b/src/gallium/drivers/vc4/vc4_qir.c
@@ -526,7 +526,7 @@ qir_optimize(struct vc4_compile *c)
-                OPTPASS(qir_opt_vpm_writes);
+                OPTPASS(qir_opt_vpm);
                 if (!progress)
diff --git a/src/gallium/drivers/vc4/vc4_qir.h 
index bae3176..4f39d72 100644
--- a/src/gallium/drivers/vc4/vc4_qir.h
+++ b/src/gallium/drivers/vc4/vc4_qir.h
@@ -484,7 +484,7 @@ bool qir_opt_copy_propagation(struct vc4_compile *c);
 bool qir_opt_cse(struct vc4_compile *c);
 bool qir_opt_dead_code(struct vc4_compile *c);
 bool qir_opt_small_immediates(struct vc4_compile *c);
-bool qir_opt_vpm_writes(struct vc4_compile *c);
+bool qir_opt_vpm(struct vc4_compile *c);
 void vc4_nir_lower_blend(struct vc4_compile *c);
 void vc4_nir_lower_io(struct vc4_compile *c);
 nir_ssa_def *vc4_nir_get_state_uniform(struct nir_builder *b,

mesa-dev mailing list

Reply via email to