From: "Juan A. Suarez Romero" <jasua...@igalia.com>

In IVB and VLV, both regioning parameters and execution sizes are measured as
floats.

So when we have something like:

mov(8) g2<1>DF g3<4,4,1>DF

We are not actually moving 8 doubles (our intention), but 4 doubles.

We need to duplicate the parameters to cope with this issue.
---
 src/mesa/drivers/dri/i965/brw_fs_generator.cpp | 47 ++++++++++++++++++++++----
 1 file changed, 41 insertions(+), 6 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp 
b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
index 0710be9..90ee7c1 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
@@ -54,13 +54,14 @@ brw_file_from_reg(fs_reg *reg)
 }
 
 static struct brw_reg
-brw_reg_from_fs_reg(fs_inst *inst, fs_reg *reg, unsigned gen, bool compressed)
+brw_reg_from_fs_reg(const struct brw_compiler *compiler, fs_inst *inst,
+                    fs_reg *reg, bool compressed)
 {
    struct brw_reg brw_reg;
 
    switch (reg->file) {
    case MRF:
-      assert((reg->nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(gen));
+      assert((reg->nr & ~BRW_MRF_COMPR4) < 
BRW_MAX_MRF(compiler->devinfo->gen));
       /* Fallthrough */
    case VGRF:
       if (reg->stride == 0) {
@@ -93,6 +94,37 @@ brw_reg_from_fs_reg(fs_inst *inst, fs_reg *reg, unsigned 
gen, bool compressed)
          const unsigned width = MIN2(reg_width, phys_width);
          brw_reg = brw_vecn_reg(width, brw_file_from_reg(reg), reg->nr, 0);
          brw_reg = stride(brw_reg, width * reg->stride, width, reg->stride);
+         /* From the Ivy PRM (EU Changes by Processor Generation, page 13):
+          *  "Each DF (Double Float) operand uses an element size of 4 rather
+          *  than 8 and all regioning parameters are twice what the values
+          *  would be based on the true element size: ExecSize, Width,
+          *  HorzStride, and VertStride. Each DF operand uses a pair of
+          *  channels and all masking and swizzing should be adjusted
+          *  appropriately."
+          *
+          * From the Ivy PRM (Special Requirements for Handling Double
+          * Precision Data Types, page 71):
+          *  "In Align1 mode, all regioning parameters like stride, execution
+          *  size, and width must use the syntax of a pair of packed
+          *  floats. The offsets for these data types must be 64-bit
+          *  aligned. The execution size and regioning parameters are in terms
+          *  of floats."
+          *
+          * All these paragraphs summarizes that in Ivy, when handling DF,
+          * exec_size, width and vertstride must be duplicated. And Horzstride
+          * should be duplicated when it is greater than 1.
+          *
+          * It applies to Valleyview too.
+          */
+         if (compiler->devinfo->gen == 7 &&
+             !compiler->devinfo->is_haswell &&
+             type_sz(reg->type) == 8) {
+            brw_reg.width++;
+            if (brw_reg.vstride > 0)
+               brw_reg.vstride++;
+            if (brw_reg.hstride > 1)
+               brw_reg.hstride++;
+         }
       }
 
       brw_reg = retype(brw_reg, reg->type);
@@ -1546,6 +1578,11 @@ fs_generator::generate_code(const cfg_t *cfg, int 
dispatch_width)
       unsigned int last_insn_offset = p->next_insn_offset;
       bool multiple_instructions_emitted = false;
 
+      if (devinfo->gen == 7 && !devinfo->is_haswell &&
+          (inst->exec_data_size() == 8 || type_sz(inst->dst.type) == 8)) {
+        inst->exec_size *= 2;
+      }
+
       /* From the Broadwell PRM, Volume 7, "3D-Media-GPGPU", in the
        * "Register Region Restrictions" section: for BDW, SKL:
        *
@@ -1586,9 +1623,7 @@ fs_generator::generate_code(const cfg_t *cfg, int 
dispatch_width)
       brw_set_default_group(p, inst->group);
 
       for (unsigned int i = 0; i < inst->sources; i++) {
-         src[i] = brw_reg_from_fs_reg(inst, &inst->src[i], devinfo->gen,
-                                      compressed);
-
+         src[i] = brw_reg_from_fs_reg(compiler, inst, &inst->src[i], 
compressed);
         /* The accumulator result appears to get used for the
          * conditional modifier generation.  When negating a UD
          * value, there is a 33rd bit generated for the sign in the
@@ -1599,7 +1634,7 @@ fs_generator::generate_code(const cfg_t *cfg, int 
dispatch_width)
                inst->src[i].type != BRW_REGISTER_TYPE_UD ||
                !inst->src[i].negate);
       }
-      dst = brw_reg_from_fs_reg(inst, &inst->dst, devinfo->gen, compressed);
+      dst = brw_reg_from_fs_reg(compiler, inst, &inst->dst, compressed);
 
       brw_set_default_access_mode(p, BRW_ALIGN_1);
       brw_set_default_predicate_control(p, inst->predicate);
-- 
2.9.3

_______________________________________________
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Reply via email to