|    1 
|   29 
|   90 +++
|  141 ++++
|  162 +++++
|   34 +
|  127 ++++
|   84 ++
|   36 +
|  293 ++++++++++
|   37 +
|  258 ++++++++
|   37 +
|   57 +
|   58 +
|   35 +
|   27 
|  127 ++++
|  209 +++++++
|   20 
 20 files changed, 1862 insertions(+)

New commits:
commit a6f3122dcf667b178a73c0db6eabc14089f4e334
Author: Timo Aaltonen <>
Date:   Mon Apr 1 09:06:24 2013 +0300

    Added a bunch of patches to fix slow blur on intel.

diff --git a/debian/changelog b/debian/changelog
index 6e7686f..26a916b 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -3,6 +3,7 @@ mesa (9.1.1-0ubuntu1) UNRELEASED; urgency=low
   [ Timo Aaltonen ]
   * Merge from unreleased debian git
     - new upstream release (LP: #1112147)
+  * Added a bunch of patches to fix slow blur on intel.
  -- Timo Aaltonen <>  Wed, 27 Feb 2013 11:12:43 +0200
diff --git 
new file mode 100644
index 0000000..3368dff
--- /dev/null
@@ -0,0 +1,29 @@
+From e3bb00afd8528eab84e57ce3004aa65358d8c34f Mon Sep 17 00:00:00 2001
+From: Eric Anholt <>
+Date: Wed, 13 Mar 2013 12:17:25 -0700
+Subject: [PATCH 01/12] i965/fs: Remove creation of a MOV instruction that's
+ never used.
+We weren't inserting it into the list, so it did nothing.  This line was
+replaced by the MOV/MUL block above.
+NOTE: This is a candidate for the 9.1 branch.
+ src/mesa/drivers/dri/i965/brw_fs.cpp | 1 -
+ 1 file changed, 1 deletion(-)
+diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp 
+index 5a5bfeb..2fb8989 100644
+--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
++++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
+@@ -253,7 +253,6 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg 
+       } else {
+          instructions.push_tail(MUL(mrf, offset, fs_reg(4)));
+       }
+-      inst = MOV(mrf, offset);
+       inst = new(mem_ctx) fs_inst(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD,
+                                   dst, surf_index);
+       inst->header_present = header_present;
diff --git 
new file mode 100644
index 0000000..e706bb1
--- /dev/null
@@ -0,0 +1,90 @@
+From 88a5caea52f56aab5641fddfd23732cb3ecfaf13 Mon Sep 17 00:00:00 2001
+From: Eric Anholt <>
+Date: Wed, 13 Mar 2013 12:27:17 -0700
+Subject: [PATCH 02/12] i965/fs: Move varying uniform offset compuation into
+ the helper func.
+I'm going to want to change the math for gen7 using sampler LD
+instructions in a way that gets CSE to occur like we'd hope.
+NOTE: This is a candidate for the 9.1 branch.
+ src/mesa/drivers/dri/i965/brw_fs.cpp         | 16 +++++++++-------
+ src/mesa/drivers/dri/i965/brw_fs.h           |  3 ++-
+ src/mesa/drivers/dri/i965/brw_fs_visitor.cpp |  5 ++---
+ 3 files changed, 13 insertions(+), 11 deletions(-)
+diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp 
+index 2fb8989..89b08e8 100644
+--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
++++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
+@@ -229,11 +229,15 @@ fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, 
uint32_t condition)
+ exec_list
+ fs_visitor::VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg surf_index,
+-                                       fs_reg offset)
++                                       fs_reg varying_offset,
++                                       uint32_t const_offset)
+ {
+    exec_list instructions;
+    fs_inst *inst;
++   fs_reg offset = fs_reg(this, glsl_type::uint_type);
++   instructions.push_tail(ADD(offset, varying_offset, fs_reg(const_offset)));
+    if (intel->gen >= 7) {
+       inst = new(mem_ctx) fs_inst(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7,
+                                   dst, surf_index, offset);
+@@ -1625,15 +1629,13 @@ 
+          base_ir = inst->ir;
+          current_annotation = inst->annotation;
+-         fs_reg offset = fs_reg(this, glsl_type::int_type);
+-         inst->insert_before(ADD(offset, *inst->src[i].reladdr,
+-                                 fs_reg(pull_constant_loc[uniform] +
+-                                        inst->src[i].reg_offset)));
+          fs_reg surf_index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
+          fs_reg temp = fs_reg(this, glsl_type::float_type);
+          exec_list list = VARYING_PULL_CONSTANT_LOAD(temp,
+-                                                     surf_index, offset);
++                                                     surf_index,
++                                                     *inst->src[i].reladdr,
pull_constant_loc[uniform] +
++                                                     inst->src[i].reg_offset);
+          inst->insert_before(&list);
+          inst->src[i].file = temp.file;
+diff --git a/src/mesa/drivers/dri/i965/brw_fs.h 
+index 254a534..76130b1 100644
+--- a/src/mesa/drivers/dri/i965/brw_fs.h
++++ b/src/mesa/drivers/dri/i965/brw_fs.h
+@@ -294,7 +294,8 @@ public:
+                                          fs_reg reg);
+    exec_list VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg surf_index,
+-                                        fs_reg offset);
++                                        fs_reg varying_offset,
++                                        uint32_t const_offset);
+    bool run();
+    void setup_payload_gen4();
+diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp 
+index 735a33d..6b6af8d 100644
+--- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
++++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
+@@ -650,9 +650,8 @@ fs_visitor::visit(ir_expression *ir)
+          emit(SHR(base_offset, op[1], fs_reg(2)));
+          for (int i = 0; i < ir->type->vector_elements; i++) {
+-            fs_reg offset = fs_reg(this, glsl_type::int_type);
+-            emit(ADD(offset, base_offset, fs_reg(i)));
+-            emit(VARYING_PULL_CONSTANT_LOAD(result, surf_index, offset));
++            emit(VARYING_PULL_CONSTANT_LOAD(result, surf_index,
++                                            base_offset, i));
+             if (ir->type->base_type == GLSL_TYPE_BOOL)
+                emit(CMP(result, result, fs_reg(0), BRW_CONDITIONAL_NZ));
diff --git 
new file mode 100644
index 0000000..17d834e
--- /dev/null
@@ -0,0 +1,141 @@
+From 406b0516036273010399ac7a520a765de66df610 Mon Sep 17 00:00:00 2001
+From: Eric Anholt <>
+Date: Wed, 20 Mar 2013 10:46:20 -0700
+Subject: [PATCH 03/12] i965: Make the constant surface interface take a normal
+ byte size.
+This puts the rounding-up logic into the function itself instead of all
+the callers having to manage it.  Also drop an "unused" comment in gen4,
+as the stride *is* used for texbos (and will be for uniforms soon).
+NOTE: This is a candidate for the 9.1 branch.
+ src/mesa/drivers/dri/i965/brw_vs_surface_state.c  |  7 +++----
+ src/mesa/drivers/dri/i965/brw_wm_surface_state.c  | 16 +++++++---------
+ src/mesa/drivers/dri/i965/gen7_wm_surface_state.c |  8 +++++---
+ src/mesa/drivers/dri/intel/intel_context.h        |  2 +-
+ 4 files changed, 16 insertions(+), 17 deletions(-)
+diff --git a/src/mesa/drivers/dri/i965/brw_vs_surface_state.c 
+index 2aefc0c..6c0b690 100644
+--- a/src/mesa/drivers/dri/i965/brw_vs_surface_state.c
++++ b/src/mesa/drivers/dri/i965/brw_vs_surface_state.c
+@@ -68,9 +68,9 @@ brw_upload_vs_pull_constants(struct brw_context *brw)
+    drm_intel_bo_unreference(brw->vs.const_bo);
++   uint32_t size = brw->vs.prog_data->nr_pull_params * 4;
+    brw->vs.const_bo = drm_intel_bo_alloc(intel->bufmgr, "vp_const_buffer",
+-                                       brw->vs.prog_data->nr_pull_params * 4,
+-                                       64);
++                                       size, 64);
+    drm_intel_gem_bo_map_gtt(brw->vs.const_bo);
+    for (i = 0; i < brw->vs.prog_data->nr_pull_params; i++) {
+@@ -90,8 +90,7 @@ brw_upload_vs_pull_constants(struct brw_context *brw)
+    drm_intel_gem_bo_unmap_gtt(brw->vs.const_bo);
+    const int surf = SURF_INDEX_VERT_CONST_BUFFER;
+-   intel->vtbl.create_constant_surface(brw, brw->vs.const_bo, 0,
+-                                     ALIGN(brw->vs.prog_data->nr_pull_params, 
4) / 4,
++   intel->vtbl.create_constant_surface(brw, brw->vs.const_bo, 0, size,
+                                      &brw->vs.surf_offset[surf]);
+    brw->state.dirty.brw |= BRW_NEW_VS_CONSTBUF;
+diff --git a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c 
+index 932e472..98eed15 100644
+--- a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
++++ b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
+@@ -916,11 +916,13 @@ void
+ brw_create_constant_surface(struct brw_context *brw,
+                           drm_intel_bo *bo,
+                           uint32_t offset,
+-                          int width,
++                          uint32_t size,
+                           uint32_t *out_offset)
+ {
+    struct intel_context *intel = &brw->intel;
+-   const GLint w = width - 1;
++   uint32_t stride = 16;
++   uint32_t elements = ALIGN(size, stride) / stride;
++   const GLint w = elements - 1;
+    uint32_t *surf;
+    surf = brw_state_batch(brw, AUB_TRACE_SURFACE_STATE,
+@@ -939,7 +941,7 @@ brw_create_constant_surface(struct brw_context *brw,
+             ((w >> 7) & 0x1fff) << BRW_SURFACE_HEIGHT_SHIFT);
+    surf[3] = (((w >> 20) & 0x7f) << BRW_SURFACE_DEPTH_SHIFT |
+-            (16 - 1) << BRW_SURFACE_PITCH_SHIFT); /* ignored */
++            (stride - 1) << BRW_SURFACE_PITCH_SHIFT);
+    surf[4] = 0;
+    surf[5] = 0;
+@@ -1086,8 +1088,7 @@ brw_upload_wm_pull_constants(struct brw_context *brw)
+    }
+    drm_intel_gem_bo_unmap_gtt(brw->wm.const_bo);
+-   intel->vtbl.create_constant_surface(brw, brw->wm.const_bo, 0,
+-                                     ALIGN(brw->wm.prog_data->nr_pull_params, 
4) / 4,
++   intel->vtbl.create_constant_surface(brw, brw->wm.const_bo, 0, size,
+                                      &brw->wm.surf_offset[surf_index]);
+    brw->state.dirty.brw |= BRW_NEW_SURFACES;
+@@ -1439,11 +1440,8 @@ brw_upload_ubo_surfaces(struct brw_context *brw,
+        * glBindBufferRange case is undefined, we can just bind the whole 
+        * glBindBufferBase wants and be a correct implementation.
+        */
+-      int size = bo->size - binding->Offset;
+-      size = ALIGN(size, 16) / 16; /* The interface takes a number of vec4s */
+       intel->vtbl.create_constant_surface(brw, bo, binding->Offset,
+-                                        size,
++                                        bo->size - binding->Offset,
+                                         &surf_offsets[i]);
+    }
+diff --git a/src/mesa/drivers/dri/i965/gen7_wm_surface_state.c 
+index db04253..484afcd 100644
+--- a/src/mesa/drivers/dri/i965/gen7_wm_surface_state.c
++++ b/src/mesa/drivers/dri/i965/gen7_wm_surface_state.c
+@@ -383,11 +383,13 @@ static void
+ gen7_create_constant_surface(struct brw_context *brw,
+                            drm_intel_bo *bo,
+                            uint32_t offset,
+-                           int width,
++                           uint32_t size,
+                            uint32_t *out_offset)
+ {
+    struct intel_context *intel = &brw->intel;
+-   const GLint w = width - 1;
++   uint32_t stride = 16;
++   uint32_t elements = ALIGN(size, stride) / stride;
++   const GLint w = elements - 1;
+    uint32_t *surf = brw_state_batch(brw, AUB_TRACE_SURFACE_STATE,
+                                     8 * 4, 32, out_offset);
+@@ -403,7 +405,7 @@ gen7_create_constant_surface(struct brw_context *brw,
+    surf[2] = SET_FIELD(w & 0x7f, GEN7_SURFACE_WIDTH) |
+              SET_FIELD((w >> 7) & 0x1fff, GEN7_SURFACE_HEIGHT);
+    surf[3] = SET_FIELD((w >> 20) & 0x7f, BRW_SURFACE_DEPTH) |
+-             (16 - 1); /* stride between samples */
++             (stride - 1);
+    if (intel->is_haswell) {
+       surf[7] = SET_FIELD(HSW_SCS_RED,   GEN7_SURFACE_SCS_R) |
+diff --git a/src/mesa/drivers/dri/intel/intel_context.h 
+index 2df15d4..bb21f55 100644
+--- a/src/mesa/drivers/dri/intel/intel_context.h
++++ b/src/mesa/drivers/dri/intel/intel_context.h
+@@ -202,7 +202,7 @@ struct intel_context
+       void (*create_constant_surface)(struct brw_context *brw,
+                                     drm_intel_bo *bo,
+                                     uint32_t offset,
+-                                    int width,
++                                    uint32_t size,
+                                     uint32_t *out_offset);
+       /** \} */
+    } vtbl;
diff --git 
new file mode 100644
index 0000000..9bbcd0c
--- /dev/null
@@ -0,0 +1,162 @@
+From 4921232ba622d327f238731874c36a288e605515 Mon Sep 17 00:00:00 2001
+From: Eric Anholt <>
+Date: Thu, 14 Mar 2013 14:41:37 -0700
+Subject: [PATCH 04/12] i965: Make the fragment shader pull constants index by
+ dwords, not vec4s.
+We want to load vec4s, since loading a vec4 instead of a dword is
+basically no increased latency.  But for variable indexed access, the
+previous requirement of aligned vec4s for a sampler LD was hard to
+Note that this change only affects those messages that use the surface
+format, like sampler LDs, but not to the untyped data cache loads we've
+used in other cases.
+No significant performance difference on my GLSL demo with uniforms forced
+to take the varying pull constants path (n=4).
+NOTE: This is a candidate for the 9.1 branch.
+ src/mesa/drivers/dri/i965/brw_fs.cpp              |  5 ++++-
+ src/mesa/drivers/dri/i965/brw_state.h             |  5 -----
+ src/mesa/drivers/dri/i965/brw_vs_surface_state.c  |  2 +-
+ src/mesa/drivers/dri/i965/brw_wm_surface_state.c  | 13 ++++++++-----
+ src/mesa/drivers/dri/i965/gen7_wm_surface_state.c |  5 +++--
+ src/mesa/drivers/dri/intel/intel_context.h        |  5 +++--
+ 6 files changed, 19 insertions(+), 16 deletions(-)
+diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp 
+index 89b08e8..fbe9e3a 100644
+--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
++++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
+@@ -2483,10 +2483,13 @@ fs_visitor::lower_uniform_pull_constant_loads()
+          continue;
+       if (intel->gen >= 7) {
++         /* The offset arg before was a vec4-aligned byte offset.  We need to
++          * turn it into a dword offset.
++          */
+          fs_reg const_offset_reg = inst->src[1];
+          assert(const_offset_reg.file == IMM &&
+                 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
+-         const_offset_reg.imm.u /= 16;
++         const_offset_reg.imm.u /= 4;
+          fs_reg payload = fs_reg(this, glsl_type::uint_type);
+          /* This is actually going to be a MOV, but since only the first dword
+diff --git a/src/mesa/drivers/dri/i965/brw_state.h 
+index 02ce57b..29ec276 100644
+--- a/src/mesa/drivers/dri/i965/brw_state.h
++++ b/src/mesa/drivers/dri/i965/brw_state.h
+@@ -187,11 +187,6 @@ void *brw_state_batch(struct brw_context *brw,
+ void gen4_init_vtable_surface_functions(struct brw_context *brw);
+ uint32_t brw_get_surface_tiling_bits(uint32_t tiling);
+ uint32_t brw_get_surface_num_multisamples(unsigned num_samples);
+-void brw_create_constant_surface(struct brw_context *brw,
+-                               drm_intel_bo *bo,
+-                               uint32_t offset,
+-                               int width,
+-                               uint32_t *out_offset);
+ uint32_t brw_format_for_mesa_format(gl_format mesa_format);
+diff --git a/src/mesa/drivers/dri/i965/brw_vs_surface_state.c 
+index 6c0b690..675a84c 100644
+--- a/src/mesa/drivers/dri/i965/brw_vs_surface_state.c
++++ b/src/mesa/drivers/dri/i965/brw_vs_surface_state.c
+@@ -91,7 +91,7 @@ brw_upload_vs_pull_constants(struct brw_context *brw)
+    const int surf = SURF_INDEX_VERT_CONST_BUFFER;
+    intel->vtbl.create_constant_surface(brw, brw->vs.const_bo, 0, size,
+-                                     &brw->vs.surf_offset[surf]);
++                                     &brw->vs.surf_offset[surf], false);
+    brw->state.dirty.brw |= BRW_NEW_VS_CONSTBUF;
+ }
+diff --git a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c 
+index 98eed15..506ddf0 100644
+--- a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
++++ b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
+@@ -912,15 +912,16 @@ brw_update_texture_surface(struct gl_context *ctx,
+  * Create the constant buffer surface.  Vertex/fragment shader constants will 
+  * read from this buffer with Data Port Read instructions/messages.
+  */
++static void
+ brw_create_constant_surface(struct brw_context *brw,
+                           drm_intel_bo *bo,
+                           uint32_t offset,
+                           uint32_t size,
+-                          uint32_t *out_offset)
++                          uint32_t *out_offset,
++                            bool dword_pitch)
+ {
+    struct intel_context *intel = &brw->intel;
+-   uint32_t stride = 16;
++   uint32_t stride = dword_pitch ? 4 : 16;
+    uint32_t elements = ALIGN(size, stride) / stride;
+    const GLint w = elements - 1;
+    uint32_t *surf;
+@@ -1089,7 +1090,8 @@ brw_upload_wm_pull_constants(struct brw_context *brw)
+    drm_intel_gem_bo_unmap_gtt(brw->wm.const_bo);
+    intel->vtbl.create_constant_surface(brw, brw->wm.const_bo, 0, size,
+-                                     &brw->wm.surf_offset[surf_index]);
++                                     &brw->wm.surf_offset[surf_index],
++                                       true);
+    brw->state.dirty.brw |= BRW_NEW_SURFACES;
+ }
+@@ -1442,7 +1444,8 @@ brw_upload_ubo_surfaces(struct brw_context *brw,
+        */
+       intel->vtbl.create_constant_surface(brw, bo, binding->Offset,
+                                         bo->size - binding->Offset,
+-                                        &surf_offsets[i]);
++                                        &surf_offsets[i],
++                                          shader->Type == GL_FRAGMENT_SHADER);
+    }
+    if (shader->NumUniformBlocks)
+diff --git a/src/mesa/drivers/dri/i965/gen7_wm_surface_state.c 
+index 484afcd..2c12be3 100644
+--- a/src/mesa/drivers/dri/i965/gen7_wm_surface_state.c
++++ b/src/mesa/drivers/dri/i965/gen7_wm_surface_state.c
+@@ -384,10 +384,11 @@ gen7_create_constant_surface(struct brw_context *brw,
+                            drm_intel_bo *bo,
+                            uint32_t offset,
+                            uint32_t size,
+-                           uint32_t *out_offset)
++                           uint32_t *out_offset,
++                             bool dword_pitch)
+ {
+    struct intel_context *intel = &brw->intel;
+-   uint32_t stride = 16;
++   uint32_t stride = dword_pitch ? 4 : 16;
+    uint32_t elements = ALIGN(size, stride) / stride;
+    const GLint w = elements - 1;
+diff --git a/src/mesa/drivers/dri/intel/intel_context.h 
+index bb21f55..7bec10f 100644
+--- a/src/mesa/drivers/dri/intel/intel_context.h
++++ b/src/mesa/drivers/dri/intel/intel_context.h
+@@ -203,13 +203,14 @@ struct intel_context
+                                     drm_intel_bo *bo,
+                                     uint32_t offset,
+                                     uint32_t size,
+-                                    uint32_t *out_offset);
++                                    uint32_t *out_offset,
++                                      bool dword_pitch);
+       /** \} */
+    } vtbl;
+    GLbitfield Fallback;  /**< mask of INTEL_FALLBACK_x bits */
+    GLuint NewGLState;
+    dri_bufmgr *bufmgr;
+    unsigned int maxBatchSize;
diff --git 
new file mode 100644
index 0000000..5e1a22d
--- /dev/null
@@ -0,0 +1,34 @@
+From 0eb070fa7d4a7d4494c70407fc953adc9429edeb Mon Sep 17 00:00:00 2001
+From: Eric Anholt <>
+Date: Fri, 15 Mar 2013 14:31:46 -0700
+Subject: [PATCH 05/12] i965/fs: Avoid inappropriate optimization with
+ regs_written > 1.
+Right now we don't have anything with regs_written() > 1 and !inst->mlen,
+but that's about to change.
+NOTE: This is a candidate for the 9.1 branch.
+ src/mesa/drivers/dri/i965/brw_fs.cpp | 6 ++++++
+ 1 file changed, 6 insertions(+)
+diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp 
+index fbe9e3a..f1b0789 100644
+--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
++++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
+@@ -2087,6 +2087,12 @@ fs_visitor::compute_to_mrf()
+              break;
+           }
++            /* Things returning more than one register would need us to
++             * understand coalescing out more than one MOV at a time.
++             */
++            if (scan_inst->regs_written() > 1)
++               break;
+           /* SEND instructions can't have MRF as a destination. */
+           if (scan_inst->mlen)
+              break;
diff --git 
new file mode 100644
index 0000000..131c379
--- /dev/null
@@ -0,0 +1,127 @@
+From 2f4d09235849e206e2807146bb8c8e724ab6ff26 Mon Sep 17 00:00:00 2001
+From: Eric Anholt <>
+Date: Wed, 13 Mar 2013 14:48:55 -0700
+Subject: [PATCH 06/12] i965/fs: Improve performance of varying-index uniform
+ loads on IVB.
+Like we have done for the VS and for constant-index uniform loads, we use
+the sampler engine to get caching in front of the L3 to avoid tickling the
+IVB L3 bug.  This is also a bit of a functional change, as we're now
+loading a vec4 instead of a single dword, though we're not taking
+advantage of the other 3 components of the vec4 (yet).
+With the driver hacked to always take the varying-index path for all
+uniforms, improves performance of my old GLSL demo by 315% +/- 2% (n=4).
+This a major fix for some blur shaders in compositors from the
+varying-index uniforms support I introduced in 9.1.
+v2: Move old offset computation into the pre-gen7 path.
+NOTE: This is a candidate for the 9.1 branch.
+ src/mesa/drivers/dri/i965/brw_fs.cpp      | 29 ++++++++++++++++++++++++-----
+ src/mesa/drivers/dri/i965/brw_fs_emit.cpp | 27 ++++++++++++++-------------
+ 2 files changed, 38 insertions(+), 18 deletions(-)
+diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp 
+index f1b0789..f4aa9f7 100644
+--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
++++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
+@@ -235,14 +235,33 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(fs_reg dst, 
fs_reg surf_index,
+    exec_list instructions;
+    fs_inst *inst;
+-   fs_reg offset = fs_reg(this, glsl_type::uint_type);
+-   instructions.push_tail(ADD(offset, varying_offset, fs_reg(const_offset)));
+    if (intel->gen >= 7) {
++      /* We have our constant surface use a pitch of 4 bytes, so our index can
++       * be any component of a vector, and then we load 4 contiguous
++       * components starting from that.
++       *
++       * We break down the const_offset to a portion added to the variable
++       * offset and a portion done using reg_offset, which means that if you
++       * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
++       * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
++       * CSE can later notice that those loads are all the same and eliminate
++       * the redundant ones.
++       */
++      fs_reg vec4_offset = fs_reg(this, glsl_type::int_type);
++      instructions.push_tail(ADD(vec4_offset,
++                                 varying_offset, const_offset & ~3));
++      fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(4), dst.type);
+       inst = new(mem_ctx) fs_inst(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7,
+-                                  dst, surf_index, offset);
++                                  vec4_result, surf_index, vec4_offset);
+       instructions.push_tail(inst);
++      vec4_result.reg_offset += const_offset & 3;
++      instructions.push_tail(MOV(dst, vec4_result));
+    } else {
++      fs_reg offset = fs_reg(this, glsl_type::uint_type);
++      instructions.push_tail(ADD(offset, varying_offset, 
+       int base_mrf = 13;
+       bool header_present = true;
+@@ -313,7 +332,7 @@ fs_inst::equals(fs_inst *inst)
+ int
+ fs_inst::regs_written()
+ {
+-   if (is_tex())
++   if (is_tex() || opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7)
+       return 4;
+    /* The SINCOS and INT_DIV_QUOTIENT_AND_REMAINDER math functions return 2,
+diff --git a/src/mesa/drivers/dri/i965/brw_fs_emit.cpp 
+index 712fef6..4b3c43f 100644
+--- a/src/mesa/drivers/dri/i965/brw_fs_emit.cpp
++++ b/src/mesa/drivers/dri/i965/brw_fs_emit.cpp
+@@ -737,28 +737,29 @@ 
fs_generator::generate_varying_pull_constant_load_gen7(fs_inst *inst,
+         index.type == BRW_REGISTER_TYPE_UD);
+    uint32_t surf_index = index.dw1.ud;
+-   uint32_t msg_control, rlen, mlen;
++   uint32_t simd_mode, rlen, mlen;
+    if (dispatch_width == 16) {
+-      mlen = rlen = 2;
++      mlen = 2;
++      rlen = 8;
++      simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
+    } else {
+-      mlen = rlen = 1;
++      mlen = 1;
++      rlen = 4;
++      simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
+    }
+    struct brw_instruction *send = brw_next_insn(p, BRW_OPCODE_SEND);
+    brw_set_dest(p, send, dst);
+    brw_set_src0(p, send, offset);
+-   if (intel->gen < 6)
+-      send->header.destreg__conditionalmod = inst->base_mrf;
+-   brw_set_dp_read_message(p, send,
++   brw_set_sampler_message(p, send,
+                            surf_index,
+-                           msg_control,
+-                           GEN7_DATAPORT_DC_DWORD_SCATTERED_READ,
+-                           BRW_DATAPORT_READ_TARGET_DATA_CACHE,
++                           0, /* LD message ignores sampler unit */
++                           GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
++                           rlen,
+                            mlen,
+-                           inst->header_present,
+-                           rlen);
++                           false, /* no header */
++                           simd_mode,
++                           0);
+ }
+ /**
diff --git 
new file mode 100644
index 0000000..b5cfe0e
--- /dev/null
@@ -0,0 +1,84 @@
+From b5f8ad54c7bfd624209e4ae7d36abac0093ddb9a Mon Sep 17 00:00:00 2001
+From: Eric Anholt <>
+Date: Fri, 15 Mar 2013 14:43:28 -0700
+Subject: [PATCH 07/12] i965/fs: Do CSE on gen7's varying-index pull constant
+ loads.
+This is our first CSE on a regs_written() > 1 instruction, so it takes a
+bit of extra fixup.  Reduces the number of loads on kwin's Lanczos shader
+from 12 to 2.
+NOTE: This is a candidate for the 9.1 branch.
+ src/mesa/drivers/dri/i965/brw_fs_cse.cpp | 43 ++++++++++++++++++++++++--------
+ 1 file changed, 32 insertions(+), 11 deletions(-)
+diff --git a/src/mesa/drivers/dri/i965/brw_fs_cse.cpp 
+index 02642c9..c89da36 100644
+--- a/src/mesa/drivers/dri/i965/brw_fs_cse.cpp
++++ b/src/mesa/drivers/dri/i965/brw_fs_cse.cpp
+@@ -68,6 +68,7 @@ is_expression(const fs_inst *const inst)
+    case BRW_OPCODE_MAD:
+    case BRW_OPCODE_LRP:
+       return true;
+@@ -129,21 +130,41 @@ fs_visitor::opt_cse_local(bblock_t *block, exec_list 
+            */
+           bool no_existing_temp = entry->tmp.file == BAD_FILE;
+           if (no_existing_temp) {
+-             entry->tmp = fs_reg(this, glsl_type::float_type);
+-             entry->tmp.type = inst->dst.type;
+-             fs_inst *copy = new(ralloc_parent(inst))
+-                fs_inst(BRW_OPCODE_MOV, entry->generator->dst, entry->tmp);
+-             entry->generator->insert_after(copy);
+-             entry->generator->dst = entry->tmp;
++               int written = entry->generator->regs_written();
++               fs_reg orig_dst = entry->generator->dst;
++               fs_reg tmp = fs_reg(GRF, virtual_grf_alloc(written),
++                                   orig_dst.type);
++               entry->tmp = tmp;
++               entry->generator->dst = tmp;
++               for (int i = 0; i < written; i++) {
++                  fs_inst *copy = MOV(orig_dst, tmp);
++                  copy->force_writemask_all =
++                     entry->generator->force_writemask_all;
++                  entry->generator->insert_after(copy);
++                  orig_dst.reg_offset++;
++                  tmp.reg_offset++;
++               }
+           }
+           /* dest <- temp */
++            int written = inst->regs_written();
++            assert(written == entry->generator->regs_written());
+             assert(inst->dst.type == entry->tmp.type);
+-          fs_inst *copy = new(ralloc_parent(inst))
+-             fs_inst(BRW_OPCODE_MOV, inst->dst, entry->tmp);
+-            copy->force_writemask_all = inst->force_writemask_all;
+-          inst->replace_with(copy);
++            fs_reg dst = inst->dst;
++            fs_reg tmp = entry->tmp;
++            fs_inst *copy;
++            for (int i = 0; i < written; i++) {
++               copy = MOV(dst, tmp);
++               copy->force_writemask_all = inst->force_writemask_all;
++               inst->insert_before(copy);
++               dst.reg_offset++;
++               tmp.reg_offset++;
++            }
++            inst->remove();
+           /* Appending an instruction may have changed our bblock end. */
+           if (inst == block->end) {
diff --git 
new file mode 100644
index 0000000..4827a9c
--- /dev/null
@@ -0,0 +1,36 @@
+From 2a0a69e2169eee805b6068f930c3b3049b362a91 Mon Sep 17 00:00:00 2001
+From: Eric Anholt <>
+Date: Mon, 18 Mar 2013 11:26:17 -0700
+Subject: [PATCH 08/12] i965/fs: Clean up the setup of gen4 simd16 message
+ destinations.
+I think this makes it much more obvious what's going on here.
+NOTE: This is a candidate for the 9.1 branch.
+ src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 9 ++++-----
+ 1 file changed, 4 insertions(+), 5 deletions(-)
+diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp 
+index 6b6af8d..48c6df3 100644
+--- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
++++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
+@@ -916,11 +916,10 @@ fs_visitor::emit_texture_gen4(ir_texture *ir, fs_reg 
dst, fs_reg coordinate,
+        * this weirdness around to the expected layout.
+        */
+       orig_dst = dst;
+-      const glsl_type *vec_type =
+-       glsl_type::get_instance(ir->type->base_type, 4, 1);
+-      dst = fs_reg(this, glsl_type::get_array_instance(vec_type, 2));
+-      dst.type = intel->is_g4x ? brw_type_for_base_type(ir->type)
+-                             : BRW_REGISTER_TYPE_F;
++      dst = fs_reg(GRF, virtual_grf_alloc(8),
++                   (intel->is_g4x ?
++                    brw_type_for_base_type(ir->type) :
++                    BRW_REGISTER_TYPE_F));
+    }
+    fs_inst *inst = NULL;
diff --git 
new file mode 100644
index 0000000..3a430d1
--- /dev/null
@@ -0,0 +1,293 @@
+From bb1d21826152370209fd64b9abffd8a59d3ec5f4 Mon Sep 17 00:00:00 2001
+From: Eric Anholt <>
+Date: Mon, 18 Mar 2013 11:30:57 -0700
+Subject: [PATCH 09/12] i965/fs: Bake regs_written into the IR instead of
+ recomputing it later.
+For sampler messages, it depends on the target gen, and on gen4
+SIMD16-sampler-on-SIMD8-execution we were returning 4 instead of 8 like we
+NOTE: This is a candidate for the 9.1 branch.
+ src/mesa/drivers/dri/i965/brw_fs.cpp               | 29 ++++++++--------------
+ src/mesa/drivers/dri/i965/brw_fs.h                 |  2 +-
+ src/mesa/drivers/dri/i965/brw_fs_cse.cpp           |  6 ++---
+ .../drivers/dri/i965/brw_fs_live_variables.cpp     |  2 +-
+ src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp  |  8 +++---
+ .../dri/i965/brw_fs_schedule_instructions.cpp      |  6 ++---
+ src/mesa/drivers/dri/i965/brw_fs_visitor.cpp       |  7 ++++--
+ 7 files changed, 27 insertions(+), 33 deletions(-)
+diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp 
+index f4aa9f7..c128175 100644
+--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
++++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
+@@ -60,6 +60,9 @@ fs_inst::init()
+    this->src[0] = reg_undef;
+    this->src[1] = reg_undef;
+    this->src[2] = reg_undef;
++   /* This will be the case for almost all instructions. */
++   this->regs_written = 1;
+ }
+ fs_inst::fs_inst()
+@@ -254,6 +257,7 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg 
+       fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(4), dst.type);
+       inst = new(mem_ctx) fs_inst(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7,
+                                   vec4_result, surf_index, vec4_offset);
++      inst->regs_written = 4;
+       instructions.push_tail(inst);
+       vec4_result.reg_offset += const_offset & 3;
+@@ -329,26 +333,13 @@ fs_inst::equals(fs_inst *inst)
+            offset == inst->offset);
+ }
+-   if (is_tex() || opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7)
+-      return 4;
+-   /* The SINCOS and INT_DIV_QUOTIENT_AND_REMAINDER math functions return 2,
+-    * but we don't currently use them...nor do we have an opcode for them.
+-    */
+-   return 1;
+ bool
+ fs_inst::overwrites_reg(const fs_reg &reg)
+ {
+    return (reg.file == dst.file &&
+            reg.reg == dst.reg &&
+            reg.reg_offset >= dst.reg_offset  &&
+-           reg.reg_offset < dst.reg_offset + regs_written());
++           reg.reg_offset < dst.reg_offset + regs_written);
+ }
+ bool
+@@ -1388,7 +1379,7 @@ fs_visitor::split_virtual_grfs()
+       /* If there's a SEND message that requires contiguous destination
+        * registers, no splitting is allowed.
+        */
+-      if (inst->regs_written() > 1) {
++      if (inst->regs_written > 1) {
+        split_grf[inst->dst.reg] = false;
+       }
+    }
+@@ -2109,7 +2100,7 @@ fs_visitor::compute_to_mrf()
+             /* Things returning more than one register would need us to
+              * understand coalescing out more than one MOV at a time.
+              */
+-            if (scan_inst->regs_written() > 1)
++            if (scan_inst->regs_written > 1)
+                break;
+           /* SEND instructions can't have MRF as a destination. */
+@@ -2326,7 +2317,7 @@ void
+ fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst)
+ {
+    int reg_size = dispatch_width / 8;
+-   int write_len = inst->regs_written() * reg_size;
++   int write_len = inst->regs_written * reg_size;
+    int first_write_grf = inst->dst.reg;
+    bool needs_dep[BRW_MAX_MRF];
+    assert(write_len < (int)sizeof(needs_dep) - 1);
+@@ -2366,7 +2357,7 @@ 
fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst)
+        * dependency has more latency than a MOV.
+        */
+       if (scan_inst->dst.file == GRF) {
+-         for (int i = 0; i < scan_inst->regs_written(); i++) {
++         for (int i = 0; i < scan_inst->regs_written; i++) {
+             int reg = scan_inst->dst.reg + i * reg_size;
+             if (reg >= first_write_grf &&
+@@ -2405,7 +2396,7 @@ 
fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst)
+ void
+ fs_visitor::insert_gen4_post_send_dependency_workarounds(fs_inst *inst)
+ {
+-   int write_len = inst->regs_written() * dispatch_width / 8;
++   int write_len = inst->regs_written * dispatch_width / 8;
+    int first_write_grf = inst->dst.reg;
+    bool needs_dep[BRW_MAX_MRF];
+    assert(write_len < (int)sizeof(needs_dep) - 1);
+diff --git a/src/mesa/drivers/dri/i965/brw_fs.h 
+index 76130b1..0c5aad1 100644
+--- a/src/mesa/drivers/dri/i965/brw_fs.h
++++ b/src/mesa/drivers/dri/i965/brw_fs.h
+@@ -174,7 +174,6 @@ public:
+            fs_reg src0, fs_reg src1,fs_reg src2);
+    bool equals(fs_inst *inst);
+-   int regs_written();
+    bool overwrites_reg(const fs_reg &reg);
+    bool is_tex();
+    bool is_math();
+@@ -192,6 +191,7 @@ public:
+    uint8_t flag_subreg;
+    int mlen; /**< SEND message length */
++   int regs_written; /**< Number of vgrfs written by a SEND message, or 1 */
+    int base_mrf; /**< First MRF in the SEND message, if mlen is nonzero. */
+    uint32_t texture_offset; /**< Texture offset bitfield */
+    int sampler;
+diff --git a/src/mesa/drivers/dri/i965/brw_fs_cse.cpp 
+index c89da36..01a64d2 100644
+--- a/src/mesa/drivers/dri/i965/brw_fs_cse.cpp
++++ b/src/mesa/drivers/dri/i965/brw_fs_cse.cpp
+@@ -130,7 +130,7 @@ fs_visitor::opt_cse_local(bblock_t *block, exec_list *aeb)
+            */
+           bool no_existing_temp = entry->tmp.file == BAD_FILE;
+           if (no_existing_temp) {
+-               int written = entry->generator->regs_written();
++               int written = entry->generator->regs_written;
+                fs_reg orig_dst = entry->generator->dst;
+                fs_reg tmp = fs_reg(GRF, virtual_grf_alloc(written),
+@@ -150,8 +150,8 @@ fs_visitor::opt_cse_local(bblock_t *block, exec_list *aeb)
+           }
+           /* dest <- temp */
+-            int written = inst->regs_written();
+-            assert(written == entry->generator->regs_written());
++            int written = inst->regs_written;
++            assert(written == entry->generator->regs_written);
+             assert(inst->dst.type == entry->tmp.type);
+             fs_reg dst = inst->dst;
+             fs_reg tmp = entry->tmp;
+diff --git a/src/mesa/drivers/dri/i965/brw_fs_live_variables.cpp 
+index 63af148..373aa2d 100644
+--- a/src/mesa/drivers/dri/i965/brw_fs_live_variables.cpp
++++ b/src/mesa/drivers/dri/i965/brw_fs_live_variables.cpp
+@@ -77,7 +77,7 @@ fs_live_variables::setup_def_use()
+         * variable, and thus qualify for being in def[].
+         */
+        if (inst->dst.file == GRF &&
+-           inst->regs_written() == v->virtual_grf_sizes[inst->dst.reg] &&
++           inst->regs_written == v->virtual_grf_sizes[inst->dst.reg] &&
+            !inst->predicate &&
+            !inst->force_uncompressed &&
+            !inst->force_sechalf) {
+diff --git a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp 
+index b8936dc..4ee7bbc 100644
+--- a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
++++ b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
+@@ -553,7 +553,7 @@ fs_visitor::choose_spill_reg(struct ra_graph *g)
+       }
+       if (inst->dst.file == GRF) {
+-       spill_costs[inst->dst.reg] += inst->regs_written() * loop_scale;
++       spill_costs[inst->dst.reg] += inst->regs_written * loop_scale;
+          if (inst->dst.smear >= 0) {
+             no_spill[inst->dst.reg] = true;
+@@ -622,7 +622,7 @@ fs_visitor::spill_reg(int spill_reg)
+         inst->dst.reg == spill_reg) {
+          int subset_spill_offset = (spill_offset +
+                                     REG_SIZE * inst->dst.reg_offset);
+-         inst->dst.reg = virtual_grf_alloc(inst->regs_written());
++         inst->dst.reg = virtual_grf_alloc(inst->regs_written);
+          inst->dst.reg_offset = 0;
+        /* If our write is going to affect just part of the
+@@ -631,7 +631,7 @@ fs_visitor::spill_reg(int spill_reg)
+         */
+        if (inst->predicate || inst->force_uncompressed || 
inst->force_sechalf) {
+             fs_reg unspill_reg = inst->dst;
+-            for (int chan = 0; chan < inst->regs_written(); chan++) {
++            for (int chan = 0; chan < inst->regs_written; chan++) {
+                emit_unspill(inst, unspill_reg,

To UNSUBSCRIBE, email to
with a subject of "unsubscribe". Trouble? Contact

Reply via email to