Re: [Mesa-dev] [PATCH] i965/cs: Use exec all for CS terminate

2015-06-12 Thread Francisco Jerez
Jordan Justen  writes:

> This prevents an assertion from being hit with SIMD16:
>
> Assertion `inst->exec_size == dispatch_width() || force_writemask_all' failed.
>
> Signed-off-by: Jordan Justen 
> Cc: Francisco Jerez 
> ---
>  src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 3 ++-
>  1 file changed, 2 insertions(+), 1 deletion(-)
>
> diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp 
> b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
> index c41284b..588966b 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
> @@ -1948,7 +1948,8 @@ fs_visitor::emit_cs_terminate()
> bld.exec_all().MOV(payload, g0);
>  
> /* Send a message to the thread spawner to terminate the thread. */
> -   fs_inst *inst = bld.emit(CS_OPCODE_CS_TERMINATE, reg_undef, payload);
> +   fs_inst *inst = bld.exec_all()
> +  .emit(CS_OPCODE_CS_TERMINATE, reg_undef, payload);
> inst->eot = true;
>  }
>  
> -- 
> 2.1.4

Reviewed-by: Francisco Jerez 


signature.asc
Description: PGP signature
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 07/17] i965/fs: Move offset() and half() to the fs_builder

2015-06-23 Thread Francisco Jerez
Jason Ekstrand  writes:

> We want to move these into the builder so that they know the current
> builder's dispatch width.  This will be needed by a later commit.

I very much like the idea of this series, but, why do you need to move
these register manipulators into the builder?  The builder is an object
you can use to:
 - Manipulate and query parameters affecting code generation.
 - Create instructions into the program (::emit and friends).
 - Allocate virtual registers from the program (::vgrf and friends).

offset() and half() logically perform an action on a given register
object (or rather, compute a function of a given register object), not
on a builder object, the builder is only required as an auxiliary
parameter -- Any reason you didn't just pass it as a third parameter?

As offset() and half() don't require access to any private details of
the builder, that would actually improve encapsulation, and would avoid
the dubious overloading of fs_builder::half() with two methods with
completely different semantics.

Thanks.

> ---
>  src/mesa/drivers/dri/i965/brw_fs.cpp |  52 ++
>  src/mesa/drivers/dri/i965/brw_fs_builder.h   |  46 +
>  src/mesa/drivers/dri/i965/brw_fs_cse.cpp |   2 +-
>  src/mesa/drivers/dri/i965/brw_fs_nir.cpp |  60 +--
>  src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 149 
> ++-
>  src/mesa/drivers/dri/i965/brw_ir_fs.h|  51 -
>  6 files changed, 182 insertions(+), 178 deletions(-)
>
> diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp 
> b/src/mesa/drivers/dri/i965/brw_fs.cpp
> index 4f98d63..c13ac7d 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
> @@ -267,7 +267,7 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder 
> &bld,
>   inst->mlen = 1 + dispatch_width / 8;
> }
>  
> -   bld.MOV(dst, offset(vec4_result, (const_offset & 3) * scale));
> +   bld.MOV(dst, bld.offset(vec4_result, (const_offset & 3) * scale));
>  }
>  
>  /**
> @@ -361,7 +361,12 @@ fs_inst::is_copy_payload(const brw::simple_allocator 
> &grf_alloc) const
>reg.width = this->src[i].width;
>if (!this->src[i].equals(reg))
>   return false;
> -  reg = ::offset(reg, 1);
> +
> +  if (i < this->header_size) {
> + reg.reg_offset += 1;
> +  } else {
> + reg.reg_offset += this->exec_size / 8;
> +  }
> }
>  
> return true;
> @@ -963,7 +968,7 @@ fs_visitor::emit_fragcoord_interpolation(bool 
> pixel_center_integer,
> } else {
>bld.ADD(wpos, this->pixel_x, fs_reg(0.5f));
> }
> -   wpos = offset(wpos, 1);
> +   wpos = bld.offset(wpos, 1);
>  
> /* gl_FragCoord.y */
> if (!flip && pixel_center_integer) {
> @@ -979,7 +984,7 @@ fs_visitor::emit_fragcoord_interpolation(bool 
> pixel_center_integer,
>  
>bld.ADD(wpos, pixel_y, fs_reg(offset));
> }
> -   wpos = offset(wpos, 1);
> +   wpos = bld.offset(wpos, 1);
>  
> /* gl_FragCoord.z */
> if (devinfo->gen >= 6) {
> @@ -989,7 +994,7 @@ fs_visitor::emit_fragcoord_interpolation(bool 
> pixel_center_integer,
> this->delta_xy[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
> interp_reg(VARYING_SLOT_POS, 2));
> }
> -   wpos = offset(wpos, 1);
> +   wpos = bld.offset(wpos, 1);
>  
> /* gl_FragCoord.w: Already set up in emit_interpolation */
> bld.MOV(wpos, this->wpos_w);
> @@ -1072,7 +1077,7 @@ fs_visitor::emit_general_interpolation(fs_reg attr, 
> const char *name,
>   /* If there's no incoming setup data for this slot, don't
>* emit interpolation for it.
>*/
> - attr = offset(attr, type->vector_elements);
> + attr = bld.offset(attr, type->vector_elements);
>   location++;
>   continue;
>}
> @@ -1087,7 +1092,7 @@ fs_visitor::emit_general_interpolation(fs_reg attr, 
> const char *name,
>  interp = suboffset(interp, 3);
> interp.type = attr.type;
> bld.emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
> -attr = offset(attr, 1);
> +attr = bld.offset(attr, 1);
>   }
>} else {
>   /* Smooth/noperspective interpolation case. */
> @@ -1125,7 +1130,7 @@ fs_visitor::emit_general_interpolation(fs_reg attr, 
> const char *name,
> if (devinfo->gen < 6 && interpolation_mode == 
> INTERP_QUALIFIER_SMOOTH) {
>bld.MUL(attr, attr, this->pixel_w);
> }
> -attr = offset(attr, 1);
> +attr = bld.offset(attr, 1);
>   }
>  
>}
> @@ -1227,19 +1232,19 @@ fs_visitor::emit_samplepos_setup()
> if (dispatch_width == 8) {
>abld.MOV(int_sample_x, fs_reg(sample_pos_reg));
> } else {
> -  abld.half(0).MOV(half(int_sample_x, 0), fs_reg(sample_pos_reg));
> -  abld.half(1).MOV(half(int_sample_x, 1),
> +  abld.half(0).MOV(abld.half(int_sample_x, 0), fs_reg(sample_pos_reg))

Re: [Mesa-dev] [PATCH 07/17] i965/fs: Move offset() and half() to the fs_builder

2015-06-24 Thread Francisco Jerez
Jason Ekstrand  writes:

> On Tue, Jun 23, 2015 at 9:22 AM, Francisco Jerez  
> wrote:
>> Jason Ekstrand  writes:
>>
>>> We want to move these into the builder so that they know the current
>>> builder's dispatch width.  This will be needed by a later commit.
>>
>> I very much like the idea of this series, but, why do you need to move
>> these register manipulators into the builder?  The builder is an object
>> you can use to:
>>  - Manipulate and query parameters affecting code generation.
>>  - Create instructions into the program (::emit and friends).
>>  - Allocate virtual registers from the program (::vgrf and friends).
>>
>> offset() and half() logically perform an action on a given register
>> object (or rather, compute a function of a given register object), not
>> on a builder object, the builder is only required as an auxiliary
>> parameter -- Any reason you didn't just pass it as a third parameter?
>
> What's required as a third parameter is the current execution size.  I
> could have passed that directly, but I figured that, especially for
> half(), it would get messed up.  I could pass the builder in but I
> don't see a whole lot of difference between that and what I'm doing
> right now.

Assembly-wise there's no difference, but it seems inconsistent with both
the remaining register manipulators and remaining builder methods, and
IMHO it's kind of an anti-pattern to make something a method that
doesn't need access to any internal details of the object.

> As is, it's not entirely obvious whether you should call
> half(reg) on the half-width or full-width builder.  I'm not 100% sure
> what to do about that.
>
Actually, does half() really need to know about the builder?  AFAICT it
only needs it because of dispatch_width(), and before doing anything
useful with it it asserts that it's equal to 16, what points at the
parameter being redundant.  By convention a "half" is a group of 8
channels (we may want to revise this convention when we implement SIMD32
-- E.g. make half a group of 16 channels and quarter a group of 8
channels), so 'half(reg)' could simply be implemented as
"horiz_offset(reg, 8 * i)" without any dependency on the builder.  As
additional paranoia to catch half() being called on a non-16-aligned
register you could assert that either 'stride == 0' or 16 divides
'(REG_SIZE * reg_offset + subreg_offset) / (stride * type_size)' (why
don't we have a reg_offset already in bytes again?) -- That would also
catch cases in which the register and builder "widths" get out of sync,
e.g. if half is called in an already halved register but the builder
used happens to be of the correct exec_size.

>> As offset() and half() don't require access to any private details of
>> the builder, that would actually improve encapsulation, and would avoid
>> the dubious overloading of fs_builder::half() with two methods with
>> completely different semantics.
>
> Yeah, I don't really like that either.  I just couldn't come up with
> anything better at the time.
>
> Suggestions are very much welcome.  But I would like to settle on
> whatever we do fairly quickly so as to limit the amount of
> refactoring.
> --Jason
>
>> Thanks.
>>
>>> ---
>>>  src/mesa/drivers/dri/i965/brw_fs.cpp |  52 ++
>>>  src/mesa/drivers/dri/i965/brw_fs_builder.h   |  46 +
>>>  src/mesa/drivers/dri/i965/brw_fs_cse.cpp |   2 +-
>>>  src/mesa/drivers/dri/i965/brw_fs_nir.cpp |  60 +--
>>>  src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 149 
>>> ++-
>>>  src/mesa/drivers/dri/i965/brw_ir_fs.h|  51 -
>>>  6 files changed, 182 insertions(+), 178 deletions(-)
>>>
>>> diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp 
>>> b/src/mesa/drivers/dri/i965/brw_fs.cpp
>>> index 4f98d63..c13ac7d 100644
>>> --- a/src/mesa/drivers/dri/i965/brw_fs.cpp
>>> +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
>>> @@ -267,7 +267,7 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder 
>>> &bld,
>>>   inst->mlen = 1 + dispatch_width / 8;
>>> }
>>>
>>> -   bld.MOV(dst, offset(vec4_result, (const_offset & 3) * scale));
>>> +   bld.MOV(dst, bld.offset(vec4_result, (const_offset & 3) * scale));
>>>  }
>>>
>>>  /**
>>> @@ -361,7 +361,12 @@ fs_inst::is_copy_payload(const brw::simple_allocator 
>>> &grf_alloc) const
>>>reg.width = this->src[i].width;
>>>  

Re: [Mesa-dev] [PATCH v2 23/82] glsl: Do not do CSE for expressions involving SSBO loads

2015-06-24 Thread Francisco Jerez
Iago Toral  writes:

> On Wed, 2015-06-17 at 17:20 -0700, Jordan Justen wrote:
>> I wanted to question whether this was required, based on this text
>> from the extension spec:
>> 
>> "The ability to write to buffer objects creates the potential for
>>  multiple independent shader invocations to read and write the same
>>  underlying memory. The same issue exists with the
>>  ARB_shader_image_load_store extension provided in OpenGL 4.2, which
>>  can write to texture objects and buffers. In both cases, the
>>  specification makes few guarantees related to the relative order of
>>  memory reads and writes performed by the shader invocations."
>> 
>> But I'm not sure if we can reconcile CSE with 'memoryBarrier' and
>> 'barrier'. curro, any thoughts from image load/store?
>
> I think the problem is within the same thread, that text above talks
> about multiple invocations reading from and writing to the same
> location, but within the same invocation, the order of reads and writes
> must be preserved:
>
> "Buffer variable memory reads and writes within a single shader
> invocation are processed in order.  However, the order of reads and
> writes performed in one invocation relative to those performed by
> another invocation is largely undefined."
>
> For example, if X is a shader storage buffer variable and we have code
> like this with just one invocation:
>
> ssbo_store(X, 1);
> a = ssbo_load(X) + 1  // a = 2
> ssbo_store(X, 2);
> b = ssbo_load(X) + 1; // b = 3
>
> CSE could mess it up like this:
>
> ssbo_store(X, 1);
> tmp = ssbo_load(X) + 1  // tmp = 2
> a = tmp;
> ssbo_store(X, 2);
> b = tmp;
>
> which would be incorrect. I think I wrote this patch after seeing
> something like this happening. The CSE pass clearly states that it does
> not support write variables after all.
>
> Also, notice the same would apply if there are multiple invocations but
> the shader code used something like gl_VertexID or gl_FragCoord to make
> each invocation read from/write to a different address within the SSBO
> buffer (I imagine this is the usual way to operate with SSBOs). In these
> cases, even if we have multiple invocations, keeping the relative order
> of reads and writes within each one is necessary.
>

AFAICT the reason why this (and many of the other changes in GLSL
optimization passes) is needed is because SSBO loads have been
implemented as ir_expression nodes instead of being lowered into
intrinsics (as other side-effectful operations do like
ARB_shader_image_load_store and ARB_shader_atomic_counters).  This
surely broke the assumption of a number of optimization passes that
ir_expression nodes behave as pure functions.  I guess the reason why
you've done it this way is because UBO loads were already being
represented as expressions, so I see why you may have wanted to use the
same approach for SSBOs even though there is a fundamental difference
between the two: UBO loads have no side effects and are constant for a
given set of arguments and a given shader execution, SSBO loads and
stores are not.  SSBO stores couldn't be accommodated into the same
framework so easily, and you decided to create a separate ir node for
them, what seems inconsistent with loads.  Intrinsics would probably
have been a good fit for both loads and stores, and would have made all
these optimization changes unnecessary...

P.S.: Sorry for the late reply, I was on vacation when I was CC'ed.

> Iago
>
>> -Jordan
>> 
>> On 2015-06-03 00:01:13, Iago Toral Quiroga wrote:
>> > SSBOs are read/write and this CSE pass only handles read-only variables.
>> > ---
>> >  src/glsl/opt_cse.cpp | 33 -
>> >  1 file changed, 32 insertions(+), 1 deletion(-)
>> > 
>> > diff --git a/src/glsl/opt_cse.cpp b/src/glsl/opt_cse.cpp
>> > index 4b8e9a0..a05ab46 100644
>> > --- a/src/glsl/opt_cse.cpp
>> > +++ b/src/glsl/opt_cse.cpp
>> > @@ -245,6 +245,28 @@ contains_rvalue(ir_rvalue *haystack, ir_rvalue 
>> > *needle)
>> >  }
>> >  
>> >  static bool
>> > +expression_contains_ssbo_load(ir_expression *expr)
>> > +{
>> > +   if (expr->operation == ir_binop_ssbo_load)
>> > +  return true;
>> > +
>> > +   for (unsigned i = 0; i < expr->get_num_operands(); i++) {
>> > +  ir_rvalue *op = expr->operands[i];
>> > +  if (op->ir_type == ir_type_expression &&
>> > +  expression_contains_ssbo_load(op->as_expression())) {
>> > + return true;
>> > +  } else if (op->ir_type == ir_type_swizzle) {
>> > + ir_swizzle *swizzle = op->as_swizzle();
>> > + ir_expression *val = swizzle->val->as_expression();
>> > + if (val && expression_contains_ssbo_load(val))
>> > +return true;
>> > +  }
>> > +   }
>> > +
>> > +   return false;
>> > +}
>> > +
>> > +static bool
>> >  is_cse_candidate(ir_rvalue *ir)
>> >  {
>> > /* Our temporary variable assignment generation isn't ready to handle
>> > @@ -260,7 +282,16 @@ is_cse_candidate(ir_rvalue *ir)
>> >  * to variable-index array dereferenc

Re: [Mesa-dev] [PATCH] clover: Implement image attribute getters

2015-06-24 Thread Francisco Jerez
Zoltan Gilian  writes:

> Image attributes are passed to the kernel as hidden parameters after the
> image attribute itself. An llvm pass replaces the getter builtins to
> the appropriate parameters.

This seems to be doing essentially the same thing as v1?  Is it the
right patch?

> ---
>  src/gallium/state_trackers/clover/core/kernel.cpp  | 26 +++
>  src/gallium/state_trackers/clover/core/kernel.hpp  | 13 ++--
>  src/gallium/state_trackers/clover/core/memory.cpp  |  2 +-
>  .../state_trackers/clover/llvm/invocation.cpp  | 81 
> +-
>  4 files changed, 116 insertions(+), 6 deletions(-)
>
> diff --git a/src/gallium/state_trackers/clover/core/kernel.cpp 
> b/src/gallium/state_trackers/clover/core/kernel.cpp
> index 0756f06..291c799 100644
> --- a/src/gallium/state_trackers/clover/core/kernel.cpp
> +++ b/src/gallium/state_trackers/clover/core/kernel.cpp
> @@ -185,6 +185,13 @@ kernel::exec_context::bind(intrusive_ptr 
> _q,
>}
> }
>  
> +   // Bind image attribute args.
> +   for (const auto& arg: kern._args) {
> +  if (auto img_arg = dynamic_cast(arg.get())) {
> + img_arg->bind_attributes(*this);
> +  }
> +   }
> +
> // Create a new compute state if anything changed.
> if (!st || q != _q ||
> cs.req_local_mem != mem_local ||
> @@ -465,6 +472,25 @@ kernel::constant_argument::unbind(exec_context &ctx) {
>  }
>  
>  void
> +kernel::image_argument::bind_attributes(exec_context &ctx) {
> +   cl_image_format format = img->format();
> +   cl_uint attributes[] = {
> + static_cast(img->width()),
> + static_cast(img->height()),
> + static_cast(img->depth()),
> + format.image_channel_data_type,
> + format.image_channel_order};
> +   for (unsigned i = 0; i < 5; ++i) {
> +  auto v = bytes(attributes[i]);
> +
> +  extend(v, module::argument::zero_ext, sizeof(cl_uint));
> +  byteswap(v, ctx.q->device().endianness());
> +  align(ctx.input, sizeof(cl_uint));
> +  insert(ctx.input, v);
> +   }
> +}
> +
> +void
>  kernel::image_rd_argument::set(size_t size, const void *value) {
> if (size != sizeof(cl_mem))
>throw error(CL_INVALID_ARG_SIZE);
> diff --git a/src/gallium/state_trackers/clover/core/kernel.hpp 
> b/src/gallium/state_trackers/clover/core/kernel.hpp
> index d6432a4..8c15b2f 100644
> --- a/src/gallium/state_trackers/clover/core/kernel.hpp
> +++ b/src/gallium/state_trackers/clover/core/kernel.hpp
> @@ -190,7 +190,14 @@ namespace clover {
>   pipe_surface *st;
>};
>  
> -  class image_rd_argument : public argument {
> +  class image_argument : public argument {
> +  public:
> + void bind_attributes(exec_context &ctx);
> +  protected:
> + image *img;
> +  };
> +
> +  class image_rd_argument : public image_argument {
>public:
>   virtual void set(size_t size, const void *value);
>   virtual void bind(exec_context &ctx,
> @@ -198,11 +205,10 @@ namespace clover {
>   virtual void unbind(exec_context &ctx);
>  
>private:
> - image *img;
>   pipe_sampler_view *st;
>};
>  
> -  class image_wr_argument : public argument {
> +  class image_wr_argument : public image_argument {
>public:
>   virtual void set(size_t size, const void *value);
>   virtual void bind(exec_context &ctx,
> @@ -210,7 +216,6 @@ namespace clover {
>   virtual void unbind(exec_context &ctx);
>  
>private:
> - image *img;
>   pipe_surface *st;
>};
>  
> diff --git a/src/gallium/state_trackers/clover/core/memory.cpp 
> b/src/gallium/state_trackers/clover/core/memory.cpp
> index 055336a..b852e68 100644
> --- a/src/gallium/state_trackers/clover/core/memory.cpp
> +++ b/src/gallium/state_trackers/clover/core/memory.cpp
> @@ -189,7 +189,7 @@ image2d::image2d(clover::context &ctx, cl_mem_flags flags,
>   const cl_image_format *format, size_t width,
>   size_t height, size_t row_pitch,
>   void *host_ptr) :
> -   image(ctx, flags, format, width, height, 0,
> +   image(ctx, flags, format, width, height, 1,
>   row_pitch, 0, height * row_pitch, host_ptr) {
>  }
>  
> diff --git a/src/gallium/state_trackers/clover/llvm/invocation.cpp 
> b/src/gallium/state_trackers/clover/llvm/invocation.cpp
> index 9b91fee..a33d450 100644
> --- a/src/gallium/state_trackers/clover/llvm/invocation.cpp
> +++ b/src/gallium/state_trackers/clover/llvm/invocation.cpp
> @@ -80,6 +80,7 @@
>  using namespace clover;
>  
>  namespace {
> +
>  #if 0
> void
> build_binary(const std::string &source, const std::string &target,
> @@ -340,17 +341,65 @@ namespace {
>PM.run(*mod);
> }
>  
> +   const llvm::MDNode *
> +   get_kernel_metadata(const llvm::Function *kernel_func) {
> +  auto mod = kernel_func->getParent();
> +  auto kernels_node = mod->getNamedMetadata("opencl.kernels");
> +  i

Re: [Mesa-dev] [PATCH 07/17] i965/fs: Move offset() and half() to the fs_builder

2015-06-24 Thread Francisco Jerez
Jason Ekstrand  writes:

> On Jun 24, 2015 4:29 AM, "Francisco Jerez"  wrote:
>>
>> Jason Ekstrand  writes:
>>
>> > On Tue, Jun 23, 2015 at 9:22 AM, Francisco Jerez 
> wrote:
>> >> Jason Ekstrand  writes:
>> >>
>> >>> We want to move these into the builder so that they know the current
>> >>> builder's dispatch width.  This will be needed by a later commit.
>> >>
>> >> I very much like the idea of this series, but, why do you need to move
>> >> these register manipulators into the builder?  The builder is an object
>> >> you can use to:
>> >>  - Manipulate and query parameters affecting code generation.
>> >>  - Create instructions into the program (::emit and friends).
>> >>  - Allocate virtual registers from the program (::vgrf and friends).
>> >>
>> >> offset() and half() logically perform an action on a given register
>> >> object (or rather, compute a function of a given register object), not
>> >> on a builder object, the builder is only required as an auxiliary
>> >> parameter -- Any reason you didn't just pass it as a third parameter?
>> >
>> > What's required as a third parameter is the current execution size.  I
>> > could have passed that directly, but I figured that, especially for
>> > half(), it would get messed up.  I could pass the builder in but I
>> > don't see a whole lot of difference between that and what I'm doing
>> > right now.
>>
>> Assembly-wise there's no difference, but it seems inconsistent with both
>> the remaining register manipulators and remaining builder methods, and
>> IMHO it's kind of an anti-pattern to make something a method that
>> doesn't need access to any internal details of the object.
>>
>> > As is, it's not entirely obvious whether you should call
>> > half(reg) on the half-width or full-width builder.  I'm not 100% sure
>> > what to do about that.
>> >
>> Actually, does half() really need to know about the builder?  AFAICT it
>> only needs it because of dispatch_width(), and before doing anything
>> useful with it it asserts that it's equal to 16, what points at the
>> parameter being redundant.  By convention a "half" is a group of 8
>> channels (we may want to revise this convention when we implement SIMD32
>> -- E.g. make half a group of 16 channels and quarter a group of 8
>> channels), so 'half(reg)' could simply be implemented as
>> "horiz_offset(reg, 8 * i)" without any dependency on the builder.  As
>> additional paranoia to catch half() being called on a non-16-aligned
>> register you could assert that either 'stride == 0' or 16 divides
>> '(REG_SIZE * reg_offset + subreg_offset) / (stride * type_size)' (why
>> don't we have a reg_offset already in bytes again?) -- That would also
>> catch cases in which the register and builder "widths" get out of sync,
>> e.g. if half is called in an already halved register but the builder
>> used happens to be of the correct exec_size.
>
> OK, fine, we can pull half() back out.  Should offset() stay in the
> builder? If not, where should it get its dispatch width.
>
I'm for leaving it as a stand-alone function (like all other register
manipulators), and add a third argument to pass the 'fs_builder' it can
take the dispatch width from?

>> >> As offset() and half() don't require access to any private details of
>> >> the builder, that would actually improve encapsulation, and would avoid
>> >> the dubious overloading of fs_builder::half() with two methods with
>> >> completely different semantics.
>> >
>> > Yeah, I don't really like that either.  I just couldn't come up with
>> > anything better at the time.
>> >
>> > Suggestions are very much welcome.  But I would like to settle on
>> > whatever we do fairly quickly so as to limit the amount of
>> > refactoring.
>> > --Jason
>> >
>> >> Thanks.
>> >>
>> >>> ---
>> >>>  src/mesa/drivers/dri/i965/brw_fs.cpp |  52 ++
>> >>>  src/mesa/drivers/dri/i965/brw_fs_builder.h   |  46 +
>> >>>  src/mesa/drivers/dri/i965/brw_fs_cse.cpp |   2 +-
>> >>>  src/mesa/drivers/dri/i965/brw_fs_nir.cpp |  60 +--
>> >>>  src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 149
> ++

Re: [Mesa-dev] [PATCH 07/17] i965/fs: Move offset() and half() to the fs_builder

2015-06-24 Thread Francisco Jerez
Jason Ekstrand  writes:

> On Jun 24, 2015 6:29 AM, "Francisco Jerez"  wrote:
>>
>> Jason Ekstrand  writes:
>>
>> > On Jun 24, 2015 4:29 AM, "Francisco Jerez" 
> wrote:
>> >>
>> >> Jason Ekstrand  writes:
>> >>
>> >> > On Tue, Jun 23, 2015 at 9:22 AM, Francisco Jerez <
> curroje...@riseup.net>
>> > wrote:
>> >> >> Jason Ekstrand  writes:
>> >> >>
>> >> >>> We want to move these into the builder so that they know the
> current
>> >> >>> builder's dispatch width.  This will be needed by a later commit.
>> >> >>
>> >> >> I very much like the idea of this series, but, why do you need to
> move
>> >> >> these register manipulators into the builder?  The builder is an
> object
>> >> >> you can use to:
>> >> >>  - Manipulate and query parameters affecting code generation.
>> >> >>  - Create instructions into the program (::emit and friends).
>> >> >>  - Allocate virtual registers from the program (::vgrf and friends).
>> >> >>
>> >> >> offset() and half() logically perform an action on a given register
>> >> >> object (or rather, compute a function of a given register object),
> not
>> >> >> on a builder object, the builder is only required as an auxiliary
>> >> >> parameter -- Any reason you didn't just pass it as a third
> parameter?
>> >> >
>> >> > What's required as a third parameter is the current execution size.
> I
>> >> > could have passed that directly, but I figured that, especially for
>> >> > half(), it would get messed up.  I could pass the builder in but I
>> >> > don't see a whole lot of difference between that and what I'm doing
>> >> > right now.
>> >>
>> >> Assembly-wise there's no difference, but it seems inconsistent with
> both
>> >> the remaining register manipulators and remaining builder methods, and
>> >> IMHO it's kind of an anti-pattern to make something a method that
>> >> doesn't need access to any internal details of the object.
>> >>
>> >> > As is, it's not entirely obvious whether you should call
>> >> > half(reg) on the half-width or full-width builder.  I'm not 100% sure
>> >> > what to do about that.
>> >> >
>> >> Actually, does half() really need to know about the builder?  AFAICT it
>> >> only needs it because of dispatch_width(), and before doing anything
>> >> useful with it it asserts that it's equal to 16, what points at the
>> >> parameter being redundant.  By convention a "half" is a group of 8
>> >> channels (we may want to revise this convention when we implement
> SIMD32
>> >> -- E.g. make half a group of 16 channels and quarter a group of 8
>> >> channels), so 'half(reg)' could simply be implemented as
>> >> "horiz_offset(reg, 8 * i)" without any dependency on the builder.  As
>> >> additional paranoia to catch half() being called on a non-16-aligned
>> >> register you could assert that either 'stride == 0' or 16 divides
>> >> '(REG_SIZE * reg_offset + subreg_offset) / (stride * type_size)' (why
>> >> don't we have a reg_offset already in bytes again?) -- That would also
>> >> catch cases in which the register and builder "widths" get out of sync,
>> >> e.g. if half is called in an already halved register but the builder
>> >> used happens to be of the correct exec_size.
>> >
>> > OK, fine, we can pull half() back out.  Should offset() stay in the
>> > builder? If not, where should it get its dispatch width.
>> >
>> I'm for leaving it as a stand-alone function (like all other register
>> manipulators), and add a third argument to pass the 'fs_builder' it can
>> take the dispatch width from?
>
> I'm not a big fan.  However, in the interest of keeping the builder clean,

It also keeps the register interface consistent IMHO.  Why do you say
you're not a big fan?

> I'm willing to go with that.
> --Jason
>
>> >> >> As offset() and half() don't require access to any private details
> of
>> >> >> the builder, that would actually improve encapsulation, and would
> avoid
>> &

Re: [Mesa-dev] [PATCH 07/17] i965/fs: Move offset() and half() to the fs_builder

2015-06-24 Thread Francisco Jerez
Jason Ekstrand  writes:

> On Wed, Jun 24, 2015 at 6:44 AM, Francisco Jerez  
> wrote:
>> Jason Ekstrand  writes:
>>
>>> On Jun 24, 2015 6:29 AM, "Francisco Jerez"  wrote:
>>>>
>>>> Jason Ekstrand  writes:
>>>>
>>>> > On Jun 24, 2015 4:29 AM, "Francisco Jerez" 
>>> wrote:
>>>> >>
>>>> >> Jason Ekstrand  writes:
>>>> >>
>>>> >> > On Tue, Jun 23, 2015 at 9:22 AM, Francisco Jerez <
>>> curroje...@riseup.net>
>>>> > wrote:
>>>> >> >> Jason Ekstrand  writes:
>>>> >> >>
>>>> >> >>> We want to move these into the builder so that they know the
>>> current
>>>> >> >>> builder's dispatch width.  This will be needed by a later commit.
>>>> >> >>
>>>> >> >> I very much like the idea of this series, but, why do you need to
>>> move
>>>> >> >> these register manipulators into the builder?  The builder is an
>>> object
>>>> >> >> you can use to:
>>>> >> >>  - Manipulate and query parameters affecting code generation.
>>>> >> >>  - Create instructions into the program (::emit and friends).
>>>> >> >>  - Allocate virtual registers from the program (::vgrf and friends).
>>>> >> >>
>>>> >> >> offset() and half() logically perform an action on a given register
>>>> >> >> object (or rather, compute a function of a given register object),
>>> not
>>>> >> >> on a builder object, the builder is only required as an auxiliary
>>>> >> >> parameter -- Any reason you didn't just pass it as a third
>>> parameter?
>>>> >> >
>>>> >> > What's required as a third parameter is the current execution size.
>>> I
>>>> >> > could have passed that directly, but I figured that, especially for
>>>> >> > half(), it would get messed up.  I could pass the builder in but I
>>>> >> > don't see a whole lot of difference between that and what I'm doing
>>>> >> > right now.
>>>> >>
>>>> >> Assembly-wise there's no difference, but it seems inconsistent with
>>> both
>>>> >> the remaining register manipulators and remaining builder methods, and
>>>> >> IMHO it's kind of an anti-pattern to make something a method that
>>>> >> doesn't need access to any internal details of the object.
>>>> >>
>>>> >> > As is, it's not entirely obvious whether you should call
>>>> >> > half(reg) on the half-width or full-width builder.  I'm not 100% sure
>>>> >> > what to do about that.
>>>> >> >
>>>> >> Actually, does half() really need to know about the builder?  AFAICT it
>>>> >> only needs it because of dispatch_width(), and before doing anything
>>>> >> useful with it it asserts that it's equal to 16, what points at the
>>>> >> parameter being redundant.  By convention a "half" is a group of 8
>>>> >> channels (we may want to revise this convention when we implement
>>> SIMD32
>>>> >> -- E.g. make half a group of 16 channels and quarter a group of 8
>>>> >> channels), so 'half(reg)' could simply be implemented as
>>>> >> "horiz_offset(reg, 8 * i)" without any dependency on the builder.  As
>>>> >> additional paranoia to catch half() being called on a non-16-aligned
>>>> >> register you could assert that either 'stride == 0' or 16 divides
>>>> >> '(REG_SIZE * reg_offset + subreg_offset) / (stride * type_size)' (why
>>>> >> don't we have a reg_offset already in bytes again?) -- That would also
>>>> >> catch cases in which the register and builder "widths" get out of sync,
>>>> >> e.g. if half is called in an already halved register but the builder
>>>> >> used happens to be of the correct exec_size.
>>>> >
>>>> > OK, fine, we can pull half() back out.  Should offset() stay in the
>>>> > builder? If not, where should it get its dispatch width.
>>>> >
>>&

Re: [Mesa-dev] [PATCH v2] glsls: Modify exec_list to avoid strict-aliasing violations

2015-06-26 Thread Francisco Jerez
Davin McCall  writes:

> On 26/06/15 11:08, Erik Faye-Lund wrote:
>> On Thu, Jun 25, 2015 at 1:48 AM, Davin McCall  wrote:
>>> This is an alternative to my earlier patch [1] (and it is now constructed
>>> properly using git format-patch).
>>>
>>> Quick background:
>>> There is a problem in exec_list due to it directly including a trio
>>> of 'struct exec_node *' members to implement two overlapping sentinel
>>> nodes. The sentinel nodes do not exist as exec_node objects and so
>>> should not be accessed as such, according to C99 6.5 paragraph 7.
>>> When this strict aliasing rule is violated the compiler may re-order
>>> reads and writes in unexpected ways, such as demonstrated in another
>>> email [2].
>>>
>>> The problem only manifests if compiling without -fno-strict-aliasing.
>>>
>>> This patch addresses the issue by introducing some new methods for
>>> setting the 'next' and 'prev' members of the exec_node structure, which
>>> avoid the aliasing restrictions by way of casting the exec_node pointer
>>> (to an exec_node-pointer-pointer) whenever it may actual point to a
>>> sentinel node. Essentially an exec_node can be seen as an array of two
>>> exec_node pointers, and this view is compatible with the sentinel
>>> structure in exec_list.
>>>
>>> Compared to the previous patch, this patch is much less intrusive, and
>>> does not increase the storage requirements of the exec_list structure.
>>>
>>> While I'm not proposing that -fno-strict-aliasing no longer be used for
>>> Mesa builds, this patch represents a step in that direction. With this
>>> patch applied, a working Mesa library can be built, although bugs may
>>> be present (and could be triggered only when using particular compiler
>>> versions and options). FWIW file sizes with and without strict aliasing:
>>>
>>> (gcc 4.8.4, -O3 -fomit-frame-pointer -march=corei7).
>>>
>>>  -fno-strict-aliasing:with strict aliasing:
>>> libGL.so  699188  699188(no change)
>>> *_dri.so 9575876 9563104(-2772)
>>>
>>> (dri bundle includes r300, r600, kms_swrast and swrast).
>>>
>>> So, not a huge win, size-wise. Dave Airlie reports a 30K difference in
>>> his r600_dri.so build however [3].
>>>
>>> In terms of performance:
>>>
>>> (export LIBGL_ALWAYS_SOFTWARE=1; time glmark2)
>>>
>>> -fno-strict-aliasing:
>>>
>>> glmark2 Score: 244
>>> real5m34.707s
>>> user11m36.192s
>>> sys0m29.596s
>>>
>>> with strict aliasing:
>>>
>>> glmark2 Score: 247
>>> real5m34.438s
>>> user11m29.904s
>>> sys0m29.556s
>>>
>>> Again, only a very small improvement when strict aliasing is enabled.
>>>
>>> With the above in mind it is reasonable to question whether this patch
>>> is worthwhile. However, it's done, and it seems to work, so I offer it
>>> for review.
>>>
>>>
>>> [1] http://lists.freedesktop.org/archives/mesa-dev/2015-June/087179.html
>>> [2] http://lists.freedesktop.org/archives/mesa-dev/2015-June/087246.html
>>> [3] http://lists.freedesktop.org/archives/mesa-dev/2015-June/087206.html
>>> ---
>>>   src/glsl/list.h | 123
>>> 
>>>   1 file changed, 80 insertions(+), 43 deletions(-)
>>>
>>> diff --git a/src/glsl/list.h b/src/glsl/list.h
>>> index 15fcd4a..cfbe5a9 100644
>>> --- a/src/glsl/list.h
>>> +++ b/src/glsl/list.h
>>> @@ -76,6 +76,12 @@
>>>   #include "util/ralloc.h"
>>>
>>>   struct exec_node {
>>> +   /**
>>> +* Accessing these fields directly may be ill-advised; if the
>>> 'exec_node'
>>> +* is actually a sentinel node embedded in the exec_list structure, it
>>> may
>>> +* be a strict-aliasing violation (C99 6.5 paragraph 7). Use the methods
>>> +* provided instead.
>>> +*/
>>>  struct exec_node *next;
>>>  struct exec_node *prev;
>>>
>>> @@ -140,35 +146,55 @@ exec_node_init(struct exec_node *n)
>>>  n->prev = NULL;
>>>   }
>>>
>>> +/**
>>> + * Strict-aliasing safe method for setting the next pointer for any
>>> + * node, including sentinel nodes.
>>> + */
>>> +static inline void
>>> +exec_node_set_next(struct exec_node *n, struct exec_node *next)
>>> +{
>>> +   ((struct exec_node **)n)[0] = next;
>>> +}
>>> +
>>> +/**
>>> + * Strict-aliasing safe method for setting the next pointer for any
>>> + * node, including sentinel nodes.
>>> + */
>>> +static inline void
>>> +exec_node_set_prev(struct exec_node *n, struct exec_node *next)
>>> +{
>>> +   ((struct exec_node **)n)[1] = next;
>>> +}
>>> +
>>>   static inline const struct exec_node *
>>>   exec_node_get_next_const(const struct exec_node *n)
>>>   {
>>> -   return n->next;
>>> +   return ((const struct exec_node **)n)[0];
>>>   }
>> How exactly is this supposed to be strict-aliasing safe?
>>
>> Here's the wording from the C++14 spec:
>>
>> "If a program attempts to access the stored value of an object through
>> a glvalue of other than one of the following types the behavior is
>> undefined:
>> * the dynamic type of the object,
>> * a

Re: [Mesa-dev] [PATCH v2] glsls: Modify exec_list to avoid strict-aliasing violations

2015-06-26 Thread Francisco Jerez
Davin McCall  writes:

> On 26/06/15 13:18, Francisco Jerez wrote:
>> Davin McCall  writes:
>>
>>> On 26/06/15 11:08, Erik Faye-Lund wrote:
>>>> On Thu, Jun 25, 2015 at 1:48 AM, Davin McCall  wrote:
>>>>> This is an alternative to my earlier patch [1] (and it is now constructed
>>>>> properly using git format-patch).
>>>>>
>>>>> Quick background:
>>>>> There is a problem in exec_list due to it directly including a trio
>>>>> of 'struct exec_node *' members to implement two overlapping sentinel
>>>>> nodes. The sentinel nodes do not exist as exec_node objects and so
>>>>> should not be accessed as such, according to C99 6.5 paragraph 7.
>>>>> When this strict aliasing rule is violated the compiler may re-order
>>>>> reads and writes in unexpected ways, such as demonstrated in another
>>>>> email [2].
>>>>>
>>>>> The problem only manifests if compiling without -fno-strict-aliasing.
>>>>>
>>>>> This patch addresses the issue by introducing some new methods for
>>>>> setting the 'next' and 'prev' members of the exec_node structure, which
>>>>> avoid the aliasing restrictions by way of casting the exec_node pointer
>>>>> (to an exec_node-pointer-pointer) whenever it may actual point to a
>>>>> sentinel node. Essentially an exec_node can be seen as an array of two
>>>>> exec_node pointers, and this view is compatible with the sentinel
>>>>> structure in exec_list.
>>>>>
>>>>> Compared to the previous patch, this patch is much less intrusive, and
>>>>> does not increase the storage requirements of the exec_list structure.
>>>>>
>>>>> While I'm not proposing that -fno-strict-aliasing no longer be used for
>>>>> Mesa builds, this patch represents a step in that direction. With this
>>>>> patch applied, a working Mesa library can be built, although bugs may
>>>>> be present (and could be triggered only when using particular compiler
>>>>> versions and options). FWIW file sizes with and without strict aliasing:
>>>>>
>>>>> (gcc 4.8.4, -O3 -fomit-frame-pointer -march=corei7).
>>>>>
>>>>>   -fno-strict-aliasing:with strict aliasing:
>>>>> libGL.so  699188  699188(no change)
>>>>> *_dri.so 9575876 9563104(-2772)
>>>>>
>>>>> (dri bundle includes r300, r600, kms_swrast and swrast).
>>>>>
>>>>> So, not a huge win, size-wise. Dave Airlie reports a 30K difference in
>>>>> his r600_dri.so build however [3].
>>>>>
>>>>> In terms of performance:
>>>>>
>>>>> (export LIBGL_ALWAYS_SOFTWARE=1; time glmark2)
>>>>>
>>>>> -fno-strict-aliasing:
>>>>>
>>>>> glmark2 Score: 244
>>>>> real5m34.707s
>>>>> user11m36.192s
>>>>> sys0m29.596s
>>>>>
>>>>> with strict aliasing:
>>>>>
>>>>> glmark2 Score: 247
>>>>> real5m34.438s
>>>>> user11m29.904s
>>>>> sys0m29.556s
>>>>>
>>>>> Again, only a very small improvement when strict aliasing is enabled.
>>>>>
>>>>> With the above in mind it is reasonable to question whether this patch
>>>>> is worthwhile. However, it's done, and it seems to work, so I offer it
>>>>> for review.
>>>>>
>>>>>
>>>>> [1] http://lists.freedesktop.org/archives/mesa-dev/2015-June/087179.html
>>>>> [2] http://lists.freedesktop.org/archives/mesa-dev/2015-June/087246.html
>>>>> [3] http://lists.freedesktop.org/archives/mesa-dev/2015-June/087206.html
>>>>> ---
>>>>>src/glsl/list.h | 123
>>>>> 
>>>>>1 file changed, 80 insertions(+), 43 deletions(-)
>>>>>
>>>>> diff --git a/src/glsl/list.h b/src/glsl/list.h
>>>>> index 15fcd4a..cfbe5a9 100644
>>>>> --- a/src/glsl/list.h
>>>>> +++ b/src/glsl/list.h
>>>>> @@ -76,6 +76,12 @@
>>>>>#include "util/ralloc.h"
>>>>>
>>>>

Re: [Mesa-dev] [PATCH v2] glsls: Modify exec_list to avoid strict-aliasing violations

2015-06-26 Thread Francisco Jerez
Davin McCall  writes:

> On 26/06/15 14:31, Eirik Byrkjeflot Anonsen wrote:
>> Erik Faye-Lund  writes:
>>
>>> On Fri, Jun 26, 2015 at 1:23 PM, Davin McCall  wrote:
 On 26/06/15 12:03, Davin McCall wrote:
> ... The stored value of 'n' is not accessed by any other type than the
> type of n itself. This value is then cast to a different pointer type. You
> are mistaken if you think that the cast accesses the stored value of n. 
> The
> other "stored value" access that it occurs in that expression is to the
> object pointed at by the result of the cast. [...]:

 I'm sorry, I think that was phrased somewhat abrasively, which I did not
 intend. Let me try this part again. If we by break up the expression in
 order of evaluation:

 From:
 return ((const struct exec_node **)n)[0]

 In order of evaluation:

 n
 - which accesses the stored value of n, i.e. a value of type 'struct exec
 node *', via n, which is obviously of that type.

 (const struct exec_node **)n
   - which casts that value, after it has been retrieved, to another type. 
 If
 this were an aliasing violation, then casting any pointer variable to
 another type would be an aliasing violation; this is clearly not the case.

 ((const struct exec_node **)n)[0]
 - which de-references the result of the above cast, thereby accessing a
 stored value of type 'exec node *' using a glvalue of type 'exec node *'.
>>> I think breaking this up is a mistake, because the strict-aliasing
>>> rules is explicitly about the *combination* of these two things.
>>>
>>> You *are* accessing the underlying memory of 'n' through a different
>>> type, and this is what strict aliasing is all about. But it takes two
>>> steps, a single step isn't enough to do so.
>>>
>>> Those other spec-quotes doesn't undo the strict-aliasing definitions;
>>> knowing how things are laid out in memory doesn't mean the compiler
>>> cannot assume two differently typed variables doesn't overlap.
>> So basically, you're saying that e.g.:
>>
>> p->next = a;
>> q = exec_node_get_next_const(p);
>>
>> is equivalent to:
>>
>> exec_node * p1 = p;
>> exec_node ** p2 = (exec_node**)p;
>> p1->next = a;
>> q = p2[0];
>
> It is, once the patch is applied (or if strict aliasing is disabled).
>
>> And at this point p1 and p2 are different types, so the compiler can
>> freely assume that p1 and p2 are non-overlapping.
>
> p1 and p2 are two separate variables and of course they are 
> non-overlapping, but *p1 and **p2 are the same type and so may overlap.
>
Also note that even *p1 and *p2 are allowed to overlap even though they
are of different types because of section 6.5 of C99:

| 7 An object shall have its stored value accessed only by an lvalue
|   expression that has one of the following types:
|[...]
|- an aggregate or union type that includes one of the aforementioned
|  types among its members (including, recursively, a member of a
|  subaggregate or contained union)[...]

>>   Thus the two
>> assignments can be "safely" reordered. Sounds plausible to me.
>
> The assignments are to 'p1->next' and 'p2[0]', which *are* the same type 
> (struct exec_node *) and therefore the assignments *cannot* be 
> reordered. It is exactly this that I rely on in my patch to resolve the 
> aliasing issue with the current code.
>
>> And note that casting via void* won't help. "p == (void*)p" compares a
>> variable of type "exec_node*" to a variable of type "void*", and thus
>> there's no strict-aliasing problem. But "p == (exec_node**)(void*)p"
>> compares an "exec_node*" to an "exec_node**" and thus the compiler can
>> assume that they are not the same.
>
> The compiler cannot assume pointers are not the same based on their 
> type, unless the pointers are de-referenced, which they are not in the 
> example you just gave. Strictly speaking C99 doesn't even allow the 
> comparison of 'p == (exec_node**)(void*)p', but all the compilers I know 
> of allow it, and AFAIK give the same result as if one operand were cast 
> to the same type as the other.
>
> Davin
>
> ___
> mesa-dev mailing list
> mesa-dev@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/mesa-dev


signature.asc
Description: PGP signature
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH v2] glsls: Modify exec_list to avoid strict-aliasing violations

2015-06-26 Thread Francisco Jerez
Erik Faye-Lund  writes:

> On Fri, Jun 26, 2015 at 4:16 PM, Davin McCall  wrote:
>> On 26/06/15 14:53, Erik Faye-Lund wrote:
>>>
>>> On Fri, Jun 26, 2015 at 3:05 PM, Davin McCall  wrote:

 On 26/06/15 12:55, Erik Faye-Lund wrote:

 On Fri, Jun 26, 2015 at 1:23 PM, Davin McCall  wrote:

 On 26/06/15 12:03, Davin McCall wrote:

 ... The stored value of 'n' is not accessed by any other type than the
 type of n itself. This value is then cast to a different pointer type.
 You
 are mistaken if you think that the cast accesses the stored value of n.
 The
 other "stored value" access that it occurs in that expression is to the
 object pointed at by the result of the cast. [...]:

 I'm sorry, I think that was phrased somewhat abrasively, which I did not
 intend. Let me try this part again. If we by break up the expression in
 order of evaluation:

 From:
 return ((const struct exec_node **)n)[0]

 In order of evaluation:

 n
 - which accesses the stored value of n, i.e. a value of type 'struct exec
 node *', via n, which is obviously of that type.

 (const struct exec_node **)n
   - which casts that value, after it has been retrieved, to another type.
 If
 this were an aliasing violation, then casting any pointer variable to
 another type would be an aliasing violation; this is clearly not the
 case.

 ((const struct exec_node **)n)[0]
 - which de-references the result of the above cast, thereby accessing a
 stored value of type 'exec node *' using a glvalue of type 'exec node *'.

 I think breaking this up is a mistake, because the strict-aliasing
 rules is explicitly about the *combination* of these two things.


 It is not a mistake, and the strict aliasing rules are not about the
 combination of these two things.
>>>
>>> It is. In fact, it's not even possible to violate strict-aliasing
>>> without doing at least two operations. You cannot validate operations
>>> in a vacuum, because that's not how strict-aliasing is defined.
>>
>>
>> Any pointer dereference can violate strict aliasing - that's one operation.
>> If you mean that it's first necessary to construct a pointer value in such a
>> way that de-referencing it will be an aliasing violation, then yes, I agree
>> with this statement.
>>
>
> Yes, I mean exactly the latter. You cannot look at one operation in
> isolation, you need to look at the whole program.
>
>>>
 As I have pointed out, with your reading,
 pretty much any pointer cast constitutes an aliasing violation.

>>> No, only those violating the strict aliasing rules I posted before.
>>
>>
>> ... which would only allow changing const/volatile qualifiers, not the
>> pointed-to type.
>>
>
> You can change the pointed to type in terms of signedness, you can
> cast it to a compatible type, you can cast a void-pointer or
> char-pointer to any type. But you need to make sure you don't violate
> the strict-aliasing rules in some other way while doing the latter.
>
> Aliasing *is* hard. But let's not go shopping for that reason.
>
>> Your reading also disallows casting an 'int' variable to type 'long',
>> because that isn't on the list.
>>
>>>
 The strict aliasing rules specify what kind of reference you can use to
 access an object of a particular type. They say nothing about how that
 reference is obtained.
>>>
>>> Which means that it applies regardless of how you obtain it.
>>
>>
>> Yes.
>>
>>> "If a program attempts to access the stored value of an object through
>>> a glvalue of other than one of the following types the behavior is
>>> undefined"
>>>
>>> It says "if a *program* attempts", not "if a *statement* attempts" or
>>> "if an *opreation* attempts". This is a whole-program deal, not
>>> limited to one operation in isolation.
>>
>>
>> The key part of the wording is "through a glvalue":
>>
>> "If a program attempts to access the stored value of an object *through
>> a glvalue* of other than one of the following types ..."
>
> This is exactly what makes this invalid AFAICT, see below.
>
>> Going back to the original example:
>>
>>return ((const struct exec_node **)n)[0]
>>
>> The glvalue used to access the object in n is n itself. (I do not think that
>> '(const struct exec_node **)n' is even a glvalue).
>
> Bur 'n' *is* an lvalue, which also makes it an glvalue (for reference,
> a glvalue is a "generalized lvalue", which means that it's either an
> lvalue or an xvalue). You can write stuff like:
>

"n" is indeed an lvalue (which in no way aliases the storage of any
exec_node or exec_list object), the result of the cast expression is
not, and the result of the subscript expression is again an lvalue but
of a type (exec_node *) which may legitimately alias an exec_node or
exec_list object (because of the text from C99 6.5/7 I quoted earlier),
so this code seems valid to me (though admittedl

Re: [Mesa-dev] [PATCH v2 00/19] i965/fs: Remove the width field from fs_reg

2015-06-26 Thread Francisco Jerez
Jason Ekstrand  writes:

> This is a re-send of the series I did a week or two ago to remove the width
> field from the fs_reg class.  I really didn't want to do a re-send but
> there have been enough fixes since then that I thought it was worth
> re-sending.  Most of these patches have already been reviewed but not all.
>
> 02: New.  Needs to be reviewed by someone familiar with SKL
>
> 03: Needs re-review.  This one is affected by 02.
>
> 05: Needs re-review.  This one went through a lot of changes to actually
> get it right.  It should be the way we want now.
>
> 08: New.  It's just moving code around so it should be trivial.
>
> 09: New.  This is a complete replacement of patch 07 from the previous
> series.
>
> Cc: Topi Pohjolainen 
> Cc: Iago Toral Quiroga 
> Cc: Francisco Jerez 
> Cc: Neil Roberts 
>
Since you seem to have R-b tags from Iago or Topi on all patches already
and from a quick look it seemed generally reasonable, the series is:

Acked-by: Francisco Jerez 

Thanks.

> Jason Ekstrand (19):
>   i965/fs: Use a switch statement in fs_inst::regs_read()
>   i965/fs: Actually set/use the mlen for gen7 uniform pull constant
> loads
>   i965/fs: Fix fs_inst::regs_read() for uniform pull constant loads
>   i965/fs: Report the right value in fs_inst::regs_read() for PIXEL_X/Y
>   i965/fs: Explicitly set the exec_size on the add(32) in interpolation
> setup
>   i965/fs: Set the builder group for emitting FB-write stencil/AA alpha
>   i965/blorp: Explicitly set execution sizes for new'd instructions
>   i965/fs: Move offset(fs_reg, unsigned) to brw_fs.h
>   i965/fs: Add a builder argument to offset()
>   i965/fs: Make better use of the builder in shader_time
>   i965/fs: Remove fs_inst constructors that don't take an explicit
> exec_size
>   i965/fs: Use exec_size for determining regs read/written and partial
> writes
>   i965/fs_builder: Use the dispatch width for setting exec sizes
>   i965/fs: Remove exec_size guessing from fs_inst::init()
>   i965/fs: Use the builder dispatch width instead of dst.width for pull
> constants
>   i965/fs: Use the builder dispatch_width for computing register offsets
>   i965/fs: Use exec_size instead of dst.width for computing component
> size
>   i965/fs_generator: Use inst->exec_size for determining hardware reg
> widths
>   i965/fs: Remove the width field from fs_reg
>
>  src/mesa/drivers/dri/i965/brw_blorp_blit_eu.cpp|   9 +-
>  src/mesa/drivers/dri/i965/brw_fs.cpp   | 266 
> -
>  src/mesa/drivers/dri/i965/brw_fs.h |  21 ++
>  src/mesa/drivers/dri/i965/brw_fs_builder.h |  37 ++-
>  .../drivers/dri/i965/brw_fs_copy_propagation.cpp   |   4 -
>  src/mesa/drivers/dri/i965/brw_fs_cse.cpp   |  10 +-
>  src/mesa/drivers/dri/i965/brw_fs_generator.cpp |  23 +-
>  src/mesa/drivers/dri/i965/brw_fs_nir.cpp   |  64 ++---
>  src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp  |   4 +-
>  .../drivers/dri/i965/brw_fs_register_coalesce.cpp  |   3 +-
>  src/mesa/drivers/dri/i965/brw_fs_visitor.cpp   | 183 +++---
>  src/mesa/drivers/dri/i965/brw_ir_fs.h  |  45 +---
>  .../drivers/dri/i965/brw_schedule_instructions.cpp |   4 +-
>  13 files changed, 287 insertions(+), 386 deletions(-)
>
> -- 
> 2.4.3


signature.asc
Description: PGP signature
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH v2] glsls: Modify exec_list to avoid strict-aliasing violations

2015-06-26 Thread Francisco Jerez
Erik Faye-Lund  writes:

> On Fri, Jun 26, 2015 at 4:53 PM, Francisco Jerez  
> wrote:
>> Erik Faye-Lund  writes:
>>
>>> On Fri, Jun 26, 2015 at 4:16 PM, Davin McCall  wrote:
>>>> On 26/06/15 14:53, Erik Faye-Lund wrote:
>>>>>
>>>>> On Fri, Jun 26, 2015 at 3:05 PM, Davin McCall  wrote:
>>>>>>
>>>>>> On 26/06/15 12:55, Erik Faye-Lund wrote:
>>>>>>
>>>>>> On Fri, Jun 26, 2015 at 1:23 PM, Davin McCall  wrote:
>>>>>>
>>>>>> On 26/06/15 12:03, Davin McCall wrote:
>>>>>>
>>>>>> ... The stored value of 'n' is not accessed by any other type than the
>>>>>> type of n itself. This value is then cast to a different pointer type.
>>>>>> You
>>>>>> are mistaken if you think that the cast accesses the stored value of n.
>>>>>> The
>>>>>> other "stored value" access that it occurs in that expression is to the
>>>>>> object pointed at by the result of the cast. [...]:
>>>>>>
>>>>>> I'm sorry, I think that was phrased somewhat abrasively, which I did not
>>>>>> intend. Let me try this part again. If we by break up the expression in
>>>>>> order of evaluation:
>>>>>>
>>>>>> From:
>>>>>> return ((const struct exec_node **)n)[0]
>>>>>>
>>>>>> In order of evaluation:
>>>>>>
>>>>>> n
>>>>>> - which accesses the stored value of n, i.e. a value of type 'struct exec
>>>>>> node *', via n, which is obviously of that type.
>>>>>>
>>>>>> (const struct exec_node **)n
>>>>>>   - which casts that value, after it has been retrieved, to another type.
>>>>>> If
>>>>>> this were an aliasing violation, then casting any pointer variable to
>>>>>> another type would be an aliasing violation; this is clearly not the
>>>>>> case.
>>>>>>
>>>>>> ((const struct exec_node **)n)[0]
>>>>>> - which de-references the result of the above cast, thereby accessing a
>>>>>> stored value of type 'exec node *' using a glvalue of type 'exec node *'.
>>>>>>
>>>>>> I think breaking this up is a mistake, because the strict-aliasing
>>>>>> rules is explicitly about the *combination* of these two things.
>>>>>>
>>>>>>
>>>>>> It is not a mistake, and the strict aliasing rules are not about the
>>>>>> combination of these two things.
>>>>>
>>>>> It is. In fact, it's not even possible to violate strict-aliasing
>>>>> without doing at least two operations. You cannot validate operations
>>>>> in a vacuum, because that's not how strict-aliasing is defined.
>>>>
>>>>
>>>> Any pointer dereference can violate strict aliasing - that's one operation.
>>>> If you mean that it's first necessary to construct a pointer value in such 
>>>> a
>>>> way that de-referencing it will be an aliasing violation, then yes, I agree
>>>> with this statement.
>>>>
>>>
>>> Yes, I mean exactly the latter. You cannot look at one operation in
>>> isolation, you need to look at the whole program.
>>>
>>>>>
>>>>>> As I have pointed out, with your reading,
>>>>>> pretty much any pointer cast constitutes an aliasing violation.
>>>>>>
>>>>> No, only those violating the strict aliasing rules I posted before.
>>>>
>>>>
>>>> ... which would only allow changing const/volatile qualifiers, not the
>>>> pointed-to type.
>>>>
>>>
>>> You can change the pointed to type in terms of signedness, you can
>>> cast it to a compatible type, you can cast a void-pointer or
>>> char-pointer to any type. But you need to make sure you don't violate
>>> the strict-aliasing rules in some other way while doing the latter.
>>>
>>> Aliasing *is* hard. But let's not go shopping for that reason.
>>>
>>>> Your reading also disallows casting an 'int' variable to type 'long',
>>>> because that isn't on the list.
>>>>
&g

Re: [Mesa-dev] [PATCH v2] glsls: Modify exec_list to avoid strict-aliasing violations

2015-06-26 Thread Francisco Jerez
Erik Faye-Lund  writes:

> On Fri, Jun 26, 2015 at 4:01 PM, Francisco Jerez  
> wrote:
>> Davin McCall  writes:
>>
>>> On 26/06/15 14:31, Eirik Byrkjeflot Anonsen wrote:
>>>> Erik Faye-Lund  writes:
>>>>
>>>>> On Fri, Jun 26, 2015 at 1:23 PM, Davin McCall  wrote:
>>>>>> On 26/06/15 12:03, Davin McCall wrote:
>>>>>>> ... The stored value of 'n' is not accessed by any other type than the
>>>>>>> type of n itself. This value is then cast to a different pointer type. 
>>>>>>> You
>>>>>>> are mistaken if you think that the cast accesses the stored value of n. 
>>>>>>> The
>>>>>>> other "stored value" access that it occurs in that expression is to the
>>>>>>> object pointed at by the result of the cast. [...]:
>>>>>>
>>>>>> I'm sorry, I think that was phrased somewhat abrasively, which I did not
>>>>>> intend. Let me try this part again. If we by break up the expression in
>>>>>> order of evaluation:
>>>>>>
>>>>>> From:
>>>>>> return ((const struct exec_node **)n)[0]
>>>>>>
>>>>>> In order of evaluation:
>>>>>>
>>>>>> n
>>>>>> - which accesses the stored value of n, i.e. a value of type 'struct exec
>>>>>> node *', via n, which is obviously of that type.
>>>>>>
>>>>>> (const struct exec_node **)n
>>>>>>   - which casts that value, after it has been retrieved, to another 
>>>>>> type. If
>>>>>> this were an aliasing violation, then casting any pointer variable to
>>>>>> another type would be an aliasing violation; this is clearly not the 
>>>>>> case.
>>>>>>
>>>>>> ((const struct exec_node **)n)[0]
>>>>>> - which de-references the result of the above cast, thereby accessing a
>>>>>> stored value of type 'exec node *' using a glvalue of type 'exec node *'.
>>>>> I think breaking this up is a mistake, because the strict-aliasing
>>>>> rules is explicitly about the *combination* of these two things.
>>>>>
>>>>> You *are* accessing the underlying memory of 'n' through a different
>>>>> type, and this is what strict aliasing is all about. But it takes two
>>>>> steps, a single step isn't enough to do so.
>>>>>
>>>>> Those other spec-quotes doesn't undo the strict-aliasing definitions;
>>>>> knowing how things are laid out in memory doesn't mean the compiler
>>>>> cannot assume two differently typed variables doesn't overlap.
>>>> So basically, you're saying that e.g.:
>>>>
>>>> p->next = a;
>>>> q = exec_node_get_next_const(p);
>>>>
>>>> is equivalent to:
>>>>
>>>> exec_node * p1 = p;
>>>> exec_node ** p2 = (exec_node**)p;
>>>> p1->next = a;
>>>> q = p2[0];
>>>
>>> It is, once the patch is applied (or if strict aliasing is disabled).
>>>
>>>> And at this point p1 and p2 are different types, so the compiler can
>>>> freely assume that p1 and p2 are non-overlapping.
>>>
>>> p1 and p2 are two separate variables and of course they are
>>> non-overlapping, but *p1 and **p2 are the same type and so may overlap.
>>>
>> Also note that even *p1 and *p2 are allowed to overlap even though they
>> are of different types because of section 6.5 of C99:
>>
>> | 7 An object shall have its stored value accessed only by an lvalue
>> |   expression that has one of the following types:
>> |[...]
>> |- an aggregate or union type that includes one of the aforementioned
>> |  types among its members (including, recursively, a member of a
>> |  subaggregate or contained union)[...]
>
> I don't see how this wording legitimates the code above. There's no
> unions involved, so that part is irrelevant.

Yeah, only the "aggregate" part is relevant.

> And "n" isn't an aggregate type, it's a pointer type that happens to
> point to an aggregate type, no? But even if it were, it needs to
> include one of the "aforementioned" types among its members, which I
> cannot see that it does either.
>
> Care to explain?

In the example this was replying to, *p1 was an lvalue of aggregate type
(struct exec_node), and *p2 (or equivalently p2[0]) was an lvalue of
type "struct exec_node *".  The latter is "a type compatible with the
effective type of the object", the object being the "next" member of an
aggregate object of type "struct exec_node", hence the text quoted above
applies and *p1 and *p2 may legitimately alias the same object.


signature.asc
Description: PGP signature
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH v2 16/19] i965/fs: Use the builder dispatch_width for computing register offsets

2015-06-26 Thread Francisco Jerez
Jason Ekstrand  writes:

> Reviewed-by: Topi Pohjolainen 
> ---
>  src/mesa/drivers/dri/i965/brw_fs.h | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/src/mesa/drivers/dri/i965/brw_fs.h 
> b/src/mesa/drivers/dri/i965/brw_fs.h
> index d4cc43d..d94a842 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs.h
> +++ b/src/mesa/drivers/dri/i965/brw_fs.h
> @@ -72,7 +72,7 @@ offset(fs_reg reg, const brw::fs_builder& bld, unsigned 
> delta)
> case MRF:
> case ATTR:
>return byte_offset(reg,
> - delta * MAX2(reg.width * reg.stride, 1) *
> + delta * bld.dispatch_width() * reg.stride *

Er...  This doesn't look right for stride == 0.  If you keep the
MAX2(.., 1) expression this patch is:

Reviewed-by: Francisco Jerez 

>   type_sz(reg.type));
> case UNIFORM:
>reg.reg_offset += delta;
> -- 
> 2.4.3
>
> ___
> mesa-dev mailing list
> mesa-dev@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/mesa-dev


signature.asc
Description: PGP signature
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 2/2] clover: implement CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE

2015-06-26 Thread Francisco Jerez
Grigori Goronzy  writes:

> On 2015-05-28 13:04, Grigori Goronzy wrote:
>> Work-group size should always be aligned to subgroup size; this is a
>> basic requirement, otherwise some work-items will be no-operation.
>> 
>> It might make sense to refine the value according to a kernel's
>> resource usage, but that's a possible optimization for the future.
>
> Ping?
>
> This is rather simple, but I'd like an Rb, if possible. That also goes 
> for the Gallium support patch.
>

For this patch:
Reviewed-by: Francisco Jerez 

Thanks.

> Grigori
>
>> ---
>>  src/gallium/state_trackers/clover/api/kernel.cpp  | 2 +-
>>  src/gallium/state_trackers/clover/core/device.cpp | 5 +
>>  src/gallium/state_trackers/clover/core/device.hpp | 1 +
>>  3 files changed, 7 insertions(+), 1 deletion(-)
>> 
>> diff --git a/src/gallium/state_trackers/clover/api/kernel.cpp
>> b/src/gallium/state_trackers/clover/api/kernel.cpp
>> index 05cc392..857a152 100644
>> --- a/src/gallium/state_trackers/clover/api/kernel.cpp
>> +++ b/src/gallium/state_trackers/clover/api/kernel.cpp
>> @@ -169,7 +169,7 @@ clGetKernelWorkGroupInfo(cl_kernel d_kern,
>> cl_device_id d_dev,
>>break;
>> 
>> case CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE:
>> -  buf.as_scalar() = 1;
>> +  buf.as_scalar() = dev.subgroup_size();
>>break;
>> 
>> case CL_KERNEL_PRIVATE_MEM_SIZE:
>> diff --git a/src/gallium/state_trackers/clover/core/device.cpp
>> b/src/gallium/state_trackers/clover/core/device.cpp
>> index 42b45b7..c42d1d2 100644
>> --- a/src/gallium/state_trackers/clover/core/device.cpp
>> +++ b/src/gallium/state_trackers/clover/core/device.cpp
>> @@ -185,6 +185,11 @@ device::max_block_size() const {
>> return { v.begin(), v.end() };
>>  }
>> 
>> +cl_uint
>> +device::subgroup_size() const {
>> +   return get_compute_param(pipe, 
>> PIPE_COMPUTE_CAP_SUBGROUP_SIZE)[0];
>> +}
>> +
>>  std::string
>>  device::device_name() const {
>> return pipe->get_name(pipe);
>> diff --git a/src/gallium/state_trackers/clover/core/device.hpp
>> b/src/gallium/state_trackers/clover/core/device.hpp
>> index de5fc6b..2857847 100644
>> --- a/src/gallium/state_trackers/clover/core/device.hpp
>> +++ b/src/gallium/state_trackers/clover/core/device.hpp
>> @@ -67,6 +67,7 @@ namespace clover {
>>bool has_doubles() const;
>> 
>>std::vector max_block_size() const;
>> +  cl_uint subgroup_size() const;
>>std::string device_name() const;
>>std::string vendor_name() const;
>>enum pipe_shader_ir ir_format() const;
> ___
> mesa-dev mailing list
> mesa-dev@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/mesa-dev


signature.asc
Description: PGP signature
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 1/2] clover: fix event handling of buffer operations

2015-06-26 Thread Francisco Jerez
Grigori Goronzy  writes:

> On 2015-06-09 22:52, Francisco Jerez wrote:
>>> +
>>> +   if (blocking)
>>> +  hev().wait();
>>> +
>> 
>> hard_event::wait() may fail, so this should probably be done before the
>> ret_object() call to avoid leaks.
>
> Alright... C++ exceptions are a minefield. :)
>
So is virtually any approach to error handling :P, it's not really more
difficult to write exception-safe code than it is to write equivalent
returned-error-code-safe code, in fact it's IME less difficult as long
as you stick to RAII (which is a healthy practice on its own).

>> Is there any reason you didn't make
>> the same change in clEnqueueReadBuffer() and clEnqueueWriteBuffer()?
>> 
>
> Must be an oversight. I think I did that, or at least I intended to do 
> so.
>
>> Same comment as above.  Also note that this is being more strict than
>> the spec requires (which I believe is what Tom was referring to).  From
>> the CL 1.2 spec:
>> 
>> | If blocking_write is CL_TRUE, the OpenCL implementation copies the 
>> data
>> | referred to by ptr and enqueues the write operation in the
>> | command-queue. The memory pointed to by ptr can be reused by the
>> | application after the clEnqueueWriteBufferRect call returns.
>> 
>> The spec is giving you no guarantee that the write to the actual memory
>> object will be complete by the time the clEnqueueWriteBufferRect call
>> returns -- Only that your data will have been buffered somewhere and 
>> the
>> memory pointed to by the argument can be reused immediately by the
>> application.  The reason why I was reluctant to make this change last
>> time it came up was that it's likely to hurt performance unnecessarily
>> because the wait() call blocks until *all* previous commands in the 
>> same
>> queue have completed execution, even though in the most common case the
>> copy is performed synchronously using soft_copy_op(), so the wait() 
>> call
>> is redundant even for blocking copies.
>> 
>
> OK, maybe we could drop the wait completely for all of the "write" 
> calls.
>

I think those should also call event::wait_signalled() just to make sure
that the event action has been executed already -- It may not have
executed immediately if there were any user events in the dependency
graph.

>> The case with blocking reads is similar, the copy is handled
>> synchronously using soft_copy_op() when no user events are present in
>> the list of dependencies, so calling wait() on the event is unnecessary
>> to guarantee that the execution of the read has completed, and will
>> cause a pipe_context flush and wait until the most recent fence is
>> signalled.
>> 
>
> I think it's reasonable to expect that the event is ready for profile 
> queries after a blocking read has finished. That was the initial 
> motivation for this patch. Other implementations behave like that. I 
> didn't expect wait() to completely flush everything. Won't that cause a 
> lot of needless flushing with event wait lists?
>
hard_event::wait() flushes the command queue, what in turn attaches a
fence to the event object marking the end of the execution of the last
batch of commands, which arguably contains whatever operations were
performed by the event action.  This assumption breaks in the case of
soft_copy_op() because it doesn't submit any actual commands to the GPU,
so calling hard_event::wait() is sub-optimal (it will wait for commands
which are completely unrelated to the copy operation), this can be fixed
by using the weaker version of wait() that doesn't care about the GPU
being already done with the work.

Thanks.

>> Ideally we would have a weaker variant of event::wait()
>> (e.g. wait_signalled()) that doesn't flush and just waits for the
>> associated action call-back to have been executed without giving any
>> guarantees about the corresponding GPU command.  The event interface
>> doesn't expose such a functionality right now, I'm attaching two
>> (completely untested) patches implementing it, you should be able to 
>> use
>> them as starting point to fix blocking transfers.
>> 
>
> Thanks, I'll look into that later when I get some free time.
>
> Grigori


signature.asc
Description: PGP signature
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH v2 16/19] i965/fs: Use the builder dispatch_width for computing register offsets

2015-06-26 Thread Francisco Jerez
Jason Ekstrand  writes:

> On Fri, Jun 26, 2015 at 8:52 AM, Francisco Jerez  
> wrote:
>> Jason Ekstrand  writes:
>>
>>> Reviewed-by: Topi Pohjolainen 
>>> ---
>>>  src/mesa/drivers/dri/i965/brw_fs.h | 2 +-
>>>  1 file changed, 1 insertion(+), 1 deletion(-)
>>>
>>> diff --git a/src/mesa/drivers/dri/i965/brw_fs.h 
>>> b/src/mesa/drivers/dri/i965/brw_fs.h
>>> index d4cc43d..d94a842 100644
>>> --- a/src/mesa/drivers/dri/i965/brw_fs.h
>>> +++ b/src/mesa/drivers/dri/i965/brw_fs.h
>>> @@ -72,7 +72,7 @@ offset(fs_reg reg, const brw::fs_builder& bld, unsigned 
>>> delta)
>>> case MRF:
>>> case ATTR:
>>>return byte_offset(reg,
>>> - delta * MAX2(reg.width * reg.stride, 1) *
>>> + delta * bld.dispatch_width() * reg.stride *
>>
>> Er...  This doesn't look right for stride == 0.  If you keep the
>> MAX2(.., 1) expression this patch is:
>
> I don't think offset() even makes sense for something with stride ==
> 0.  I added "assert(stride != 0)" right above the byte_offset() call
> and it passed Jenkins.  Would that be an acceptable alternative?

stride == 0 implies that each logical component of your vector takes 1
scalar component (because all the N channels of your SIMDN value are one
and the same scalar in your register file), that means that logically
independent components of a vector or array are stored 1 scalar apart,
and the previous code was doing the right thing.

> --Jason
>
>> Reviewed-by: Francisco Jerez 
>>
>>>   type_sz(reg.type));
>>> case UNIFORM:
>>>reg.reg_offset += delta;
>>> --
>>> 2.4.3
>>>
>>> ___
>>> mesa-dev mailing list
>>> mesa-dev@lists.freedesktop.org
>>> http://lists.freedesktop.org/mailman/listinfo/mesa-dev


signature.asc
Description: PGP signature
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH] nir: Make C++ more happy with NIR_SRC_INIT and NIR_DEST_INIT

2015-06-26 Thread Francisco Jerez
Jason Ekstrand  writes:

> In C, if you partially initialize a structure, the rest of the struct gets
> set to 0.  C++, however, does not have this rule so GCC throws warnings
> whenver NIR_SRC_INIT or NIR_DEST_INIT is used in C++.

I don't think that's right, in C++ initializers missing from an
aggregate initializer list are also defined to be initialized
(value-initialized to be more precise, what would set them to zero in
this case just like in C).

> Since nir.h contains a static inline that uses NIR_SRC_INIT, every C++
> file that includes nir.h complains about this.
>
I suspect the reason why this causes a warning may be that you're using
compound literals? (which are a C99-specific feature and not part of C++)

> This patch adds a small static inline function that makes a struct,
> memsets it to 0, and returns it.  NIR_SRC_INIT and NIR_DEST_INIT are then
> wrappers around this function.

In C++ you could just call the implicitly defined default constructor
for nir_src or nir_dest, like 'nir_src()'.

> ---
>  src/glsl/nir/nir.h | 22 ++
>  1 file changed, 22 insertions(+)
>
> diff --git a/src/glsl/nir/nir.h b/src/glsl/nir/nir.h
> index c666d93..3634f30 100644
> --- a/src/glsl/nir/nir.h
> +++ b/src/glsl/nir/nir.h
> @@ -511,7 +511,18 @@ typedef struct nir_src {
> bool is_ssa;
>  } nir_src;
>  
> +#ifdef __cplusplus
> +static inline nir_src
> +__nir_src_init(void)
> +{
> +   nir_src src;
> +   memset(&src, 0, sizeof(src));
> +   return src;
> +}
> +#define NIR_SRC_INIT (__nir_src_init())
> +#else
>  #define NIR_SRC_INIT (nir_src) { { NULL } }
> +#endif
>  
>  #define nir_foreach_use(reg_or_ssa_def, src) \
> list_for_each_entry(nir_src, src, &(reg_or_ssa_def)->uses, use_link)
> @@ -534,7 +545,18 @@ typedef struct {
> bool is_ssa;
>  } nir_dest;
>  
> +#ifdef __cplusplus
> +static inline nir_dest
> +__nir_dest_init(void)
> +{
> +   nir_dest dest;
> +   memset(&dest, 0, sizeof(dest));
> +   return dest;
> +}
> +#define NIR_DEST_INIT (__nir_dest_init())
> +#else
>  #define NIR_DEST_INIT (nir_dest) { { { NULL } } }
> +#endif
>  
>  #define nir_foreach_def(reg, dest) \
> list_for_each_entry(nir_dest, dest, &(reg)->defs, reg.def_link)
> -- 
> 2.4.3
>
> ___
> mesa-dev mailing list
> mesa-dev@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/mesa-dev


signature.asc
Description: PGP signature
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH] nir: Make C++ more happy with NIR_SRC_INIT and NIR_DEST_INIT

2015-06-26 Thread Francisco Jerez
Jason Ekstrand  writes:

> On Fri, Jun 26, 2015 at 12:08 PM, Francisco Jerez  
> wrote:
>> Jason Ekstrand  writes:
>>
>>> In C, if you partially initialize a structure, the rest of the struct gets
>>> set to 0.  C++, however, does not have this rule so GCC throws warnings
>>> whenver NIR_SRC_INIT or NIR_DEST_INIT is used in C++.
>>
>> I don't think that's right, in C++ initializers missing from an
>> aggregate initializer list are also defined to be initialized
>> (value-initialized to be more precise, what would set them to zero in
>> this case just like in C).
>
> Yes, that is correct.  I just did a second attempt that, instead,
> defines a static const variable named NIR_SRC_INIT with a partial
> initializer.  C++ still gets grumpy and gives me a pile of "missing
> initializer" warnings.
>
That's likely related to the warning flags you have enabled in CXXFLAGS,
not to C++ itself.  Maybe you have -Wmissing-field-initializers enabled
for C++ only?

>>> Since nir.h contains a static inline that uses NIR_SRC_INIT, every C++
>>> file that includes nir.h complains about this.
>>>
>> I suspect the reason why this causes a warning may be that you're using
>> compound literals? (which are a C99-specific feature and not part of C++)
>>
>>> This patch adds a small static inline function that makes a struct,
>>> memsets it to 0, and returns it.  NIR_SRC_INIT and NIR_DEST_INIT are then
>>> wrappers around this function.
>>
>> In C++ you could just call the implicitly defined default constructor
>> for nir_src or nir_dest, like 'nir_src()'.
>
> The implicitly defined default constructor does nothing to POD types,
> so doing so would explicitly *not* perform the desired action of
> zeroing out the data.
>

Indeed, but 'nir_src()' doesn't only call the implicitly-defined trivial
default constructor, it value-initializes the object (See section 8.5/8
of the C++14 spec) what for POD types causes all members to be
zero-initialized.

>>> ---
>>>  src/glsl/nir/nir.h | 22 ++
>>>  1 file changed, 22 insertions(+)
>>>
>>> diff --git a/src/glsl/nir/nir.h b/src/glsl/nir/nir.h
>>> index c666d93..3634f30 100644
>>> --- a/src/glsl/nir/nir.h
>>> +++ b/src/glsl/nir/nir.h
>>> @@ -511,7 +511,18 @@ typedef struct nir_src {
>>> bool is_ssa;
>>>  } nir_src;
>>>
>>> +#ifdef __cplusplus
>>> +static inline nir_src
>>> +__nir_src_init(void)
>>> +{
>>> +   nir_src src;
>>> +   memset(&src, 0, sizeof(src));
>>> +   return src;
>>> +}
>>> +#define NIR_SRC_INIT (__nir_src_init())
>>> +#else
>>>  #define NIR_SRC_INIT (nir_src) { { NULL } }
>>> +#endif
>>>
>>>  #define nir_foreach_use(reg_or_ssa_def, src) \
>>> list_for_each_entry(nir_src, src, &(reg_or_ssa_def)->uses, use_link)
>>> @@ -534,7 +545,18 @@ typedef struct {
>>> bool is_ssa;
>>>  } nir_dest;
>>>
>>> +#ifdef __cplusplus
>>> +static inline nir_dest
>>> +__nir_dest_init(void)
>>> +{
>>> +   nir_dest dest;
>>> +   memset(&dest, 0, sizeof(dest));
>>> +   return dest;
>>> +}
>>> +#define NIR_DEST_INIT (__nir_dest_init())
>>> +#else
>>>  #define NIR_DEST_INIT (nir_dest) { { { NULL } } }
>>> +#endif
>>>
>>>  #define nir_foreach_def(reg, dest) \
>>> list_for_each_entry(nir_dest, dest, &(reg)->defs, reg.def_link)
>>> --
>>> 2.4.3
>>>
>>> ___
>>> mesa-dev mailing list
>>> mesa-dev@lists.freedesktop.org
>>> http://lists.freedesktop.org/mailman/listinfo/mesa-dev


signature.asc
Description: PGP signature
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH] nir: Make C++ more happy with NIR_SRC_INIT and NIR_DEST_INIT

2015-06-26 Thread Francisco Jerez
Jason Ekstrand  writes:

> On Fri, Jun 26, 2015 at 3:03 PM, Francisco Jerez  
> wrote:
>> Jason Ekstrand  writes:
>>
>>> On Fri, Jun 26, 2015 at 12:08 PM, Francisco Jerez  
>>> wrote:
>>>> Jason Ekstrand  writes:
>>>>
>>>>> In C, if you partially initialize a structure, the rest of the struct gets
>>>>> set to 0.  C++, however, does not have this rule so GCC throws warnings
>>>>> whenver NIR_SRC_INIT or NIR_DEST_INIT is used in C++.
>>>>
>>>> I don't think that's right, in C++ initializers missing from an
>>>> aggregate initializer list are also defined to be initialized
>>>> (value-initialized to be more precise, what would set them to zero in
>>>> this case just like in C).
>>>
>>> Yes, that is correct.  I just did a second attempt that, instead,
>>> defines a static const variable named NIR_SRC_INIT with a partial
>>> initializer.  C++ still gets grumpy and gives me a pile of "missing
>>> initializer" warnings.
>>>
>> That's likely related to the warning flags you have enabled in CXXFLAGS,
>> not to C++ itself.  Maybe you have -Wmissing-field-initializers enabled
>> for C++ only?
>>
>>>>> Since nir.h contains a static inline that uses NIR_SRC_INIT, every C++
>>>>> file that includes nir.h complains about this.
>>>>>
>>>> I suspect the reason why this causes a warning may be that you're using
>>>> compound literals? (which are a C99-specific feature and not part of C++)
>>>>
>>>>> This patch adds a small static inline function that makes a struct,
>>>>> memsets it to 0, and returns it.  NIR_SRC_INIT and NIR_DEST_INIT are then
>>>>> wrappers around this function.
>>>>
>>>> In C++ you could just call the implicitly defined default constructor
>>>> for nir_src or nir_dest, like 'nir_src()'.
>>>
>>> The implicitly defined default constructor does nothing to POD types,
>>> so doing so would explicitly *not* perform the desired action of
>>> zeroing out the data.
>>>
>>
>> Indeed, but 'nir_src()' doesn't only call the implicitly-defined trivial
>> default constructor, it value-initializes the object (See section 8.5/8
>> of the C++14 spec) what for POD types causes all members to be
>> zero-initialized.
>
> It looks like this greatly depends on your C++ version.  If it's C++11
> or above, I believe it does get zero-initialized.  If it's earlier
> than C++11, it doesn't.  At least that's the way I read this:
>
> http://en.cppreference.com/w/cpp/language/value_initialization

Not really, it will get zero-initialized back to C++98.  AFAICT what the
article is trying to say is that in C++98 what is now referred to as
value-initialization used to be called default-initialization in the
spec, but still it had the effect of zero-initializing the structure.


signature.asc
Description: PGP signature
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH] nir: Make C++ more happy with NIR_SRC_INIT and NIR_DEST_INIT

2015-06-26 Thread Francisco Jerez
Jason Ekstrand  writes:

> On Fri, Jun 26, 2015 at 3:34 PM, Francisco Jerez  
> wrote:
>> Jason Ekstrand  writes:
>>
>>> On Fri, Jun 26, 2015 at 3:03 PM, Francisco Jerez  
>>> wrote:
>>>> Jason Ekstrand  writes:
>>>>
>>>>> On Fri, Jun 26, 2015 at 12:08 PM, Francisco Jerez  
>>>>> wrote:
>>>>>> Jason Ekstrand  writes:
>>>>>>
>>>>>>> In C, if you partially initialize a structure, the rest of the struct 
>>>>>>> gets
>>>>>>> set to 0.  C++, however, does not have this rule so GCC throws warnings
>>>>>>> whenver NIR_SRC_INIT or NIR_DEST_INIT is used in C++.
>>>>>>
>>>>>> I don't think that's right, in C++ initializers missing from an
>>>>>> aggregate initializer list are also defined to be initialized
>>>>>> (value-initialized to be more precise, what would set them to zero in
>>>>>> this case just like in C).
>>>>>
>>>>> Yes, that is correct.  I just did a second attempt that, instead,
>>>>> defines a static const variable named NIR_SRC_INIT with a partial
>>>>> initializer.  C++ still gets grumpy and gives me a pile of "missing
>>>>> initializer" warnings.
>>>>>
>>>> That's likely related to the warning flags you have enabled in CXXFLAGS,
>>>> not to C++ itself.  Maybe you have -Wmissing-field-initializers enabled
>>>> for C++ only?
>>>>
>>>>>>> Since nir.h contains a static inline that uses NIR_SRC_INIT, every C++
>>>>>>> file that includes nir.h complains about this.
>>>>>>>
>>>>>> I suspect the reason why this causes a warning may be that you're using
>>>>>> compound literals? (which are a C99-specific feature and not part of C++)
>>>>>>
>>>>>>> This patch adds a small static inline function that makes a struct,
>>>>>>> memsets it to 0, and returns it.  NIR_SRC_INIT and NIR_DEST_INIT are 
>>>>>>> then
>>>>>>> wrappers around this function.
>>>>>>
>>>>>> In C++ you could just call the implicitly defined default constructor
>>>>>> for nir_src or nir_dest, like 'nir_src()'.
>>>>>
>>>>> The implicitly defined default constructor does nothing to POD types,
>>>>> so doing so would explicitly *not* perform the desired action of
>>>>> zeroing out the data.
>>>>>
>>>>
>>>> Indeed, but 'nir_src()' doesn't only call the implicitly-defined trivial
>>>> default constructor, it value-initializes the object (See section 8.5/8
>>>> of the C++14 spec) what for POD types causes all members to be
>>>> zero-initialized.
>>>
>>> It looks like this greatly depends on your C++ version.  If it's C++11
>>> or above, I believe it does get zero-initialized.  If it's earlier
>>> than C++11, it doesn't.  At least that's the way I read this:
>>>
>>> http://en.cppreference.com/w/cpp/language/value_initialization
>>
>> Not really, it will get zero-initialized back to C++98.  AFAICT what the
>> article is trying to say is that in C++98 what is now referred to as
>> value-initialization used to be called default-initialization in the
>> spec, but still it had the effect of zero-initializing the structure.
>
> Ok, I did some more reading and I think I'm convinced now.  Figuring
> out what "nir_src src = nir_src()" actually does should *not* take
> this much research.  I'll send an updated patch on Monday.
> --Jason

Then again this is likely to only be required because you somehow end up
with different warning options in C and C++.  Omitting members in an
aggregate initializer is valid C++, and has the same effect as
value-initializing the missing members, see section 8.5.1 "Aggregates"
of the C++14 spec:

| 7 If there are fewer initializer-clauses in the list than there are
|   members in the aggregate, then each member not explicitly
|   initialized shall be initialized from its brace-or-equal-initializer
|   or, if there is no brace-or-equal-initializer, from an empty
|   initializer list (8.5.4).

And 8.5.4 "List-initialization":

| 3 List-initialization of an object or reference of type T is defined
|   as follows:
|[..]
|— Otherwise, if the initializer list has no elements, the object is
|  value-initialized.

That said, doing 'nir_src()' does have the advantage that you avoid
using non-standard compound literals in C++, but you could definitely
avoid them in some other way (e.g. just omit the '(nir_src)' type?) that
doesn't involve using separate preprocessor paths for C and C++.


signature.asc
Description: PGP signature
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 1/2] gallium: add PIPE_COMPUTE_CAP_SUBGROUP_SIZE

2015-06-27 Thread Francisco Jerez
Grigori Goronzy  writes:

> We need this to implement OpenCL's
> CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE.

Reviewed-by: Francisco Jerez 

Thanks.

> ---
>  src/gallium/docs/source/screen.rst |  2 ++
>  src/gallium/drivers/ilo/ilo_screen.c   |  8 
>  src/gallium/drivers/nouveau/nvc0/nvc0_screen.c |  4 
>  src/gallium/drivers/radeon/r600_pipe_common.c  |  6 ++
>  src/gallium/drivers/radeon/r600_pipe_common.h  | 20 
>  src/gallium/include/pipe/p_defines.h   |  3 ++-
>  6 files changed, 42 insertions(+), 1 deletion(-)
>
> diff --git a/src/gallium/docs/source/screen.rst 
> b/src/gallium/docs/source/screen.rst
> index 416ef2d..32c1e87 100644
> --- a/src/gallium/docs/source/screen.rst
> +++ b/src/gallium/docs/source/screen.rst
> @@ -382,6 +382,8 @@ pipe_screen::get_compute_param.
>Value type: ``uint32_t``
>  * ``PIPE_COMPUTE_CAP_IMAGES_SUPPORTED``: Whether images are supported
>non-zero means yes, zero means no. Value type: ``uint32_t``
> +* ``PIPE_COMPUTE_CAP_SUBGROUP_SIZE``: The size of a basic execution unit in
> +  threads. Also known as wavefront size, warp size or SIMD width.
>  
>  .. _pipe_bind:
>  
> diff --git a/src/gallium/drivers/ilo/ilo_screen.c 
> b/src/gallium/drivers/ilo/ilo_screen.c
> index b0fed73..f2a18b2 100644
> --- a/src/gallium/drivers/ilo/ilo_screen.c
> +++ b/src/gallium/drivers/ilo/ilo_screen.c
> @@ -195,6 +195,7 @@ ilo_get_compute_param(struct pipe_screen *screen,
>uint32_t max_clock_frequency;
>uint32_t max_compute_units;
>uint32_t images_supported;
> +  uint32_t subgroup_size;
> } val;
> const void *ptr;
> int size;
> @@ -286,6 +287,13 @@ ilo_get_compute_param(struct pipe_screen *screen,
>ptr = &val.images_supported;
>size = sizeof(val.images_supported);
>break;
> +   case PIPE_COMPUTE_CAP_SUBGROUP_SIZE:
> +  /* best case is SIMD32 */
> +  val.subgroup_size = 32;
> +
> +  ptr = &val.subgroup_size;
> +  size = sizeof(val.subgroup_size);
> +  break;
> default:
>ptr = NULL;
>size = 0;
> diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c 
> b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
> index 1ca997a..f6bef83 100644
> --- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
> +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
> @@ -340,6 +340,7 @@ nvc0_screen_get_compute_param(struct pipe_screen *pscreen,
>enum pipe_compute_cap param, void *data)
>  {
> uint64_t *data64 = (uint64_t *)data;
> +   uint32_t *data32 = (uint32_t *)data;
> const uint16_t obj_class = nvc0_screen(pscreen)->compute->oclass;
>  
> switch (param) {
> @@ -371,6 +372,9 @@ nvc0_screen_get_compute_param(struct pipe_screen *pscreen,
> case PIPE_COMPUTE_CAP_MAX_INPUT_SIZE: /* c[], arbitrary limit */
>data64[0] = 4096;
>return 8;
> +   case PIPE_COMPUTE_CAP_SUBGROUP_SIZE:
> +  data32[0] = 32;
> +  return 4;
> default:
>return 0;
> }
> diff --git a/src/gallium/drivers/radeon/r600_pipe_common.c 
> b/src/gallium/drivers/radeon/r600_pipe_common.c
> index 42e681d..5494cb3 100644
> --- a/src/gallium/drivers/radeon/r600_pipe_common.c
> +++ b/src/gallium/drivers/radeon/r600_pipe_common.c
> @@ -637,6 +637,12 @@ static int r600_get_compute_param(struct pipe_screen 
> *screen,
>   return sizeof(uint32_t);
>   case PIPE_COMPUTE_CAP_MAX_PRIVATE_SIZE:
>   break; /* unused */
> + case PIPE_COMPUTE_CAP_SUBGROUP_SIZE:
> + if (ret) {
> + uint32_t *subgroup_size = ret;
> + *subgroup_size = r600_wavefront_size(rscreen->family);
> + }
> + return sizeof(uint32_t);
>   }
>  
>  fprintf(stderr, "unknown PIPE_COMPUTE_CAP %d\n", param);
> diff --git a/src/gallium/drivers/radeon/r600_pipe_common.h 
> b/src/gallium/drivers/radeon/r600_pipe_common.h
> index 6ce81d3..51fd016 100644
> --- a/src/gallium/drivers/radeon/r600_pipe_common.h
> +++ b/src/gallium/drivers/radeon/r600_pipe_common.h
> @@ -570,6 +570,26 @@ static inline unsigned r600_tex_aniso_filter(unsigned 
> filter)
>/* else */return 4;
>  }
>  
> +static inline unsigned r600_wavefront_size(enum radeon_family family)
> +{
> + switch (family) {
> + case CHIP_RV610:
> + case CHIP_RS780:
> + case CHIP_RV620:
> + case CHIP_RS880:
> + return 16;
> + case CHIP_RV630:
> + case CHIP_RV635:
> + case CHIP_RV730:
> + case CHIP_RV710:
> + case CHIP_PALM:
> + case

Re: [Mesa-dev] [PATCH 0/3] additions to loop unroll patchset

2015-06-29 Thread Francisco Jerez
Tapani Pälli  writes:

> Hi;
>
> Here's additions to patches I sent earlier (reviewed by curro), these just
> set EmitNoIndirectSampler on when there's no ARB_gpu_shader5 support liked
> discussed.
>
> Original mail thread:
> http://lists.freedesktop.org/archives/mesa-dev/2015-June/086049.html
>
> All patches applied on master:
> http://cgit.freedesktop.org/~tpalli/mesa/log/?h=unroll_loops
>

Looks good to me, for the series:
Reviewed-by: Francisco Jerez 

> Thanks;
>
> Tapani Pälli (3):
>   i965: use EmitNoIndirectSampler for gen < 7
>   i915: use EmitNoIndirectSampler
>   mesa/st: use EmitNoIndirectSampler if !ARB_gpu_shader5
>
>  src/mesa/drivers/dri/i915/i915_context.c | 3 +++
>  src/mesa/drivers/dri/i965/brw_shader.cpp | 4 
>  src/mesa/state_tracker/st_context.c  | 5 +
>  3 files changed, 12 insertions(+)
>
> -- 
> 2.1.0


signature.asc
Description: PGP signature
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH v2] glsls: Modify exec_list to avoid strict-aliasing violations

2015-06-29 Thread Francisco Jerez
Davin McCall  writes:

> On 26/06/15 14:53, Francisco Jerez wrote:
>
>> [...]
>>
>> Your first approach seemed quite reasonable IMHO.  Were you able to
>> measure any performance regression from it?
>>
>> Thanks.
>>
>
> When I run an apitrace replay of a Dota 2 trace [1] with 
> LIBGL_ALWAYS_SOFTWARE and without the patch I get (averaged over 5 runs):
>
>  Maximum Resident Set Size (kbytes): 4509696
>  FPS: .9044752
>  user time: 2467.306
>
> ("Maximum Resident Set Size" and user time are given by GNU "time". I'm 
> not sure what it's really measuring, because this is a 32-bit system and 
> I don't see how the maximum resident set could be > 4GB; "top" shows 
> virt+res capping out at about 2.3GB. However I assume MRSS is at least 
> giving some relative indication of memory use; the deviation wasn't too 
> high).
>
> With the patch (again averaged over 5 runs):
>
>  Maximum Resident Set Size: 4523622.4
>  FPS: 0.9068524
>  user time: 2457.506
>
> So, "MRSS" has gone up a bit, but nothing else has changed 
> significantly. I think that means memory use has slightly increased, but 
> performance hasn't really changed.
>
> I wanted to test with the Intel driver using INTEL_NO_HW, but I get a 
> segfault when the patch is applied. Having checked over the patch 
> several times, I think this might mean that it triggers a latent bug 
> elsewhere, but I am still investigating that. V2 of the patch does not 
> trigger this crash.
>

Most likely some assumption left to fix in the i965 back-end?  Have you
shared your changes to the i965 driver already?  They don't seem to be
part of your v1.

Thanks.

>
> [1]  http://people.freedesktop.org/~anholt/dota_linux.trace


signature.asc
Description: PGP signature
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH v2] glsls: Modify exec_list to avoid strict-aliasing violations

2015-06-29 Thread Francisco Jerez
Davin McCall  writes:

> On 29/06/15 10:40, Francisco Jerez wrote:
>> Davin McCall  writes:
>>
>>> On 26/06/15 14:53, Francisco Jerez wrote:
>>>
>>>> [...]
>>>>
>>>> Your first approach seemed quite reasonable IMHO.  Were you able to
>>>> measure any performance regression from it?
>>>>
>>>> Thanks.
>>>>
>>> When I run an apitrace replay of a Dota 2 trace [1] with
>>> LIBGL_ALWAYS_SOFTWARE and without the patch I get (averaged over 5 runs):
>>>
>>>   Maximum Resident Set Size (kbytes): 4509696
>>>   FPS: .9044752
>>>   user time: 2467.306
>>>
>>> ("Maximum Resident Set Size" and user time are given by GNU "time". I'm
>>> not sure what it's really measuring, because this is a 32-bit system and
>>> I don't see how the maximum resident set could be > 4GB; "top" shows
>>> virt+res capping out at about 2.3GB. However I assume MRSS is at least
>>> giving some relative indication of memory use; the deviation wasn't too
>>> high).
>>>
>>> With the patch (again averaged over 5 runs):
>>>
>>>   Maximum Resident Set Size: 4523622.4
>>>   FPS: 0.9068524
>>>   user time: 2457.506
>>>
>>> So, "MRSS" has gone up a bit, but nothing else has changed
>>> significantly. I think that means memory use has slightly increased, but
>>> performance hasn't really changed.
>>>
>>> I wanted to test with the Intel driver using INTEL_NO_HW, but I get a
>>> segfault when the patch is applied. Having checked over the patch
>>> several times, I think this might mean that it triggers a latent bug
>>> elsewhere, but I am still investigating that. V2 of the patch does not
>>> trigger this crash.
>>>
>> Most likely some assumption left to fix in the i965 back-end?
>
> That's what I thought. However, I've just tried (after reverting the 
> patch) inserting a single field in the exec_list structure to emulate 
> the data layout that should occur when the patch is applied - but I 
> can't then reproduce the problem. So I guess there is something wrong 
> with the patch itself, but I'm blind to it :(
>
>>Have you
>> shared your changes to the i965 driver already?  They don't seem to be
>> part of your v1.
>
> No, I've not shared them previously, but I've included them now (below).
>
> Davin
>
>
>
>  From 2b6ebbb7787a78d55ba46de2f78eb2ba20b9fe58 Mon Sep 17 00:00:00 2001
> From: Davin McCall 
> Date: Sat, 27 Jun 2015 13:48:41 +0100
> Subject: [PATCH] glsl: fix some strict aliasing issues in exec_list
>
> There is a problem in exec_list due to it directly including a trio
> of 'struct exec_node *' members to implement two overlapping sentinel
> nodes. The sentinel nodes do not exist as exec_node objects and so
> should not be accessed as such, according to C99 6.5 paragraph 7.
> When this strict aliasing rule is violated the compiler may re-order
> reads and writes in unexpected ways. The problem only manifests if
> compiling without -fno-strict-aliasing, since that option allows
> breaking the strict aliasing rules.
>
> This patch addresses the issue by including explicit head and tail
> sentinel nodes into the exec_list structure, which do not overlap.
> This adds a single word of storage to the size of the exec_list structure.
>
> While I'm not proposing that -fno-strict-aliasing no longer be used for
> Mesa builds, this patch represents a step in that direction. With this
> patch applied, a working Mesa library can be built, although bugs may
> be present (and could be triggered only when using particular compiler
> versions and options on particular platforms).
> ---
>   src/glsl/ast_function.cpp  |  20 +++--
>   src/glsl/ast_to_hir.cpp|   9 +-
>   src/glsl/glsl_parser_extras.cpp|   6 +-
>   src/glsl/ir.cpp|   8 +-
>   src/glsl/ir_clone.cpp  |   2 +-
>   src/glsl/ir_constant_expression.cpp|   3 +-
>   src/glsl/ir_function.cpp   |  14 ++--
>   src/glsl/ir_reader.cpp |   5 +-
>   src/glsl/ir_validate.cpp   |   5 +-
>   src/glsl/list.h| 130 
> +
>   src/glsl/lower_clip_distance.cpp   |   6 +-
>   src/glsl/lower_jumps.cpp   |   2 +-
>   src/glsl/lower_packed_varyings.cpp |   8 +-
>   src/

Re: [Mesa-dev] [PATCH v2 04/19] i965/fs: Report the right value in fs_inst::regs_read() for PIXEL_X/Y

2015-06-29 Thread Francisco Jerez
Jason Ekstrand  writes:

> Reviewed-by: Iago Toral Quiroga 
> Reviewed-by: Topi Pohjolainen 
> ---
>  src/mesa/drivers/dri/i965/brw_fs.cpp | 6 ++
>  1 file changed, 6 insertions(+)
>
> diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp 
> b/src/mesa/drivers/dri/i965/brw_fs.cpp
> index 589b74c..6cf9e96 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
> @@ -726,6 +726,12 @@ fs_inst::regs_read(int arg) const
>   return exec_size / 4;
>break;
>  
> +   case FS_OPCODE_PIXEL_X:
> +   case FS_OPCODE_PIXEL_Y:
> +  if (arg == 0)
> + return 2;
> +  break;
> +

This doesn't look right.  AFAICT PIXEL_X/Y take two exec_size-wide
components of UW type (interleaved for each subspan, but that doesn't
matter here), i.e. two registers in SIMD16 mode but only one register in
SIMD8 mode.

> default:
>if (is_tex() && arg == 0 && src[0].file == GRF)
>   return mlen;
> -- 
> 2.4.3
>
> ___
> mesa-dev mailing list
> mesa-dev@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/mesa-dev


signature.asc
Description: PGP signature
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH v2 12/19] i965/fs: Use exec_size for determining regs read/written and partial writes

2015-06-30 Thread Francisco Jerez
Jason Ekstrand  writes:

> Reviewed-by: Topi Pohjolainen 
> ---
>  src/mesa/drivers/dri/i965/brw_fs.cpp | 8 
>  1 file changed, 4 insertions(+), 4 deletions(-)
>
> diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp 
> b/src/mesa/drivers/dri/i965/brw_fs.cpp
> index d1e253a..4f56865 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
> @@ -101,7 +101,7 @@ fs_inst::init(enum opcode opcode, uint8_t exec_size, 
> const fs_reg &dst,
> case MRF:
> case ATTR:
>this->regs_written =
> - DIV_ROUND_UP(MAX2(dst.width * dst.stride, 1) * type_sz(dst.type), 
> 32);
> + DIV_ROUND_UP(MAX2(exec_size * dst.stride, 1) * type_sz(dst.type), 
> 32);
>break;
> case BAD_FILE:
>this->regs_written = 0;
> @@ -675,7 +675,7 @@ bool
>  fs_inst::is_partial_write() const
>  {
> return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
> -   (this->dst.width * type_sz(this->dst.type)) < 32 ||
> +   (this->exec_size * type_sz(this->dst.type)) < 32 ||
> !this->dst.is_contiguous());
>  }
>  
> @@ -729,8 +729,8 @@ fs_inst::regs_read(int arg) const
>if (src[arg].stride == 0) {
>   return 1;
>} else {
> - int size = src[arg].width * src[arg].stride * 
> type_sz(src[arg].type);
> - return (size + 31) / 32;
> + int size = this->exec_size * src[arg].stride * 
> type_sz(src[arg].type);
> + return DIV_ROUND_UP(size, 32);

This seems to be wrong for header sources of LOAD_PAYLOAD, since they
are 8-wide regardless of the instruction execution size.

>}
> case MRF:
>unreachable("MRF registers are not allowed as sources");
> -- 
> 2.4.3
>
> ___
> mesa-dev mailing list
> mesa-dev@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/mesa-dev


signature.asc
Description: PGP signature
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH v2 09/19] i965/fs: Add a builder argument to offset()

2015-06-30 Thread Francisco Jerez
Jason Ekstrand  writes:

> ---
>  src/mesa/drivers/dri/i965/brw_fs.cpp |  42 
>  src/mesa/drivers/dri/i965/brw_fs.h   |   2 +-
>  src/mesa/drivers/dri/i965/brw_fs_cse.cpp |   2 +-
>  src/mesa/drivers/dri/i965/brw_fs_nir.cpp |  58 +--
>  src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 143 
> ++-

You should probably also update test_fs_cmod_propagation and
test_fs_saturate_propagation to avoid breaking make check.

>  5 files changed, 128 insertions(+), 119 deletions(-)
>
> diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp 
> b/src/mesa/drivers/dri/i965/brw_fs.cpp
> index 6cf9e96..9855bfb 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
> @@ -267,7 +267,7 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder 
> &bld,
>   inst->mlen = 1 + dispatch_width / 8;
> }
>  
> -   bld.MOV(dst, offset(vec4_result, (const_offset & 3) * scale));
> +   bld.MOV(dst, offset(vec4_result, bld, (const_offset & 3) * scale));
>  }
>  
>  /**
> @@ -361,7 +361,12 @@ fs_inst::is_copy_payload(const brw::simple_allocator 
> &grf_alloc) const
>reg.width = this->src[i].width;
>if (!this->src[i].equals(reg))
>   return false;
> -  reg = ::offset(reg, 1);
> +
> +  if (i < this->header_size) {
> + reg.reg_offset += 1;
> +  } else {
> + reg.reg_offset += this->exec_size / 8;
> +  }
> }
>  
> return true;
> @@ -920,7 +925,7 @@ fs_visitor::emit_fragcoord_interpolation(bool 
> pixel_center_integer,
> } else {
>bld.ADD(wpos, this->pixel_x, fs_reg(0.5f));
> }
> -   wpos = offset(wpos, 1);
> +   wpos = offset(wpos, bld, 1);
>  
> /* gl_FragCoord.y */
> if (!flip && pixel_center_integer) {
> @@ -936,7 +941,7 @@ fs_visitor::emit_fragcoord_interpolation(bool 
> pixel_center_integer,
>  
>bld.ADD(wpos, pixel_y, fs_reg(offset));
> }
> -   wpos = offset(wpos, 1);
> +   wpos = offset(wpos, bld, 1);
>  
> /* gl_FragCoord.z */
> if (devinfo->gen >= 6) {
> @@ -946,7 +951,7 @@ fs_visitor::emit_fragcoord_interpolation(bool 
> pixel_center_integer,
> this->delta_xy[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
> interp_reg(VARYING_SLOT_POS, 2));
> }
> -   wpos = offset(wpos, 1);
> +   wpos = offset(wpos, bld, 1);
>  
> /* gl_FragCoord.w: Already set up in emit_interpolation */
> bld.MOV(wpos, this->wpos_w);
> @@ -1029,7 +1034,7 @@ fs_visitor::emit_general_interpolation(fs_reg attr, 
> const char *name,
>   /* If there's no incoming setup data for this slot, don't
>* emit interpolation for it.
>*/
> - attr = offset(attr, type->vector_elements);
> + attr = offset(attr, bld, type->vector_elements);
>   location++;
>   continue;
>}
> @@ -1044,7 +1049,7 @@ fs_visitor::emit_general_interpolation(fs_reg attr, 
> const char *name,
>  interp = suboffset(interp, 3);
> interp.type = attr.type;
> bld.emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
> -attr = offset(attr, 1);
> +attr = offset(attr, bld, 1);
>   }
>} else {
>   /* Smooth/noperspective interpolation case. */
> @@ -1082,7 +1087,7 @@ fs_visitor::emit_general_interpolation(fs_reg attr, 
> const char *name,
> if (devinfo->gen < 6 && interpolation_mode == 
> INTERP_QUALIFIER_SMOOTH) {
>bld.MUL(attr, attr, this->pixel_w);
> }
> -attr = offset(attr, 1);
> +attr = offset(attr, bld, 1);
>   }
>  
>}
> @@ -1190,7 +1195,7 @@ fs_visitor::emit_samplepos_setup()
> }
> /* Compute gl_SamplePosition.x */
> compute_sample_position(pos, int_sample_x);
> -   pos = offset(pos, 1);
> +   pos = offset(pos, abld, 1);
> if (dispatch_width == 8) {
>abld.MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1)));
> } else {
> @@ -2980,10 +2985,6 @@ fs_visitor::lower_load_payload()
>  
>assert(inst->dst.file == MRF || inst->dst.file == GRF);
>assert(inst->saturate == false);
> -
> -  const fs_builder ibld = bld.group(inst->exec_size, inst->force_sechalf)
> - .exec_all(inst->force_writemask_all)
> - .at(block, inst);
>fs_reg dst = inst->dst;
>  
>/* Get rid of COMPR4.  We'll add it back in if we need it */
> @@ -2991,17 +2992,23 @@ fs_visitor::lower_load_payload()
>   dst.reg = dst.reg & ~BRW_MRF_COMPR4;
>  
>dst.width = 8;
> +  const fs_builder hbld = bld.group(8, 0).exec_all().at(block, inst);
> +
>for (uint8_t i = 0; i < inst->header_size; i++) {
>   if (inst->src[i].file != BAD_FILE) {
>  fs_reg mov_dst = retype(dst, BRW_REGISTER_TYPE_UD);
>  fs_reg mov_src = retype(inst->src[i], BRW_REGISTER_TYPE_UD);
>  mov_src.width = 8;
> -

Re: [Mesa-dev] [PATCH v2 05/19] i965/fs: Explicitly set the exec_size on the add(32) in interpolation setup

2015-06-30 Thread Francisco Jerez
Jason Ekstrand  writes:

> Soon we will start using the builder to explicitly set all the execution
> sizes.  We could make a 32-wide builder, but the builder asserts that we
> never grow it which is usually a reasonable assumption.  Sinc this one
> instruction is a bit of an odd-ball, we just set the exec_size explicitly.
>
> Reviewed-by: Iago Toral Quiroga 
>
> v2: Explicitly new the fs_inst instead of using the builder and setting
> exec_size after the fact.
>
> v3: Set force_writemask_all with the builder instead of directly.  The
> builder over-writes it if we set it manually.  Also, if we don't have
> force_writemask_all in the builder it will assert-fail on SIMD32.

It seems to me that it would be useful to be able to create instructions
using the builder with execution size higher than the default as long as
force_writemask_all is set, as this isn't the first time I've found
myself wanting that feature -- In fact before your series it was
possible to do it by using a destination register of the correct width.

How about we change the builder to allow it?  (Patch attached)

> ---
>  src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 10 ++
>  1 file changed, 6 insertions(+), 4 deletions(-)
>
> diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp 
> b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
> index 9a4bad6..8976c25 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
> @@ -1357,10 +1357,12 @@ fs_visitor::emit_interpolation_setup_gen6()
> */
>fs_reg int_pixel_xy(GRF, alloc.allocate(dispatch_width / 8),
>BRW_REGISTER_TYPE_UW, dispatch_width * 2);
> -  abld.exec_all()
> -  .ADD(int_pixel_xy,
> -   fs_reg(stride(suboffset(g1_uw, 4), 1, 4, 0)),
> -   fs_reg(brw_imm_v(0x11001010)));
> +  fs_inst *add =
> + new (mem_ctx) fs_inst(BRW_OPCODE_ADD, dispatch_width * 2,
> +   int_pixel_xy,
> +   fs_reg(stride(suboffset(g1_uw, 4), 1, 4, 0)),
> +   fs_reg(brw_imm_v(0x11001010)));
> +  abld.exec_all().emit(add);
>  
>this->pixel_x = vgrf(glsl_type::float_type);
>this->pixel_y = vgrf(glsl_type::float_type);
> -- 
> 2.4.3
>
> ___
> mesa-dev mailing list
> mesa-dev@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/mesa-dev

From 09f6cb08cd9951d8618dea7360aa7619cc806988 Mon Sep 17 00:00:00 2001
From: Francisco Jerez 
Date: Tue, 30 Jun 2015 15:15:44 +0300
Subject: [PATCH] i965/fs: Relax fs_builder channel group assertion when
 force_writemask_all is on.

This assertion was meant to catch code inadvertently escaping the
control flow jail determined by the group of channel enable signals
selected by some caller, however it seems useful to be able to
increase the default execution size as long as force_writemask_all is
enabled, because force_writemask_all is an explicit indication that
there is no longer a one-to-one correspondence between channels and
SIMD components so the restriction doesn't apply.

In addition reorder the calls to fs_builder::group and ::exec_all in a
couple of places to make sure that we don't temporarily break this
invariant in the future for instructions with exec_size higher than
the dispatch width.
---
 src/mesa/drivers/dri/i965/brw_fs.cpp   | 4 ++--
 src/mesa/drivers/dri/i965/brw_fs_builder.h | 4 ++--
 src/mesa/drivers/dri/i965/brw_fs_cse.cpp   | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 8658554..430710c 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -2963,8 +2963,8 @@ fs_visitor::lower_load_payload()
   assert(inst->dst.file == MRF || inst->dst.file == GRF);
   assert(inst->saturate == false);
 
-  const fs_builder ibld = bld.group(inst->exec_size, inst->force_sechalf)
- .exec_all(inst->force_writemask_all)
+  const fs_builder ibld = bld.exec_all(inst->force_writemask_all)
+ .group(inst->exec_size, inst->force_sechalf)
  .at(block, inst);
   fs_reg dst = inst->dst;
 
diff --git a/src/mesa/drivers/dri/i965/brw_fs_builder.h b/src/mesa/drivers/dri/i965/brw_fs_builder.h
index 58ac598..012180f 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_builder.h
+++ b/src/mesa/drivers/dri/i965/brw_fs_builder.h
@@ -99,8 +99,8 @@ namespace brw {
   fs_builder
   group(unsigned n, unsigned i) const
   {
- assert(n <= dispatch_width() &&
-i < dispatch_width(

[Mesa-dev] [PATCH] i965/gen9: Use custom MOCS entries set up by the kernel.

2015-06-30 Thread Francisco Jerez
Instead of relying on hardware defaults the i915 kernel driver is
going program custom MOCS tables system-wide on Gen9 hardware.  The
"WT" entry previously used for renderbuffers had a number of problems:
It disabled caching on eLLC, it used a reserved L3 cacheability
setting, and it used to override the PTE controls making renderbuffers
always WT on LLC regardless of the kernel's setting.  Instead use an
entry from the new MOCS tables with parameters: TC=LLC/eLLC, LeCC=PTE,
L3CC=WB.

Even though the corresponding kernel change is in a way an ABI break
it doesn't seem necessary to check that the kernel is recent enough
because the change should only affect Gen9 which is still unreleased
hardware.
---
Note that this change is based on Ville's "[PATCH 1/2] i965: House
MOCS settings in brw_context/brw_device_info":

http://lists.freedesktop.org/archives/mesa-dev/2015-June/086665.html

 src/mesa/drivers/dri/i965/brw_defines.h | 16 +++-
 src/mesa/drivers/dri/i965/brw_device_info.c |  5 +++--
 2 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_defines.h 
b/src/mesa/drivers/dri/i965/brw_defines.h
index 497da9c..2889118 100644
--- a/src/mesa/drivers/dri/i965/brw_defines.h
+++ b/src/mesa/drivers/dri/i965/brw_defines.h
@@ -2499,12 +2499,18 @@ enum brw_wm_barycentric_interp_mode {
  */
 #define CHV_MOCS_L30x78
 
-/* Skylake: MOCS is now an index into an array of 64 different configurable
- * cache settings.  We still use only either write-back or write-through; and
- * rely on the documented default values.
+/* Skylake: MOCS is now an index into an array of 64 different caching
+ * configurations programmed by the kernel.
  */
-#define SKL_MOCS_WB (0b001001 << 1)
-#define SKL_MOCS_WT (0b000101 << 1)
+/* TC=LLC/eLLC, LeCC=WB, LRUM=3, L3CC=WB */
+#define SKL_MOCS_WB  (1 << 1)
+/* TC=LLC/eLLC, LeCC=PTE, LRUM=3, L3CC=WB */
+#define SKL_MOCS_PTE (9 << 1)
+
+/* Broxton: As for Skylake this should match the tables set up by the kernel.
+ */
+/* L3CC=WB */
+#define BXT_MOCS_L3  (9 << 1)
 
 #define MEDIA_VFE_STATE 0x7000
 /* GEN7 DW2, GEN8+ DW3 */
diff --git a/src/mesa/drivers/dri/i965/brw_device_info.c 
b/src/mesa/drivers/dri/i965/brw_device_info.c
index 167ecb5..d5133e0 100644
--- a/src/mesa/drivers/dri/i965/brw_device_info.c
+++ b/src/mesa/drivers/dri/i965/brw_device_info.c
@@ -305,7 +305,6 @@ static const struct brw_device_info brw_device_info_chv = {
 };
 
 /* Thread counts and URB limits are placeholders, and may not be accurate. */
-/* FINISHME: Use PTE MOCS on Skylake. */
 #define GEN9_FEATURES   \
.gen = 9,\
.has_hiz_and_separate_stencil = true,\
@@ -315,7 +314,7 @@ static const struct brw_device_info brw_device_info_chv = {
.max_vs_threads = 280,   \
.max_gs_threads = 256,   \
.max_wm_threads = 408,   \
-   .mocs_pte = SKL_MOCS_WT, \
+   .mocs_pte = SKL_MOCS_PTE,\
.mocs_wb = SKL_MOCS_WB,  \
.urb = { \
   .size = 128,  \
@@ -352,6 +351,8 @@ static const struct brw_device_info brw_device_info_bxt = {
.max_vs_threads = 112,
.max_gs_threads = 112,
.max_wm_threads = 32,
+   .mocs_pte = BXT_MOCS_L3,
+   .mocs_wb = BXT_MOCS_L3,
.urb = {
   .size = 64,
   .min_vs_entries = 34,
-- 
2.4.3

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH] i965/gen9: Use custom MOCS entries set up by the kernel.

2015-06-30 Thread Francisco Jerez
Ben Widawsky  writes:

> On Tue, Jun 30, 2015 at 11:25:42PM +0300, Francisco Jerez wrote:
>> Instead of relying on hardware defaults the i915 kernel driver is
>> going program custom MOCS tables system-wide on Gen9 hardware.  The
>> "WT" entry previously used for renderbuffers had a number of problems:
>> It disabled caching on eLLC, it used a reserved L3 cacheability
>> setting, and it used to override the PTE controls making renderbuffers
>> always WT on LLC regardless of the kernel's setting.  Instead use an
>> entry from the new MOCS tables with parameters: TC=LLC/eLLC, LeCC=PTE,
>> L3CC=WB.
>> 
>> Even though the corresponding kernel change is in a way an ABI break
>> it doesn't seem necessary to check that the kernel is recent enough
>> because the change should only affect Gen9 which is still unreleased
>> hardware.
>
> I think the commit message is a bit confusing. You correctly mention the 
> WT->PTE
> fix, but then the reasoning for the WB change isn't clear [to me].
>
Right, I probably didn't mention it because the meaning of the WB define
didn't change at all, the index into the new MOCS table is different but
it should have the same semantics.

> In any case, I think it makes a lot more sense to fix the PTE setting as one
> patch for the old table, then a patch to update both WB and WT to the new 
> table
> settings.

I tried to split up the patch that way originally, but unfortunately
there's no entry in the default MOCS table equivalent to the new PTE
setting, and there is also no equivalent to the old WT setting in the
custom MOCS table (and it probably doesn't make sense to add one just
for the sake of having a nice git history), so it doesn't seem easily
possible to do it backwards either (first update to the new table, then
switch to the PTE MOCS setting).

> Also, we do have customers (Canonical) that want to make this work on
> mesa 10.5, and with an older kernel. Therefore I think the two separate 
> patches,
> and doing it without the dependency on Ville's patch (which I like FWIW) make
> the lives of everyone easiest. Then Ville can rebase his patch on top of this
> for mesa 10.7 time.
>
The problem is that an equivalent patch not based on Ville's refactor
would involve a considerable amount of churn because the BXT and SKL WB
entries (which are used in many different places) don't match (sigh).
It may not be suitable for stable either way, unless we drop BXT support
or are OK with adding a bunch of ternary operators, basically anywhere
SKL_MOCS_WB is used.

> I did think of it, but never broached the subject if we want to send both my
> MOCS patch, and the PTE version of this patch to stable.
>
> Anyway, the concept here is definitely
> Acked-by: Ben Widawsky 
>
>> ---
>> Note that this change is based on Ville's "[PATCH 1/2] i965: House
>> MOCS settings in brw_context/brw_device_info":
>> 
>> http://lists.freedesktop.org/archives/mesa-dev/2015-June/086665.html
>
> Could you include a reference to the kernel patch too if you end up resending?

Ah, sure, here it is FTR, I didn't notice I hadn't included the link
until it was too late:

http://lists.freedesktop.org/archives/intel-gfx/2015-June/070244.html

>
>> 
>>  src/mesa/drivers/dri/i965/brw_defines.h | 16 +++-
>>  src/mesa/drivers/dri/i965/brw_device_info.c |  5 +++--
>>  2 files changed, 14 insertions(+), 7 deletions(-)
>> 
>> diff --git a/src/mesa/drivers/dri/i965/brw_defines.h 
>> b/src/mesa/drivers/dri/i965/brw_defines.h
>> index 497da9c..2889118 100644
>> --- a/src/mesa/drivers/dri/i965/brw_defines.h
>> +++ b/src/mesa/drivers/dri/i965/brw_defines.h
>> @@ -2499,12 +2499,18 @@ enum brw_wm_barycentric_interp_mode {
>>   */
>>  #define CHV_MOCS_L3 0x78
>>  
>> -/* Skylake: MOCS is now an index into an array of 64 different configurable
>> - * cache settings.  We still use only either write-back or write-through; 
>> and
>> - * rely on the documented default values.
>> +/* Skylake: MOCS is now an index into an array of 64 different caching
>> + * configurations programmed by the kernel.
>>   */
>> -#define SKL_MOCS_WB (0b001001 << 1)
>> -#define SKL_MOCS_WT (0b000101 << 1)
>> +/* TC=LLC/eLLC, LeCC=WB, LRUM=3, L3CC=WB */
>> +#define SKL_MOCS_WB  (1 << 1)
>> +/* TC=LLC/eLLC, LeCC=PTE, LRUM=3, L3CC=WB */
>> +#define SKL_MOCS_PTE (9 << 1)
>> +
>> +/* Broxton: As for Skylake this should match the tables set up by the 
>> kernel.
>> + */
>> +/* L3CC=WB */
>> +#define BXT_MOCS_L3  (9 << 1)
>>  
&

Re: [Mesa-dev] [PATCH] i965/gen9: Use custom MOCS entries set up by the kernel.

2015-06-30 Thread Francisco Jerez
Ben Widawsky  writes:

> On Wed, Jul 01, 2015 at 12:33:54AM +0300, Francisco Jerez wrote:
>> Ben Widawsky  writes:
>> 
>> > On Tue, Jun 30, 2015 at 11:25:42PM +0300, Francisco Jerez wrote:
>> >> Instead of relying on hardware defaults the i915 kernel driver is
>> >> going program custom MOCS tables system-wide on Gen9 hardware.  The
>> >> "WT" entry previously used for renderbuffers had a number of problems:
>> >> It disabled caching on eLLC, it used a reserved L3 cacheability
>> >> setting, and it used to override the PTE controls making renderbuffers
>> >> always WT on LLC regardless of the kernel's setting.  Instead use an
>> >> entry from the new MOCS tables with parameters: TC=LLC/eLLC, LeCC=PTE,
>> >> L3CC=WB.
>> >> 
>> >> Even though the corresponding kernel change is in a way an ABI break
>> >> it doesn't seem necessary to check that the kernel is recent enough
>> >> because the change should only affect Gen9 which is still unreleased
>> >> hardware.
>> >
>> > I think the commit message is a bit confusing. You correctly mention the 
>> > WT->PTE
>> > fix, but then the reasoning for the WB change isn't clear [to me].
>> >
>> Right, I probably didn't mention it because the meaning of the WB define
>> didn't change at all, the index into the new MOCS table is different but
>> it should have the same semantics.
>> 
>
> I figured, just add it to the commit message :-)

OK, fixed.

>
>> > In any case, I think it makes a lot more sense to fix the PTE setting as 
>> > one
>> > patch for the old table, then a patch to update both WB and WT to the new 
>> > table
>> > settings.
>> 
>> I tried to split up the patch that way originally, but unfortunately
>> there's no entry in the default MOCS table equivalent to the new PTE
>> setting, and there is also no equivalent to the old WT setting in the
>> custom MOCS table (and it probably doesn't make sense to add one just
>> for the sake of having a nice git history), so it doesn't seem easily
>> possible to do it backwards either (first update to the new table, then
>> switch to the PTE MOCS setting).
>
> Hmm. I must not be following something because it sure looks like the HW
> defaults have indices for the PTE setting. The index you're using from the new
> table, 9 is just the hardware index 2, isn't it?
>
> 1000  10  11  0   0   00  000
>
> Can you explain what I'm missing?
>
I don't have the spec in front of me right now, but I remember that all
entries that had matching (e)LLC settings had different L3 settings (and
the other way around), so the change switching to the new table entries
would have necessarily been a functional change either way.

>> 
>> > Also, we do have customers (Canonical) that want to make this work on
>> > mesa 10.5, and with an older kernel. Therefore I think the two separate 
>> > patches,
>> > and doing it without the dependency on Ville's patch (which I like FWIW) 
>> > make
>> > the lives of everyone easiest. Then Ville can rebase his patch on top of 
>> > this
>> > for mesa 10.7 time.
>> >
>> The problem is that an equivalent patch not based on Ville's refactor
>> would involve a considerable amount of churn because the BXT and SKL WB
>> entries (which are used in many different places) don't match (sigh).
>> It may not be suitable for stable either way, unless we drop BXT support
>> or are OK with adding a bunch of ternary operators, basically anywhere
>> SKL_MOCS_WB is used.
>
> Yeah, we don't need BXT support in stable since BXT won't have PCI IDs until
> 10.7. So I'd be in favor of doing the easy SKL specific thing first if it's
> possible
>
That sounds OK to me, I'll send another version of the patch for stable
with the minimal set of changes for it to work on SKL.

> Please tell me there is a good reason that they didn't make BXT and SKL the
> same...

Heh, ask Peter. :)

>
>> 
>> > I did think of it, but never broached the subject if we want to send both 
>> > my
>> > MOCS patch, and the PTE version of this patch to stable.
>> >
>> > Anyway, the concept here is definitely
>> > Acked-by: Ben Widawsky 
>> >
>> >> ---
>> >> Note that this change is based on Ville's "[PATCH 1/2] i965: House
>> >> MOCS settings in brw_context/brw_device_info":

Re: [Mesa-dev] [PATCH 2/2] i965/fs: Use the builder directly for the gen6 interpolation add(32)

2015-07-01 Thread Francisco Jerez
Jason Ekstrand  writes:

> Now that we can create builders with a bigger width than their parent as
> long as it's exec_all, we don't need to create the instruction manually.
> ---
>  src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 11 +--
>  1 file changed, 5 insertions(+), 6 deletions(-)
>
> diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp 
> b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
> index 79ebb2d..890ddc1 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
> @@ -1358,12 +1358,11 @@ fs_visitor::emit_interpolation_setup_gen6()
> */
>fs_reg int_pixel_xy(GRF, alloc.allocate(dispatch_width / 8),
>BRW_REGISTER_TYPE_UW);
> -  fs_inst *add =
> - new (mem_ctx) fs_inst(BRW_OPCODE_ADD, dispatch_width * 2,
> -   int_pixel_xy,
> -   fs_reg(stride(suboffset(g1_uw, 4), 1, 4, 0)),
> -   fs_reg(brw_imm_v(0x11001010)));
> -  abld.exec_all().emit(add);
> +
> +  const fs_builder abld32 = abld.exec_all().group(dispatch_width * 2, 0);

The abld32 name seems misleading because this can actually be a 16 or 32
wide builder depending on dispatch_width.  I suggest "dbld" (d for
double), or just expand the definition in its only user and get rid of
the temporary.  With that fixed:

Reviewed-by: Francisco Jerez 

> +  abld32.ADD(int_pixel_xy,
> + fs_reg(stride(suboffset(g1_uw, 4), 1, 4, 0)),
> + fs_reg(brw_imm_v(0x11001010)));
>  
>this->pixel_x = vgrf(glsl_type::float_type);
>this->pixel_y = vgrf(glsl_type::float_type);
> -- 
> 2.4.3
>
> ___
> mesa-dev mailing list
> mesa-dev@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/mesa-dev


signature.asc
Description: PGP signature
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCHv0.5] i965/gen9: Use custom MOCS entries set up by the kernel.

2015-07-01 Thread Francisco Jerez
Instead of relying on hardware defaults the i915 kernel driver is
going program custom MOCS tables system-wide on Gen9 hardware.  The
"WT" entry previously used for renderbuffers had a number of problems:
It disabled caching on eLLC, it used a reserved L3 cacheability
setting, and it used to override the PTE controls making renderbuffers
always WT on LLC regardless of the kernel's setting.  Instead use an
entry from the new MOCS tables with parameters: TC=LLC/eLLC, LeCC=PTE,
L3CC=WB.

The "WB" entry previously used for anything other than renderbuffers
has moved to a different index in the new MOCS tables but it should
have the same caching semantics as the old entry.

Even though the corresponding kernel change ("drm/i915 : Added
Programming of the MOCS") is in a way an ABI break it doesn't seem
necessary to check that the kernel is recent enough because the change
should only affect Gen9 which is still unreleased hardware.

v0.5: Drop BXT support to keep the change minimal and make it easier
  to back-port to stable. (Ben)
---
 src/mesa/drivers/dri/i965/brw_defines.h| 11 ++-
 src/mesa/drivers/dri/i965/gen8_surface_state.c |  3 +--
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_defines.h 
b/src/mesa/drivers/dri/i965/brw_defines.h
index 66b9abc..6b8a5ea 100644
--- a/src/mesa/drivers/dri/i965/brw_defines.h
+++ b/src/mesa/drivers/dri/i965/brw_defines.h
@@ -2491,12 +2491,13 @@ enum brw_wm_barycentric_interp_mode {
 #define BDW_MOCS_WT  0x58
 #define BDW_MOCS_PTE 0x18
 
-/* Skylake: MOCS is now an index into an array of 64 different configurable
- * cache settings.  We still use only either write-back or write-through; and
- * rely on the documented default values.
+/* Skylake: MOCS is now an index into an array of 64 different caching
+ * configurations programmed by the kernel.
  */
-#define SKL_MOCS_WB (0b001001 << 1)
-#define SKL_MOCS_WT (0b000101 << 1)
+/* TC=LLC/eLLC, LeCC=WB, LRUM=3, L3CC=WB */
+#define SKL_MOCS_WB  (1 << 1)
+/* TC=LLC/eLLC, LeCC=PTE, LRUM=3, L3CC=WB */
+#define SKL_MOCS_PTE (9 << 1)
 
 #define MEDIA_VFE_STATE 0x7000
 /* GEN7 DW2, GEN8+ DW3 */
diff --git a/src/mesa/drivers/dri/i965/gen8_surface_state.c 
b/src/mesa/drivers/dri/i965/gen8_surface_state.c
index b2d1a57..a154b0a 100644
--- a/src/mesa/drivers/dri/i965/gen8_surface_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_surface_state.c
@@ -401,8 +401,7 @@ gen8_update_renderbuffer_surface(struct brw_context *brw,
   irb->mt_layer : (irb->mt_layer / MAX2(mt->num_samples, 1));
GLenum gl_target =
   rb->TexImage ? rb->TexImage->TexObject->Target : GL_TEXTURE_2D;
-   /* FINISHME: Use PTE MOCS on Skylake. */
-   uint32_t mocs = brw->gen >= 9 ? SKL_MOCS_WT : BDW_MOCS_PTE;
+   uint32_t mocs = brw->gen >= 9 ? SKL_MOCS_PTE : BDW_MOCS_PTE;
 
intel_miptree_used_for_rendering(mt);
 
-- 
2.4.3

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] i965/gen9: Use custom MOCS entries set up by the kernel on BXT.

2015-07-01 Thread Francisco Jerez
Follow-up to "i965/gen9: Use custom MOCS entries set up by the
kernel.", sent as a separate patch to make the SKL change easier to
back-port to stable branches.
---
This change depends on Ville's "[PATCH 1/2] i965: House MOCS settings
in brw_context/brw_device_info":

http://lists.freedesktop.org/archives/mesa-dev/2015-June/086665.html

 src/mesa/drivers/dri/i965/brw_defines.h | 5 +
 src/mesa/drivers/dri/i965/brw_device_info.c | 2 ++
 2 files changed, 7 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_defines.h 
b/src/mesa/drivers/dri/i965/brw_defines.h
index ac9af6d..2889118 100644
--- a/src/mesa/drivers/dri/i965/brw_defines.h
+++ b/src/mesa/drivers/dri/i965/brw_defines.h
@@ -2507,6 +2507,11 @@ enum brw_wm_barycentric_interp_mode {
 /* TC=LLC/eLLC, LeCC=PTE, LRUM=3, L3CC=WB */
 #define SKL_MOCS_PTE (9 << 1)
 
+/* Broxton: As for Skylake this should match the tables set up by the kernel.
+ */
+/* L3CC=WB */
+#define BXT_MOCS_L3  (9 << 1)
+
 #define MEDIA_VFE_STATE 0x7000
 /* GEN7 DW2, GEN8+ DW3 */
 # define MEDIA_VFE_STATE_MAX_THREADS_SHIFT  16
diff --git a/src/mesa/drivers/dri/i965/brw_device_info.c 
b/src/mesa/drivers/dri/i965/brw_device_info.c
index cb80256..d5133e0 100644
--- a/src/mesa/drivers/dri/i965/brw_device_info.c
+++ b/src/mesa/drivers/dri/i965/brw_device_info.c
@@ -351,6 +351,8 @@ static const struct brw_device_info brw_device_info_bxt = {
.max_vs_threads = 112,
.max_gs_threads = 112,
.max_wm_threads = 32,
+   .mocs_pte = BXT_MOCS_L3,
+   .mocs_wb = BXT_MOCS_L3,
.urb = {
   .size = 64,
   .min_vs_entries = 34,
-- 
2.4.3

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH v2 16/19] i965/fs: Use the builder dispatch_width for computing register offsets

2015-07-01 Thread Francisco Jerez
Jason Ekstrand  writes:

> On Fri, Jun 26, 2015 at 11:51 AM, Francisco Jerez  
> wrote:
>> Jason Ekstrand  writes:
>>
>>> On Fri, Jun 26, 2015 at 8:52 AM, Francisco Jerez  
>>> wrote:
>>>> Jason Ekstrand  writes:
>>>>
>>>>> Reviewed-by: Topi Pohjolainen 
>>>>> ---
>>>>>  src/mesa/drivers/dri/i965/brw_fs.h | 2 +-
>>>>>  1 file changed, 1 insertion(+), 1 deletion(-)
>>>>>
>>>>> diff --git a/src/mesa/drivers/dri/i965/brw_fs.h 
>>>>> b/src/mesa/drivers/dri/i965/brw_fs.h
>>>>> index d4cc43d..d94a842 100644
>>>>> --- a/src/mesa/drivers/dri/i965/brw_fs.h
>>>>> +++ b/src/mesa/drivers/dri/i965/brw_fs.h
>>>>> @@ -72,7 +72,7 @@ offset(fs_reg reg, const brw::fs_builder& bld, unsigned 
>>>>> delta)
>>>>> case MRF:
>>>>> case ATTR:
>>>>>return byte_offset(reg,
>>>>> - delta * MAX2(reg.width * reg.stride, 1) *
>>>>> + delta * bld.dispatch_width() * reg.stride *
>>>>
>>>> Er...  This doesn't look right for stride == 0.  If you keep the
>>>> MAX2(.., 1) expression this patch is:
>>>
>>> I don't think offset() even makes sense for something with stride ==
>>> 0.  I added "assert(stride != 0)" right above the byte_offset() call
>>> and it passed Jenkins.  Would that be an acceptable alternative?
>>
>> stride == 0 implies that each logical component of your vector takes 1
>> scalar component (because all the N channels of your SIMDN value are one
>> and the same scalar in your register file), that means that logically
>> independent components of a vector or array are stored 1 scalar apart,
>> and the previous code was doing the right thing.
>
> I still think offset() is bogus for stride == 0.  However, I don't
> really feel like arguing the point, so I added the MAX2().

No need to argue about it, let me explain it step by step:

In the FS back-end (SoA) registers are in general a sequence of SIMDN
values, each SIMDN value being itself a sequence of N per-channel scalar
values.

Agreed?

offset(reg, i) returns another register reg' based on the i-th SIMDN
value from the start of reg.  [IOW if reg logically represented a vector
(say in a high-level language like GLSL) reg' would be at the i-th
logical component of the the vector]

Agreed?

offset(reg, i) is well-defined as long as the size of a single SIMDN
value is well-defined in the register file, because logically
independent elements of a SoA sequence of SIMDN values are simply stored
contiguously.

Agreed?

The size of a SIMDN value with stride=0 (i.e. a uniform) in the register
file is the same as the size of a single scalar value.  [And, although
it's irrelevant here, the size of a SIMDN value with stride!=0 is
stride*type_sz(type)]

Agreed?

If you agreed with the last two points, you'll also agree that
offset(reg, i) is well-defined for reg.stride=0.  If you're still not
convinced stop for a moment and consider the natural layout of an array
of uniforms in the GRF, and what would be the natural way to pick the
i-th component from such an array.

> --Jason
>
>>> --Jason
>>>
>>>> Reviewed-by: Francisco Jerez 
>>>>
>>>>>   type_sz(reg.type));
>>>>> case UNIFORM:
>>>>>reg.reg_offset += delta;
>>>>> --
>>>>> 2.4.3
>>>>>
>>>>> ___
>>>>> mesa-dev mailing list
>>>>> mesa-dev@lists.freedesktop.org
>>>>> http://lists.freedesktop.org/mailman/listinfo/mesa-dev


signature.asc
Description: PGP signature
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH v2 23/82] glsl: Do not do CSE for expressions involving SSBO loads

2015-07-03 Thread Francisco Jerez
Samuel Iglesias Gonsálvez  writes:

> On 29/06/15 09:11, Jordan Justen wrote:
>> On 2015-06-24 07:36:24, Iago Toral wrote:
>>> On Wed, 2015-06-24 at 15:43 +0300, Francisco Jerez wrote:
>>>> AFAICT the reason why this (and many of the other changes in GLSL
>>>> optimization passes) is needed is because SSBO loads have been
>>>> implemented as ir_expression nodes instead of being lowered into
>>>> intrinsics (as other side-effectful operations do like
>>>> ARB_shader_image_load_store and ARB_shader_atomic_counters).  This
>>>> surely broke the assumption of a number of optimization passes that
>>>> ir_expression nodes behave as pure functions.  I guess the reason why
>>>> you've done it this way is because UBO loads were already being
>>>> represented as expressions, so I see why you may have wanted to use the
>>>> same approach for SSBOs even though there is a fundamental difference
>>>> between the two: UBO loads have no side effects and are constant for a
>>>> given set of arguments and a given shader execution, SSBO loads and
>>>> stores are not.  SSBO stores couldn't be accommodated into the same
>>>> framework so easily, and you decided to create a separate ir node for
>>>> them, what seems inconsistent with loads.  Intrinsics would probably
>>>> have been a good fit for both loads and stores, and would have made all
>>>> these optimization changes unnecessary...
>>>>
>>>> P.S.: Sorry for the late reply, I was on vacation when I was CC'ed.
>>>
>>> Right, your assessment about the reasons behind the current
>>> implementation is correct. I did not realize of these issues when I
>>> decided to go with the current implementation, now it does look like
>>> going with GLSL intrinsics would have made things a bit easier. I
>>> suppose it would make sense to revisit the implementation in the near
>>> future taking your work on arb_shader_image_load_store as a reference.
>> 
>> While we're waiting for curro's work to land, I was hoping to review
>> and let you guys land the first ~30 front end patches. These patches
>> would allow some compiler tests to pass if the extension is
>> overridden. (Plus, it would take a big chunk out of this large
>> series.)
>> 
>> Unfortunately, I think you should rework the load/store ops as
>> intrinsics as recommended by curro.
>> 
>> Once you have the extension working again with intrinsics, could you
>> re-post the early patches before the 'i965' patches start?
>> 
>> Does this seem like a reasonable plan?
>> 
>
Hi Samuel,

> Iago and I are working on defining SSBO load/store as GLSL IR
> intrinsincs. After looking at what Francisco did for
> ARB_shader_image_load_store, we found some differences.
>
> ARB_shader_image_load_store defines imageStore() and imageLoad() as
> built-in functions and they are called explicitly by GLSL shaders. In
> our case SSBO load/store are implicit and, because of that, we need to
> do a lowering pass when we have all the needed SSBOs information, i.e.
> at link time, similar to what we did in the patch series.
>
> Our idea is to make that lowering pass to inject ir_call nodes replacing
> the creation of ir_ssbo_store nodes and ssbo_load expressions we had before.
>
> In order to inject those ir_call nodes, we are thinking about doing some
> steps similar to how built-in functions are defined but inside the
> lowering pass:
>
> 1) Create ir_function_signature for the corresponding intrinsic (SSBO
> store or load)
> 2) Create an ir_function with the desired name and add the signature
> created in first step to it.
> 3) Create an ir_call node passing as argument the created
> ir_function_signature and the list of variables that SSBO store/load
> need to work.
> 4) Add the new ir_call to the list of IR instructions.
>

That sounds roughly right to me.

> However we don't know if this is the proper approach for several reasons:
>
> * As we are executing the lowering pass in link time, we don't have the
> table of symbols (it was deleted before), so we cannot add the created
> ir_function to it (like built-in function's definition code does).
> * Creating ir_function_signature in the lowering pass doesn't seem right
> to us but, as the table of symbols has been deleted, we cannot get it
> from other place if it was created before.

I think these should be fine because GLSL shaders are not expected to
call the intrinsic explicitly so your lowering pass can just keep a set
of pointers to the intrinsic

Re: [Mesa-dev] [PATCH] i965/fs: Don't disable SIMD16 when using the pixel interpolator

2015-07-03 Thread Francisco Jerez
Neil Roberts  writes:

> There was a comment saying that in SIMD16 mode the pixel interpolator
> returns coords interleaved 8 channels at a time and that this requires
> extra work to support. However, this interleaved format is exactly
> what the PLN instruction requires so I don't think anything needs to
> be done to support it apart from removing the line to disable it and
> to ensure that the message lengths for the send message are correct.
>
> I am more convinced that this is correct because as it says in the
> comment this interleaved output is identical to what is given in the
> thread payload. The code generated to apply the plane equation to
> these coordinates is identical on SIMD16 and SIMD8 except that the
> dispatch width is larger which implies no special unmangling is
> needed.
>
> Perhaps the confusion stems from the fact that the description of the
> PLN instruction in the IVB PRM seems to imply that the src1 inputs are
> not interleaved so it wouldn't work. However, in the HSW and BDW PRMs,
> the pseudo-code is different and looks like it expects the interleaved
> format. Mesa doesn't seem to generate different code on IVB to
> uninterleave the payload registers and everything is working so I can
> only assume that the PRM is wrong.
>
> I tested the interpolateAt tests on HSW and did a full Piglit run on
> IVB on there were no regressions.
> ---
>
> I've CC'd Chris Forbes because according to git-annotate he wrote the
> original comment so he might know something I don't.
>
>  src/mesa/drivers/dri/i965/brw_fs_nir.cpp | 11 +++
>  1 file changed, 3 insertions(+), 8 deletions(-)
>
> diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp 
> b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
> index 59081ea..717e597 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
> @@ -1461,12 +1461,6 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, 
> nir_intrinsic_instr *instr
> case nir_intrinsic_interp_var_at_centroid:
> case nir_intrinsic_interp_var_at_sample:
> case nir_intrinsic_interp_var_at_offset: {
> -  /* in SIMD16 mode, the pixel interpolator returns coords interleaved
> -   * 8 channels at a time, same as the barycentric coords presented in
> -   * the FS payload. this requires a bit of extra work to support.
> -   */
> -  no16("interpolate_at_* not yet supported in SIMD16 mode.");
> -

Heh, I happened to come across this comment yesterday while looking for
the remaining no16 calls and wondered why on earth it couldn't do the
same that the normal interpolation code does.  After this patch and a
series coming up that will remove all SIMD8 fallbacks from the texturing
code, the only case left still applicable to Gen7 hardware and later
will be "SIMD16 explicit accumulator operands unsupported".  Anyone?

>fs_reg dst_xy = bld.vgrf(BRW_REGISTER_TYPE_F, 2);
>  
>/* For most messages, we need one reg of ignored data; the hardware
> @@ -1531,7 +1525,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, 
> nir_intrinsic_instr *instr
> bld.SEL(offset(src, i), itemp, fs_reg(7)));
>  }
>  
> -mlen = 2;
> +mlen = 2 * dispatch_width / 8;
>  inst = bld.emit(FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET, 
> dst_xy, src,
>  fs_reg(0u));
>   }
> @@ -1543,7 +1537,8 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, 
> nir_intrinsic_instr *instr
>}
>  
>inst->mlen = mlen;
> -  inst->regs_written = 2; /* 2 floats per slot returned */
> +  /* 2 floats per slot returned */
> +  inst->regs_written = 2 * dispatch_width / 8;
>inst->pi_noperspective = instr->variables[0]->var->data.interpolation 
> ==
> INTERP_QUALIFIER_NOPERSPECTIVE;
>  
> -- 
> 1.9.3
>
> ___
> mesa-dev mailing list
> mesa-dev@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/mesa-dev


signature.asc
Description: PGP signature
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 1/3] clover: separate compile and link stages

2015-07-05 Thread Francisco Jerez
Hi EdB, a bunch of comments inline,

EdB  writes:

> ---
>  src/gallium/state_trackers/clover/api/program.cpp  |   6 +-
>  .../state_trackers/clover/core/compiler.hpp|   7 +-
>  src/gallium/state_trackers/clover/core/error.hpp   |  21 ++
>  src/gallium/state_trackers/clover/core/program.cpp |  93 ++-
>  src/gallium/state_trackers/clover/core/program.hpp |  10 +-
>  .../state_trackers/clover/llvm/invocation.cpp  | 281 
> +++--
>  6 files changed, 323 insertions(+), 95 deletions(-)
>
> diff --git a/src/gallium/state_trackers/clover/api/program.cpp 
> b/src/gallium/state_trackers/clover/api/program.cpp
> index e9b1f38..2441d81 100644
> --- a/src/gallium/state_trackers/clover/api/program.cpp
> +++ b/src/gallium/state_trackers/clover/api/program.cpp
> @@ -184,10 +184,6 @@ clBuildProgram(cl_program d_prog, cl_uint num_devs,
> prog.build(devs, opts);

I don't think there's any reason to keep the program::build method
around anymore, it's only going to be called from this entry point so
you could as well make the two function calls to ::compile() and
::link() directly from here.

> return CL_SUCCESS;
>  } catch (error &e) {
> -   if (e.get() == CL_INVALID_COMPILER_OPTIONS)
> -  return CL_INVALID_BUILD_OPTIONS;
> -   if (e.get() == CL_COMPILE_PROGRAM_FAILURE)
> -  return CL_BUILD_PROGRAM_FAILURE;
> return e.get();
>  }
>  
> @@ -224,7 +220,7 @@ clCompileProgram(cl_program d_prog, cl_uint num_devs,
>range(header_names, num_headers),
>objs(d_header_progs, num_headers));
>  
> -   prog.build(devs, opts, headers);
> +   prog.compile(devs, opts, headers);
> return CL_SUCCESS;
>  
>  } catch (error &e) {
> diff --git a/src/gallium/state_trackers/clover/core/compiler.hpp 
> b/src/gallium/state_trackers/clover/core/compiler.hpp
> index c68aa39..31fb6ee 100644
> --- a/src/gallium/state_trackers/clover/core/compiler.hpp
> +++ b/src/gallium/state_trackers/clover/core/compiler.hpp
> @@ -32,11 +32,16 @@ namespace clover {
>  
> module compile_program_llvm(const std::string &source,
> const header_map &headers,
> -   pipe_shader_ir ir,
> const std::string &target,
> const std::string &opts,
> std::string &r_log);
>  
> +   module link_program_llvm(const std::vector &modules,
> +enum pipe_shader_ir ir,
> +const std::string &target,
> +const std::string &opts,
> +std::string &r_log);
> +
> module compile_program_tgsi(const std::string &source);
>  }
>  
> diff --git a/src/gallium/state_trackers/clover/core/error.hpp 
> b/src/gallium/state_trackers/clover/core/error.hpp
> index 780b973..3c1bf90 100644
> --- a/src/gallium/state_trackers/clover/core/error.hpp
> +++ b/src/gallium/state_trackers/clover/core/error.hpp
> @@ -68,10 +68,31 @@ namespace clover {
> class build_error : public error {
> public:
>build_error(const std::string &what = "") :
> + error(CL_BUILD_PROGRAM_FAILURE, what) {
> +  }
> +   };
> +
This exception class now seems redundant -- With program::build() gone
build is no longer a thing.

> +   class compile_error : public error {
> +   public:
> +  compile_error(const std::string &what = "") :
>   error(CL_COMPILE_PROGRAM_FAILURE, what) {
>}
> };
>  
> +   class link_error : public error {
> +   public:
> +  link_error(const std::string &what = "") :
> + error(CL_LINK_PROGRAM_FAILURE, what) {
> +  }
> +   };
> +
> +   class link_option_error : public error {
> +   public:
> +  link_option_error(const std::string &what = "") :
> + error(CL_INVALID_LINKER_OPTIONS , what) {
> +  }
> +   };
> +

I don't think you really need to special-case link_option_error against
the less specific clover::error class?

> template
> class invalid_object_error;
>  
> diff --git a/src/gallium/state_trackers/clover/core/program.cpp 
> b/src/gallium/state_trackers/clover/core/program.cpp
> index 0d6cc40..21faf4e 100644
> --- a/src/gallium/state_trackers/clover/core/program.cpp
> +++ b/src/gallium/state_trackers/clover/core/program.cpp
> @@ -40,15 +40,37 @@ program::program(clover::context &ctx,
>  }
>  
>  void
> -program::build(const ref_vector &devs, const char *opts,
> -   const header_map &headers) {
> +program::build(const ref_vector &devs, const char *opts) {
> +

Unnecessary whitespace.

> +   if (has_source) {
> +  try {
> + compile(devs, opts, {});
> + if (!link(devs, opts, {*this}, true))
> +throw error(CL_BUILD_PROGRAM_FAILURE);
> +  } catch (error &e) {
> + switch (e.get()) {
> +case CL_INVALID_COMPILER_OPTIONS:
> +case CL_INVALID_LINKER_OPTIONS:
> +   e = error(CL_INVALID_BUILD_OPTIONS);
> +

Re: [Mesa-dev] [PATCH 2/3] clover: override ret_object

2015-07-05 Thread Francisco Jerez
EdB  writes:

> Return an API object from an intrusive smart reference Clover object,
> incrementing the reference count of the object.
> ---
>  src/gallium/state_trackers/clover/api/util.hpp | 12 
>  1 file changed, 12 insertions(+)
>
> diff --git a/src/gallium/state_trackers/clover/api/util.hpp 
> b/src/gallium/state_trackers/clover/api/util.hpp
> index 918df61..6af28f2 100644
> --- a/src/gallium/state_trackers/clover/api/util.hpp
> +++ b/src/gallium/state_trackers/clover/api/util.hpp
> @@ -61,6 +61,18 @@ namespace clover {
>   *p = desc(v());
>}
> }
> +
> +   ///
> +   /// Return an API object from an intrusive smart reference Clover object,
> +   /// incrementing the reference count of the object.
> +   ///

The function below looks OK, but the explanation doesn't make much sense
to me.  How about "[...] from an intrusive reference to a Clover object
[...]"?

With that fixed:
Reviewed-by: Francisco Jerez 

> +   template
> +   typename T::descriptor_type *
> +   ret_object(const intrusive_ref &v) {
> +  v().retain();
> +  return desc(v());
> +   }
> +
>  }
>  
>  #endif
> -- 
> 2.4.3


signature.asc
Description: PGP signature
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 1/3] clover: separate compile and link stages

2015-07-05 Thread Francisco Jerez
EdB  writes:

> On Sunday 05 July 2015 18:15:33 Francisco Jerez wrote:
>>[...]
>> > --- a/src/gallium/state_trackers/clover/core/error.hpp
>> > +++ b/src/gallium/state_trackers/clover/core/error.hpp
>> > @@ -68,10 +68,31 @@ namespace clover {
>> > 
>> > class build_error : public error {
>> > 
>> > public:
>> >build_error(const std::string &what = "") :
>> > + error(CL_BUILD_PROGRAM_FAILURE, what) {
>> > +  }
>> > +   };
>> > +
>> 
>> This exception class now seems redundant -- With program::build() gone
>> build is no longer a thing.
>
> It's still needed by tgsi.
> I plan to rework this part later to make it consistent with the way it's 
> handle in llvm/invocation but first off I wanted to be done with clLink :/
>
The tgsi path could also throw compile_error AFAICT?

>> 
>> > +   class compile_error : public error {
>> > +   public:
>> > 
>> > +  compile_error(const std::string &what = "") :
>> >   error(CL_COMPILE_PROGRAM_FAILURE, what) {
>> >
>> >}
>> > 
>> > };
>> > 
>> > +   class link_error : public error {
>> > +   public:
>> > +  link_error(const std::string &what = "") :
>> > + error(CL_LINK_PROGRAM_FAILURE, what) {
>> > +  }
>> > +   };
>> > +
>> > +   class link_option_error : public error {
>> > +   public:
>> > +  link_option_error(const std::string &what = "") :
>> > + error(CL_INVALID_LINKER_OPTIONS , what) {
>> > +  }
>> > +   };
>> > +
>> 
>> I don't think you really need to special-case link_option_error against
>> the less specific clover::error class?
>
> clLinkProgram should not create a program if it failed to parse the given 
> options, I will use this class to handle this case. Other case should create 
> the said program.
> That said, it could also have been created in later patch.
>

Ah, fair enough.

>> 
>> > template
>> > class invalid_object_error;
>> > 
>> > diff --git a/src/gallium/state_trackers/clover/core/program.cpp
>> > b/src/gallium/state_trackers/clover/core/program.cpp index
>> > 0d6cc40..21faf4e 100644
>> > --- a/src/gallium/state_trackers/clover/core/program.cpp
>> > +++ b/src/gallium/state_trackers/clover/core/program.cpp
>>[...]
>> 
>> > +  std::string log;
>> > +
>> > +  try {
>> > + auto module = link_program_llvm(mods,
>> > + dev.ir_format(),
>> > dev.ir_target(),
>> > + opts, log);
>> > + _binaries.insert({ &dev, module });
>> > + append_to_log(&dev, log);
>> > +  } catch (const link_option_error &) {
>> > + append_to_log(&dev, log);
>> > + throw;
>> > +  } catch (const error &) {
>> > + append_to_log(&dev, log);
>> > + r = false;
>> 
>> I suggest you just catch "const error &", update the error log and
>> rethrow here, so you save a catch block and an error subclass.
>
> As explain clLinkProgram doesn't behave the same way regarding error during 
> option parsing and after.
>
Still I doubt that you need to handle them separately here, just update
the log and rethrow whatever you got, clLinkProgram can still give
link_error special treatment and return the created program despite the
failure for the application to be able to read back the error log.

>[...]


signature.asc
Description: PGP signature
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH] i965/fs: Don't disable SIMD16 when using the pixel interpolator

2015-07-05 Thread Francisco Jerez
Hi Matt,

Matt Turner  writes:

> On Fri, Jul 3, 2015 at 3:46 AM, Francisco Jerez  wrote:
>> Heh, I happened to come across this comment yesterday while looking for
>> the remaining no16 calls and wondered why on earth it couldn't do the
>> same that the normal interpolation code does.  After this patch and a
>> series coming up that will remove all SIMD8 fallbacks from the texturing
>> code, the only case left still applicable to Gen7 hardware and later
>> will be "SIMD16 explicit accumulator operands unsupported".  Anyone?
>
> I can explain the problem:
>
> Prior to Gen7, the were were two accumulator registers usable for most
> datatypes (acc0, acc1). On Gen7, they removed integer-support from
> acc1, which was necessary to implement SIMD16 integer multiplication
> using the normal MUL/MACH sequence.

IIRC they got rid of the acc1 register on IVB altogether, but managed to
emulate it for floating point types by taking advantage of the extra
precision not normally used for floating point arithmetic (the fake acc1
basically uses the same storage in the EU that holds the 32 MSBs of each
component of acc0), what explains the apparent asymmetry between integer
and floating point data types.

> I implemented 32-bit integer multiplication without using the
> accumulator in:
>
> commit f7df169ba13d22338e9276839a7e9629ca0a6b4f
> Author: Matt Turner 
> Date:   Wed May 13 18:34:03 2015 -0700
>
> i965/fs: Implement integer multiply without mul/mach.
>
> The remaining cases of "SIMD16 explicit accumulator operands
> unsupported" are ADDC, SUBB, and 32x32 -> high 32-bit multiplication.
> The remaining multiplication case can probably be reimplemented
> without the accumulator, like I did for the low 32-bit result.
>
Hmm, I have the suspicion that high 32-bit multiplication is the one
legit use-case of the accumulator we have left, any algorithm breaking
it up into individual 32/16-bit MULs would end up doing more
multiplications than the two MUL/MACH instructions we do now, because we
wouldn't be able to take advantage of the full precision implemented in
the hardware if we truncate the 48-bit intermediate results to fit in a
32-bit register.

How about we use the SIMD width lowering pass to split the computation
in half?  It should be quite straightforward but will probably require
adding a new virtual opcode so that the SIMD width lowering pass doesn't
have to deal with (seriously fucked-up) accumulators directly.

> The ADDC and SUBB instructions implicitly write a bit to the
> accumulator if their operations overflowed. The 1Q/2Q quarter control
> is supposed to select which register is implicitly written -- except
> that there is no acc1 for integer types. Haswell and newer ignore the
> quarter control and always write acc0, but IVB (and presumably BYT)
> attempt to write to the nonexistent acc1.
>
> You could split the the SIMD16 operations into 2x SIMD8s and set
> force_writemask_all on the second, followed by a 2Q MOV from the
> accumulator. Maybe we'd rather use the .o (overflow) conditional mod
> on a result ADD to implement this.
>
Yeah.  I did in fact try to implement uaddCarry last Friday without
using the accumulator by doing something like:

| CMP.o tmp, src0, -src1
| MOV dst, -tmp

...what of course didn't work because of the extra argument precision
post-source modifiers and also because the .o condmod doesn't work at
all on CMP, but...

> Ideally, we'd recognize merge the addition and carry operations into a
> single ADDC instruction, but it's pretty unimportant. It's all pretty
> academic -- I've never seen an application use either operation (or
> [iu]mulExtended either).

...if we did the following instead:

| ADD tmp, src0, src1
| CMP.l tmp, tmp, src0
| MOV dst, -tmp

the ADD could be easily CSE'ed with the original ADD instruction (and
the source modifier of the last MOV can also be easily propagated into
some other instruction), so even though it seems like one instruction
more than what we emit now it might be a net win (aside from it working
on SIMD16).  usubBorrow is even easier:

| CMP.l tmp, src0, src1
| MOV dst, -tmp

I was planning to run it through shader-db tomorrow but if you say
you've never seen them used I guess I shouldn't get my hopes too high? :P


signature.asc
Description: PGP signature
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH] clover: Implement image attribute getters

2015-07-06 Thread Francisco Jerez
Zoltán Gilián  writes:

>> This seems to be doing essentially the same thing as v1?  Is it the
>> right patch?
>
> The llvm pass was invoked in clover in v1. This patch relies on llvm
> to perform that task (). What this patch does basically is that it
> adds the image attributes to the end of the kernel input vector.
> The commit message of this patch is misleading, I'll fix it.
>
NAK.  Just like in v1, you're implementing the same pipe driver-specific
policy in Clover's core layer -- If you don't feel like fixing this
properly as I described in my reply to v1, it would be acceptable to
implement it for the time being using a workaround similar to
llvm/invocation.cpp:433 -- Hint: you'll need new
module::argument::semantic enums.

Thanks.

> On Wed, Jun 24, 2015 at 2:48 PM, Francisco Jerez  
> wrote:
>> Zoltan Gilian  writes:
>>
>>> Image attributes are passed to the kernel as hidden parameters after the
>>> image attribute itself. An llvm pass replaces the getter builtins to
>>> the appropriate parameters.
>>
>> This seems to be doing essentially the same thing as v1?  Is it the
>> right patch?
>>
>>> ---
>>>  src/gallium/state_trackers/clover/core/kernel.cpp  | 26 +++
>>>  src/gallium/state_trackers/clover/core/kernel.hpp  | 13 ++--
>>>  src/gallium/state_trackers/clover/core/memory.cpp  |  2 +-
>>>  .../state_trackers/clover/llvm/invocation.cpp  | 81 
>>> +-
>>>  4 files changed, 116 insertions(+), 6 deletions(-)
>>>
>>> diff --git a/src/gallium/state_trackers/clover/core/kernel.cpp 
>>> b/src/gallium/state_trackers/clover/core/kernel.cpp
>>> index 0756f06..291c799 100644
>>> --- a/src/gallium/state_trackers/clover/core/kernel.cpp
>>> +++ b/src/gallium/state_trackers/clover/core/kernel.cpp
>>> @@ -185,6 +185,13 @@ 
>>> kernel::exec_context::bind(intrusive_ptr _q,
>>>}
>>> }
>>>
>>> +   // Bind image attribute args.
>>> +   for (const auto& arg: kern._args) {
>>> +  if (auto img_arg = dynamic_cast(arg.get())) {
>>> + img_arg->bind_attributes(*this);
>>> +  }
>>> +   }
>>> +
>>> // Create a new compute state if anything changed.
>>> if (!st || q != _q ||
>>> cs.req_local_mem != mem_local ||
>>> @@ -465,6 +472,25 @@ kernel::constant_argument::unbind(exec_context &ctx) {
>>>  }
>>>
>>>  void
>>> +kernel::image_argument::bind_attributes(exec_context &ctx) {
>>> +   cl_image_format format = img->format();
>>> +   cl_uint attributes[] = {
>>> + static_cast(img->width()),
>>> + static_cast(img->height()),
>>> + static_cast(img->depth()),
>>> + format.image_channel_data_type,
>>> + format.image_channel_order};
>>> +   for (unsigned i = 0; i < 5; ++i) {
>>> +  auto v = bytes(attributes[i]);
>>> +
>>> +  extend(v, module::argument::zero_ext, sizeof(cl_uint));
>>> +  byteswap(v, ctx.q->device().endianness());
>>> +  align(ctx.input, sizeof(cl_uint));
>>> +  insert(ctx.input, v);
>>> +   }
>>> +}
>>> +
>>> +void
>>>  kernel::image_rd_argument::set(size_t size, const void *value) {
>>> if (size != sizeof(cl_mem))
>>>throw error(CL_INVALID_ARG_SIZE);
>>> diff --git a/src/gallium/state_trackers/clover/core/kernel.hpp 
>>> b/src/gallium/state_trackers/clover/core/kernel.hpp
>>> index d6432a4..8c15b2f 100644
>>> --- a/src/gallium/state_trackers/clover/core/kernel.hpp
>>> +++ b/src/gallium/state_trackers/clover/core/kernel.hpp
>>> @@ -190,7 +190,14 @@ namespace clover {
>>>   pipe_surface *st;
>>>};
>>>
>>> -  class image_rd_argument : public argument {
>>> +  class image_argument : public argument {
>>> +  public:
>>> + void bind_attributes(exec_context &ctx);
>>> +  protected:
>>> + image *img;
>>> +  };
>>> +
>>> +  class image_rd_argument : public image_argument {
>>>public:
>>>   virtual void set(size_t size, const void *value);
>>>   virtual void bind(exec_context &ctx,
>>> @@ -198,11 +205,10 @@ namespace clover {
>>>   virtual void unbind(exec_context &ctx);
>>>
>>>private:
>>> - image *img;
>&g

[Mesa-dev] [PATCH 1/3] i965/gen4-5: Set ENDIF dst and src0 fields to the null register.

2015-07-06 Thread Francisco Jerez
The hardware docs don't mention explicitly what these fields should
be, but I've verified experimentally on ILK that using a GRF as
destination causes the register to be corrupted when the execution
size of an ENDIF instruction is higher than 8 -- and because the
destination we were using was g0, eventually a hang.

Fixes some 150 piglit tests on Gen4-5 when forced to run shaders with
if conditionals 16-wide, e.g. shaders/glsl-fs-sampler-numbering-3.
---
 src/mesa/drivers/dri/i965/brw_eu_emit.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_eu_emit.c 
b/src/mesa/drivers/dri/i965/brw_eu_emit.c
index 0f53604..4d39762 100644
--- a/src/mesa/drivers/dri/i965/brw_eu_emit.c
+++ b/src/mesa/drivers/dri/i965/brw_eu_emit.c
@@ -1584,8 +1584,8 @@ brw_ENDIF(struct brw_codegen *p)
}
 
if (devinfo->gen < 6) {
-  brw_set_dest(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
-  brw_set_src0(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
+  brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+  brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
   brw_set_src1(p, insn, brw_imm_d(0x0));
} else if (devinfo->gen == 6) {
   brw_set_dest(p, insn, brw_imm_w(0));
-- 
2.4.3

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 2/3] i965/gen4-5: Program the execution size correctly for DO/WHILE instructions.

2015-07-06 Thread Francisco Jerez
From the hardware docs for the DO instruction:

 "Execution size is ignored for this instruction."

My observation on ILK hardware contradicts the spec though, channels
over the execution size of a DO instruction won't enter the loop, and
channels over the execution size of a WHILE instruction will exit the
loop after the first iteration -- The latter is consistent with the
spec though, there's no claim that the execution size is ignored for
the WHILE instruction so it's not completely unexpected that it has an
influence on the evaluation of EMask.

The execute_size argument of brw_DO() shouldn't have any effect on
Gen6 and newer hardware.  On Gen4-5 WHILE instructions inherit the
execution size from the matching DO, so this patch should fix them
too.  The execution size of BREAK and CONT instructions was already
being set correctly.

Fixes some 50 piglit tests on Gen4-5 when forced to run shaders with
conditional and loop instructions 16-wide,
e.g. shaders/glsl-fs-continue-inside-do-while.
---
 src/mesa/drivers/dri/i965/brw_fs_generator.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp 
b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
index 0a70bdc..c986d91 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
@@ -1869,7 +1869,7 @@ fs_generator::generate_code(const cfg_t *cfg, int 
dispatch_width)
 break;
 
   case BRW_OPCODE_DO:
-brw_DO(p, BRW_EXECUTE_8);
+brw_DO(p, dispatch_width == 16 ? BRW_EXECUTE_16 : BRW_EXECUTE_8);
 break;
 
   case BRW_OPCODE_BREAK:
-- 
2.4.3

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 3/3] i965/gen4-5: Enable 16-wide dispatch on shaders with control flow.

2015-07-06 Thread Francisco Jerez
This was probably disabled due to a combination of several bugs in the
generator code (fixed earlier in this series) and a misunderstanding
of the hardware spec.  The documentation for most control flow
instructions mentions among other restrictions:

 "Instruction compression is not allowed."

This however doesn't have any implications on 16 wide not being
supported, because none of the control flow instructions have
multi-register operands (control flow instructions are not compressed
on more recent hardware either, except maybe SNB's IF with inline
compare).  In fact Gen4-5 had 16-wide control flow masks and stacks,
and the spec mentions in several places that control flow instructions
push and pop 16 channels worth of data -- Otherwise there doesn't seem
to be any indication that it shouldn't work.

Causes no piglit regressions, and gives the following shader-db
results on ILK:

 total instructions in shared programs: 4711384 -> 4711384 (0.00%)
 instructions in affected programs: 0 -> 0
 helped:0
 HURT:  0
 GAINED:1215
 LOST:  0
---
 src/mesa/drivers/dri/i965/brw_fs_nir.cpp | 8 +---
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp 
b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
index bd71404..5247738 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
@@ -417,18 +417,12 @@ fs_visitor::nir_emit_if(nir_if *if_stmt)
 
bld.emit(BRW_OPCODE_ENDIF);
 
-   if (!try_replace_with_sel() && devinfo->gen < 6) {
-  no16("Can't support (non-uniform) control flow on SIMD16\n");
-   }
+   try_replace_with_sel();
 }
 
 void
 fs_visitor::nir_emit_loop(nir_loop *loop)
 {
-   if (devinfo->gen < 6) {
-  no16("Can't support (non-uniform) control flow on SIMD16\n");
-   }
-
bld.emit(BRW_OPCODE_DO);
 
nir_emit_cf_list(&loop->body);
-- 
2.4.3

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH] i965/fs: Don't disable SIMD16 when using the pixel interpolator

2015-07-07 Thread Francisco Jerez
Matt Turner  writes:

> On Sun, Jul 5, 2015 at 4:45 PM, Francisco Jerez  wrote:
>> Hi Matt,
>>
>> Matt Turner  writes:
>>
>>> On Fri, Jul 3, 2015 at 3:46 AM, Francisco Jerez  
>>> wrote:
>>>> Heh, I happened to come across this comment yesterday while looking for
>>>> the remaining no16 calls and wondered why on earth it couldn't do the
>>>> same that the normal interpolation code does.  After this patch and a
>>>> series coming up that will remove all SIMD8 fallbacks from the texturing
>>>> code, the only case left still applicable to Gen7 hardware and later
>>>> will be "SIMD16 explicit accumulator operands unsupported".  Anyone?
>>>
>>> I can explain the problem:
>>>
>>> Prior to Gen7, the were were two accumulator registers usable for most
>>> datatypes (acc0, acc1). On Gen7, they removed integer-support from
>>> acc1, which was necessary to implement SIMD16 integer multiplication
>>> using the normal MUL/MACH sequence.
>>
>> IIRC they got rid of the acc1 register on IVB altogether, but managed to
>> emulate it for floating point types by taking advantage of the extra
>> precision not normally used for floating point arithmetic (the fake acc1
>> basically uses the same storage in the EU that holds the 32 MSBs of each
>> component of acc0), what explains the apparent asymmetry between integer
>> and floating point data types.
>
> I've never read anything that told me that -- what have you seen?

Heh, I'll try to dig up my reference and send it to you in private.

>
>>> I implemented 32-bit integer multiplication without using the
>>> accumulator in:
>>>
>>> commit f7df169ba13d22338e9276839a7e9629ca0a6b4f
>>> Author: Matt Turner 
>>> Date:   Wed May 13 18:34:03 2015 -0700
>>>
>>> i965/fs: Implement integer multiply without mul/mach.
>>>
>>> The remaining cases of "SIMD16 explicit accumulator operands
>>> unsupported" are ADDC, SUBB, and 32x32 -> high 32-bit multiplication.
>>> The remaining multiplication case can probably be reimplemented
>>> without the accumulator, like I did for the low 32-bit result.
>>>
>> Hmm, I have the suspicion that high 32-bit multiplication is the one
>> legit use-case of the accumulator we have left, any algorithm breaking
>> it up into individual 32/16-bit MULs would end up doing more
>> multiplications than the two MUL/MACH instructions we do now, because we
>> wouldn't be able to take advantage of the full precision implemented in
>> the hardware if we truncate the 48-bit intermediate results to fit in a
>> 32-bit register.
>
> That's probably true. It's just that Sandybridge and earlier don't
> expose the functionality (but could do 64-bit integer multiplication
> just fine), Ivybridge has the quarter-control/accumulator bug, Haswell
> works fine if you split the multiplication sequence into SIMD8, and
> Broadwell let's you do 32x32 -> 64-bit multiplication without the
> accumulator.
>
> So you have only two platforms where it's you have to use the
> accumulator, and one of them is broken (but I guess can be trivially
> fixed by some force-writemask-all hackery).
>

I guess there's also VLV, CHV and BXT, AFAIK the latter two have some
level of support for 64-bit multiplication (with the annoying alignment
restriction on the operands) but it might be easier for them to use the
accumulator path like earlier hardware.

> The best SIMD16 code for [iu]mulExtended() where both lsb and msb
> results are used is probably 2 sets of mul/mach/mov (with some kind of
> work around for Ivybridge), but that's kind of hard to recognize.
>
It's probably also the best SIMD16 code (on chips without reasonable
support for 64-bit multiply that is) for computing the high 32 bits of
the result, regardless of whether optimizer is able to recognise that
the low 32 bits of the computation also come out as a side product, and
whether or not the low 32 bits are used by the shader.

A potential solution could be to have the visitor emit full 64-bit MULs
speculatively for any 32-bit integer multiplication (high or low),
together with a MOV to chop off the unnecessary bits, a later
optimization pass (run after CSE to give the optimizer the opportunity
to merge the 64-bits MULs from the high and low 32-bit computations)
would demote 64-bit MULs for which only the lowest 32-bits of the result
are used to 32-bit MULs, later on the SIMD width lowering pass would
split 16-wide 64-bit MULs in half, and a later pass would lower them
into the MUL/MACH sequence on platforms that don

Re: [Mesa-dev] [PATCH 0/8] Render node only opencl and pipe-loader cleanups

2015-07-07 Thread Francisco Jerez
Emil Velikov  writes:

> On 30/06/15 16:09, Emil Velikov wrote:
>> Hello all,
>> 
>> As mentioned over IRC a few weeks back, here is a series that removes 
>> support for non-render node devices.
>> 
>> The two main motivations being:
>>  - Currently we force X/xcb onto everyone that wants to use OpenCL
>> (headless OpenCL systems/farms anyone ?)
>>  - Nice overall cleanup - 43 insertions(+), 279 deletions(-)
>> 
>> 
>> Note that the final patches touch related code - from removing a unused 
>> function (pipe_loader_sw_probe_xlib) to using loader_open_device() over 
>> open(), with the former caring about CLOEXEC.
>>
> Francisco, Tom,
>
> Can you guys please take a look at the series. Even an Ack would be
> greatly appreciated.
>

Looks OK to me, assuming that Tom is OK with the general approach the
series is:
Reviewed-by: Francisco Jerez 

> Thanks
> Emil


signature.asc
Description: PGP signature
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCHv2] i965/gen9: Use custom MOCS entries set up by the kernel.

2015-07-07 Thread Francisco Jerez
Instead of relying on hardware defaults the i915 kernel driver is
going program custom MOCS tables system-wide on Gen9 hardware.  The
"WT" entry previously used for renderbuffers had a number of problems:
It disabled caching on eLLC, it used a reserved L3 cacheability
setting, and it used to override the PTE controls making renderbuffers
always WT on LLC regardless of the kernel's setting.  Instead use an
entry from the new MOCS tables with parameters: TC=LLC/eLLC, LeCC=PTE,
L3CC=WB.

The "WB" entry previously used for anything other than renderbuffers
has moved to a different index in the new MOCS tables but it should
have the same caching semantics as the old entry.

Even though the corresponding kernel change ("drm/i915: Added
Programming of the MOCS") is in a way an ABI break it doesn't seem
necessary to check that the kernel is recent enough because the change
should only affect Gen9 which is still unreleased hardware.

v2: Update MOCS values for the new Android-incompatible tables
introduced in v7 of the kernel patch.

Cc: 10.6 
---
 src/mesa/drivers/dri/i965/brw_defines.h| 11 ++-
 src/mesa/drivers/dri/i965/gen8_surface_state.c |  3 +--
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_defines.h 
b/src/mesa/drivers/dri/i965/brw_defines.h
index 66b9abc..8ab8d62 100644
--- a/src/mesa/drivers/dri/i965/brw_defines.h
+++ b/src/mesa/drivers/dri/i965/brw_defines.h
@@ -2491,12 +2491,13 @@ enum brw_wm_barycentric_interp_mode {
 #define BDW_MOCS_WT  0x58
 #define BDW_MOCS_PTE 0x18
 
-/* Skylake: MOCS is now an index into an array of 64 different configurable
- * cache settings.  We still use only either write-back or write-through; and
- * rely on the documented default values.
+/* Skylake: MOCS is now an index into an array of 62 different caching
+ * configurations programmed by the kernel.
  */
-#define SKL_MOCS_WB (0b001001 << 1)
-#define SKL_MOCS_WT (0b000101 << 1)
+/* TC=LLC/eLLC, LeCC=WB, LRUM=3, L3CC=WB */
+#define SKL_MOCS_WB  (2 << 1)
+/* TC=LLC/eLLC, LeCC=PTE, LRUM=3, L3CC=WB */
+#define SKL_MOCS_PTE (1 << 1)
 
 #define MEDIA_VFE_STATE 0x7000
 /* GEN7 DW2, GEN8+ DW3 */
diff --git a/src/mesa/drivers/dri/i965/gen8_surface_state.c 
b/src/mesa/drivers/dri/i965/gen8_surface_state.c
index bd3eb00..dfaf762 100644
--- a/src/mesa/drivers/dri/i965/gen8_surface_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_surface_state.c
@@ -401,8 +401,7 @@ gen8_update_renderbuffer_surface(struct brw_context *brw,
   irb->mt_layer : (irb->mt_layer / MAX2(mt->num_samples, 1));
GLenum gl_target =
   rb->TexImage ? rb->TexImage->TexObject->Target : GL_TEXTURE_2D;
-   /* FINISHME: Use PTE MOCS on Skylake. */
-   uint32_t mocs = brw->gen >= 9 ? SKL_MOCS_WT : BDW_MOCS_PTE;
+   uint32_t mocs = brw->gen >= 9 ? SKL_MOCS_PTE : BDW_MOCS_PTE;
 
intel_miptree_used_for_rendering(mt);
 
-- 
2.4.3

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [Mesa-stable] [PATCHv2] i965/gen9: Use custom MOCS entries set up by the kernel.

2015-07-09 Thread Francisco Jerez
Ben Widawsky  writes:

> On Tue, Jul 07, 2015 at 10:21:28PM +0300, Francisco Jerez wrote:
>> Instead of relying on hardware defaults the i915 kernel driver is
>> going program custom MOCS tables system-wide on Gen9 hardware.  The
>> "WT" entry previously used for renderbuffers had a number of problems:
>> It disabled caching on eLLC, it used a reserved L3 cacheability
>> setting, and it used to override the PTE controls making renderbuffers
>> always WT on LLC regardless of the kernel's setting.  Instead use an
>> entry from the new MOCS tables with parameters: TC=LLC/eLLC, LeCC=PTE,
>> L3CC=WB.
>> 
>> The "WB" entry previously used for anything other than renderbuffers
>> has moved to a different index in the new MOCS tables but it should
>> have the same caching semantics as the old entry.
>> 
>> Even though the corresponding kernel change ("drm/i915: Added
>> Programming of the MOCS") is in a way an ABI break it doesn't seem
>> necessary to check that the kernel is recent enough because the change
>> should only affect Gen9 which is still unreleased hardware.
>> 
>> v2: Update MOCS values for the new Android-incompatible tables
>> introduced in v7 of the kernel patch.
>> 
>> Cc: 10.6 
>
> It'd be cool to get perf data, but certainly not a requirement here since the
> requirement to change is pretty obvious, IMO (mostly, I'm just curious). I do
> like having the References: in the commit for the kernel patch, but that's 
> just
> me, and I can live with whatever.
>
I ran SynMark on SKL with this patch applied last Monday and didn't spot
any significant differences.  Some of the benchmarks seemed to give
quite erratic results regardless.  Meh...

>> ---
>>  src/mesa/drivers/dri/i965/brw_defines.h| 11 ++-
>>  src/mesa/drivers/dri/i965/gen8_surface_state.c |  3 +--
>>  2 files changed, 7 insertions(+), 7 deletions(-)
>> 
>> diff --git a/src/mesa/drivers/dri/i965/brw_defines.h 
>> b/src/mesa/drivers/dri/i965/brw_defines.h
>> index 66b9abc..8ab8d62 100644
>> --- a/src/mesa/drivers/dri/i965/brw_defines.h
>> +++ b/src/mesa/drivers/dri/i965/brw_defines.h
>> @@ -2491,12 +2491,13 @@ enum brw_wm_barycentric_interp_mode {
>>  #define BDW_MOCS_WT  0x58
>>  #define BDW_MOCS_PTE 0x18
>>  
>> -/* Skylake: MOCS is now an index into an array of 64 different configurable
>> - * cache settings.  We still use only either write-back or write-through; 
>> and
>> - * rely on the documented default values.
>> +/* Skylake: MOCS is now an index into an array of 62 different caching
>> + * configurations programmed by the kernel.
>
> I'd keep the '64' instead of '62' the latter is a software construct, but
> whatever you like.

It's an actual hardware limitation, the last two entries are reserved by
the hardware and are neither configurable (as the previous comment said)
nor can be programmed by the kernel (as my comment would imply had I
left the 64).

>
>>   */
>> -#define SKL_MOCS_WB (0b001001 << 1)
>> -#define SKL_MOCS_WT (0b000101 << 1)
>> +/* TC=LLC/eLLC, LeCC=WB, LRUM=3, L3CC=WB */
>> +#define SKL_MOCS_WB  (2 << 1)
>> +/* TC=LLC/eLLC, LeCC=PTE, LRUM=3, L3CC=WB */
>> +#define SKL_MOCS_PTE (1 << 1)
>>  
>>  #define MEDIA_VFE_STATE 0x7000
>>  /* GEN7 DW2, GEN8+ DW3 */
>> diff --git a/src/mesa/drivers/dri/i965/gen8_surface_state.c 
>> b/src/mesa/drivers/dri/i965/gen8_surface_state.c
>> index bd3eb00..dfaf762 100644
>> --- a/src/mesa/drivers/dri/i965/gen8_surface_state.c
>> +++ b/src/mesa/drivers/dri/i965/gen8_surface_state.c
>> @@ -401,8 +401,7 @@ gen8_update_renderbuffer_surface(struct brw_context *brw,
>>irb->mt_layer : (irb->mt_layer / MAX2(mt->num_samples, 1));
>> GLenum gl_target =
>>rb->TexImage ? rb->TexImage->TexObject->Target : GL_TEXTURE_2D;
>> -   /* FINISHME: Use PTE MOCS on Skylake. */
>> -   uint32_t mocs = brw->gen >= 9 ? SKL_MOCS_WT : BDW_MOCS_PTE;
>> +   uint32_t mocs = brw->gen >= 9 ? SKL_MOCS_PTE : BDW_MOCS_PTE;
>
> I don't know the policy on const really, but this is a good opportunity to
> const.

Sure, why not, const is always good.

>>  
>> intel_miptree_used_for_rendering(mt);
>>  
>
> Reviewed-by: Ben Widawsky 

Thanks.

>
> ___
> mesa-stable mailing list
> mesa-sta...@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/mesa-stable


signature.asc
Description: PGP signature
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [HACK] i965/fs: Fix ordering of src0 alpha and oMask in the framebuffer write payload.

2015-07-09 Thread Francisco Jerez
We were passing src0 alpha and oMask in reverse order.  There seems to
be no good way to pass them in the correct order to the new-style
LOAD_PAYLOAD (how surprising) because src0 alpha is per-channel while
oMask is not.  Just split src0 alpha in fixed-width registers and pass
them to LOAD_PAYLOAD as if they were part of the header as work-around
for now.

I've written a piglit test that demonstrates the problem by using
gl_SampleMask from a fragment shader with multiple color outputs [1].

[1] http://lists.freedesktop.org/archives/piglit/2015-July/016499.html
---
 src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 26 +-
 1 file changed, 17 insertions(+), 9 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp 
b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
index 94d6a58..304ae74 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
@@ -1535,6 +1535,19 @@ fs_visitor::emit_single_fb_write(const fs_builder &bld,
   length++;
}
 
+   if (src0_alpha.file != BAD_FILE && color0.file != BAD_FILE) {
+  /* Neat, we need to chop the src0 alpha component and pass it as part of
+   * the header even though it has per-channel semantics, because the next
+   * optional field is header-like and LOAD_PAYLOAD requires all such
+   * fields to form a contiguous segment at the beginning of the message.
+   */
+  for (unsigned i = 0; i < exec_size / 8; i++) {
+ setup_color_payload(&sources[length], src0_alpha, 1, 8,
+ use_2nd_half || i == 1);
+ length++;
+  }
+   }
+
prog_data->uses_omask =
   prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK);
if (prog_data->uses_omask) {
@@ -1561,19 +1574,14 @@ fs_visitor::emit_single_fb_write(const fs_builder &bld,
  offset(this->outputs[0], bld, 3),
  1, exec_size, false);
   length += 4;
-   } else if (color1.file == BAD_FILE) {
-  if (src0_alpha.file != BAD_FILE) {
- setup_color_payload(&sources[length], src0_alpha, 1, exec_size, 
false);
- length++;
-  }
-
-  setup_color_payload(&sources[length], color0, components,
-  exec_size, use_2nd_half);
-  length += 4;
} else {
   setup_color_payload(&sources[length], color0, components,
   exec_size, use_2nd_half);
   length += 4;
+
+   }
+
+   if (color1.file != BAD_FILE) {
   setup_color_payload(&sources[length], color1, components,
   exec_size, use_2nd_half);
   length += 4;
-- 
2.4.3

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [HACK] i965/fs: Fix rescale_texcoord() for SIMD16 and remove no16 fall-back.

2015-07-09 Thread Francisco Jerez
Aside from the trivial GRF underallocation problem in the
"devinfo->gen < 6 && is_rect" if-block, the texrect scale uniform
look-up code was assuming a one-to-one mapping between UNIFORM
register indices and the param array, which only holds during the
SIMD8 run.

It seems dubious that this needs to manipulate the param array
directly even though it doesn't have a fixed meaning (all constants if
you're building SIMD8, push constants if you're building SIMD16).  We
would probably be better off not using the ancient state token
tracking stuff which forces you to recompile the program anytime a
sampler uniform binding changes and doesn't work at all for
ARB_gpu_shader5-style variable indexing of samplers.  Instead this
could be implemented like images do by passing sampler metadata
preemptively at a fixed offset from the sampler uniform that is later
on eliminated by the optimizer in case it's not needed.

This depends on another patch I sent a while ago "i965/fs: Don't
overwrite fs_visitor::uniforms and ::param_size during the SIMD16
run." [1].  No piglit regressions.

[1] http://lists.freedesktop.org/archives/mesa-dev/2015-May/083484.html
---
 src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 23 +++
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp 
b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
index 94d6a58..dcd2e4e 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
@@ -708,17 +708,23 @@ fs_visitor::rescale_texcoord(fs_reg coordinate, int 
coord_components,
 0
   };
 
-  no16("rectangle scale uniform setup not supported on SIMD16\n");
-  if (dispatch_width == 16) {
-return coordinate;
-  }
-
   GLuint index = _mesa_add_state_reference(params,
   (gl_state_index *)tokens);
   /* Try to find existing copies of the texrect scale uniforms. */
   for (unsigned i = 0; i < uniforms; i++) {
- if (stage_prog_data->param[i] ==
- &prog->Parameters->ParameterValues[index][0]) {
+ /* Neat, there's an extra level of indirection between the fake
+  * UNIFORM file and the push/pull param arrays, but *only* during
+  * non-SIMD8 runs (i.e. SIMD16).
+  */
+ const gl_constant_value *param =
+(dispatch_width == 8 ? stage_prog_data->param[i] :
+ push_constant_loc[i] >= 0 ?
+stage_prog_data->param[push_constant_loc[i]] :
+ pull_constant_loc[i] >= 0 ?
+stage_prog_data->pull_param[pull_constant_loc[i]] :
+ NULL);
+
+ if (param == &prog->Parameters->ParameterValues[index][0]) {
 scale_x = fs_reg(UNIFORM, i);
 scale_y = fs_reg(UNIFORM, i + 1);
 break;
@@ -727,6 +733,7 @@ fs_visitor::rescale_texcoord(fs_reg coordinate, int 
coord_components,
 
   /* If we didn't already set them up, do so now. */
   if (scale_x.file == BAD_FILE) {
+ assert(dispatch_width == 8);
  scale_x = fs_reg(UNIFORM, uniforms);
  scale_y = fs_reg(UNIFORM, uniforms + 1);
 
@@ -742,7 +749,7 @@ fs_visitor::rescale_texcoord(fs_reg coordinate, int 
coord_components,
 * tracking to get the scaling factor.
 */
if (devinfo->gen < 6 && is_rect) {
-  fs_reg dst = fs_reg(GRF, alloc.allocate(coord_components));
+  fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_F, coord_components);
   fs_reg src = coordinate;
   coordinate = dst;
 
-- 
2.4.3

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] i965/fs: Reimplement nir_op_uadd_carry and _usub_borrow without accumulator.

2015-07-09 Thread Francisco Jerez
This gets rid of two no16() fall-backs and should allow better
scheduling of the generated IR.  There are no uses of usubBorrow() or
uaddCarry() in shader-db so no changes are expected.  However the
"arb_gpu_shader5/execution/built-in-functions/fs-usubBorrow" and
"arb_gpu_shader5/execution/built-in-functions/fs-uaddCarry" piglit
tests go from 40 to 28 instructions.  The reason is that the plain ADD
instruction can easily be CSE'ed with the original addition, and the
negation can easily be propagated into the source modifier of another
instruction, so effectively both operations can be performed with just
one instruction.

No piglit regressions.
---
 src/mesa/drivers/dri/i965/brw_fs_nir.cpp | 33 +---
 1 file changed, 13 insertions(+), 20 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp 
b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
index 6d9e9d3..3b6aa0a 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
@@ -829,29 +829,22 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, 
nir_alu_instr *instr)
   bld.emit(SHADER_OPCODE_INT_QUOTIENT, result, op[0], op[1]);
   break;
 
-   case nir_op_uadd_carry: {
-  if (devinfo->gen >= 7)
- no16("SIMD16 explicit accumulator operands unsupported\n");
-
-  struct brw_reg acc = retype(brw_acc_reg(dispatch_width),
-  BRW_REGISTER_TYPE_UD);
-
-  bld.ADDC(bld.null_reg_ud(), op[0], op[1]);
-  bld.MOV(result, fs_reg(acc));
+   case nir_op_uadd_carry:
+  /* Use signed operands for the ADD to be easily CSE'ed with the original
+   * addition (e.g. in case we're implementing the uaddCarry() GLSL
+   * built-in).
+   */
+  bld.ADD(result, retype(op[0], BRW_REGISTER_TYPE_D),
+  retype(op[1], BRW_REGISTER_TYPE_D));
+  bld.CMP(result, retype(result, BRW_REGISTER_TYPE_UD), op[0],
+  BRW_CONDITIONAL_L);
+  bld.MOV(result, negate(result));
   break;
-   }
 
-   case nir_op_usub_borrow: {
-  if (devinfo->gen >= 7)
- no16("SIMD16 explicit accumulator operands unsupported\n");
-
-  struct brw_reg acc = retype(brw_acc_reg(dispatch_width),
-  BRW_REGISTER_TYPE_UD);
-
-  bld.SUBB(bld.null_reg_ud(), op[0], op[1]);
-  bld.MOV(result, fs_reg(acc));
+   case nir_op_usub_borrow:
+  bld.CMP(result, op[0], op[1], BRW_CONDITIONAL_L);
+  bld.MOV(result, negate(result));
   break;
-   }
 
case nir_op_umod:
   bld.emit(SHADER_OPCODE_INT_REMAINDER, result, op[0], op[1]);
-- 
2.4.3

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH] i965/fs: Reimplement nir_op_uadd_carry and _usub_borrow without accumulator.

2015-07-09 Thread Francisco Jerez
Ilia Mirkin  writes:

> FYI there's already a lowering pass that does this in the GLSL IR
> (CARRY_TO_ARITH in lower_instructions). Perhaps the right place to do
> this is NIR though, just wanted to let you know.
>
Ah, I wasn't aware of that flag, that seems even better.  I just tried
it and it seems to generate one instruction more per op than my assembly
code (apparently because our implementation of b2i is suboptimal, could
probably be fixed), but it would also work to get rid of the no16()
calls, which is all I care about right now.

I'll resend using your approach tomorrow.

> On Thu, Jul 9, 2015 at 3:51 PM, Francisco Jerez  wrote:
>> This gets rid of two no16() fall-backs and should allow better
>> scheduling of the generated IR.  There are no uses of usubBorrow() or
>> uaddCarry() in shader-db so no changes are expected.  However the
>> "arb_gpu_shader5/execution/built-in-functions/fs-usubBorrow" and
>> "arb_gpu_shader5/execution/built-in-functions/fs-uaddCarry" piglit
>> tests go from 40 to 28 instructions.  The reason is that the plain ADD
>> instruction can easily be CSE'ed with the original addition, and the
>> negation can easily be propagated into the source modifier of another
>> instruction, so effectively both operations can be performed with just
>> one instruction.
>>
>> No piglit regressions.
>> ---
>>  src/mesa/drivers/dri/i965/brw_fs_nir.cpp | 33 
>> +---
>>  1 file changed, 13 insertions(+), 20 deletions(-)
>>
>> diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp 
>> b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
>> index 6d9e9d3..3b6aa0a 100644
>> --- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
>> +++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
>> @@ -829,29 +829,22 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, 
>> nir_alu_instr *instr)
>>bld.emit(SHADER_OPCODE_INT_QUOTIENT, result, op[0], op[1]);
>>break;
>>
>> -   case nir_op_uadd_carry: {
>> -  if (devinfo->gen >= 7)
>> - no16("SIMD16 explicit accumulator operands unsupported\n");
>> -
>> -  struct brw_reg acc = retype(brw_acc_reg(dispatch_width),
>> -  BRW_REGISTER_TYPE_UD);
>> -
>> -  bld.ADDC(bld.null_reg_ud(), op[0], op[1]);
>> -  bld.MOV(result, fs_reg(acc));
>> +   case nir_op_uadd_carry:
>> +  /* Use signed operands for the ADD to be easily CSE'ed with the 
>> original
>> +   * addition (e.g. in case we're implementing the uaddCarry() GLSL
>> +   * built-in).
>> +   */
>> +  bld.ADD(result, retype(op[0], BRW_REGISTER_TYPE_D),
>> +  retype(op[1], BRW_REGISTER_TYPE_D));
>> +  bld.CMP(result, retype(result, BRW_REGISTER_TYPE_UD), op[0],
>> +  BRW_CONDITIONAL_L);
>> +  bld.MOV(result, negate(result));
>>break;
>> -   }
>>
>> -   case nir_op_usub_borrow: {
>> -  if (devinfo->gen >= 7)
>> - no16("SIMD16 explicit accumulator operands unsupported\n");
>> -
>> -  struct brw_reg acc = retype(brw_acc_reg(dispatch_width),
>> -  BRW_REGISTER_TYPE_UD);
>> -
>> -  bld.SUBB(bld.null_reg_ud(), op[0], op[1]);
>> -  bld.MOV(result, fs_reg(acc));
>> +   case nir_op_usub_borrow:
>> +  bld.CMP(result, op[0], op[1], BRW_CONDITIONAL_L);
>> +  bld.MOV(result, negate(result));
>>break;
>> -   }
>>
>> case nir_op_umod:
>>bld.emit(SHADER_OPCODE_INT_REMAINDER, result, op[0], op[1]);
>> --
>> 2.4.3
>>
>> ___
>> mesa-dev mailing list
>> mesa-dev@lists.freedesktop.org
>> http://lists.freedesktop.org/mailman/listinfo/mesa-dev


signature.asc
Description: PGP signature
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [HACK] i965/fs: Fix ordering of src0 alpha and oMask in the framebuffer write payload.

2015-07-10 Thread Francisco Jerez
Jason Ekstrand  writes:

> On Jul 9, 2015 7:57 AM, "Francisco Jerez"  wrote:
>>
>> We were passing src0 alpha and oMask in reverse order.  There seems to
>> be no good way to pass them in the correct order to the new-style
>> LOAD_PAYLOAD (how surprising) because src0 alpha is per-channel while
>> oMask is not.  Just split src0 alpha in fixed-width registers and pass
>> them to LOAD_PAYLOAD as if they were part of the header as work-around
>> for now.
>
> Bah... I came across this when I did the LOAD_PAYLOAD rework but thought it
> was only theoretical.  I wasn't very familiar with what omask actually did
> and, since piglit didn't hit it, I wasn't sure if it was a real problem or
> not.  I probably should have done more digging and written a piglit test at
> the time. My bad.
>
> One solution that I proposed at the time was to turn header_size into
> header_mask in the obvious way. We can still use 8 bits because we should
> never have a header source higher than 8.
>

So your idea is to have one bit per source indicating whether it's
header-like or per-channel?  I don't think that extends to instructions
other than LOAD_PAYLOAD (e.g. FB_WRITE) where the same source is at the
same time header and payload.  One bit per 32B register would extend
easily but it would be rather ugly to deal with if you want to keep your
code SIMD width-invariant.

I think if you go with the per-source flag you'll want it to be in its
own subclass of fs_inst.  With its own subclass you could even have an
array of per-source sizes determining the number of registers read for
each source, which would be rather nice for the visitor (no need to
split vectors into components while passing them to LOAD_PAYLOAD).

Still I think the most elegant solution would be to simply get rid of
the header/payload distinction by using force_writemask_all and, if it
proves to be necessary, fix the optimizer to get rid of redundant
force_writemask_all flags where it doesn't do it already.

> Thoughts?
> --Jason
>
>> I've written a piglit test that demonstrates the problem by using
>> gl_SampleMask from a fragment shader with multiple color outputs [1].
>>
>> [1] http://lists.freedesktop.org/archives/piglit/2015-July/016499.html
>> ---
>>  src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 26
> +-
>>  1 file changed, 17 insertions(+), 9 deletions(-)
>>
>> diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
> b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
>> index 94d6a58..304ae74 100644
>> --- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
>> +++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
>> @@ -1535,6 +1535,19 @@ fs_visitor::emit_single_fb_write(const fs_builder
> &bld,
>>length++;
>> }
>>
>> +   if (src0_alpha.file != BAD_FILE && color0.file != BAD_FILE) {
>> +  /* Neat, we need to chop the src0 alpha component and pass it as
> part of
>> +   * the header even though it has per-channel semantics, because
> the next
>> +   * optional field is header-like and LOAD_PAYLOAD requires all such
>> +   * fields to form a contiguous segment at the beginning of the
> message.
>> +   */
>> +  for (unsigned i = 0; i < exec_size / 8; i++) {
>> + setup_color_payload(&sources[length], src0_alpha, 1, 8,
>> + use_2nd_half || i == 1);
>> + length++;
>> +  }
>> +   }
>> +
>> prog_data->uses_omask =
>>prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK);
>> if (prog_data->uses_omask) {
>> @@ -1561,19 +1574,14 @@ fs_visitor::emit_single_fb_write(const fs_builder
> &bld,
>>   offset(this->outputs[0], bld, 3),
>>   1, exec_size, false);
>>length += 4;
>> -   } else if (color1.file == BAD_FILE) {
>> -  if (src0_alpha.file != BAD_FILE) {
>> - setup_color_payload(&sources[length], src0_alpha, 1, exec_size,
> false);
>> - length++;
>> -  }
>> -
>> -  setup_color_payload(&sources[length], color0, components,
>> -  exec_size, use_2nd_half);
>> -  length += 4;
>> } else {
>>setup_color_payload(&sources[length], color0, components,
>>exec_size, use_2nd_half);
>>length += 4;
>> +
>> +   }
>> +
>> +   if (color1.file != BAD_FILE) {
>>setup_color_payload(&sources[length], color1, components,
>>exec_size, use_2nd_half);
>>length += 4;
>> --
>> 2.4.3
>>


signature.asc
Description: PGP signature
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [HACK] i965/fs: Fix ordering of src0 alpha and oMask in the framebuffer write payload.

2015-07-10 Thread Francisco Jerez
Jason Ekstrand  writes:

> On Fri, Jul 10, 2015 at 5:25 AM, Francisco Jerez  
> wrote:
>> Jason Ekstrand  writes:
>>
>>> On Jul 9, 2015 7:57 AM, "Francisco Jerez"  wrote:
>>>>
>>>> We were passing src0 alpha and oMask in reverse order.  There seems to
>>>> be no good way to pass them in the correct order to the new-style
>>>> LOAD_PAYLOAD (how surprising) because src0 alpha is per-channel while
>>>> oMask is not.  Just split src0 alpha in fixed-width registers and pass
>>>> them to LOAD_PAYLOAD as if they were part of the header as work-around
>>>> for now.
>>>
>>> Bah... I came across this when I did the LOAD_PAYLOAD rework but thought it
>>> was only theoretical.  I wasn't very familiar with what omask actually did
>>> and, since piglit didn't hit it, I wasn't sure if it was a real problem or
>>> not.  I probably should have done more digging and written a piglit test at
>>> the time. My bad.
>>>
>>> One solution that I proposed at the time was to turn header_size into
>>> header_mask in the obvious way. We can still use 8 bits because we should
>>> never have a header source higher than 8.
>>>
>>
>> So your idea is to have one bit per source indicating whether it's
>> header-like or per-channel?  I don't think that extends to instructions
>> other than LOAD_PAYLOAD (e.g. FB_WRITE) where the same source is at the
>> same time header and payload.
>
> You're right, it doesn't.  We really shouldn't be conflating them.  We
> should have header_mask and header_present be different fields.  Maybe
> use a union to save space, but they should have different semantic
> meaning and different names.  We should probably also have a
> compr4_mask and get rid of the hackery there.
>
>> One bit per 32B register would extend
>> easily but it would be rather ugly to deal with if you want to keep your
>> code SIMD width-invariant.
>>
>> I think if you go with the per-source flag you'll want it to be in its
>> own subclass of fs_inst.  With its own subclass you could even have an
>> array of per-source sizes determining the number of registers read for
>> each source, which would be rather nice for the visitor (no need to
>> split vectors into components while passing them to LOAD_PAYLOAD).
>>
>> Still I think the most elegant solution would be to simply get rid of
>> the header/payload distinction by using force_writemask_all and, if it
>> proves to be necessary, fix the optimizer to get rid of redundant
>> force_writemask_all flags where it doesn't do it already.
>
> I really don't think that's a good long-term or short-term solution.
>
> How badly are you blocking on this?   I don't really have a lot of
> extra time to work on this at the moment but can carve some out if
> needed.

I'm not blocking on this at all, feel free to fix it however you like,
or just go with this hack for the moment if you have higher priority
stuff to work on right now, I honestly don't care.

> --jason
>
>>> Thoughts?
>>> --Jason
>>>
>>>> I've written a piglit test that demonstrates the problem by using
>>>> gl_SampleMask from a fragment shader with multiple color outputs [1].
>>>>
>>>> [1] http://lists.freedesktop.org/archives/piglit/2015-July/016499.html
>>>> ---
>>>>  src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 26
>>> +-
>>>>  1 file changed, 17 insertions(+), 9 deletions(-)
>>>>
>>>> diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
>>> b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
>>>> index 94d6a58..304ae74 100644
>>>> --- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
>>>> +++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
>>>> @@ -1535,6 +1535,19 @@ fs_visitor::emit_single_fb_write(const fs_builder
>>> &bld,
>>>>length++;
>>>> }
>>>>
>>>> +   if (src0_alpha.file != BAD_FILE && color0.file != BAD_FILE) {
>>>> +  /* Neat, we need to chop the src0 alpha component and pass it as
>>> part of
>>>> +   * the header even though it has per-channel semantics, because
>>> the next
>>>> +   * optional field is header-like and LOAD_PAYLOAD requires all such
>>>> +   * fields to form a contiguous segment at the beginning of the
>>> message.
>>>> +   */
>>>> +  

[Mesa-dev] [PATCH 1/2] i965: Implement b2f and b2i using negation.

2015-07-10 Thread Francisco Jerez
Booleans are represented as 0/-1 on modern hardware which means we can
just negate them to convert them into a numeric type.  Negation has
the benefit that it can be implemented using a source modifier which
can easily be propagated into some other instruction.  shader-db
results on HSW:

 total instructions in shared programs: 5264246 -> 5264211 (-0.00%)
 instructions in affected programs: 1464 -> 1429 (-2.39%)
 helped:15
 HURT:  1

No piglit regressions.
---
 src/mesa/drivers/dri/i965/brw_fs_nir.cpp   | 4 +---
 src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp | 7 +--
 2 files changed, 2 insertions(+), 9 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp 
b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
index 4690d00..64ff24c 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
@@ -969,10 +969,8 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, 
nir_alu_instr *instr)
   break;
 
case nir_op_b2i:
-  bld.AND(result, op[0], fs_reg(1));
-  break;
case nir_op_b2f:
-  bld.AND(retype(result, BRW_REGISTER_TYPE_UD), op[0], 
fs_reg(0x3f80u));
+  bld.MOV(result, negate(op[0]));
   break;
 
case nir_op_f2b:
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp 
b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
index c9c2661..fd94a70 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
@@ -1733,16 +1733,11 @@ vec4_visitor::visit(ir_expression *ir)
   emit(MOV(result_dst, op[0]));
   break;
case ir_unop_b2i:
-  emit(AND(result_dst, op[0], src_reg(1)));
-  break;
case ir_unop_b2f:
   if (devinfo->gen <= 5) {
  resolve_bool_comparison(ir->operands[0], &op[0]);
   }
-  op[0].type = BRW_REGISTER_TYPE_D;
-  result_dst.type = BRW_REGISTER_TYPE_D;
-  emit(AND(result_dst, op[0], src_reg(0x3f80u)));
-  result_dst.type = BRW_REGISTER_TYPE_F;
+  emit(MOV(result_dst, negate(op[0])));
   break;
case ir_unop_f2b:
   emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
-- 
2.4.3

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCHv2 2/2] i965: Implement nir_op_uadd_carry and _usub_borrow without accumulator.

2015-07-10 Thread Francisco Jerez
This gets rid of two no16() fall-backs and should allow better
scheduling of the generated IR.  There are no uses of usubBorrow() or
uaddCarry() in shader-db so no changes are expected.  However the
"arb_gpu_shader5/execution/built-in-functions/fs-usubBorrow" and
"arb_gpu_shader5/execution/built-in-functions/fs-uaddCarry" piglit
tests go from 40 to 28 instructions.  The reason is that the plain ADD
instruction can easily be CSE'ed with the original addition, and the
b2i negation can easily be propagated into the source modifier of
another instruction, so effectively both operations are performed with
just one instruction.

No piglit regressions.

v2: Rely on carry_to_arith() and borrow_to_arith() to lower these
(Ilia Mirkin).
---
 src/mesa/drivers/dri/i965/brw_fs_nir.cpp   | 26 --
 src/mesa/drivers/dri/i965/brw_shader.cpp   |  4 +++-
 src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp | 17 +
 3 files changed, 12 insertions(+), 35 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp 
b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
index 64ff24c..9cccd7f 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
@@ -836,29 +836,11 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, 
nir_alu_instr *instr)
   bld.emit(SHADER_OPCODE_INT_QUOTIENT, result, op[0], op[1]);
   break;
 
-   case nir_op_uadd_carry: {
-  if (devinfo->gen >= 7)
- no16("SIMD16 explicit accumulator operands unsupported\n");
-
-  struct brw_reg acc = retype(brw_acc_reg(dispatch_width),
-  BRW_REGISTER_TYPE_UD);
-
-  bld.ADDC(bld.null_reg_ud(), op[0], op[1]);
-  bld.MOV(result, fs_reg(acc));
-  break;
-   }
+   case nir_op_uadd_carry:
+  unreachable("Should have been lowered by carry_to_arith().");
 
-   case nir_op_usub_borrow: {
-  if (devinfo->gen >= 7)
- no16("SIMD16 explicit accumulator operands unsupported\n");
-
-  struct brw_reg acc = retype(brw_acc_reg(dispatch_width),
-  BRW_REGISTER_TYPE_UD);
-
-  bld.SUBB(bld.null_reg_ud(), op[0], op[1]);
-  bld.MOV(result, fs_reg(acc));
-  break;
-   }
+   case nir_op_usub_borrow:
+  unreachable("Should have been lowered by borrow_to_arith().");
 
case nir_op_umod:
   bld.emit(SHADER_OPCODE_INT_REMAINDER, result, op[0], op[1]);
diff --git a/src/mesa/drivers/dri/i965/brw_shader.cpp 
b/src/mesa/drivers/dri/i965/brw_shader.cpp
index 3e3d78b..d66baf3 100644
--- a/src/mesa/drivers/dri/i965/brw_shader.cpp
+++ b/src/mesa/drivers/dri/i965/brw_shader.cpp
@@ -259,7 +259,9 @@ process_glsl_ir(struct brw_context *brw,
   EXP_TO_EXP2 |
   LOG_TO_LOG2 |
   bitfield_insert |
-  LDEXP_TO_ARITH);
+  LDEXP_TO_ARITH |
+  CARRY_TO_ARITH |
+  BORROW_TO_ARITH);
 
/* Pre-gen6 HW can only nest if-statements 16 deep.  Beyond this,
 * if-statements need to be flattened.
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp 
b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
index fd94a70..da7561c 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
@@ -1601,20 +1601,13 @@ vec4_visitor::visit(ir_expression *ir)
   assert(ir->type->is_integer());
   emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
   break;
-   case ir_binop_carry: {
-  struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD);
 
-  emit(ADDC(dst_null_ud(), op[0], op[1]));
-  emit(MOV(result_dst, src_reg(acc)));
-  break;
-   }
-   case ir_binop_borrow: {
-  struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD);
+   case ir_binop_carry:
+  unreachable("Should have been lowered by carry_to_arith().");
+
+   case ir_binop_borrow:
+  unreachable("Should have been lowered by borrow_to_arith().");
 
-  emit(SUBB(dst_null_ud(), op[0], op[1]));
-  emit(MOV(result_dst, src_reg(acc)));
-  break;
-   }
case ir_binop_mod:
   /* Floating point should be lowered by MOD_TO_FLOOR in the compiler. */
   assert(ir->type->is_integer());
-- 
2.4.3

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH] clover: Pass image attributes to the kernel

2015-07-10 Thread Francisco Jerez
Zoltan Gilian  writes:

> Read-only and write-only image arguments are recognized and
> distinguished.
> Attributes of the image arguments are passed to the kernel as implicit
> arguments.

Thanks, this looks much better.  One thing that still seems kind of
unfortunate is the fact that you've added a single "image_attributes"
argument that lumps image dimensions with format.  I expect the set of
targets that need format metadata to be a strict superset of the targets
that need image dimensions, so it would be nice if the target could
specify them as separate arguments (e.g. semantic::image_size and
::image_format).

Another related point is that you've chosen to pass the metadata for all
images together at the end of the input buffer.  I have the suspicion
that it would simplify both the OpenCL front-end and compiler back-end
code if the image metadata was interleaved with images themselves.
E.g. for each image argument and kernel the target would request an
argument list like

 type::imageNd semantic::general,
 type::scalar semantic::image_format,
 type::scalar semantic::image_size

and assume a struct-like layout for each image argument in the input
buffer:
 
 struct image_argument {
uint32_t index;
uint32_t size[3];
uint32_t format[2];
 };

For the back-end this would imply that the offset between a given image
argument and metadata field would be fixed, independent of how many
other arguments and how many images are being passed to the kernel, and
for the front-end it would mean you could get rid of the first pass of
the argument list you've added to exec_context::bind() (you could just
take the image from the last explicit_arg argument seen).

Some more nit-picks below.

> ---
>  src/gallium/state_trackers/clover/core/kernel.cpp  |  27 ++
>  src/gallium/state_trackers/clover/core/kernel.hpp  |  13 ++-
>  src/gallium/state_trackers/clover/core/memory.cpp  |   2 +-
>  src/gallium/state_trackers/clover/core/module.hpp  |   3 +-
>  .../state_trackers/clover/llvm/invocation.cpp  | 102 
> -
>  5 files changed, 140 insertions(+), 7 deletions(-)
>
> diff --git a/src/gallium/state_trackers/clover/core/kernel.cpp 
> b/src/gallium/state_trackers/clover/core/kernel.cpp
> index 0756f06..d7d42a6 100644
> --- a/src/gallium/state_trackers/clover/core/kernel.cpp
> +++ b/src/gallium/state_trackers/clover/core/kernel.cpp
> @@ -159,6 +159,14 @@ kernel::exec_context::bind(intrusive_ptr 
> _q,
> auto msec = find(type_equals(module::section::text), m.secs);
> auto explicit_arg = kern._args.begin();
>  
> +   std::vector image_args;
> +   for (const auto& arg: kern._args) {
> +  if (auto img_arg = dynamic_cast(arg.get())) {
> + image_args.push_back(img_arg);
> +  }
> +   }
> +   auto image_arg = image_args.begin();
> +
> for (auto &marg : margs) {
>switch (marg.semantic) {
>case module::argument::general:
> @@ -182,9 +190,28 @@ kernel::exec_context::bind(intrusive_ptr 
> _q,
>   }
>   break;
>}
> +  case module::argument::image_attributes: {
> + auto img = (*image_arg++)->get_image();
> + cl_image_format fmt = img->format();
> + auto attributes = std::vector({
> +   static_cast(img->width()),
> +   static_cast(img->height()),
> +   static_cast(img->depth()),
> +   static_cast(fmt.image_channel_data_type),
> +   static_cast(fmt.image_channel_order)});

How about casting to cl_uint instead?  And you could do:

 std::vector attributes {
   ...
 };

> +
> + for (auto x: attributes) {
> +auto arg = argument::create(marg);
> +
> +arg->set(sizeof(x), &x);
> +arg->bind(*this, marg);
> + }
> + break;
> +  }
>}
> }
>  
> +

Unnecessary whitespace. 

> // Create a new compute state if anything changed.
> if (!st || q != _q ||
> cs.req_local_mem != mem_local ||
> diff --git a/src/gallium/state_trackers/clover/core/kernel.hpp 
> b/src/gallium/state_trackers/clover/core/kernel.hpp
> index d6432a4..be9f783 100644
> --- a/src/gallium/state_trackers/clover/core/kernel.hpp
> +++ b/src/gallium/state_trackers/clover/core/kernel.hpp
> @@ -190,7 +190,14 @@ namespace clover {
>   pipe_surface *st;
>};
>  
> -  class image_rd_argument : public argument {
> +  class image_argument : public argument {
> +  public:
> + const image *get_image() const { return img; }

Can we call this method get() so the duality with set() is more obvious?

> +  protected:
> + image *img;
> +  };
> +
> +  class image_rd_argument : public image_argument {
>public:
>   virtual void set(size_t size, const void *value);
>   virtual void bind(exec_context &ctx,
> @@ -198,11 +205,10 @@ namespace clover {
>   virtual void unbind(exec_context &ctx);
>  
>private:
> - image *img;
> 

Re: [Mesa-dev] [PATCH 1/2] i965: Implement b2f and b2i using negation.

2015-07-10 Thread Francisco Jerez
Matt Turner  writes:

> On Fri, Jul 10, 2015 at 10:06 AM, Francisco Jerez  
> wrote:
>> Booleans are represented as 0/-1 on modern hardware which means we can
>> just negate them to convert them into a numeric type.  Negation has
>> the benefit that it can be implemented using a source modifier which
>> can easily be propagated into some other instruction.  shader-db
>> results on HSW:
>>
>>  total instructions in shared programs: 5264246 -> 5264211 (-0.00%)
>>  instructions in affected programs: 1464 -> 1429 (-2.39%)
>>  helped:15
>>  HURT:  1
>
> Strange, I get different (better) numbers on Haswell:
>
> total instructions in shared programs: 6279705 -> 6277316 (-0.04%)
> instructions in affected programs: 40948 -> 38559 (-5.83%)

Odd.  Apparently you have more instructions than I have overall so you
either have more shaders in your shader-db or some of them are not being
compiled for me for some reason.

> helped:123
> HURT:  1
> GAINED:1
> LOST:  0
>
> Certainly more than 15 helped programs in Civilization Beyond Earth alone.
>
> The one hurt program is
> rocketbirds-hardboiled-chicken/fp-2.shader_test, which is hurt because
> we do not CSE the MOV instructions. I'll send a patch to fix this.
>
>> No piglit regressions.
>
> As a rule, this is implied by sending the patch. Don't put it in the
> commit log -- in the worst case the patch is rebased and it's no
> longer true (this has happened, embarrassingly enough). Same thing in
> 2/2.

Hah, some people (including yourself earlier this week IIRC) have asked
me in the past whether some patch passes piglit after I sent it to the
mailing list, so I can only assume it's not redundant information.  You
also seemed to get angry recently because some commit I sent was missing
(from my point of view) redundant information you considered critical,
so don't be surprised to see all kinds of useless data in my commit
messages from now on.

>
>> ---
>>  src/mesa/drivers/dri/i965/brw_fs_nir.cpp   | 4 +---
>>  src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp | 7 +--
>>  2 files changed, 2 insertions(+), 9 deletions(-)
>>
>> diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp 
>> b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
>> index 4690d00..64ff24c 100644
>> --- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
>> +++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
>> @@ -969,10 +969,8 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, 
>> nir_alu_instr *instr)
>>break;
>>
>> case nir_op_b2i:
>> -  bld.AND(result, op[0], fs_reg(1));
>> -  break;
>> case nir_op_b2f:
>> -  bld.AND(retype(result, BRW_REGISTER_TYPE_UD), op[0], 
>> fs_reg(0x3f80u));
>> +  bld.MOV(result, negate(op[0]));
>>break;
>>
>> case nir_op_f2b:
>> diff --git a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp 
>> b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
>> index c9c2661..fd94a70 100644
>> --- a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
>> +++ b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
>> @@ -1733,16 +1733,11 @@ vec4_visitor::visit(ir_expression *ir)
>>emit(MOV(result_dst, op[0]));
>>break;
>> case ir_unop_b2i:
>> -  emit(AND(result_dst, op[0], src_reg(1)));
>> -  break;
>> case ir_unop_b2f:
>>if (devinfo->gen <= 5) {
>>   resolve_bool_comparison(ir->operands[0], &op[0]);
>>}
>> -  op[0].type = BRW_REGISTER_TYPE_D;
>> -  result_dst.type = BRW_REGISTER_TYPE_D;
>> -  emit(AND(result_dst, op[0], src_reg(0x3f80u)));
>> -  result_dst.type = BRW_REGISTER_TYPE_F;
>> +  emit(MOV(result_dst, negate(op[0])));
>>break;
>> case ir_unop_f2b:
>>emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
>> --
>> 2.4.3
>>
>
> Good idea. Not sure why I didn't think of that before.
>
> Both are:
>
> Reviewed-by: Matt Turner 


signature.asc
Description: PGP signature
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 2/2] clover: Use threadsafe wrappers for pipe_context v2

2015-07-11 Thread Francisco Jerez
Tom Stellard  writes:

> Events can be added to an OpenCL command queue concurrently from multiple
> threads, but pipe_context bjects are not threadsafe.  The threadsafe
> wrappers protect all pipe_context function calls with a mutex, so we
> can safely use them with multiple threads.
>
> v2:
>   - Don't use wrapper for pipe_screen.
>
> CC: 10.6 

Thanks, this patch is:
Reviewed-by: Francisco Jerez 

> ---
>  src/gallium/state_trackers/clover/core/queue.cpp | 2 ++
>  src/gallium/targets/opencl/Makefile.am   | 4 +++-
>  2 files changed, 5 insertions(+), 1 deletion(-)
>
> diff --git a/src/gallium/state_trackers/clover/core/queue.cpp 
> b/src/gallium/state_trackers/clover/core/queue.cpp
> index 87f9dcc..16b089c 100644
> --- a/src/gallium/state_trackers/clover/core/queue.cpp
> +++ b/src/gallium/state_trackers/clover/core/queue.cpp
> @@ -24,6 +24,7 @@
>  #include "core/event.hpp"
>  #include "pipe/p_screen.h"
>  #include "pipe/p_context.h"
> +#include "threadsafe/threadsafe.h"
>  
>  using namespace clover;
>  
> @@ -33,6 +34,7 @@ command_queue::command_queue(clover::context &ctx, 
> clover::device &dev,
> pipe = dev.pipe->context_create(dev.pipe, NULL);
> if (!pipe)
>throw error(CL_INVALID_DEVICE);
> +   pipe = pipe_threadsafe_context(pipe);
>  }
>  
>  command_queue::~command_queue() {
> diff --git a/src/gallium/targets/opencl/Makefile.am 
> b/src/gallium/targets/opencl/Makefile.am
> index 70e60e2..be5a59d 100644
> --- a/src/gallium/targets/opencl/Makefile.am
> +++ b/src/gallium/targets/opencl/Makefile.am
> @@ -16,6 +16,7 @@ endif
>  
>  lib@OPENCL_LIBNAME@_la_LIBADD = \
>   
> $(top_builddir)/src/gallium/auxiliary/pipe-loader/libpipe_loader_client.la \
> + $(top_builddir)/src/gallium/drivers/threadsafe/libthreadsafe.la \
>   $(top_builddir)/src/gallium/state_trackers/clover/libclover.la \
>   $(top_builddir)/src/gallium/auxiliary/libgallium.la \
>   $(top_builddir)/src/util/libmesautil.la \
> @@ -36,7 +37,8 @@ lib@OPENCL_LIBNAME@_la_LIBADD = \
>   -lclangEdit \
>   -lclangLex \
>   -lclangBasic \
> - $(LLVM_LIBS)
> + $(LLVM_LIBS) \
> + $(PTHREAD_LIBS)
>  
>  nodist_EXTRA_lib@OPENCL_LIBNAME@_la_SOURCES = dummy.cpp
>  lib@OPENCL_LIBNAME@_la_SOURCES =
> -- 
> 2.0.4


signature.asc
Description: PGP signature
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH] clover: Fix bug with computing hard_event status

2015-07-11 Thread Francisco Jerez
Tom Stellard  writes:

> pipe_context::flush() can return a NULL fence if the queue is already
> empty, so we should not assume that an event with a NULL fence
> has the status of CL_QUEUED.
>

This seems suspicious...  On the one hand it doesn't seem to be a
documented "feature" of pipe_context::flush to return NULL except in
error conditions (I'm pretty sure other drivers like nouveau won't), and
it seems like it could easily break assumptions of other state trackers.

IMO pipe_context::flush() should respect the invariant that whatever is
returned in the fence output argument (unless some error occurred) be a
valid argument for pipe_screen::fence_finish() and ::fence_signalled()
-- I don't think NULL is?

On the other hand this leaves me wondering how could the queue already
be empty when clover calls pipe_context::flush() -- I assume by queue
you mean the pipe driver's?  The fact that clover calls
pipe_context::flush() implies that clover's event queue is not empty
(i.e. there have been commands enqueued to the pipe driver since the
last call to pipe_context::flush()).  It sounds like this mismatch
between clover's and the pipe driver's command queue might be caused by
some race condition elsewhere?

Thanks.

> CC: 10.6 
> ---
>  src/gallium/state_trackers/clover/core/event.cpp | 7 ---
>  src/gallium/state_trackers/clover/core/event.hpp | 1 +
>  2 files changed, 5 insertions(+), 3 deletions(-)
>
> diff --git a/src/gallium/state_trackers/clover/core/event.cpp 
> b/src/gallium/state_trackers/clover/core/event.cpp
> index d75b839..b973c78 100644
> --- a/src/gallium/state_trackers/clover/core/event.cpp
> +++ b/src/gallium/state_trackers/clover/core/event.cpp
> @@ -118,7 +118,7 @@ event::wait() const {
>  hard_event::hard_event(command_queue &q, cl_command_type command,
> const ref_vector &deps, action action) :
> event(q.context(), deps, profile(q, action), [](event &ev){}),
> -   _queue(q), _command(command), _fence(NULL) {
> +   _queue(q), _command(command), _fence(NULL), _fenced(false) {
> if (q.profiling_enabled())
>_time_queued = timestamp::current(q);
>  
> @@ -138,7 +138,7 @@ hard_event::status() const {
> if (event::status() < 0)
>return event::status();
>  
> -   else if (!_fence)
> +   else if (!_fenced)
>return CL_QUEUED;
>  
> else if (!screen->fence_finish(screen, _fence, 0))
> @@ -167,7 +167,7 @@ hard_event::wait() const {
> if (status() == CL_QUEUED)
>queue()->flush();
>  
> -   if (!_fence ||
> +   if (!_fenced ||
> !screen->fence_finish(screen, _fence, PIPE_TIMEOUT_INFINITE))
>throw error(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST);
>  }
> @@ -196,6 +196,7 @@ void
>  hard_event::fence(pipe_fence_handle *fence) {
> pipe_screen *screen = queue()->device().pipe;
> screen->fence_reference(screen, &_fence, fence);
> +   _fenced = true;
>  }
>  
>  event::action
> diff --git a/src/gallium/state_trackers/clover/core/event.hpp 
> b/src/gallium/state_trackers/clover/core/event.hpp
> index 6469e48..fac62d2 100644
> --- a/src/gallium/state_trackers/clover/core/event.hpp
> +++ b/src/gallium/state_trackers/clover/core/event.hpp
> @@ -137,6 +137,7 @@ namespace clover {
>const intrusive_ref _queue;
>cl_command_type _command;
>pipe_fence_handle *_fence;
> +  bool _fenced;
>lazy _time_queued, _time_submit, _time_start, _time_end;
> };
>  
> -- 
> 2.0.4


signature.asc
Description: PGP signature
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH] i965/fs: Make the texturing helpers take NIR opcodes instead of old IR ones

2015-07-13 Thread Francisco Jerez
Jason Ekstrand  writes:

> Now that the old GLSL IR visitor code is gone, having the remap is silly.
> ---
>  src/mesa/drivers/dri/i965/brw_fs.h   |  12 +--
>  src/mesa/drivers/dri/i965/brw_fs_nir.cpp |  18 +---
>  src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 135 
> ++-
>  3 files changed, 75 insertions(+), 90 deletions(-)
>
> diff --git a/src/mesa/drivers/dri/i965/brw_fs.h 
> b/src/mesa/drivers/dri/i965/brw_fs.h
> index 5243079..1690f4a 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs.h
> +++ b/src/mesa/drivers/dri/i965/brw_fs.h
> @@ -109,7 +109,7 @@ public:
> void compute_clip_distance(gl_clip_plane *clip_planes);
>  
> uint32_t gather_channel(int orig_chan, uint32_t sampler);
> -   void swizzle_result(ir_texture_opcode op, int dest_components,
> +   void swizzle_result(nir_texop op, int dest_components,
> fs_reg orig_val, uint32_t sampler);
>  
> int type_size(const struct glsl_type *type);
> @@ -207,28 +207,28 @@ public:
> void compute_sample_position(fs_reg dst, fs_reg int_sample_pos);
> fs_reg rescale_texcoord(fs_reg coordinate, int coord_components,
> bool is_rect, uint32_t sampler, int texunit);
> -   fs_inst *emit_texture_gen4(ir_texture_opcode op, fs_reg dst,
> +   fs_inst *emit_texture_gen4(nir_texop op, fs_reg dst,
>fs_reg coordinate, int coord_components,
>fs_reg shadow_comp,
>fs_reg lod, fs_reg lod2, int grad_components,
>uint32_t sampler);
> -   fs_inst *emit_texture_gen4_simd16(ir_texture_opcode op, fs_reg dst,
> +   fs_inst *emit_texture_gen4_simd16(nir_texop op, fs_reg dst,
>   fs_reg coordinate, int vector_elements,
>   fs_reg shadow_c, fs_reg lod,
>   uint32_t sampler);
> -   fs_inst *emit_texture_gen5(ir_texture_opcode op, fs_reg dst,
> +   fs_inst *emit_texture_gen5(nir_texop op, fs_reg dst,
>fs_reg coordinate, int coord_components,
>fs_reg shadow_comp,
>fs_reg lod, fs_reg lod2, int grad_components,
>fs_reg sample_index, uint32_t sampler,
>bool has_offset);
> -   fs_inst *emit_texture_gen7(ir_texture_opcode op, fs_reg dst,
> +   fs_inst *emit_texture_gen7(nir_texop op, fs_reg dst,
>fs_reg coordinate, int coord_components,
>fs_reg shadow_comp,
>fs_reg lod, fs_reg lod2, int grad_components,
>fs_reg sample_index, fs_reg mcs, fs_reg 
> sampler,
>fs_reg offset_value);
> -   void emit_texture(ir_texture_opcode op,
> +   void emit_texture(nir_texop op,
>   const glsl_type *dest_type,
>   fs_reg coordinate, int components,
>   fs_reg shadow_c,

Hold on, I'm about to send a series that gets rid of the
ir_texture_opcode argument of these gen-specific functions and replaces
it with a backend_instruction opcode.  If you wait a little this change
will involve much less churn.

> diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp 
> b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
> index caf1300..d8a6f3c 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
> @@ -1739,23 +1739,7 @@ fs_visitor::nir_emit_texture(const fs_builder &bld, 
> nir_tex_instr *instr)
>glsl_type::get_instance(dest_base_type, nir_tex_instr_dest_size(instr),
>1);
>  
> -   ir_texture_opcode op;
> -   switch (instr->op) {
> -   case nir_texop_lod: op = ir_lod; break;
> -   case nir_texop_query_levels: op = ir_query_levels; break;
> -   case nir_texop_tex: op = ir_tex; break;
> -   case nir_texop_tg4: op = ir_tg4; break;
> -   case nir_texop_txb: op = ir_txb; break;
> -   case nir_texop_txd: op = ir_txd; break;
> -   case nir_texop_txf: op = ir_txf; break;
> -   case nir_texop_txf_ms: op = ir_txf_ms; break;
> -   case nir_texop_txl: op = ir_txl; break;
> -   case nir_texop_txs: op = ir_txs; break;
> -   default:
> -  unreachable("unknown texture opcode");
> -   }
> -
> -   emit_texture(op, dest_type, coordinate, instr->coord_components,
> +   emit_texture(instr->op, dest_type, coordinate, instr->coord_components,
>  shadow_comparitor, lod, lod2, lod_components, sample_index,
>  tex_offset, mcs, gather_component,
>  is_cube_array, is_rect, sampler, sampler_reg, texunit);
> diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp 
> b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
> index 94d6a58..c726dcc 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
> +++ b/src/mesa/dr

Re: [Mesa-dev] [PATCH 2/5] i965/fs: fix stride and type for hw_reg's in regs_read()

2015-07-14 Thread Francisco Jerez
Connor Abbott  writes:

> sources with file == HW_REG get all their information from the
> fixed_hw_reg field, so we need to get the stride and type from there
> when computing the size.
>
> Signed-off-by: Connor Abbott 
> ---
>  src/mesa/drivers/dri/i965/brw_fs.cpp | 24 ++--
>  1 file changed, 18 insertions(+), 6 deletions(-)
>
> diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp 
> b/src/mesa/drivers/dri/i965/brw_fs.cpp
> index 38b9095..64f093b 100644
> --- a/src/mesa/drivers/dri/i965/brw_fs.cpp
> +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
> @@ -696,24 +696,36 @@ fs_inst::regs_read(int arg) const
>break;
> }
>  
> +   unsigned stride;
> +   enum brw_reg_type type;
> +
> switch (src[arg].file) {
> case BAD_FILE:
> case UNIFORM:
> case IMM:
>return 1;
> +
> case GRF:
> +  stride = src[arg].stride;
> +  type = src[arg].type;
> +  break;
> +
> case HW_REG:
> -  if (src[arg].stride == 0) {
> - return 1;
> -  } else {
> - int size = components * this->exec_size * type_sz(src[arg].type);
> - return DIV_ROUND_UP(size * src[arg].stride, 32);
> -  }
> +  stride = src[arg].fixed_hw_reg.hstride;
> +  type = src[arg].fixed_hw_reg.type;
> +  break;
> +
> case MRF:
>unreachable("MRF registers are not allowed as sources");
> default:
>unreachable("Invalid register file");
> }
> +
> +   if (stride == 0)
> +  return 1;
> +
> +   int size = components * this->exec_size * type_sz(type);
> +   return DIV_ROUND_UP(size * stride, 32);

I don't think this will work unfortunately, brw_reg::hstride is the log2
of the actual stride, unlike fs_reg::stride.  Did I already mention I'm
appalled by the fact that fs_reg has a number of fields with overlapping
semantics but different representation, one or the other being
applicable depending on the occasion.  I guess it would be more or less
bearable if these data members were declared private and some reasonable
abstraction was provided to access them.

How do you like the attached patch?  It doesn't solve the fundamental
problem but it seems to improve the situation slightly.

>  }
>  
>  bool
> -- 
> 2.4.3
>
> ___
> mesa-dev mailing list
> mesa-dev@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/mesa-dev

From f09181eadd3ff1cd10f1afeee13e6c4bb86caa91 Mon Sep 17 00:00:00 2001
From: Francisco Jerez 
Date: Tue, 14 Jul 2015 15:43:44 +0300
Subject: [PATCH] i965/fs: Factor out universally broken calculation of the
 register component size.

This in principle simple calculation was being open-coded in a number
of places (in a series I haven't yet sent for review there will be a
couple more), all of them were subtly broken in one way or another:
None of them were handling the HW_REG case correctly as pointed out by
Connor, and fs_inst::regs_read() was handling the stride=0 case rather
naively.  This patch solves both problems and factors out the
calculation as a new fs_reg method.
---
 src/mesa/drivers/dri/i965/brw_fs.cpp   | 21 +
 src/mesa/drivers/dri/i965/brw_fs.h |  6 +++---
 src/mesa/drivers/dri/i965/brw_fs_generator.cpp |  2 +-
 src/mesa/drivers/dri/i965/brw_ir_fs.h  |  6 ++
 4 files changed, 23 insertions(+), 12 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index feb4c6c..9f15560 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -78,8 +78,8 @@ fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
case HW_REG:
case MRF:
case ATTR:
-  this->regs_written =
- DIV_ROUND_UP(MAX2(exec_size * dst.stride, 1) * type_sz(dst.type), 32);
+  this->regs_written = DIV_ROUND_UP(dst.component_size(exec_size),
+REG_SIZE);
   break;
case BAD_FILE:
   this->regs_written = 0;
@@ -443,6 +443,15 @@ fs_reg::is_contiguous() const
return stride == 1;
 }
 
+unsigned
+fs_reg::component_size(unsigned width) const
+{
+   const unsigned stride = (file != HW_REG ? this->stride :
+fixed_hw_reg.hstride == 0 ? 0 :
+1 << (fixed_hw_reg.hstride - 1));
+   return MAX2(width * stride, 1) * type_sz(type);
+}
+
 int
 fs_visitor::type_size(const struct glsl_type *type)
 {
@@ -703,12 +712,8 @@ fs_inst::regs_read(int arg) const
   return 1;
case GRF:
case HW_REG:
-  if (src[arg].stride == 0) {
- return 1;
-  } else {
- int size = components * this->exec_size * type_sz(src[arg].type);
- return DIV_ROUND

Re: [Mesa-dev] [PATCH] clover: Pass image attributes to the kernel

2015-07-14 Thread Francisco Jerez
Zoltán Gilián  writes:

>>  I have the suspicion
>> that it would simplify both the OpenCL front-end and compiler back-end
>> code if the image metadata was interleaved with images themselves.
>
> In fact this complicates the back-end, since the location of each
> argument following an image argument changes because of the metadata
> args. 

Yeah, of course, it was implicit in the idea that you'd redefine
imageNd_t as a <6 x i32>-like type so the addresses of subsequent
arguments would be calculated correctly, but...

> My least problematic solution to this problem is via an llvm
> pass which adds the image attribute arguments to the kernel args in IR
> form to maintain the correspondence between llvm IR args and input
> buffer values. Otherwise wrong locations will be calculated during
> lowering of the formal parameters.
> If I add these extra parameters in a pass, maybe its better to wire
> those args in during the same pass by replacing attribute getters with
> the args. This removes the benefit of the constant offset between the
> image arg and the attribute.

...this actually sounds really good.  Lowering the implicit arguments
into regular ones at the LLVM IR level means you would be one step from
being able to get rid of the target-specific hacks from invocation.cpp
-- The only thing left to do would be to define some metadata values
you'd attach to each LLVM kernel argument (along the lines of
clover::module::argument::semantic) telling clover whether some argument
is explicit or implicit, and in the latter case what sort of implicit
argument it is, so clover could do the right thing for the target.

Thanks.

> Do you find the proposed approach (i.e. append the attributes to the
> input buffer) objectionable? Do you have any suggestions on how to
> overcome this problem, so the metadata could be passed interleaved?
>
> On Fri, Jul 10, 2015 at 8:08 PM, Francisco Jerez  
> wrote:
>> Zoltan Gilian  writes:
>>
>>> Read-only and write-only image arguments are recognized and
>>> distinguished.
>>> Attributes of the image arguments are passed to the kernel as implicit
>>> arguments.
>>
>> Thanks, this looks much better.  One thing that still seems kind of
>> unfortunate is the fact that you've added a single "image_attributes"
>> argument that lumps image dimensions with format.  I expect the set of
>> targets that need format metadata to be a strict superset of the targets
>> that need image dimensions, so it would be nice if the target could
>> specify them as separate arguments (e.g. semantic::image_size and
>> ::image_format).
>>
>> Another related point is that you've chosen to pass the metadata for all
>> images together at the end of the input buffer.  I have the suspicion
>> that it would simplify both the OpenCL front-end and compiler back-end
>> code if the image metadata was interleaved with images themselves.
>> E.g. for each image argument and kernel the target would request an
>> argument list like
>>
>>  type::imageNd semantic::general,
>>  type::scalar semantic::image_format,
>>  type::scalar semantic::image_size
>>
>> and assume a struct-like layout for each image argument in the input
>> buffer:
>>
>>  struct image_argument {
>> uint32_t index;
>> uint32_t size[3];
>> uint32_t format[2];
>>  };
>>
>> For the back-end this would imply that the offset between a given image
>> argument and metadata field would be fixed, independent of how many
>> other arguments and how many images are being passed to the kernel, and
>> for the front-end it would mean you could get rid of the first pass of
>> the argument list you've added to exec_context::bind() (you could just
>> take the image from the last explicit_arg argument seen).
>>
>> Some more nit-picks below.
>>
>>> ---
>>>  src/gallium/state_trackers/clover/core/kernel.cpp  |  27 ++
>>>  src/gallium/state_trackers/clover/core/kernel.hpp  |  13 ++-
>>>  src/gallium/state_trackers/clover/core/memory.cpp  |   2 +-
>>>  src/gallium/state_trackers/clover/core/module.hpp  |   3 +-
>>>  .../state_trackers/clover/llvm/invocation.cpp  | 102 
>>> -
>>>  5 files changed, 140 insertions(+), 7 deletions(-)
>>>
>>> diff --git a/src/gallium/state_trackers/clover/core/kernel.cpp 
>>> b/src/gallium/state_trackers/clover/core/kernel.cpp
>>> index 0756f06..d7d42a6 100644
>>> --- a/src/gallium/state_trackers/clover/core/kernel.cpp
>>> +++ b/src/gallium/state_trackers/clover/core/kernel.cp

Re: [Mesa-dev] [PATCH 2/5] i965/fs: fix stride and type for hw_reg's in regs_read()

2015-07-15 Thread Francisco Jerez
Connor Abbott  writes:

> On Tue, Jul 14, 2015 at 6:02 AM, Francisco Jerez  
> wrote:
>> Connor Abbott  writes:
>>
>>> sources with file == HW_REG get all their information from the
>>> fixed_hw_reg field, so we need to get the stride and type from there
>>> when computing the size.
>>>
>>> Signed-off-by: Connor Abbott 
>>> ---
>>>  src/mesa/drivers/dri/i965/brw_fs.cpp | 24 ++--
>>>  1 file changed, 18 insertions(+), 6 deletions(-)
>>>
>>> diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp 
>>> b/src/mesa/drivers/dri/i965/brw_fs.cpp
>>> index 38b9095..64f093b 100644
>>> --- a/src/mesa/drivers/dri/i965/brw_fs.cpp
>>> +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
>>> @@ -696,24 +696,36 @@ fs_inst::regs_read(int arg) const
>>>break;
>>> }
>>>
>>> +   unsigned stride;
>>> +   enum brw_reg_type type;
>>> +
>>> switch (src[arg].file) {
>>> case BAD_FILE:
>>> case UNIFORM:
>>> case IMM:
>>>return 1;
>>> +
>>> case GRF:
>>> +  stride = src[arg].stride;
>>> +  type = src[arg].type;
>>> +  break;
>>> +
>>> case HW_REG:
>>> -  if (src[arg].stride == 0) {
>>> - return 1;
>>> -  } else {
>>> - int size = components * this->exec_size * type_sz(src[arg].type);
>>> - return DIV_ROUND_UP(size * src[arg].stride, 32);
>>> -  }
>>> +  stride = src[arg].fixed_hw_reg.hstride;
>>> +  type = src[arg].fixed_hw_reg.type;
>>> +  break;
>>> +
>>> case MRF:
>>>unreachable("MRF registers are not allowed as sources");
>>> default:
>>>unreachable("Invalid register file");
>>> }
>>> +
>>> +   if (stride == 0)
>>> +  return 1;
>>> +
>>> +   int size = components * this->exec_size * type_sz(type);
>>> +   return DIV_ROUND_UP(size * stride, 32);
>>
>> I don't think this will work unfortunately, brw_reg::hstride is the log2
>> of the actual stride, unlike fs_reg::stride.  Did I already mention I'm
>> appalled by the fact that fs_reg has a number of fields with overlapping
>> semantics but different representation, one or the other being
>> applicable depending on the occasion.  I guess it would be more or less
>> bearable if these data members were declared private and some reasonable
>> abstraction was provided to access them.
>
> I don't think anybody's happy with it, but refactoring that is it's
> own can of worms.
>

Sure, it would be a pile of work, but I think it should be quite
straightforward in principle.  We could just punt fixed_hw_reg and
replace it with an ARF file and a fixed-GRF file using the same fields
normal regististers use.  For immediates we'd have to add to add a union
with float/unsigned/int fields similar to brw_reg::dw1.

>>
>> How do you like the attached patch?  It doesn't solve the fundamental
>> problem but it seems to improve the situation slightly.
>
> It seems fine to me... I was more paranoid about getting the type from
> the fixed_hw_reg too, but brw_reg_from_fs_reg() in the generator we
> have:
>
> assert(reg->type == reg->fixed_hw_reg.type);
>
> so it seems my paranoia wasn't justified. I'd like someone else who's
> more experienced to take a look though. I suspect that others might
> want to bikeshed about the name, but I don't have a better suggestion.
>

Yeah, your paranoia was definitely justified, it's essentially the same
problem.  It actually led to actual bugs in the past which is why I
added that assertion and changed retype() to keep the type of the
fixed_hw_reg in sync with the normal type...

>>
>>>  }
>>>
>>>  bool
>>> --
>>> 2.4.3
>>>
>>> ___
>>> mesa-dev mailing list
>>> mesa-dev@lists.freedesktop.org
>>> http://lists.freedesktop.org/mailman/listinfo/mesa-dev
>>


signature.asc
Description: PGP signature
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 1/3] i965/fs: Fix stride for immediate registers.

2015-07-16 Thread Francisco Jerez
When the width field was removed from fs_reg the BROADCAST handling
code in opt_algebraic() started to miss a number of trivial
optimization cases resulting in the ugly indirect-addressing sequence
to be emitted unnecessarily for some variable-indexed texturing and
UBO loads regardless of one of the sources of BROADCAST being
immediate.  Apparently the reason was that we were setting the stride
field to one for immediates even though they are typically uniform.
Width used to be set to one too which is why this optimization used to
work previously until the "reg.width == 1" check was removed.

The stride field of vector immediates is intentionally left equal to
one, because they are strictly speaking not uniform.  The assertion in
fs_generator makes sure that immediates have the expected stride as
consistency check.
---
 src/mesa/drivers/dri/i965/brw_fs.cpp   | 3 +++
 src/mesa/drivers/dri/i965/brw_fs_generator.cpp | 4 
 2 files changed, 7 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp 
b/src/mesa/drivers/dri/i965/brw_fs.cpp
index ff0675d..537ccbe 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -362,6 +362,7 @@ fs_reg::fs_reg(float f)
init();
this->file = IMM;
this->type = BRW_REGISTER_TYPE_F;
+   this->stride = 0;
this->fixed_hw_reg.dw1.f = f;
 }
 
@@ -371,6 +372,7 @@ fs_reg::fs_reg(int32_t i)
init();
this->file = IMM;
this->type = BRW_REGISTER_TYPE_D;
+   this->stride = 0;
this->fixed_hw_reg.dw1.d = i;
 }
 
@@ -380,6 +382,7 @@ fs_reg::fs_reg(uint32_t u)
init();
this->file = IMM;
this->type = BRW_REGISTER_TYPE_UD;
+   this->stride = 0;
this->fixed_hw_reg.dw1.ud = u;
 }
 
diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp 
b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
index bae7216..8a3af47 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
@@ -79,6 +79,10 @@ brw_reg_from_fs_reg(fs_inst *inst, fs_reg *reg)
   brw_reg = byte_offset(brw_reg, reg->subreg_offset);
   break;
case IMM:
+  assert(reg->stride == (reg->type == BRW_REGISTER_TYPE_V ||
+ reg->type == BRW_REGISTER_TYPE_UV ||
+ reg->type == BRW_REGISTER_TYPE_VF ? 1 : 0));
+
   switch (reg->type) {
   case BRW_REGISTER_TYPE_F:
 brw_reg = brw_imm_f(reg->fixed_hw_reg.dw1.f);
-- 
2.4.3

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 3/3] i965: Fix stride field for the result of emit_uniformize().

2015-07-16 Thread Francisco Jerez
This is essentially the same problem fixed in an earlier patch for
immediates.  Setting the stride to zero will be particularly useful
for my future SIMD lowering pass, because we will be able to just
check whether the stride of a source register is zero and skip
emitting the copies required to unzip it in that case.

Instead of setting stride to zero in every caller of emit_uniformize()
I've changed the function to return the result as its return value
(previously it was being written into a caller-provided destination
register), because this way we can enforce that the result is used with
the correct regioning from the function itself.

The changes to the prototype of its VEC4 counterpart are mainly for
the sake of symmetry, VEC4 registers don't have stride.
---
 src/mesa/drivers/dri/i965/brw_fs_builder.h | 16 +---
 src/mesa/drivers/dri/i965/brw_fs_nir.cpp   |  4 ++--
 src/mesa/drivers/dri/i965/brw_vec4.h   |  7 +--
 src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp | 18 ++
 4 files changed, 26 insertions(+), 19 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs_builder.h 
b/src/mesa/drivers/dri/i965/brw_fs_builder.h
index 34646d7..ab4680f 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_builder.h
+++ b/src/mesa/drivers/dri/i965/brw_fs_builder.h
@@ -350,17 +350,19 @@ namespace brw {
   }
 
   /**
-   * Copy any live channel from \p src to the first channel of \p dst.
+   * Copy any live channel from \p src to the first channel of the result.
*/
-  void
-  emit_uniformize(const dst_reg &dst, const src_reg &src) const
+  src_reg
+  emit_uniformize(const src_reg &src) const
   {
  const fs_builder ubld = exec_all();
- const dst_reg chan_index = vgrf(BRW_REGISTER_TYPE_UD);
+ const dst_reg chan_index = component(vgrf(BRW_REGISTER_TYPE_UD), 0);
+ const dst_reg dst = component(vgrf(src.type), 0);
+
+ ubld.emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, chan_index);
+ ubld.emit(SHADER_OPCODE_BROADCAST, dst, src, chan_index);
 
- ubld.emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, component(chan_index, 0));
- ubld.emit(SHADER_OPCODE_BROADCAST, component(dst, 0),
-   src, component(chan_index, 0));
+ return dst;
   }
 
   /**
diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp 
b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
index 3099dc4..4e45118 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
@@ -1386,7 +1386,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, 
nir_intrinsic_instr *instr
  surf_index = vgrf(glsl_type::uint_type);
  bld.ADD(surf_index, get_nir_src(instr->src[0]),
  fs_reg(stage_prog_data->binding_table.ubo_start));
- bld.emit_uniformize(surf_index, surf_index);
+ surf_index = bld.emit_uniformize(surf_index);
 
  /* Assume this may touch any UBO. It would be nice to provide
   * a tighter bound, but the array information is already lowered away.
@@ -1681,7 +1681,7 @@ fs_visitor::nir_emit_texture(const fs_builder &bld, 
nir_tex_instr *instr)
  /* Emit code to evaluate the actual indexing expression */
  sampler_reg = vgrf(glsl_type::uint_type);
  bld.ADD(sampler_reg, src, fs_reg(sampler));
- bld.emit_uniformize(sampler_reg, sampler_reg);
+ sampler_reg = bld.emit_uniformize(sampler_reg);
  break;
   }
 
diff --git a/src/mesa/drivers/dri/i965/brw_vec4.h 
b/src/mesa/drivers/dri/i965/brw_vec4.h
index 3643651..7bf027a 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.h
+++ b/src/mesa/drivers/dri/i965/brw_vec4.h
@@ -293,8 +293,11 @@ public:
void emit_lrp(const dst_reg &dst,
  const src_reg &x, const src_reg &y, const src_reg &a);
 
-   /** Copy any live channel from \p src to the first channel of \p dst. */
-   void emit_uniformize(const dst_reg &dst, const src_reg &src);
+   /**
+* Copy any live channel from \p src to the first channel of the
+* result.
+*/
+   src_reg emit_uniformize(const src_reg &src);
 
void emit_block_move(dst_reg *dst, src_reg *src,
 const struct glsl_type *type, brw_predicate predicate);
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp 
b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
index f351bf4..a6eee47 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
@@ -1374,15 +1374,19 @@ vec4_visitor::emit_pull_constant_load_reg(dst_reg dst,
   emit(pull);
 }
 
-void
-vec4_visitor::emit_uniformize(const dst_reg &dst, const src_reg &src)
+src_reg
+vec4_visitor::emit_uniformize(const src_reg &src)
 {
const src_reg chan_index(this, glsl_type::uint_type);
+   const dst_reg dst = retype(dst_reg(this, glsl_type::uint_type),
+  src.type);
 
emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, dst_reg(c

[Mesa-dev] [PATCH 2/3] i965/fs: Fix stride field for uniforms.

2015-07-16 Thread Francisco Jerez
This fixes essentially the same problem as for immediates.  Registers
of the UNIFORM file are typically accessed according to the formula:

 read_uniform(r, channel_index, array_index) =
read_element(r, channel_index * 0 + array_index * 1)

Which matches the general direct addressing formula for stride=0:

 read_direct(r, channel_index, array_index) =
read_element(r, channel_index * stride +
array_index * max{1, stride * width})

In either case if reladdr is present the access will be according to
the composition of two register regions, the first one determining the
per-channel array_index used for the second, like:

 read_indirect(r, channel_index, array_index) =
read_direct(r, channel_index,
read(r.reladdr, channel_index, array_index))

where:
 read(r, channel_index, array_index) = if r.reladdr == NULL
then read_direct(r, channel_index, array_index)
else read_indirect(r, channel_index, array_index)

In conclusion we can handle uniforms consistently with the other
register files if we set stride to zero.  After lowering to a GRF
using VARYING_PULL_CONSTANT_LOAD in demote_pull_constant_loads() the
stride of the source is set to one again because the result of
VARYING_PULL_CONSTANT_LOAD is generally non-uniform.
---
 src/mesa/drivers/dri/i965/brw_fs.cpp | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp 
b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 537ccbe..2e835b1 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -819,6 +819,7 @@ fs_reg::fs_reg(enum register_file file, int reg)
this->file = file;
this->reg = reg;
this->type = BRW_REGISTER_TYPE_F;
+   this->stride = (file == UNIFORM ? 0 : 1);
 }
 
 /** Fixed HW reg constructor. */
@@ -828,6 +829,7 @@ fs_reg::fs_reg(enum register_file file, int reg, enum 
brw_reg_type type)
this->file = file;
this->reg = reg;
this->type = type;
+   this->stride = (file == UNIFORM ? 0 : 1);
 }
 
 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
@@ -1268,6 +1270,7 @@ fs_visitor::assign_curb_setup()
  constant_nr / 8,
  constant_nr % 8);
 
+assert(inst->src[i].stride == 0);
inst->src[i].file = HW_REG;
inst->src[i].fixed_hw_reg = byte_offset(
retype(brw_reg, inst->src[i].type),
@@ -1818,6 +1821,8 @@ fs_visitor::demote_pull_constants()
  fs_reg 
surf_index(stage_prog_data->binding_table.pull_constants_start);
  fs_reg dst = vgrf(glsl_type::float_type);
 
+ assert(inst->src[i].stride == 0);
+
  /* Generate a pull load into dst. */
  if (inst->src[i].reladdr) {
 VARYING_PULL_CONSTANT_LOAD(ibld, dst,
@@ -1825,6 +1830,7 @@ fs_visitor::demote_pull_constants()
*inst->src[i].reladdr,
pull_index);
 inst->src[i].reladdr = NULL;
+inst->src[i].stride = 1;
  } else {
 fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
 ibld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
-- 
2.4.3

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 1/4] i965/fs: Add stub lowering pass for logical send-message opcodes.

2015-07-16 Thread Francisco Jerez
This pass will house ad-hoc lowering code for several send
message-like virtual opcodes that will represent their logically
independent arguments as separate instruction sources rather than as a
single payload blob.  This pass will basically just take the separate
arguments that are supposed to be part of the payload and concatenate
them to construct a message in the form required by the hardware.
Virtual instructions in separate-source form will eventually allow
some simplification of the visitor code and make several
transformations easier like lowering SIMD16 instructions to SIMD8
algorithmically in cases where the hardware doesn't support the former
natively.
---
 src/mesa/drivers/dri/i965/brw_fs.cpp | 29 -
 src/mesa/drivers/dri/i965/brw_fs.h   |  1 +
 2 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp 
b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 2e835b1..09ff587 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -3179,6 +3179,30 @@ fs_visitor::lower_integer_multiplication()
return progress;
 }
 
+bool
+fs_visitor::lower_logical_sends()
+{
+   bool progress = false;
+
+   foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
+  const fs_builder ibld = bld.exec_all(inst->force_writemask_all)
+ .group(inst->exec_size, inst->force_sechalf)
+ .at(block, inst);
+
+  switch (inst->opcode) {
+  default:
+ continue;
+  }
+
+  progress = true;
+   }
+
+   if (progress)
+  invalidate_live_intervals();
+
+   return progress;
+}
+
 void
 fs_visitor::dump_instructions()
 {
@@ -3626,9 +3650,12 @@ fs_visitor::optimize()
   backend_shader::dump_instructions(filename);
}
 
-   bool progress;
+   bool progress = false;
int iteration = 0;
int pass_num = 0;
+
+   OPT(lower_logical_sends);
+
do {
   progress = false;
   pass_num = 0;
diff --git a/src/mesa/drivers/dri/i965/brw_fs.h 
b/src/mesa/drivers/dri/i965/brw_fs.h
index c005666..f3850d1 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -182,6 +182,7 @@ public:
void no16(const char *msg);
void lower_uniform_pull_constant_loads();
bool lower_load_payload();
+   bool lower_logical_sends();
bool lower_integer_multiplication();
bool opt_combine_constants();
 
-- 
2.4.3

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 2/4] i965/fs: Add builder emit method taking a variable number of source registers.

2015-07-16 Thread Francisco Jerez
And start using it in fs_builder::LOAD_PAYLOAD().  This will be used
to emit logical send message opcodes which have an unusually large
number of arguments.
---
 src/mesa/drivers/dri/i965/brw_fs_builder.h | 15 ---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs_builder.h 
b/src/mesa/drivers/dri/i965/brw_fs_builder.h
index ab4680f..1dba66f 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_builder.h
+++ b/src/mesa/drivers/dri/i965/brw_fs_builder.h
@@ -307,6 +307,17 @@ namespace brw {
   }
 
   /**
+   * Create and insert an instruction with a variable number of sources
+   * into the program.
+   */
+  instruction *
+  emit(enum opcode opcode, const dst_reg &dst, const src_reg srcs[],
+   unsigned n) const
+  {
+ return emit(instruction(opcode, dispatch_width(), dst, srcs, n));
+  }
+
+  /**
* Insert a preallocated instruction into the program.
*/
   instruction *
@@ -518,9 +529,7 @@ namespace brw {
   LOAD_PAYLOAD(const dst_reg &dst, const src_reg *src,
unsigned sources, unsigned header_size) const
   {
- instruction *inst = emit(instruction(SHADER_OPCODE_LOAD_PAYLOAD,
-  dispatch_width(), dst,
-  src, sources));
+ instruction *inst = emit(SHADER_OPCODE_LOAD_PAYLOAD, dst, src, 
sources);
  inst->header_size = header_size;
  inst->regs_written = header_size +
   (sources - header_size) * (dispatch_width() / 8);
-- 
2.4.3

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 4/4] i965/fs: Implement pass to lower instructions of unsupported SIMD width.

2015-07-16 Thread Francisco Jerez
This lowering pass implements an algorithm to expand SIMDN
instructions into a sequence of SIMDM instructions in cases where the
hardware doesn't support the original execution size natively for some
particular instruction.  The most important use-cases are:

 - Lowering send message instructions that don't support SIMD16
   natively into SIMD8 (several texturing, framebuffer write and typed
   surface operations).

 - Lowering messages that don't support SIMD8 natively into SIMD16
   (*cough*gen4*cough*).

 - 64-bit precision operations (e.g. FP64 and 64-bit integer
   multiplication).

 - SIMD32.

The algorithm works by splitting the sources of the original
instruction into chunks of width appropriate for the lowered
instructions, and then interleaving the results component-wise into
the destination of the original instruction.  The pass is controlled
by the get_lowered_simd_width() function that currently just returns
the original execution size making the whole pass a no-op for the
moment until some user is introduced.
---
 src/mesa/drivers/dri/i965/brw_fs.cpp | 142 +++
 src/mesa/drivers/dri/i965/brw_fs.h   |   1 +
 2 files changed, 143 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp 
b/src/mesa/drivers/dri/i965/brw_fs.cpp
index d031352..eeb6938 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -3204,6 +3204,147 @@ fs_visitor::lower_logical_sends()
return progress;
 }
 
+/**
+ * Get the closest native SIMD width supported by the hardware for instruction
+ * \p inst.  The instruction will be left untouched by
+ * fs_visitor::lower_simd_width() if the returned value is equal to the
+ * original execution size.
+ */
+static unsigned
+get_lowered_simd_width(const struct brw_device_info *devinfo,
+   const fs_inst *inst)
+{
+   switch (inst->opcode) {
+   default:
+  return inst->exec_size;
+   }
+}
+
+/**
+ * The \p rows array of registers represents a \p num_rows by \p num_columns
+ * matrix in row-major order, write it in column-major order into the register
+ * passed as destination.  \p stride gives the separation between matrix
+ * elements in the input in fs_builder::dispatch_width() units.
+ */
+static void
+emit_transpose(const fs_builder &bld,
+   const fs_reg &dst, const fs_reg *rows,
+   unsigned num_rows, unsigned num_columns, unsigned stride)
+{
+   fs_reg *const components = new fs_reg[num_rows * num_columns];
+
+   for (unsigned i = 0; i < num_columns; ++i) {
+  for (unsigned j = 0; j < num_rows; ++j)
+ components[num_rows * i + j] = offset(rows[j], bld, stride * i);
+   }
+
+   bld.LOAD_PAYLOAD(dst, components, num_rows * num_columns, 0);
+
+   delete[] components;
+}
+
+bool
+fs_visitor::lower_simd_width()
+{
+   bool progress = false;
+
+   foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
+  const unsigned lower_width = get_lowered_simd_width(devinfo, inst);
+
+  if (lower_width != inst->exec_size) {
+ /* Builder matching the original instruction. */
+ const fs_builder ibld = bld.at(block, inst)
+.exec_all(inst->force_writemask_all)
+.group(inst->exec_size, 
inst->force_sechalf);
+
+ /* Split the copies in chunks of the execution width of either the
+  * original or the lowered instruction, whichever is lower.
+  */
+ const unsigned copy_width = MIN2(lower_width, inst->exec_size);
+ const unsigned n = inst->exec_size / copy_width;
+ const unsigned dst_size = inst->regs_written * REG_SIZE /
+inst->dst.component_size(inst->exec_size);
+ fs_reg dsts[4];
+
+ assert(n > 0 && n <= ARRAY_SIZE(dsts) &&
+!inst->writes_accumulator && !inst->mlen);
+
+ for (unsigned i = 0; i < n; i++) {
+/* Emit a copy of the original instruction with the lowered width.
+ * If the EOT flag was set throw it away except for the last
+ * instruction to avoid killing the thread prematurely.
+ */
+fs_inst tmp_inst = *inst;
+tmp_inst.exec_size = lower_width;
+tmp_inst.eot = inst->eot && i == n - 1;
+
+/* Set exec_all if the lowered width is higher than the original
+ * to avoid breaking the compiler invariant that no control
+ * flow-masked instruction is wider than the shader's
+ * dispatch_width.  Then emit the lowered instruction.
+ */
+const fs_builder lbld = ibld.exec_all(lower_width > 
inst->exec_size)
+.group(lower_width, i);
+fs_inst *split_inst = lbld.emit(tmp_inst);
+
+for (unsigned j = 0; j < inst->sources; j++) {
+   if (inst->src[j].file != BAD_FILE &&
+   !is_uniform(inst->src[j])) {
+  /* Get

[Mesa-dev] [PATCH 3/4] i965/fs: Fix return value of fs_inst::regs_read() for BAD_FILE.

2015-07-16 Thread Francisco Jerez
Typically BAD_FILE sources are used to mark a source as not present
what implies that no registers are read.  This will become much more
frequent with logical send opcodes which have a large number of
sources, many of them optionally used and marked as BAD_FILE when they
aren't applicable.  It will prove to be useful to be able to rely on
the value of regs_read() regardless of whether a source is present or
not.
---
 src/mesa/drivers/dri/i965/brw_fs.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp 
b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 09ff587..d031352 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -709,6 +709,7 @@ fs_inst::regs_read(int arg) const
 
switch (src[arg].file) {
case BAD_FILE:
+  return 0;
case UNIFORM:
case IMM:
   return 1;
-- 
2.4.3

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 04/12] i965/fs: Fix slight layering violation in emit_single_fb_writes().

2015-07-16 Thread Francisco Jerez
In cases where the color0 argument wasn't being provided,
emit_single_fb_writes() would take the alpha channel directly from the
visitor state instead of taking it from its arguments.  This sort of
hack didn't fit nicely into the logical send-message approach because
all parameters of the instruction have to be visible to the SIMD
lowering pass for it to be able to split them into halves at all.

Fix it by using LOAD_PAYLOAD in fs_visitor::emit_fb_writes() to
provide an actual color0 vector with undefined contents except for the
alpha component to match the previous behavior when no color buffers
are enabled.
---
 src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 22 ++
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp 
b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
index 94d6a58..6564d5f 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
@@ -1551,17 +1551,7 @@ fs_visitor::emit_single_fb_write(const fs_builder &bld,
 
payload_header_size = length;
 
-   if (color0.file == BAD_FILE) {
-  /* Even if there's no color buffers enabled, we still need to send
-   * alpha out the pipeline to our null renderbuffer to support
-   * alpha-testing, alpha-to-coverage, and so on.
-   */
-  if (this->outputs[0].file != BAD_FILE)
- setup_color_payload(&sources[length + 3],
- offset(this->outputs[0], bld, 3),
- 1, exec_size, false);
-  length += 4;
-   } else if (color1.file == BAD_FILE) {
+   if (color1.file == BAD_FILE) {
   if (src0_alpha.file != BAD_FILE) {
  setup_color_payload(&sources[length], src0_alpha, 1, exec_size, 
false);
  length++;
@@ -1709,7 +1699,15 @@ fs_visitor::emit_fb_writes()
* alpha out the pipeline to our null renderbuffer to support
* alpha-testing, alpha-to-coverage, and so on.
*/
-  inst = emit_single_fb_write(bld, reg_undef, reg_undef, reg_undef, 0,
+  /* FINISHME: Factor out this frequently recurring pattern into a
+   * helper function.
+   */
+  const fs_reg srcs[] = { reg_undef, reg_undef,
+  reg_undef, offset(this->outputs[0], bld, 3) };
+  const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD, 4);
+  bld.LOAD_PAYLOAD(tmp, srcs, 4, 0);
+
+  inst = emit_single_fb_write(bld, tmp, reg_undef, reg_undef, 4,
   dispatch_width);
   inst->target = 0;
}
-- 
2.4.3

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 06/12] i965/fs: Move up prog_data->uses_omask assignment up to brw_codegen_wm_prog().

2015-07-16 Thread Francisco Jerez
---
 src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 2 --
 src/mesa/drivers/dri/i965/brw_wm.c   | 3 ++-
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp 
b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
index 08d9abf..c489010 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
@@ -1535,8 +1535,6 @@ fs_visitor::emit_single_fb_write(const fs_builder &bld,
   length++;
}
 
-   prog_data->uses_omask =
-  prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK);
if (prog_data->uses_omask) {
   assert(this->sample_mask.file != BAD_FILE);
   /* Hand over gl_SampleMask. Only lower 16 bits are relevant.  Since
diff --git a/src/mesa/drivers/dri/i965/brw_wm.c 
b/src/mesa/drivers/dri/i965/brw_wm.c
index 592a729..b590b17 100644
--- a/src/mesa/drivers/dri/i965/brw_wm.c
+++ b/src/mesa/drivers/dri/i965/brw_wm.c
@@ -181,7 +181,8 @@ brw_codegen_wm_prog(struct brw_context *brw,
 * so the shader definitely kills pixels.
 */
prog_data.uses_kill = fp->program.UsesKill || key->alpha_test_func;
-
+   prog_data.uses_omask =
+  fp->program.Base.OutputsWritten & 
BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK);
prog_data.computed_depth_mode = computed_depth_mode(&fp->program);
 
/* Use ALT floating point mode for ARB programs so that 0^0 == 1. */
-- 
2.4.3

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 02/12] i965/fs: Honour the instruction force_sechalf and exec_size fields for FB writes.

2015-07-16 Thread Francisco Jerez
We were previously guessing the half based on the EOT flag which seems
rather gross.
---
 src/mesa/drivers/dri/i965/brw_fs_generator.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp 
b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
index 8a3af47..e94f34e 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
@@ -221,11 +221,11 @@ fs_generator::fire_fb_write(fs_inst *inst,
if (inst->opcode == FS_OPCODE_REP_FB_WRITE)
   msg_control = 
BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE_REPLICATED;
else if (prog_data->dual_src_blend) {
-  if (dispatch_width == 8 || !inst->eot)
+  if (!inst->force_sechalf)
  msg_control = 
BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN01;
   else
  msg_control = 
BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN23;
-   } else if (dispatch_width == 16)
+   } else if (inst->exec_size == 16)
   msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE;
else
   msg_control = 
BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01;
-- 
2.4.3

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 01/12] i965/fs: Define logical framebuffer write opcode.

2015-07-16 Thread Francisco Jerez
The logical variant is largely equivalent to the original opcode but
instead of taking a single payload source it expects the arguments
that make up the payload separately as individual sources, like:

 fb_write_logical null, color0, color1, src0_alpha,
src_depth, dst_depth, sample_mask, num_components

This patch defines the opcode and usual instruction boilerplate,
including a placeholder lowering function provided mostly as
self-documentation of its source registers.
---
 src/mesa/drivers/dri/i965/brw_defines.h  |  1 +
 src/mesa/drivers/dri/i965/brw_fs.cpp | 34 
 src/mesa/drivers/dri/i965/brw_fs.h   |  2 +-
 src/mesa/drivers/dri/i965/brw_shader.cpp |  2 ++
 4 files changed, 38 insertions(+), 1 deletion(-)

diff --git a/src/mesa/drivers/dri/i965/brw_defines.h 
b/src/mesa/drivers/dri/i965/brw_defines.h
index 5bf53e3..65685a9 100644
--- a/src/mesa/drivers/dri/i965/brw_defines.h
+++ b/src/mesa/drivers/dri/i965/brw_defines.h
@@ -875,6 +875,7 @@ enum opcode {
 * instructions.
 */
FS_OPCODE_FB_WRITE = 128,
+   FS_OPCODE_FB_WRITE_LOGICAL,
FS_OPCODE_BLORP_FB_WRITE,
FS_OPCODE_REP_FB_WRITE,
SHADER_OPCODE_RCP,
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp 
b/src/mesa/drivers/dri/i965/brw_fs.cpp
index eeb6938..ae050b7 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -701,6 +701,13 @@ fs_inst::regs_read(int arg) const
  return 1;
   break;
 
+   case FS_OPCODE_FB_WRITE_LOGICAL:
+  assert(src[6].file == IMM);
+  /* First/second FB write color. */
+  if (arg < 2)
+ components = src[6].fixed_hw_reg.dw1.ud;
+  break;
+
default:
   if (is_tex() && arg == 0 && src[0].file == GRF)
  return mlen;
@@ -3180,6 +3187,25 @@ fs_visitor::lower_integer_multiplication()
return progress;
 }
 
+static void
+lower_fb_write_logical_send(const fs_builder &bld, fs_inst *inst,
+const brw_wm_prog_data *prog_data,
+const brw_wm_prog_key *key,
+const fs_visitor::thread_payload &payload)
+{
+   assert(inst->src[6].file == IMM);
+   const brw_device_info *devinfo = bld.shader->devinfo;
+   const fs_reg &color0 = inst->src[0];
+   const fs_reg &color1 = inst->src[1];
+   const fs_reg &src0_alpha = inst->src[2];
+   const fs_reg &src_depth = inst->src[3];
+   const fs_reg &dst_depth = inst->src[4];
+   fs_reg sample_mask = inst->src[5];
+   const unsigned components = inst->src[6].fixed_hw_reg.dw1.ud;
+
+   assert(!"Not implemented");
+}
+
 bool
 fs_visitor::lower_logical_sends()
 {
@@ -3191,6 +3217,14 @@ fs_visitor::lower_logical_sends()
  .at(block, inst);
 
   switch (inst->opcode) {
+  case FS_OPCODE_FB_WRITE_LOGICAL:
+ assert(stage == MESA_SHADER_FRAGMENT);
+ lower_fb_write_logical_send(ibld, inst,
+ (const brw_wm_prog_data *)prog_data,
+ (const brw_wm_prog_key *)key,
+ payload);
+ break;
+
   default:
  continue;
   }
diff --git a/src/mesa/drivers/dri/i965/brw_fs.h 
b/src/mesa/drivers/dri/i965/brw_fs.h
index 9582648..3533d38 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -387,7 +387,7 @@ public:
fs_reg result;
 
/** Register numbers for thread payload fields. */
-   struct {
+   struct thread_payload {
   uint8_t source_depth_reg;
   uint8_t source_w_reg;
   uint8_t aa_dest_stencil_reg;
diff --git a/src/mesa/drivers/dri/i965/brw_shader.cpp 
b/src/mesa/drivers/dri/i965/brw_shader.cpp
index d66baf3..44681e4 100644
--- a/src/mesa/drivers/dri/i965/brw_shader.cpp
+++ b/src/mesa/drivers/dri/i965/brw_shader.cpp
@@ -533,6 +533,8 @@ brw_instruction_name(enum opcode op)
   return opcode_descs[op].name;
case FS_OPCODE_FB_WRITE:
   return "fb_write";
+   case FS_OPCODE_FB_WRITE_LOGICAL:
+  return "fb_write_logical";
case FS_OPCODE_BLORP_FB_WRITE:
   return "blorp_fb_write";
case FS_OPCODE_REP_FB_WRITE:
-- 
2.4.3

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 03/12] i965/fs: Make sure that the type sizes are compatible during copy propagation.

2015-07-16 Thread Francisco Jerez
It's surprising that we weren't checking for this already.  A future
patch will cause code like the following to be emitted:

 MOV(16) tmp<1>:uw, src
 MOV(8) dst<1>:ud, tmp<8,8,1>:ud

The second MOV comes from the expansion of a LOAD_PAYLOAD header copy,
so I don't have control over its types.  Copy propagation will happily
turn this into:

 MOV(8) dst<1>:ud, src

Which has different semantics.  Fix it by preventing propagation in
cases where a single channel of the instruction would span several
channels of the copy (this requirement could in fact be relaxed if the
copy is just a trivial memcpy, but this case is unusual enough that I
don't think it matters in practice).

I'm deliberately only checking if the type of the instruction is
larger than the original, because the converse case seems to be
handled correctly already in the code below.
---
 src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp | 8 
 1 file changed, 8 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp 
b/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp
index 54e9114..269bdb5 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp
@@ -339,6 +339,14 @@ fs_visitor::try_copy_propagate(fs_inst *inst, int arg, 
acp_entry *entry)
if (entry->src.stride * inst->src[arg].stride > 4)
   return false;
 
+   /* Bail if the instruction type is larger than the execution type of the
+* copy, what implies that each channel is reading multiple channels of the
+* destination of the copy, and simply replacing the sources would give a
+* program with different semantics.
+*/
+   if (type_sz(entry->dst.type) < type_sz(inst->src[arg].type))
+  return false;
+
/* Bail if the result of composing both strides cannot be expressed
 * as another stride. This avoids, for example, trying to transform
 * this:
-- 
2.4.3

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 09/12] i965/fs: Remove the FS_OPCODE_SET_OMASK pseudo-opcode.

2015-07-16 Thread Francisco Jerez
This is now unused.
---
 src/mesa/drivers/dri/i965/brw_defines.h|  1 -
 src/mesa/drivers/dri/i965/brw_fs.h |  4 ---
 src/mesa/drivers/dri/i965/brw_fs_generator.cpp | 35 --
 src/mesa/drivers/dri/i965/brw_shader.cpp   |  2 --
 4 files changed, 42 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_defines.h 
b/src/mesa/drivers/dri/i965/brw_defines.h
index 65685a9..9099676 100644
--- a/src/mesa/drivers/dri/i965/brw_defines.h
+++ b/src/mesa/drivers/dri/i965/brw_defines.h
@@ -970,7 +970,6 @@ enum opcode {
FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7,
FS_OPCODE_MOV_DISPATCH_TO_FLAGS,
FS_OPCODE_DISCARD_JUMP,
-   FS_OPCODE_SET_OMASK,
FS_OPCODE_SET_SAMPLE_ID,
FS_OPCODE_SET_SIMD4X2_OFFSET,
FS_OPCODE_PACK_HALF_2x16_SPLIT,
diff --git a/src/mesa/drivers/dri/i965/brw_fs.h 
b/src/mesa/drivers/dri/i965/brw_fs.h
index 3533d38..1ae79a9 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -496,10 +496,6 @@ private:
   struct brw_reg msg_data,
   unsigned msg_type);
 
-   void generate_set_omask(fs_inst *inst,
-   struct brw_reg dst,
-   struct brw_reg sample_mask);
-
void generate_set_sample_id(fs_inst *inst,
struct brw_reg dst,
struct brw_reg src0,
diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp 
b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
index e94f34e..2d5d352 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
@@ -1364,37 +1364,6 @@ fs_generator::generate_set_simd4x2_offset(fs_inst *inst,
brw_pop_insn_state(p);
 }
 
-/* Sets vstride=16, width=8, hstride=2 or vstride=0, width=1, hstride=0
- * (when mask is passed as a uniform) of register mask before moving it
- * to register dst.
- */
-void
-fs_generator::generate_set_omask(fs_inst *inst,
- struct brw_reg dst,
- struct brw_reg mask)
-{
-   bool stride_8_8_1 =
-(mask.vstride == BRW_VERTICAL_STRIDE_8 &&
- mask.width == BRW_WIDTH_8 &&
- mask.hstride == BRW_HORIZONTAL_STRIDE_1);
-
-   bool stride_0_1_0 = has_scalar_region(mask);
-
-   assert(stride_8_8_1 || stride_0_1_0);
-   assert(dst.type == BRW_REGISTER_TYPE_UW);
-
-   brw_push_insn_state(p);
-   brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
-   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
-
-   if (stride_8_8_1) {
-  brw_MOV(p, dst, retype(stride(mask, 16, 8, 2), dst.type));
-   } else if (stride_0_1_0) {
-  brw_MOV(p, dst, retype(mask, dst.type));
-   }
-   brw_pop_insn_state(p);
-}
-
 /* Sets vstride=1, width=4, hstride=0 of register src1 during
  * the ADD instruction.
  */
@@ -2074,10 +2043,6 @@ fs_generator::generate_code(const cfg_t *cfg, int 
dispatch_width)
  brw_broadcast(p, dst, src[0], src[1]);
  break;
 
-  case FS_OPCODE_SET_OMASK:
- generate_set_omask(inst, dst, src[0]);
- break;
-
   case FS_OPCODE_SET_SAMPLE_ID:
  generate_set_sample_id(inst, dst, src[0], src[1]);
  break;
diff --git a/src/mesa/drivers/dri/i965/brw_shader.cpp 
b/src/mesa/drivers/dri/i965/brw_shader.cpp
index 44681e4..36a383b 100644
--- a/src/mesa/drivers/dri/i965/brw_shader.cpp
+++ b/src/mesa/drivers/dri/i965/brw_shader.cpp
@@ -660,8 +660,6 @@ brw_instruction_name(enum opcode op)
case FS_OPCODE_DISCARD_JUMP:
   return "discard_jump";
 
-   case FS_OPCODE_SET_OMASK:
-  return "set_omask";
case FS_OPCODE_SET_SAMPLE_ID:
   return "set_sample_id";
case FS_OPCODE_SET_SIMD4X2_OFFSET:
-- 
2.4.3

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 10/12] i965/fs: Hook up SIMD lowering to unroll FB writes of unsupported width.

2015-07-16 Thread Francisco Jerez
This shouldn't have any effect because we don't emit logical
framebuffer writes yet.
---
 src/mesa/drivers/dri/i965/brw_fs.cpp | 9 +
 1 file changed, 9 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp 
b/src/mesa/drivers/dri/i965/brw_fs.cpp
index ae050b7..70fdc5e 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -3249,6 +3249,15 @@ get_lowered_simd_width(const struct brw_device_info 
*devinfo,
const fs_inst *inst)
 {
switch (inst->opcode) {
+   case FS_OPCODE_FB_WRITE_LOGICAL:
+  /* Gen6 doesn't support SIMD16 depth writes but we cannot handle them
+   * here.
+   */
+  assert(devinfo->gen != 6 || inst->src[3].file == BAD_FILE ||
+ inst->exec_size <= 8);
+  /* Dual-source FB writes are unsupported in SIMD16 mode. */
+  return (inst->src[1].file != BAD_FILE ? 8 : inst->exec_size);
+
default:
   return inst->exec_size;
}
-- 
2.4.3

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 08/12] i965/fs: Don't attempt to copy the useless half of oMask for SIMD8 FB writes.

2015-07-16 Thread Francisco Jerez
There's no need to initialize the wrong half of oMask in the payload
when we're doing an 8-wide framebuffer write because it will be
ignored by the hardware anyway.  By doing it this way we can let the
SIMD lowering pass split the sample_mask source as a regular
per-channel source, otherwise we would have to introduce some sort of
per-instruction source query or use fs_inst::header_size for the
lowering pass to be able to find out whether some source is
header-like, and leave the source untouched in that case.

As a bonus this achieves the same purpose as the previous code without
making use of the SET_OMASK pseudo-instruction, which will be removed
in a future commit.
---
 src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 26 ++
 1 file changed, 18 insertions(+), 8 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp 
b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
index b5a42b1..ba4b177 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
@@ -1499,6 +1499,7 @@ fs_visitor::emit_single_fb_write(const fs_builder &bld,
assert(stage == MESA_SHADER_FRAGMENT);
brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
+   const fs_builder ubld = bld.group(exec_size, use_2nd_half);
int header_size = 2, payload_header_size;
 
/* We can potentially have a message length of up to 15, so we have to set
@@ -1536,14 +1537,24 @@ fs_visitor::emit_single_fb_write(const fs_builder &bld,
}
 
if (prog_data->uses_omask) {
-  assert(this->sample_mask.file != BAD_FILE);
-  /* Hand over gl_SampleMask. Only lower 16 bits are relevant.  Since
-   * it's unsinged single words, one vgrf is always 16-wide.
-   */
   sources[length] = fs_reg(GRF, alloc.allocate(1),
-   BRW_REGISTER_TYPE_UW);
-  bld.exec_all().annotate("FB write oMask")
- .emit(FS_OPCODE_SET_OMASK, sources[length], this->sample_mask);
+   BRW_REGISTER_TYPE_UD);
+
+  /* Hand over gl_SampleMask.  Only the lower 16 bits of each channel are
+   * relevant.  Since it's unsigned single words one vgrf is always
+   * 16-wide, but only the lower or higher 8 channels will be used by the
+   * hardware when doing a SIMD8 write depending on whether we have
+   * selected the subspans for the first or second half respectively.
+   */
+  fs_reg sample_mask = this->sample_mask;
+  assert(sample_mask.file != BAD_FILE && type_sz(sample_mask.type) == 4);
+  sample_mask.type = BRW_REGISTER_TYPE_UW;
+  sample_mask.stride *= 2;
+
+  ubld.annotate("FB write oMask")
+  .MOV(half(retype(sources[length], BRW_REGISTER_TYPE_UW),
+use_2nd_half),
+   half(sample_mask, use_2nd_half));
   length++;
}
 
@@ -1590,7 +1601,6 @@ fs_visitor::emit_single_fb_write(const fs_builder &bld,
if (payload.dest_depth_reg)
   sources[length++] = fs_reg(brw_vec8_grf(payload.dest_depth_reg, 0));
 
-   const fs_builder ubld = bld.group(exec_size, use_2nd_half);
fs_inst *load;
fs_inst *write;
if (devinfo->gen >= 7) {
-- 
2.4.3

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 11/12] i965/fs: Implement lowering of logical framebuffer writes.

2015-07-16 Thread Francisco Jerez
This does essentially the same thing as
fs_visitor::emit_single_fb_write(), with some slight differences:

 - We don't have to worry about exec_size and use_2nd_half anymore,
   16-wide sources have already been lowered to 8-wide thanks to the
   previous commit and the manual argument unzipping is no longer
   required.

 - The src/dst_depth and sample_mask values are now explicit sources
   of the instruction instead of being taken from the visitor state
   directly.  The same goes for the kill-pixel mask that will be
   passed to the instruction explicitly as predicate.

 - Everything is now done in static functions to improve
   encapsulation.
---
 src/mesa/drivers/dri/i965/brw_fs.cpp | 137 ++-
 1 file changed, 136 insertions(+), 1 deletion(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp 
b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 70fdc5e..787849d 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -3188,6 +3188,25 @@ fs_visitor::lower_integer_multiplication()
 }
 
 static void
+setup_color_payload(const fs_builder &bld, const brw_wm_prog_key *key,
+fs_reg *dst, fs_reg color, unsigned components)
+{
+   if (key->clamp_fragment_color) {
+  fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F, 4);
+  assert(color.type == BRW_REGISTER_TYPE_F);
+
+  for (unsigned i = 0; i < components; i++)
+ set_saturate(true,
+  bld.MOV(offset(tmp, bld, i), offset(color, bld, i)));
+
+  color = tmp;
+   }
+
+   for (unsigned i = 0; i < components; i++)
+  dst[i] = offset(color, bld, i);
+}
+
+static void
 lower_fb_write_logical_send(const fs_builder &bld, fs_inst *inst,
 const brw_wm_prog_data *prog_data,
 const brw_wm_prog_key *key,
@@ -3203,7 +3222,123 @@ lower_fb_write_logical_send(const fs_builder &bld, 
fs_inst *inst,
fs_reg sample_mask = inst->src[5];
const unsigned components = inst->src[6].fixed_hw_reg.dw1.ud;
 
-   assert(!"Not implemented");
+   /* We can potentially have a message length of up to 15, so we have to set
+* base_mrf to either 0 or 1 in order to fit in m0..m15.
+*/
+   fs_reg sources[15];
+   int header_size = 2, payload_header_size;
+   unsigned length = 0;
+
+   /* From the Sandy Bridge PRM, volume 4, page 198:
+*
+* "Dispatched Pixel Enables. One bit per pixel indicating
+*  which pixels were originally enabled when the thread was
+*  dispatched. This field is only required for the end-of-
+*  thread message and on all dual-source messages."
+*/
+   if (devinfo->gen >= 6 &&
+   (devinfo->is_haswell || devinfo->gen >= 8 || !prog_data->uses_kill) &&
+   color1.file == BAD_FILE &&
+   key->nr_color_regions == 1) {
+  header_size = 0;
+   }
+
+   if (header_size != 0) {
+  assert(header_size == 2);
+  /* Allocate 2 registers for a header */
+  length += 2;
+   }
+
+   if (payload.aa_dest_stencil_reg) {
+  sources[length] = fs_reg(GRF, bld.shader->alloc.allocate(1));
+  bld.group(8, 0).exec_all().annotate("FB write stencil/AA alpha")
+ .MOV(sources[length],
+  fs_reg(brw_vec8_grf(payload.aa_dest_stencil_reg, 0)));
+  length++;
+   }
+
+   if (prog_data->uses_omask) {
+  sources[length] = fs_reg(GRF, bld.shader->alloc.allocate(1),
+   BRW_REGISTER_TYPE_UD);
+
+  /* Hand over gl_SampleMask.  Only the lower 16 bits of each channel are
+   * relevant.  Since it's unsigned single words one vgrf is always
+   * 16-wide, but only the lower or higher 8 channels will be used by the
+   * hardware when doing a SIMD8 write depending on whether we have
+   * selected the subspans for the first or second half respectively.
+   */
+  assert(sample_mask.file != BAD_FILE && type_sz(sample_mask.type) == 4);
+  sample_mask.type = BRW_REGISTER_TYPE_UW;
+  sample_mask.stride *= 2;
+
+  bld.exec_all().annotate("FB write oMask")
+ .MOV(half(retype(sources[length], BRW_REGISTER_TYPE_UW),
+   inst->force_sechalf),
+  sample_mask);
+  length++;
+   }
+
+   payload_header_size = length;
+
+   if (src0_alpha.file != BAD_FILE) {
+  /* FIXME: This is being passed at the wrong location in the payload and
+   * doesn't work when gl_SampleMask and MRTs are used simultaneously.
+   * It's supposed to be immediately before oMask but there seems to be no
+   * reasonable way to pass them in the correct order because LOAD_PAYLOAD
+   * requires header sources to form a contiguous segment at the beginning
+   * of the message and src0_alpha has per-channel semantics.
+   */
+  setup_color_payload(bld, key, &sources[length], src0_alpha, 1);
+  length++;
+   }
+
+   setup_color_payload(bld, key, &sources[length], color0, components);
+   length += 4;
+
+   if (color1.file != BAD_FILE) 

[Mesa-dev] [PATCH 05/12] i965/fs: Simplify control flow in emit_single_fb_write().

2015-07-16 Thread Francisco Jerez
Flatten the if ladder to match the way that the ordering of these
fields is specified in the hardware documentation a bit more closely.
---
 src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 28 
 1 file changed, 16 insertions(+), 12 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp 
b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
index 6564d5f..08d9abf 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
@@ -1551,19 +1551,23 @@ fs_visitor::emit_single_fb_write(const fs_builder &bld,
 
payload_header_size = length;
 
-   if (color1.file == BAD_FILE) {
-  if (src0_alpha.file != BAD_FILE) {
- setup_color_payload(&sources[length], src0_alpha, 1, exec_size, 
false);
- length++;
-  }
+   if (src0_alpha.file != BAD_FILE) {
+  /* FIXME: This is being passed at the wrong location in the payload and
+   * doesn't work when gl_SampleMask and MRTs are used simultaneously.
+   * It's supposed to be immediately before oMask but there seems to be no
+   * reasonable way to pass them in the correct order because LOAD_PAYLOAD
+   * requires header sources to form a contiguous segment at the beginning
+   * of the message and src0_alpha has per-channel semantics.
+   */
+  setup_color_payload(&sources[length], src0_alpha, 1, exec_size, false);
+  length++;
+   }
 
-  setup_color_payload(&sources[length], color0, components,
-  exec_size, use_2nd_half);
-  length += 4;
-   } else {
-  setup_color_payload(&sources[length], color0, components,
-  exec_size, use_2nd_half);
-  length += 4;
+   setup_color_payload(&sources[length], color0, components,
+   exec_size, use_2nd_half);
+   length += 4;
+
+   if (color1.file != BAD_FILE) {
   setup_color_payload(&sources[length], color1, components,
   exec_size, use_2nd_half);
   length += 4;
-- 
2.4.3

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 12/12] i965/fs: Reimplement emit_single_fb_write() in terms of logical framebuffer writes.

2015-07-16 Thread Francisco Jerez
The only non-trivial thing it still has to do is figure out where to
take the src/dst depth values from and predicate the instruction if
discard is in use.  The manual SIMD unrolling logic in the dual-source
case goes away because this is now handled transparently by the SIMD
lowering pass.
---
 src/mesa/drivers/dri/i965/brw_fs.h   |   5 +-
 src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 203 +++
 2 files changed, 20 insertions(+), 188 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.h 
b/src/mesa/drivers/dri/i965/brw_fs.h
index 1ae79a9..64f89d4 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -290,13 +290,10 @@ public:
bool optimize_frontfacing_ternary(nir_alu_instr *instr,
  const fs_reg &result);
 
-   void setup_color_payload(fs_reg *dst, fs_reg color, unsigned components,
-unsigned exec_size, bool use_2nd_half);
void emit_alpha_test();
fs_inst *emit_single_fb_write(const brw::fs_builder &bld,
  fs_reg color1, fs_reg color2,
- fs_reg src0_alpha, unsigned components,
- unsigned exec_size, bool use_2nd_half = 
false);
+ fs_reg src0_alpha, unsigned components);
void emit_fb_writes();
void emit_urb_writes();
void emit_cs_terminate();
diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp 
b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
index ba4b177..bcfeaa0 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
@@ -1409,33 +1409,6 @@ fs_visitor::emit_interpolation_setup_gen6()
}
 }
 
-void
-fs_visitor::setup_color_payload(fs_reg *dst, fs_reg color, unsigned components,
-unsigned exec_size, bool use_2nd_half)
-{
-   brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
-   fs_inst *inst;
-
-   if (key->clamp_fragment_color) {
-  fs_reg tmp = vgrf(glsl_type::vec4_type);
-  assert(color.type == BRW_REGISTER_TYPE_F);
-  for (unsigned i = 0; i < components; i++) {
- inst = bld.MOV(offset(tmp, bld, i), offset(color, bld, i));
- inst->saturate = true;
-  }
-  color = tmp;
-   }
-
-   if (exec_size < dispatch_width) {
-  unsigned half_idx = use_2nd_half ? 1 : 0;
-  for (unsigned i = 0; i < components; i++)
- dst[i] = half(offset(color, bld, i), half_idx);
-   } else {
-  for (unsigned i = 0; i < components; i++)
- dst[i] = offset(color, bld, i);
-   }
-}
-
 static enum brw_conditional_mod
 cond_for_alpha_func(GLenum func)
 {
@@ -1493,146 +1466,34 @@ fs_visitor::emit_alpha_test()
 fs_inst *
 fs_visitor::emit_single_fb_write(const fs_builder &bld,
  fs_reg color0, fs_reg color1,
- fs_reg src0_alpha, unsigned components,
- unsigned exec_size, bool use_2nd_half)
+ fs_reg src0_alpha, unsigned components)
 {
assert(stage == MESA_SHADER_FRAGMENT);
brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
-   brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
-   const fs_builder ubld = bld.group(exec_size, use_2nd_half);
-   int header_size = 2, payload_header_size;
-
-   /* We can potentially have a message length of up to 15, so we have to set
-* base_mrf to either 0 or 1 in order to fit in m0..m15.
-*/
-   fs_reg *sources = ralloc_array(mem_ctx, fs_reg, 15);
-   int length = 0;
-
-   /* From the Sandy Bridge PRM, volume 4, page 198:
-*
-* "Dispatched Pixel Enables. One bit per pixel indicating
-*  which pixels were originally enabled when the thread was
-*  dispatched. This field is only required for the end-of-
-*  thread message and on all dual-source messages."
-*/
-   if (devinfo->gen >= 6 &&
-   (devinfo->is_haswell || devinfo->gen >= 8 || !prog_data->uses_kill) &&
-   color1.file == BAD_FILE &&
-   key->nr_color_regions == 1) {
-  header_size = 0;
-   }
-
-   if (header_size != 0) {
-  assert(header_size == 2);
-  /* Allocate 2 registers for a header */
-  length += 2;
-   }
-
-   if (payload.aa_dest_stencil_reg) {
-  sources[length] = fs_reg(GRF, alloc.allocate(1));
-  bld.group(8, 0).exec_all().annotate("FB write stencil/AA alpha")
- .MOV(sources[length],
-  fs_reg(brw_vec8_grf(payload.aa_dest_stencil_reg, 0)));
-  length++;
-   }
-
-   if (prog_data->uses_omask) {
-  sources[length] = fs_reg(GRF, alloc.allocate(1),
-   BRW_REGISTER_TYPE_UD);
-
-  /* Hand over gl_SampleMask.  Only the lower 16 bits of each channel are
-   * relevant.  Since it's unsigned single words one vgrf is always
-   * 16-wide, but only the lower or higher 8 channels will be used by the
-   *

[Mesa-dev] [PATCH 07/12] i965/fs: Move up Gen6 no16 check to emit_fb_writes().

2015-07-16 Thread Francisco Jerez
And update the comment.
---
 src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 20 +++-
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp 
b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
index c489010..b5a42b1 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
@@ -1572,15 +1572,6 @@ fs_visitor::emit_single_fb_write(const fs_builder &bld,
}
 
if (source_depth_to_render_target) {
-  if (devinfo->gen == 6) {
-/* For outputting oDepth on gen6, SIMD8 writes have to be
- * used.  This would require SIMD8 moves of each half to
- * message regs, kind of like pre-gen5 SIMD16 FB writes.
- * Just bail on doing so for now.
- */
-no16("Missing support for simd16 depth writes on gen6\n");
-  }
-
   if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
 /* Hand over gl_FragDepth. */
 assert(this->frag_depth.file != BAD_FILE);
@@ -1643,6 +1634,17 @@ fs_visitor::emit_fb_writes()
brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
 
fs_inst *inst = NULL;
+
+   if (source_depth_to_render_target && devinfo->gen == 6) {
+  /* For outputting oDepth on gen6, SIMD8 writes have to be used.  This
+   * would require SIMD8 moves of each half to message regs, e.g. by using
+   * the SIMD lowering pass.  Unfortunately this is more difficult than it
+   * sounds because the SIMD8 single-source message lacks channel selects
+   * for the second and third subspans.
+   */
+  no16("Missing support for simd16 depth writes on gen6\n");
+   }
+
if (do_dual_src) {
   const fs_builder abld = bld.annotate("FB dual-source write");
 
-- 
2.4.3

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 1/3] i965/fs: Fix stride for immediate registers.

2015-07-17 Thread Francisco Jerez
Samuel Iglesias Gonsálvez  writes:

> On 16/07/15 17:33, Francisco Jerez wrote:
>> When the width field was removed from fs_reg the BROADCAST handling
>> code in opt_algebraic() started to miss a number of trivial
>> optimization cases resulting in the ugly indirect-addressing sequence
>> to be emitted unnecessarily for some variable-indexed texturing and
>> UBO loads regardless of one of the sources of BROADCAST being
>> immediate.  Apparently the reason was that we were setting the stride
>> field to one for immediates even though they are typically uniform.
>> Width used to be set to one too which is why this optimization used to
>> work previously until the "reg.width == 1" check was removed.
>> 
>> The stride field of vector immediates is intentionally left equal to
>> one, because they are strictly speaking not uniform.  The assertion in
>> fs_generator makes sure that immediates have the expected stride as
>> consistency check.
>> ---
>>  src/mesa/drivers/dri/i965/brw_fs.cpp   | 3 +++
>>  src/mesa/drivers/dri/i965/brw_fs_generator.cpp | 4 
>>  2 files changed, 7 insertions(+)
>> 
>> diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp 
>> b/src/mesa/drivers/dri/i965/brw_fs.cpp
>> index ff0675d..537ccbe 100644
>> --- a/src/mesa/drivers/dri/i965/brw_fs.cpp
>> +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
>> @@ -362,6 +362,7 @@ fs_reg::fs_reg(float f)
>> init();
>> this->file = IMM;
>> this->type = BRW_REGISTER_TYPE_F;
>> +   this->stride = 0;
>> this->fixed_hw_reg.dw1.f = f;
>>  }
>>  
>> @@ -371,6 +372,7 @@ fs_reg::fs_reg(int32_t i)
>> init();
>> this->file = IMM;
>> this->type = BRW_REGISTER_TYPE_D;
>> +   this->stride = 0;
>> this->fixed_hw_reg.dw1.d = i;
>>  }
>>  
>> @@ -380,6 +382,7 @@ fs_reg::fs_reg(uint32_t u)
>> init();
>> this->file = IMM;
>> this->type = BRW_REGISTER_TYPE_UD;
>> +   this->stride = 0;
>> this->fixed_hw_reg.dw1.ud = u;
>>  }
>>  
>> diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp 
>> b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
>> index bae7216..8a3af47 100644
>> --- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
>> +++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
>> @@ -79,6 +79,10 @@ brw_reg_from_fs_reg(fs_inst *inst, fs_reg *reg)
>>brw_reg = byte_offset(brw_reg, reg->subreg_offset);
>>break;
>> case IMM:
>> +  assert(reg->stride == (reg->type == BRW_REGISTER_TYPE_V ||
>> + reg->type == BRW_REGISTER_TYPE_UV ||
>> + reg->type == BRW_REGISTER_TYPE_VF ? 1 : 0));
>> +
>
> Just nitpicking: I would put extra parenthesis to enclose the whole
> condition.
>
Hah, in fact there is a good reason to put the whole ?: expression
around parenthesis rather than the condition only: The ternary operator
has one of the lowest precedences (one could argue it's dangerously low
-- it has the same precedence as the assignment operator!), which means
it's far more likely to fall into the opposite mistake of letting the
ternary operator bind too loosely as would have happened in this patch
if I had put the parenthesis around the condition instead:

|  assert(reg->stride == (reg->type == BRW_REGISTER_TYPE_V ||
| reg->type == BRW_REGISTER_TYPE_UV ||
| reg->type == BRW_REGISTER_TYPE_VF) ? 1 : 0);

would be parsed as:

|  assert((reg->stride == (reg->type == BRW_REGISTER_TYPE_V ||
|  reg->type == BRW_REGISTER_TYPE_UV ||
|  reg->type == BRW_REGISTER_TYPE_VF)) ? 1 : 0);

which isn't what I intended but *happens* to be equivalent by pure luck
because "boolean-expression ? 1 : 0" is in fact a no-op -- In general
it isn't. :)

The opposite case is almost impossible: Without parenthesis around the
condition it will only ever bind more tightly than intended if you want
to do an assignment or use the comma operator in the condition, which is
hardly ever the case so the parenthesis around the condition are almost
always redundant.

> Reviewed-by: Samuel Iglesias Gonsálvez 
>

Thanks!

> Sam
>
>>switch (reg->type) {
>>case BRW_REGISTER_TYPE_F:
>>   brw_reg = brw_imm_f(reg->fixed_hw_reg.dw1.f);
>> 


signature.asc
Description: PGP signature
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 1/3] i965/fs: Fix stride for immediate registers.

2015-07-17 Thread Francisco Jerez
Samuel Iglesias Gonsálvez  writes:

> On Fri, 2015-07-17 at 16:33 +0300, Francisco Jerez wrote:
>> Samuel Iglesias Gonsálvez  writes:
>> 
>> > On 16/07/15 17:33, Francisco Jerez wrote:
>> >> When the width field was removed from fs_reg the BROADCAST handling
>> >> code in opt_algebraic() started to miss a number of trivial
>> >> optimization cases resulting in the ugly indirect-addressing sequence
>> >> to be emitted unnecessarily for some variable-indexed texturing and
>> >> UBO loads regardless of one of the sources of BROADCAST being
>> >> immediate.  Apparently the reason was that we were setting the stride
>> >> field to one for immediates even though they are typically uniform.
>> >> Width used to be set to one too which is why this optimization used to
>> >> work previously until the "reg.width == 1" check was removed.
>> >> 
>> >> The stride field of vector immediates is intentionally left equal to
>> >> one, because they are strictly speaking not uniform.  The assertion in
>> >> fs_generator makes sure that immediates have the expected stride as
>> >> consistency check.
>> >> ---
>> >>  src/mesa/drivers/dri/i965/brw_fs.cpp   | 3 +++
>> >>  src/mesa/drivers/dri/i965/brw_fs_generator.cpp | 4 
>> >>  2 files changed, 7 insertions(+)
>> >> 
>> >> diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp 
>> >> b/src/mesa/drivers/dri/i965/brw_fs.cpp
>> >> index ff0675d..537ccbe 100644
>> >> --- a/src/mesa/drivers/dri/i965/brw_fs.cpp
>> >> +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
>> >> @@ -362,6 +362,7 @@ fs_reg::fs_reg(float f)
>> >> init();
>> >> this->file = IMM;
>> >> this->type = BRW_REGISTER_TYPE_F;
>> >> +   this->stride = 0;
>> >> this->fixed_hw_reg.dw1.f = f;
>> >>  }
>> >>  
>> >> @@ -371,6 +372,7 @@ fs_reg::fs_reg(int32_t i)
>> >> init();
>> >> this->file = IMM;
>> >> this->type = BRW_REGISTER_TYPE_D;
>> >> +   this->stride = 0;
>> >> this->fixed_hw_reg.dw1.d = i;
>> >>  }
>> >>  
>> >> @@ -380,6 +382,7 @@ fs_reg::fs_reg(uint32_t u)
>> >> init();
>> >> this->file = IMM;
>> >> this->type = BRW_REGISTER_TYPE_UD;
>> >> +   this->stride = 0;
>> >> this->fixed_hw_reg.dw1.ud = u;
>> >>  }
>> >>  
>> >> diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp 
>> >> b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
>> >> index bae7216..8a3af47 100644
>> >> --- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
>> >> +++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
>> >> @@ -79,6 +79,10 @@ brw_reg_from_fs_reg(fs_inst *inst, fs_reg *reg)
>> >>brw_reg = byte_offset(brw_reg, reg->subreg_offset);
>> >>break;
>> >> case IMM:
>> >> +  assert(reg->stride == (reg->type == BRW_REGISTER_TYPE_V ||
>> >> + reg->type == BRW_REGISTER_TYPE_UV ||
>> >> + reg->type == BRW_REGISTER_TYPE_VF ? 1 : 0));
>> >> +
>> >
>> > Just nitpicking: I would put extra parenthesis to enclose the whole
>> > condition.
>> >
>> Hah, in fact there is a good reason to put the whole ?: expression
>> around parenthesis rather than the condition only: The ternary operator
>> has one of the lowest precedences (one could argue it's dangerously low
>> -- it has the same precedence as the assignment operator!), which means
>> it's far more likely to fall into the opposite mistake of letting the
>> ternary operator bind too loosely as would have happened in this patch
>> if I had put the parenthesis around the condition instead:
>> 
>> |  assert(reg->stride == (reg->type == BRW_REGISTER_TYPE_V ||
>> | reg->type == BRW_REGISTER_TYPE_UV ||
>> | reg->type == BRW_REGISTER_TYPE_VF) ? 1 : 0);
>> 
>> would be parsed as:
>> 
>> |  assert((reg->stride == (reg->type == BRW_REGISTER_TYPE_V ||
>> |  reg->type == BRW_REGISTER_TYPE_UV ||
>> |  reg->type == BRW_REGISTER_TYPE_VF)) ? 1 : 0);
>> 
>> which isn't what I inten

Re: [Mesa-dev] [PATCH] radeonsi: don't return NULL fence if no fence is available

2015-07-18 Thread Francisco Jerez
Michel Dänzer  writes:

> On 17.07.2015 06:03, Marek Olšák wrote:
>> From: Marek Olšák 
>> 
>> An alternative (and ugly) solution to the current clover issue.
>
> How about something like this instead? (Compile tested only)
>

I'm rather unfamiliar with the radeonsi pipe driver code so I should
probably hold myself back from giving you an R-b, but I must say that
this seems much cleaner than the last two solutions proposed so
far... :)

>
> diff --git a/src/gallium/drivers/radeonsi/si_hw_context.c 
> b/src/gallium/drivers/radeonsi/si_hw_context.c
> index 08cc08e..dc8702e 100644
> --- a/src/gallium/drivers/radeonsi/si_hw_context.c
> +++ b/src/gallium/drivers/radeonsi/si_hw_context.c
> @@ -84,7 +84,8 @@ void si_context_gfx_flush(void *context, unsigned flags,
> struct radeon_winsys_cs *cs = ctx->b.rings.gfx.cs;
> struct radeon_winsys *ws = ctx->b.ws;
>
> -   if (cs->cdw == ctx->b.initial_gfx_cs_size) {
> +   if (cs->cdw == ctx->b.initial_gfx_cs_size &&
> +   (!fence || ctx->last_gfx_fence)) {
> if (fence)
> ws->fence_reference(fence, ctx->last_gfx_fence);
> if (!(flags & RADEON_FLUSH_ASYNC))
>
>
> -- 
> Earthling Michel Dänzer   |   http://www.amd.com
> Libre software enthusiast | Mesa and X developer
> ___
> mesa-dev mailing list
> mesa-dev@lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/mesa-dev


signature.asc
Description: PGP signature
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 02/12] i965/fs: Use exec_size instead of dispatch_width to determine the message variant.

2015-07-18 Thread Francisco Jerez
dispatch_width is global for a single compilation and doesn't
necessarily match the desired execution width if we had to lower the
original full-width instruction due to hardware limitations.  These
were all inside a Gen4-specific branch so this patch shouldn't have
any effect on more recent hardware.
---
 src/mesa/drivers/dri/i965/brw_fs_generator.cpp | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp 
b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
index a176fcf..811fb73 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
@@ -655,7 +655,7 @@ fs_generator::generate_tex(fs_inst *inst, struct brw_reg 
dst, struct brw_reg src
 /* Note that G45 and older determines shadow compare and dispatch width
  * from message length for most messages.
  */
- if (dispatch_width == 8) {
+ if (inst->exec_size == 8) {
 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE;
 if (inst->shadow_compare) {
assert(inst->mlen == 6);
@@ -674,7 +674,7 @@ fs_generator::generate_tex(fs_inst *inst, struct brw_reg 
dst, struct brw_reg src
 break;
   case FS_OPCODE_TXB:
 if (inst->shadow_compare) {
-assert(dispatch_width == 8);
+assert(inst->exec_size == 8);
assert(inst->mlen == 6);
msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_BIAS_COMPARE;
 } else {
@@ -685,7 +685,7 @@ fs_generator::generate_tex(fs_inst *inst, struct brw_reg 
dst, struct brw_reg src
 break;
   case SHADER_OPCODE_TXL:
 if (inst->shadow_compare) {
-assert(dispatch_width == 8);
+assert(inst->exec_size == 8);
assert(inst->mlen == 6);
msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_LOD_COMPARE;
 } else {
@@ -696,7 +696,7 @@ fs_generator::generate_tex(fs_inst *inst, struct brw_reg 
dst, struct brw_reg src
 break;
   case SHADER_OPCODE_TXD:
 /* There is no sample_d_c message; comparisons are done manually */
- assert(dispatch_width == 8);
+ assert(inst->exec_size == 8);
 assert(inst->mlen == 7 || inst->mlen == 10);
 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_GRADIENTS;
 break;
-- 
2.4.3

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 03/12] i965/fs: Fix opt_zero_samples() for texturing ops not matching dispatch_width.

2015-07-18 Thread Francisco Jerez
---
 src/mesa/drivers/dri/i965/brw_fs.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp 
b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 6afb9fe..c31a0e1 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -2145,11 +2145,11 @@ fs_visitor::opt_zero_samples()
* "Parameter 0 is required except for the sampleinfo message, which
*  has no parameter 0"
*/
-  while (inst->mlen > inst->header_size + dispatch_width / 8 &&
+  while (inst->mlen > inst->header_size + inst->exec_size / 8 &&
  load_payload->src[(inst->mlen - inst->header_size) /
-   (dispatch_width / 8) +
+   (inst->exec_size / 8) +
inst->header_size - 1].is_zero()) {
- inst->mlen -= dispatch_width / 8;
+ inst->mlen -= inst->exec_size / 8;
  progress = true;
   }
}
-- 
2.4.3

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 07/12] i965/fs: Implement lowering of logical texturing opcodes on Gen5-6.

2015-07-18 Thread Francisco Jerez
This should be largely equivalent to emit_texture_gen5() except for
slight codestyle changes and the use i965 opcodes instead of the
ir_texture_opcode enum, see "i965/fs: Implement lowering of logical
texturing opcodes on Gen7+." for the mapping between them.
---
 src/mesa/drivers/dri/i965/brw_fs.cpp | 103 +++
 1 file changed, 103 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp 
b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 7387ca5..5233ac3 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -3369,6 +3369,104 @@ lower_fb_write_logical_send(const fs_builder &bld, 
fs_inst *inst,
inst->header_size = header_size;
 }
 
+static void
+lower_sampler_logical_send_gen5(const fs_builder &bld, fs_inst *inst, opcode 
op,
+fs_reg coordinate,
+const fs_reg &shadow_c,
+fs_reg lod, fs_reg lod2,
+const fs_reg &sample_index,
+const fs_reg &sampler,
+const fs_reg &offset_value,
+unsigned coord_components,
+unsigned grad_components)
+{
+   fs_reg message(MRF, 2, BRW_REGISTER_TYPE_F);
+   fs_reg msg_coords = message;
+   unsigned header_size = 0;
+
+   if (offset_value.file != BAD_FILE) {
+  /* The offsets set up by the visitor are in the m1 header, so we can't
+   * go headerless.
+   */
+  header_size = 1;
+  message.reg--;
+   }
+
+   for (unsigned i = 0; i < coord_components; i++) {
+  bld.MOV(retype(offset(msg_coords, bld, i), coordinate.type), coordinate);
+  coordinate = offset(coordinate, bld, 1);
+   }
+   fs_reg msg_end = offset(msg_coords, bld, coord_components);
+   fs_reg msg_lod = offset(msg_coords, bld, 4);
+
+   if (shadow_c.file != BAD_FILE) {
+  fs_reg msg_shadow = msg_lod;
+  bld.MOV(msg_shadow, shadow_c);
+  msg_lod = offset(msg_shadow, bld, 1);
+  msg_end = msg_lod;
+   }
+
+   switch (op) {
+   case SHADER_OPCODE_TXL:
+   case FS_OPCODE_TXB:
+  bld.MOV(msg_lod, lod);
+  msg_end = offset(msg_lod, bld, 1);
+  break;
+   case SHADER_OPCODE_TXD:
+  /**
+   *  P   =  u,v,r
+   * dPdx = dudx, dvdx, drdx
+   * dPdy = dudy, dvdy, drdy
+   *
+   * Load up these values:
+   * - dudx   dudy   dvdx   dvdy   drdx   drdy
+   * - dPdx.x dPdy.x dPdx.y dPdy.y dPdx.z dPdy.z
+   */
+  msg_end = msg_lod;
+  for (unsigned i = 0; i < grad_components; i++) {
+ bld.MOV(msg_end, lod);
+ lod = offset(lod, bld, 1);
+ msg_end = offset(msg_end, bld, 1);
+
+ bld.MOV(msg_end, lod2);
+ lod2 = offset(lod2, bld, 1);
+ msg_end = offset(msg_end, bld, 1);
+  }
+  break;
+   case SHADER_OPCODE_TXS:
+  msg_lod = retype(msg_end, BRW_REGISTER_TYPE_UD);
+  bld.MOV(msg_lod, lod);
+  msg_end = offset(msg_lod, bld, 1);
+  break;
+   case SHADER_OPCODE_TXF:
+  msg_lod = offset(msg_coords, bld, 3);
+  bld.MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), lod);
+  msg_end = offset(msg_lod, bld, 1);
+  break;
+   case SHADER_OPCODE_TXF_CMS:
+  msg_lod = offset(msg_coords, bld, 3);
+  /* lod */
+  bld.MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), fs_reg(0u));
+  /* sample index */
+  bld.MOV(retype(offset(msg_lod, bld, 1), BRW_REGISTER_TYPE_UD), 
sample_index);
+  msg_end = offset(msg_lod, bld, 2);
+  break;
+   default:
+  break;
+   }
+
+   inst->opcode = op;
+   inst->src[0] = reg_undef;
+   inst->src[1] = sampler;
+   inst->resize_sources(2);
+   inst->base_mrf = message.reg;
+   inst->mlen = msg_end.reg - message.reg;
+   inst->header_size = header_size;
+
+   /* Message length > MAX_SAMPLER_MESSAGE_SIZE disallowed by hardware. */
+   assert(inst->mlen <= MAX_SAMPLER_MESSAGE_SIZE);
+}
+
 static bool
 is_high_sampler(const struct brw_device_info *devinfo, const fs_reg &sampler)
 {
@@ -3604,6 +3702,11 @@ lower_sampler_logical_send(const fs_builder &bld, 
fs_inst *inst, opcode op)
   shadow_c, lod, lod2, sample_index,
   mcs, sampler, offset_value,
   coord_components, grad_components);
+   } else if (devinfo->gen >= 5) {
+  lower_sampler_logical_send_gen5(bld, inst, op, coordinate,
+  shadow_c, lod, lod2, sample_index,
+  sampler, offset_value,
+  coord_components, grad_components);
} else {
   assert(!"Not implemented");
}
-- 
2.4.3

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 09/12] i965/fs: Hook up SIMD lowering to handle texturing opcodes of unsupported width.

2015-07-18 Thread Francisco Jerez
This should match the set of cases in which we currently call fail()
or no16() from the emit_texture_*() methods and the ones in which
emit_texture_gen4() enables the SIMD16 workaround.

Hint for reviewers: It's not a big deal if I happen to have missed
some case here, it will just lead to an assertion failure down the
road which is easily fixable, however being stricter than necessary
won't cause any visible breakage, it would just decrease performance
silently due to the unnecessary message splitting, so feel free to
double-check that all cases listed here already cause a SIMD8/16
fall-back with the current texturing code -- You may want to skip over
the Gen5-6 cases though if you don't have pencil and paper at hand.
---
 src/mesa/drivers/dri/i965/brw_fs.cpp | 27 +++
 1 file changed, 27 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp 
b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 043d9e9..f291202 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -3918,6 +3918,33 @@ get_lowered_simd_width(const struct brw_device_info 
*devinfo,
   /* Dual-source FB writes are unsupported in SIMD16 mode. */
   return (inst->src[1].file != BAD_FILE ? 8 : inst->exec_size);
 
+   case SHADER_OPCODE_TXD_LOGICAL:
+  /* TXD is unsupported in SIMD16 mode. */
+  return 8;
+
+   case SHADER_OPCODE_TG4_OFFSET_LOGICAL: {
+  /* gather4_po_c is unsupported in SIMD16 mode. */
+  const fs_reg &shadow_c = inst->src[1];
+  return (shadow_c.file != BAD_FILE ? 8 : inst->exec_size);
+   }
+   case SHADER_OPCODE_TXL_LOGICAL:
+   case FS_OPCODE_TXB_LOGICAL: {
+  /* Gen4 doesn't have SIMD8 non-shadow-compare bias/LOD instructions, and
+   * Gen4-6 don't support TXL and TXB with shadow comparison in SIMD16
+   * mode.
+   */
+  const fs_reg &shadow_c = inst->src[1];
+  return (devinfo->gen == 4 && shadow_c.file == BAD_FILE ? 16 :
+  devinfo->gen < 7 && shadow_c.file != BAD_FILE ? 8 :
+  inst->exec_size);
+   }
+   case SHADER_OPCODE_TXF_LOGICAL:
+   case SHADER_OPCODE_TXS_LOGICAL:
+  /* Gen4 doesn't have SIMD8 variants for the RESINFO and LD-with-LOD
+   * messages.  Use SIMD16 instead.
+   */
+  return (devinfo->gen == 4 ? 16 : inst->exec_size);
+
default:
   return inst->exec_size;
}
-- 
2.4.3

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 06/12] i965/fs: Lower SHADER_OPCODE_TXF_UMS/MCS_LOGICAL too on Gen7+.

2015-07-18 Thread Francisco Jerez
These weren't being handled by emit_texture_gen7() but we can easily
lower them here for consistency with other texturing opcodes.
---
 src/mesa/drivers/dri/i965/brw_fs.cpp | 16 +++-
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp 
b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 06cfc97..7387ca5 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -3498,12 +3498,18 @@ lower_sampler_logical_send_gen7(const fs_builder &bld, 
fs_inst *inst, opcode op,
   coordinate_done = true;
   break;
case SHADER_OPCODE_TXF_CMS:
-  bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), sample_index);
-  length++;
+   case SHADER_OPCODE_TXF_UMS:
+   case SHADER_OPCODE_TXF_MCS:
+  if (op == SHADER_OPCODE_TXF_UMS || op == SHADER_OPCODE_TXF_CMS) {
+ bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), sample_index);
+ length++;
+  }
 
-  /* Data from the multisample control surface. */
-  bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), mcs);
-  length++;
+  if (op == SHADER_OPCODE_TXF_CMS) {
+ /* Data from the multisample control surface. */
+ bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), mcs);
+ length++;
+  }
 
   /* There is no offsetting for this message; just copy in the integer
* texture coordinates.
-- 
2.4.3

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 01/12] i965/fs: Define logical texture sampling opcodes.

2015-07-18 Thread Francisco Jerez
Each logical variant is largely equivalent to the original opcode but
instead of taking a single payload source it expects the arguments
separately as individual sources, like:

 tex_logical dst, coordinates, shadow_c, lod, lod2,
  sample_index, mcs, sampler, offset,
  num_coordinate_components, num_grad_components

This patch defines the opcodes and usual instruction boilerplate,
including a placeholder lowering function provided mostly as
documentation for their source registers.
---
 src/mesa/drivers/dri/i965/brw_defines.h  | 12 +
 src/mesa/drivers/dri/i965/brw_fs.cpp | 92 
 src/mesa/drivers/dri/i965/brw_shader.cpp | 25 +
 3 files changed, 129 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_defines.h 
b/src/mesa/drivers/dri/i965/brw_defines.h
index 9099676..193fcbe 100644
--- a/src/mesa/drivers/dri/i965/brw_defines.h
+++ b/src/mesa/drivers/dri/i965/brw_defines.h
@@ -890,17 +890,29 @@ enum opcode {
SHADER_OPCODE_COS,
 
SHADER_OPCODE_TEX,
+   SHADER_OPCODE_TEX_LOGICAL,
SHADER_OPCODE_TXD,
+   SHADER_OPCODE_TXD_LOGICAL,
SHADER_OPCODE_TXF,
+   SHADER_OPCODE_TXF_LOGICAL,
SHADER_OPCODE_TXL,
+   SHADER_OPCODE_TXL_LOGICAL,
SHADER_OPCODE_TXS,
+   SHADER_OPCODE_TXS_LOGICAL,
FS_OPCODE_TXB,
+   FS_OPCODE_TXB_LOGICAL,
SHADER_OPCODE_TXF_CMS,
+   SHADER_OPCODE_TXF_CMS_LOGICAL,
SHADER_OPCODE_TXF_UMS,
+   SHADER_OPCODE_TXF_UMS_LOGICAL,
SHADER_OPCODE_TXF_MCS,
+   SHADER_OPCODE_TXF_MCS_LOGICAL,
SHADER_OPCODE_LOD,
+   SHADER_OPCODE_LOD_LOGICAL,
SHADER_OPCODE_TG4,
+   SHADER_OPCODE_TG4_LOGICAL,
SHADER_OPCODE_TG4_OFFSET,
+   SHADER_OPCODE_TG4_OFFSET_LOGICAL,
 
/**
 * Combines multiple sources of size 1 into a larger virtual GRF.
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp 
b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 503d4d8..6afb9fe 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -711,6 +711,31 @@ fs_inst::regs_read(int arg) const
  components = src[6].fixed_hw_reg.dw1.ud;
   break;
 
+   case SHADER_OPCODE_TEX_LOGICAL:
+   case SHADER_OPCODE_TXD_LOGICAL:
+   case SHADER_OPCODE_TXF_LOGICAL:
+   case SHADER_OPCODE_TXL_LOGICAL:
+   case SHADER_OPCODE_TXS_LOGICAL:
+   case FS_OPCODE_TXB_LOGICAL:
+   case SHADER_OPCODE_TXF_CMS_LOGICAL:
+   case SHADER_OPCODE_TXF_UMS_LOGICAL:
+   case SHADER_OPCODE_TXF_MCS_LOGICAL:
+   case SHADER_OPCODE_LOD_LOGICAL:
+   case SHADER_OPCODE_TG4_LOGICAL:
+   case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
+  assert(src[8].file == IMM && src[9].file == IMM);
+  /* Texture coordinates. */
+  if (arg == 0)
+ components = src[8].fixed_hw_reg.dw1.ud;
+  /* Texture derivatives/LOD. */
+  else if (arg == 2 || arg == 3)
+ components = (opcode == SHADER_OPCODE_TXD_LOGICAL ?
+   src[9].fixed_hw_reg.dw1.ud : 1);
+  /* Texture offset. */
+  else if (arg == 7)
+ components = 2;
+  break;
+
default:
   if (is_tex() && arg == 0 && src[0].file == GRF)
  return mlen;
@@ -3344,6 +3369,25 @@ lower_fb_write_logical_send(const fs_builder &bld, 
fs_inst *inst,
inst->header_size = header_size;
 }
 
+static void
+lower_sampler_logical_send(const fs_builder &bld, fs_inst *inst, opcode op)
+{
+   const brw_device_info *devinfo = bld.shader->devinfo;
+   const fs_reg &coordinate = inst->src[0];
+   const fs_reg &shadow_c = inst->src[1];
+   const fs_reg &lod = inst->src[2];
+   const fs_reg &lod2 = inst->src[3];
+   const fs_reg &sample_index = inst->src[4];
+   const fs_reg &mcs = inst->src[5];
+   const fs_reg &sampler = inst->src[6];
+   const fs_reg &offset_value = inst->src[7];
+   assert(inst->src[8].file == IMM && inst->src[9].file == IMM);
+   const unsigned coord_components = inst->src[8].fixed_hw_reg.dw1.ud;
+   const unsigned grad_components = inst->src[9].fixed_hw_reg.dw1.ud;
+
+   assert(!"Not implemented");
+}
+
 bool
 fs_visitor::lower_logical_sends()
 {
@@ -3363,6 +3407,54 @@ fs_visitor::lower_logical_sends()
  payload);
  break;
 
+  case SHADER_OPCODE_TEX_LOGICAL:
+ lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TEX);
+ break;
+
+  case SHADER_OPCODE_TXD_LOGICAL:
+ lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXD);
+ break;
+
+  case SHADER_OPCODE_TXF_LOGICAL:
+ lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF);
+ break;
+
+  case SHADER_OPCODE_TXL_LOGICAL:
+ lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXL);
+ break;
+
+  case SHADER_OPCODE_TXS_LOGICAL:
+ lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXS);
+ break;
+
+  case FS_OPCODE_TXB_LOGICAL:
+ lower_sampler_logical_send(ibld, inst, FS_OPCODE_TXB);
+ break;
+
+  case SHADER_OPCODE_TXF_CMS_LOGICAL:
+ lower_sampler_logical_send(ibld, inst, SHADER_

[Mesa-dev] [PATCH 05/12] i965/fs: Implement lowering of logical texturing opcodes on Gen7+.

2015-07-18 Thread Francisco Jerez
This should be largely equivalent to emit_texture_gen7() except that
we now get i965 sampling opcodes directly rather than
ir_texture_opcode enum values.  The mapping is as follows:

 - ir_tex -> SHADER_OPCODE_TEX
 - ir_txb -> FS_OPCODE_TXB
 - ir_txl -> SHADER_OPCODE_TXL
 - ir_txd -> SHADER_OPCODE_TXD
 - ir_txf -> SHADER_OPCODE_TXF
 - ir_txf_ms -> SHADER_OPCODE_TXF_CMS
 - ir_txs -> SHADER_OPCODE_TXS
 - ir_query_levels -> SHADER_OPCODE_TXS too, the visitor will make
  sure that the provided lod value is zero in this
  case.
 - ir_lod -> SHADER_OPCODE_LOD
 - ir_tg4 -> SHADER_OPCODE_TG4_OFFSET if the offset value is not
 immediate, SHADER_OPCODE_TG4 otherwise.

Other than that there are only minor changes and style fixes like the
implementation now being factored out in static functions to improve
encapsulation.
---
 src/mesa/drivers/dri/i965/brw_fs.cpp | 217 ++-
 1 file changed, 216 insertions(+), 1 deletion(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp 
b/src/mesa/drivers/dri/i965/brw_fs.cpp
index c31a0e1..06cfc97 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -3369,6 +3369,214 @@ lower_fb_write_logical_send(const fs_builder &bld, 
fs_inst *inst,
inst->header_size = header_size;
 }
 
+static bool
+is_high_sampler(const struct brw_device_info *devinfo, const fs_reg &sampler)
+{
+   if (devinfo->gen < 8 && !devinfo->is_haswell)
+  return false;
+
+   return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
+}
+
+static void
+lower_sampler_logical_send_gen7(const fs_builder &bld, fs_inst *inst, opcode 
op,
+fs_reg coordinate,
+const fs_reg &shadow_c,
+fs_reg lod, fs_reg lod2,
+const fs_reg &sample_index,
+const fs_reg &mcs, const fs_reg &sampler,
+fs_reg offset_value,
+unsigned coord_components,
+unsigned grad_components)
+{
+   const brw_device_info *devinfo = bld.shader->devinfo;
+   int reg_width = bld.dispatch_width() / 8;
+   unsigned header_size = 0, length = 0;
+   fs_reg sources[MAX_SAMPLER_MESSAGE_SIZE];
+   for (unsigned i = 0; i < ARRAY_SIZE(sources); i++)
+  sources[i] = bld.vgrf(BRW_REGISTER_TYPE_F);
+
+   if (op == SHADER_OPCODE_TG4 || op == SHADER_OPCODE_TG4_OFFSET ||
+   offset_value.file != BAD_FILE ||
+   is_high_sampler(devinfo, sampler)) {
+  /* For general texture offsets (no txf workaround), we need a header to
+   * put them in.  Note that we're only reserving space for it in the
+   * message payload as it will be initialized implicitly by the
+   * generator.
+   *
+   * TG4 needs to place its channel select in the header, for interaction
+   * with ARB_texture_swizzle.  The sampler index is only 4-bits, so for
+   * larger sampler numbers we need to offset the Sampler State Pointer in
+   * the header.
+   */
+  header_size = 1;
+  sources[0] = fs_reg();
+  length++;
+   }
+
+   if (shadow_c.file != BAD_FILE) {
+  bld.MOV(sources[length], shadow_c);
+  length++;
+   }
+
+   bool coordinate_done = false;
+
+   /* The sampler can only meaningfully compute LOD for fragment shader
+* messages. For all other stages, we change the opcode to TXL and
+* hardcode the LOD to 0.
+*/
+   if (bld.shader->stage != MESA_SHADER_FRAGMENT &&
+   op == SHADER_OPCODE_TEX) {
+  op = SHADER_OPCODE_TXL;
+  lod = fs_reg(0.0f);
+   }
+
+   /* Set up the LOD info */
+   switch (op) {
+   case FS_OPCODE_TXB:
+   case SHADER_OPCODE_TXL:
+  bld.MOV(sources[length], lod);
+  length++;
+  break;
+   case SHADER_OPCODE_TXD:
+  /* TXD should have been lowered in SIMD16 mode. */
+  assert(bld.dispatch_width() == 8);
+
+  /* Load dPdx and the coordinate together:
+   * [hdr], [ref], x, dPdx.x, dPdy.x, y, dPdx.y, dPdy.y, z, dPdx.z, dPdy.z
+   */
+  for (unsigned i = 0; i < coord_components; i++) {
+ bld.MOV(sources[length], coordinate);
+ coordinate = offset(coordinate, bld, 1);
+ length++;
+
+ /* For cube map array, the coordinate is (u,v,r,ai) but there are
+  * only derivatives for (u, v, r).
+  */
+ if (i < grad_components) {
+bld.MOV(sources[length], lod);
+lod = offset(lod, bld, 1);
+length++;
+
+bld.MOV(sources[length], lod2);
+lod2 = offset(lod2, bld, 1);
+length++;
+ }
+  }
+
+  coordinate_done = true;
+  break;
+   case SHADER_OPCODE_TXS:
+  bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), lod);
+  length++;
+  break;
+   case SHADER_OPCODE_TXF:
+  /* Unfortunately, the parameters for LD are intermixed: u, lo

[Mesa-dev] [PATCH 04/12] i965/fs: Pass a BAD_FILE header source to LOAD_PAYLOAD in emit_texture_gen7().

2015-07-18 Thread Francisco Jerez
So that it's left uninitialized by LOAD_PAYLOAD, we only need to
reserve space for it in the message since it will be initialized
implicitly by the generator.
---
 src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 7 ---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp 
b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
index 522e13e..89fcc49 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
@@ -473,8 +473,9 @@ fs_visitor::emit_texture_gen7(ir_texture_opcode op, fs_reg 
dst,
if (op == ir_tg4 || offset_value.file != BAD_FILE ||
is_high_sampler(devinfo, sampler)) {
   /* For general texture offsets (no txf workaround), we need a header to
-   * put them in.  Note that for SIMD16 we're making space for two actual
-   * hardware registers here, so the emit will have to fix up for this.
+   * put them in.  Note that we're only reserving space for it in the
+   * message payload as it will be initialized implicitly by the
+   * generator.
*
* * ir4_tg4 needs to place its channel select in the header,
* for interaction with ARB_texture_swizzle
@@ -483,7 +484,7 @@ fs_visitor::emit_texture_gen7(ir_texture_opcode op, fs_reg 
dst,
* need to offset the Sampler State Pointer in the header.
*/
   header_size = 1;
-  sources[0] = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
+  sources[0] = fs_reg();
   length++;
}
 
-- 
2.4.3

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 08/12] i965/fs: Implement lowering of logical texturing opcodes on Gen4.

2015-07-18 Thread Francisco Jerez
Unlike its Gen5 and Gen7 counterparts this patch isn't a plain
refactor of the previous Gen4 texturing code, it's more of a rewrite
largely based on emit_texture_gen4_simd16().  The reason is that on
the one hand the original emit_texture_gen4() code didn't seem easily
fixable to be SIMD width-invariant and had plenty of clutter to
support SIMD-width workarounds which are no longer required.  On the
other hand emit_texture_gen4_simd16() was missing a number of
SIMD8-only opcodes.  This should generalize both and roughly match
their current behaviour where there is overlap.

Incidentally this will fix the following piglits on Gen4:

arb_shader_texture_lod.execution.arb_shader_texture_lod-texgrad
arb_shader_texture_lod.execution.tex-miplevel-selection *gradarb 2d
arb_shader_texture_lod.execution.tex-miplevel-selection *gradarb 3d
arb_shader_texture_lod.execution.tex-miplevel-selection *projgradarb 2d
arb_shader_texture_lod.execution.tex-miplevel-selection *projgradarb 
2d_projvec4
arb_shader_texture_lod.execution.tex-miplevel-selection *projgradarb 3d
---
 src/mesa/drivers/dri/i965/brw_fs.cpp | 108 ++-
 1 file changed, 107 insertions(+), 1 deletion(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp 
b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 5233ac3..043d9e9 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -3370,6 +3370,110 @@ lower_fb_write_logical_send(const fs_builder &bld, 
fs_inst *inst,
 }
 
 static void
+lower_sampler_logical_send_gen4(const fs_builder &bld, fs_inst *inst, opcode 
op,
+const fs_reg &coordinate,
+const fs_reg &shadow_c,
+const fs_reg &lod, const fs_reg &lod2,
+const fs_reg &sampler,
+unsigned coord_components,
+unsigned grad_components)
+{
+   const bool has_lod = (op == SHADER_OPCODE_TXL || op == FS_OPCODE_TXB ||
+ op == SHADER_OPCODE_TXF || op == SHADER_OPCODE_TXS);
+   fs_reg msg_begin(MRF, 1, BRW_REGISTER_TYPE_F);
+   fs_reg msg_end = msg_begin;
+
+   /* g0 header. */
+   msg_end = offset(msg_end, bld.group(8, 0), 1);
+
+   for (unsigned i = 0; i < coord_components; i++)
+  bld.MOV(retype(offset(msg_end, bld, i), coordinate.type),
+  offset(coordinate, bld, i));
+
+   msg_end = offset(msg_end, bld, coord_components);
+
+   /* Messages other than SAMPLE and RESINFO in SIMD16 and TXD in SIMD8
+* require all three components to be present and zero if they are unused.
+*/
+   if (coord_components > 0 &&
+   (has_lod || shadow_c.file != BAD_FILE ||
+(op == SHADER_OPCODE_TEX && bld.dispatch_width() == 8))) {
+  for (unsigned i = coord_components; i < 3; i++)
+ bld.MOV(offset(msg_end, bld, i), fs_reg(0.0f));
+
+  msg_end = offset(msg_end, bld, 3 - coord_components);
+   }
+
+   if (op == SHADER_OPCODE_TXD) {
+  /* TXD unsupported in SIMD16 mode. */
+  assert(bld.dispatch_width() == 8);
+
+  /* the slots for u and v are always present, but r is optional */
+  if (coord_components < 2)
+ msg_end = offset(msg_end, bld, 2 - coord_components);
+
+  /*  P   = u, v, r
+   * dPdx = dudx, dvdx, drdx
+   * dPdy = dudy, dvdy, drdy
+   *
+   * 1-arg: Does not exist.
+   *
+   * 2-arg: dudx   dvdx   dudy   dvdy
+   *dPdx.x dPdx.y dPdy.x dPdy.y
+   *m4 m5 m6 m7
+   *
+   * 3-arg: dudx   dvdx   drdx   dudy   dvdy   drdy
+   *dPdx.x dPdx.y dPdx.z dPdy.x dPdy.y dPdy.z
+   *m5 m6 m7 m8 m9 m10
+   */
+  for (unsigned i = 0; i < grad_components; i++)
+ bld.MOV(offset(msg_end, bld, i), offset(lod, bld, i));
+
+  msg_end = offset(msg_end, bld, MAX2(grad_components, 2));
+
+  for (unsigned i = 0; i < grad_components; i++)
+ bld.MOV(offset(msg_end, bld, i), offset(lod2, bld, i));
+
+  msg_end = offset(msg_end, bld, MAX2(grad_components, 2));
+   }
+
+   if (has_lod) {
+  /* Bias/LOD with shadow comparitor is unsupported in SIMD16 -- *Without*
+   * shadow comparitor (including RESINFO) it's unsupported in SIMD8 mode.
+   */
+  assert(shadow_c.file != BAD_FILE ? bld.dispatch_width() == 8 :
+ bld.dispatch_width() == 16);
+
+  const brw_reg_type type =
+ (op == SHADER_OPCODE_TXF || op == SHADER_OPCODE_TXS ?
+  BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_F);
+  bld.MOV(retype(msg_end, type), lod);
+  msg_end = offset(msg_end, bld, 1);
+   }
+
+   if (shadow_c.file != BAD_FILE) {
+  if (op == SHADER_OPCODE_TEX && bld.dispatch_width() == 8) {
+ /* There's no plain shadow compare message, so we use shadow
+  * compare with a bias of 0.0.
+  */
+ bld.MOV(msg_end, fs_reg(0.0f));
+ msg

[Mesa-dev] [PATCH 10/12] i965/fs: Reimplement emit_texture() in terms of logical send messages.

2015-07-18 Thread Francisco Jerez
---
 src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 66 +---
 1 file changed, 49 insertions(+), 17 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp 
b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
index 89fcc49..4011639 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
@@ -861,6 +861,14 @@ fs_visitor::emit_texture(ir_texture_opcode op,
   }
}
 
+   if (op == ir_query_levels) {
+  /* textureQueryLevels() is implemented in terms of TXS so we need to
+   * pass a valid LOD argument.
+   */
+  assert(lod.file == BAD_FILE);
+  lod = fs_reg(0u);
+   }
+
if (coordinate.file != BAD_FILE) {
   /* FINISHME: Texture coordinate rescaling doesn't work with non-constant
* samplers.  This should only be a problem with GL_CLAMP on Gen7.
@@ -873,26 +881,50 @@ fs_visitor::emit_texture(ir_texture_opcode op,
 * samples, so don't worry about them.
 */
fs_reg dst = vgrf(glsl_type::get_instance(dest_type->base_type, 4, 1));
+   const fs_reg srcs[] = {
+  coordinate, shadow_c, lod, lod2,
+  sample_index, mcs, sampler_reg, offset_value,
+  fs_reg(coord_components), fs_reg(grad_components)
+   };
+   enum opcode opcode;
 
-   if (devinfo->gen >= 7) {
-  inst = emit_texture_gen7(op, dst, coordinate, coord_components,
-   shadow_c, lod, lod2, grad_components,
-   sample_index, mcs, sampler_reg,
-   offset_value);
-   } else if (devinfo->gen >= 5) {
-  inst = emit_texture_gen5(op, dst, coordinate, coord_components,
-   shadow_c, lod, lod2, grad_components,
-   sample_index, sampler,
-   offset_value.file != BAD_FILE);
-   } else if (dispatch_width == 16) {
-  inst = emit_texture_gen4_simd16(op, dst, coordinate, coord_components,
-  shadow_c, lod, sampler);
-   } else {
-  inst = emit_texture_gen4(op, dst, coordinate, coord_components,
-   shadow_c, lod, lod2, grad_components,
-   sampler);
+   switch (op) {
+   case ir_tex:
+  opcode = SHADER_OPCODE_TEX_LOGICAL;
+  break;
+   case ir_txb:
+  opcode = FS_OPCODE_TXB_LOGICAL;
+  break;
+   case ir_txl:
+  opcode = SHADER_OPCODE_TXL_LOGICAL;
+  break;
+   case ir_txd:
+  opcode = SHADER_OPCODE_TXD_LOGICAL;
+  break;
+   case ir_txf:
+  opcode = SHADER_OPCODE_TXF_LOGICAL;
+  break;
+   case ir_txf_ms:
+  opcode = SHADER_OPCODE_TXF_CMS_LOGICAL;
+  break;
+   case ir_txs:
+   case ir_query_levels:
+  opcode = SHADER_OPCODE_TXS_LOGICAL;
+  break;
+   case ir_lod:
+  opcode = SHADER_OPCODE_LOD_LOGICAL;
+  break;
+   case ir_tg4:
+  opcode = (offset_value.file != BAD_FILE && offset_value.file != IMM ?
+SHADER_OPCODE_TG4_OFFSET_LOGICAL : SHADER_OPCODE_TG4_LOGICAL);
+  break;
+   default:
+  unreachable("not reached");
}
 
+   inst = bld.emit(opcode, dst, srcs, ARRAY_SIZE(srcs));
+   inst->regs_written = 4 * dispatch_width / 8;
+
if (shadow_c.file != BAD_FILE)
   inst->shadow_compare = true;
 
-- 
2.4.3

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


  1   2   3   4   5   6   7   8   9   10   >