Re: [Mesa-dev] [PATCH 5/6] nvc0/ir: add support for PK2H/UP2H

2016-01-03 Thread Samuel Pitoiset

Reviewed-by: Samuel Pitoiset 

On 01/03/2016 01:38 AM, Ilia Mirkin wrote:

Signed-off-by: Ilia Mirkin 
---
  .../drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp |  1 +
  .../drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp  |  5 -
  .../drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp  | 23 ++
  src/gallium/drivers/nouveau/nvc0/nvc0_screen.c |  2 +-
  4 files changed, 29 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp 
b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
index e9ddd36..ec74e7a 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
@@ -740,6 +740,7 @@ CodeEmitterGM107::emitF2F()
 emitCC   (0x2f);
 emitField(0x2d, 1, (insn->op == OP_NEG) || insn->src(0).mod.neg());
 emitFMZ  (0x2c, 1);
+   emitField(0x29, 1, insn->subOp);
 emitRND  (0x27, rnd, 0x2a);
 emitField(0x0a, 2, util_logbase2(typeSizeof(insn->sType)));
 emitField(0x08, 2, util_logbase2(typeSizeof(insn->dType)));
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp 
b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
index 1d4f0d9..0b28047 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
@@ -1030,7 +1030,10 @@ CodeEmitterNVC0::emitCVT(Instruction *i)

// for 8/16 source types, the byte/word is in subOp. word 1 is
// represented as 2.
-  code[1] |= i->subOp << 0x17;
+  if (!isFloatType(i->sType))
+ code[1] |= i->subOp << 0x17;
+  else
+ code[1] |= i->subOp << 0x18;

if (sat)
   code[0] |= 0x20;
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp 
b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
index beb67fe..e0b9435 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
@@ -319,6 +319,10 @@ unsigned int Instruction::srcMask(unsigned int s) const
   x |= 2;
return x;
 }
+   case TGSI_OPCODE_PK2H:
+  return 0x3;
+   case TGSI_OPCODE_UP2H:
+  return 0x1;
 default:
break;
 }
@@ -452,6 +456,7 @@ nv50_ir::DataType Instruction::inferSrcType() const
 case TGSI_OPCODE_ATOMUMAX:
 case TGSI_OPCODE_UBFE:
 case TGSI_OPCODE_UMSB:
+   case TGSI_OPCODE_UP2H:
return nv50_ir::TYPE_U32;
 case TGSI_OPCODE_I2F:
 case TGSI_OPCODE_I2D:
@@ -516,10 +521,12 @@ nv50_ir::DataType Instruction::inferDstType() const
 case TGSI_OPCODE_DSGE:
 case TGSI_OPCODE_DSLT:
 case TGSI_OPCODE_DSNE:
+   case TGSI_OPCODE_PK2H:
return nv50_ir::TYPE_U32;
 case TGSI_OPCODE_I2F:
 case TGSI_OPCODE_U2F:
 case TGSI_OPCODE_D2F:
+   case TGSI_OPCODE_UP2H:
return nv50_ir::TYPE_F32;
 case TGSI_OPCODE_I2D:
 case TGSI_OPCODE_U2D:
@@ -2807,6 +2814,22 @@ Converter::handleInstruction(const struct 
tgsi_full_instruction *insn)
FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi)
   mkCvt(OP_CVT, dstTy, dst0[c], srcTy, fetchSrc(0, c));
break;
+   case TGSI_OPCODE_PK2H:
+  val0 = getScratch();
+  val1 = getScratch();
+  mkCvt(OP_CVT, TYPE_F16, val0, TYPE_F32, fetchSrc(0, 0));
+  mkCvt(OP_CVT, TYPE_F16, val1, TYPE_F32, fetchSrc(0, 1));
+  mkOp3(OP_INSBF, TYPE_U32, dst0[0], val1, mkImm(0x1010), val0);
+  break;
+   case TGSI_OPCODE_UP2H:
+  src0 = fetchSrc(0, 0);
+  if (dst0[0])
+ mkCvt(OP_CVT, TYPE_F32, dst0[0], TYPE_F16, src0);
+  if (dst0[1]) {
+ geni = mkCvt(OP_CVT, TYPE_F32, dst0[1], TYPE_F16, src0);
+ geni->subOp = 1;
+  }
+  break;
 case TGSI_OPCODE_EMIT:
/* export the saved viewport index */
if (viewport != NULL) {
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c 
b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
index 58b712e..43f6164 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
@@ -197,6 +197,7 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum 
pipe_cap param)
 case PIPE_CAP_DRAW_PARAMETERS:
 case PIPE_CAP_MULTI_DRAW_INDIRECT:
 case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS:
+   case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
return 1;
 case PIPE_CAP_SEAMLESS_CUBE_MAP_PER_TEXTURE:
return (class_3d >= NVE4_3D_CLASS) ? 1 : 0;
@@ -219,7 +220,6 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum 
pipe_cap param)
 case PIPE_CAP_VERTEXID_NOBASE:
 case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
 case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
-   case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
return 0;

 case PIPE_CAP_VENDOR_ID:


___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 5/9] gallium/radeon: always add +DumpCode to the LLVM target machine for LLVM <= 3.5

2016-01-03 Thread Marek Olšák
On Sat, Jan 2, 2016 at 11:01 PM, Nicolai Hähnle  wrote:
> What's the reason for always having +DumpCode? Generating the assembly is
> some overhead that's usually unnecessary. Even if it's a small part of the
> profiles I've seen, it still seems like a natural thing to just skip. From
> what I can tell it should be dependent on any of the shader dumping flags +
> DBG_CHECK_VM being set. In any case, I suppose that would be for a separate
> commit.

Yeah, I agree that we shouldn't always generate the assembly string.

However, there is one case where we probably want to dump it always:
when a shader cache is used. We'll have only one chance to compile a
shader with a shader cache. If we decide not to generate the assembly,
we won't have it for all subsequent uses of the shader (even in other
processes).

Marek
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [Bug 75165] compute.c:464:49: error: function definition is not allowed here

2016-01-03 Thread bugzilla-daemon
https://bugs.freedesktop.org/show_bug.cgi?id=75165

Samuel Pitoiset  changed:

   What|Removed |Added

 Status|NEW |RESOLVED
 Resolution|--- |FIXED

--- Comment #4 from Samuel Pitoiset  ---
Fixed with "gallium/tests: fix build with clang compiler".

http://cgit.freedesktop.org/mesa/mesa/commit/?id=6a49fcfb1f28b563b89f2b37e82d9f87c0671228

-- 
You are receiving this mail because:
You are the assignee for the bug.
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 1/5] configure.ac: Detect if running on POWER8 arch

2016-01-03 Thread Oded Gabbay
On Thu, Dec 31, 2015 at 8:48 PM, Roland Scheidegger  wrote:
> Am 31.12.2015 um 10:30 schrieb Oded Gabbay:
>> On Wed, Dec 30, 2015 at 5:41 PM, Roland Scheidegger  
>> wrote:
>>> Am 30.12.2015 um 10:46 schrieb Oded Gabbay:
 On Wed, Dec 30, 2015 at 1:11 AM, Roland Scheidegger  
 wrote:
>
> So, if I see that right, you will automatically generate binaries using
> power8 instructions if compiled on power8 capable box, which then won't
> run on boxes not supporting power8? Is that really what you want?
> Maybe some runtime detection would be a good idea (though I don't know
> if anyone cares about power7)?

 The problem is I don't think I can eliminate the build time check
 (although I would very much like to) because I need:
 1. To pass a special flag to the GCC compiler: -mpower8-vector
 2. To define _ARCH_PWR8 so GCC will include the newer intrinsic

 Without those two things, I won't be able to use vec_vgbbd which I
 need to implement the _mm_movemask_epi8 efficiently, and without that,
 all this patch series can be thrown out the window. The emulation of
 _mm_movemask_epi8 using regular instructions is just horrible.

 You are correct that once you build a binary with this flag on power8
 machine, that binary won't run on power7 machine. You get "cannot
 execute binary file"

 Unfortunately, I don't see a way around this because even if I
 condition the use of vec_vgbbd on a runtime check/define, the library
 still won't be executable because it was built with -mpower8-vector.

 Having said that, because I *assume* IBM right now mostly cares about
 Linux running on POWER8 with little-endian, I think it is a fair
 compromise.
>>>
>>> Note I don't have anything against a build time check. My concern here
>>> is something along the lines of unsuspecting distros shipping binaries
>>> which won't work, as it looks to me like this will get picked up
>>> automatically. That is different to how for instance sse41 is handled.
>>> That is I believe this should only get enabled if someone has specified
>>> some -mcpu=power8 or whatever flag explicitly somewhere already.
>>>
>>> Roland
>>
>> I understand and I share your concern. Maybe we should add
>> "--disable-pwr8-inst" to mesa's configure ? if that flag is given to
>> configure, it would disable the optimization code (won't add
>> _ARCH_PWR8 to defines and won't add -mpower8-vector to gcc flags).
>>
>> What do you think ?
> If the generated code with all automatically picked up compile options
> really doesn't run on power7 just because of this, I think it would be
> nicer if this were an explicit enable.
>
> Roland
>

So the problem is a bit worse then that and requires a harsher solution.
Apparently, when that compiler flag (power8-vector) is given to GCC,
GCC uses POWER8-only instructions in other places as well! What I have
seen so far, is that it uses such instructions in the implementation
of exp2() and/or log2() (in f.cpp) and also saw it in
__glXInitializeVisualConfigFromTags(). The instructions used are not
vector instructions, but floating point instructions, which were added
only in PowerISA 2.07

Therefore, I think that for now, I will limit the entire optimization
code to POWER8 *and* Little-Endian. Because ppc64le packages can
*only* run on POWER8 systems, and because you can't transfer binaries
between LE and BE machines, this workaround eliminates the danger of
crashing on "illegal instruction". In addition, there is no more need
for runtime checks.

I hope you agree that with this change, it is better to enable the
power8-vector by default when building on POWER8 machine installed
with Linux LE. For all other archs it will be disabled by default.

I will try to contact IBM GCC devs to see how we can overcome this
problem (or if they even care) so I can expand these optimizations to
BE as well.

I will send the revised patches shortly.

   Oded

>
>
>
>>
>> Oded
>>
>>>

 Oded

> So far we didn't bother with that for SSE
> but it has to be said SSE2 is a really low bar (and the manual assembly
> stuff doesn't use anything more advanced, even though clearly things
> like the emulated mm_mullo_epi32 are suboptimal if your cpu supports
> sse41). And even then on non-x86 you actually might not get
> PIPE_ARCH_SSE if you didn't set gcc's compile flags accordingly.
>
> Roland
>
>
> Am 29.12.2015 um 17:12 schrieb Oded Gabbay:
>> To determine if we could use special POWER8 assembly directives, we first
>> need to detect whether we are running on POWER8 architecture. This patch
>> adds this detection to configure.ac and adds the necessary compilation
>> flags accordingly.
>>
>> Signed-off-by: Oded Gabbay 
>> ---
>>  configure.ac | 30 ++
>>  1 file changed, 30 insertions(+)
>>
>> diff --git a/configure.ac b/configur

[Mesa-dev] [PATCH v2 4/5] llvmpipe: Optimize BUILD_MASK(_LINEAR) for POWER8

2016-01-03 Thread Oded Gabbay
This patch converts the SSE-optimized build_mask_32() and
build_mask_linear_32() to VMX/VSX.

I measured the results on POWER8 machine with 32 cores at 3.4GHz and
16GB of RAM.

  FPS/Score
  NameBefore AfterDelta

glmark2 (score)   139.8  142.72.07%

openarena and xonotic didn't show a significant (more than 1%)
difference.

v2: Make sure code is build only on POWER8 LE machine

Signed-off-by: Oded Gabbay 
---
 src/gallium/drivers/llvmpipe/lp_rast_tri.c | 150 +
 1 file changed, 110 insertions(+), 40 deletions(-)

diff --git a/src/gallium/drivers/llvmpipe/lp_rast_tri.c 
b/src/gallium/drivers/llvmpipe/lp_rast_tri.c
index c9b9221..09a182a 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast_tri.c
+++ b/src/gallium/drivers/llvmpipe/lp_rast_tri.c
@@ -133,36 +133,8 @@ lp_rast_triangle_4_16(struct lp_rasterizer_task *task,
lp_rast_triangle_4(task, arg2);
 }
 
-#if !defined(PIPE_ARCH_SSE)
+#if defined(PIPE_ARCH_SSE)
 
-void
-lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task,
- const union lp_rast_cmd_arg arg)
-{
-   union lp_rast_cmd_arg arg2;
-   arg2.triangle.tri = arg.triangle.tri;
-   arg2.triangle.plane_mask = (1<<3)-1;
-   lp_rast_triangle_32_3(task, arg2);
-}
-
-void
-lp_rast_triangle_32_4_16(struct lp_rasterizer_task *task,
- const union lp_rast_cmd_arg arg)
-{
-   union lp_rast_cmd_arg arg2;
-   arg2.triangle.tri = arg.triangle.tri;
-   arg2.triangle.plane_mask = (1<<4)-1;
-   lp_rast_triangle_32_4(task, arg2);
-}
-
-void
-lp_rast_triangle_32_3_4(struct lp_rasterizer_task *task,
-  const union lp_rast_cmd_arg arg)
-{
-   lp_rast_triangle_32_3_16(task, arg);
-}
-
-#else
 #include 
 #include "util/u_sse.h"
 
@@ -265,12 +237,6 @@ sign_bits4(const __m128i *cstep, int cdiff)
 
 #define NR_PLANES 3
 
-
-
-
-
-
-
 void
 lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task,
   const union lp_rast_cmd_arg arg)
@@ -381,10 +347,6 @@ lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task,
0x & ~out[i].mask);
 }
 
-
-
-
-
 void
 lp_rast_triangle_32_3_4(struct lp_rasterizer_task *task,
  const union lp_rast_cmd_arg arg)
@@ -471,6 +433,114 @@ lp_rast_triangle_32_3_4(struct lp_rasterizer_task *task,
 }
 
 #undef NR_PLANES
+
+#else
+
+#if defined(_ARCH_PWR8) && defined(PIPE_ARCH_LITTLE_ENDIAN)
+
+#include 
+#include "util/u_pwr8.h"
+
+static inline void
+build_masks_32(int c,
+   int cdiff,
+   int dcdx,
+   int dcdy,
+   unsigned *outmask,
+   unsigned *partmask)
+{
+   __m128i cstep0 = vec_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3);
+   __m128i xdcdy = (__m128i) vec_splats(dcdy);
+
+   /* Get values across the quad
+*/
+   __m128i cstep1 = vec_add_epi32(cstep0, xdcdy);
+   __m128i cstep2 = vec_add_epi32(cstep1, xdcdy);
+   __m128i cstep3 = vec_add_epi32(cstep2, xdcdy);
+
+   {
+  __m128i cstep01, cstep23, result;
+
+  cstep01 = vec_packs_epi32(cstep0, cstep1);
+  cstep23 = vec_packs_epi32(cstep2, cstep3);
+  result = vec_packs_epi16(cstep01, cstep23);
+
+  *outmask |= vec_movemask_epi8(result);
+   }
+
+
+   {
+  __m128i cio4 = (__m128i) vec_splats(cdiff);
+  __m128i cstep01, cstep23, result;
+
+  cstep0 = vec_add_epi32(cstep0, cio4);
+  cstep1 = vec_add_epi32(cstep1, cio4);
+  cstep2 = vec_add_epi32(cstep2, cio4);
+  cstep3 = vec_add_epi32(cstep3, cio4);
+
+  cstep01 = vec_packs_epi32(cstep0, cstep1);
+  cstep23 = vec_packs_epi32(cstep2, cstep3);
+  result = vec_packs_epi16(cstep01, cstep23);
+
+  *partmask |= vec_movemask_epi8(result);
+   }
+}
+
+static inline unsigned
+build_mask_linear_32(int c, int dcdx, int dcdy)
+{
+   __m128i cstep0 = vec_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3);
+   __m128i xdcdy = (__m128i) vec_splats(dcdy);
+
+   /* Get values across the quad
+*/
+   __m128i cstep1 = vec_add_epi32(cstep0, xdcdy);
+   __m128i cstep2 = vec_add_epi32(cstep1, xdcdy);
+   __m128i cstep3 = vec_add_epi32(cstep2, xdcdy);
+
+   /* pack pairs of results into epi16
+*/
+   __m128i cstep01 = vec_packs_epi32(cstep0, cstep1);
+   __m128i cstep23 = vec_packs_epi32(cstep2, cstep3);
+
+   /* pack into epi8, preserving sign bits
+*/
+   __m128i result = vec_packs_epi16(cstep01, cstep23);
+
+   /* extract sign bits to create mask
+*/
+   return vec_movemask_epi8(result);
+}
+
+#endif /* _ARCH_PWR8 && PIPE_ARCH_LITTLE_ENDIAN */
+
+void
+lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task,
+ const union lp_rast_cmd_arg arg)
+{
+   union lp_rast_cmd_arg arg2;
+   arg2.triangle.tri = arg.triangle.tri;
+   arg2.triangle.plane_mask = (1<<3)-1;
+   lp_rast_triangle_32_3(task, arg2);
+}
+
+void
+lp_rast_triangle_32_4_16(struct lp_rasterizer_task *task,
+ const union lp

[Mesa-dev] [PATCH v2 0/5] Optimizing llvmpipe for POWER8 architecture

2016-01-03 Thread Oded Gabbay
Hi,

Here is the 2nd version of the patch series. The main change is that the new 
code is limited to POWER8 Little-Endian machines, due to a special compiler 
flag (power8-vector) that must bu turned on but creates code that can't run 
on POWER7 machines. 

As only POWER8 has an LE mode (ppc64le), the code can be encomposed with:
#if defined(_ARCH_PWR8) && defined(PIPE_ARCH_LITTLE_ENDIAN)

and because an LE binary can't be run on BE machine anyway, we can safely 
enable this code by default for ppc64le architecture.

A few more changes are detailed in each commit message.

Thanks,

  - Oded

Oded Gabbay (5):
  configure.ac: Detect if running on POWER8 arch
  llvmpipe: add POWER8 portability file - u_pwr8.h
  llvmpipe: Optimize do_triangle_ccw for POWER8
  llvmpipe: Optimize BUILD_MASK(_LINEAR) for POWER8
  llvmpipe: Optimize lp_rast_triangle_32_3_16 for POWER8

 configure.ac|  55 +
 src/gallium/auxiliary/util/u_pwr8.h | 310 
 src/gallium/drivers/llvmpipe/lp_rast_tri.c  | 290 ++
 src/gallium/drivers/llvmpipe/lp_setup_tri.c | 100 +
 4 files changed, 715 insertions(+), 40 deletions(-)
 create mode 100644 src/gallium/auxiliary/util/u_pwr8.h

-- 
2.5.0

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v2 2/5] llvmpipe: add POWER8 portability file - u_pwr8.h

2016-01-03 Thread Oded Gabbay
This file provides a portability layer that will make it easier to convert
SSE-based functions to VMX/VSX-based functions.

All the functions implemented in this file are prefixed using "vec_".
Therefore, when converting from SSE-based function, one needs to simply
replace the "_mm_" prefix of the SSE function being called to "vec_".

Having said that, not all functions could be converted as such, due to the
differences between the architectures. So, when doing such
conversion hurt the performance, I preferred to implement a more ad-hoc
solution. For example, converting the _mm_shuffle_epi32 needed to be done
using ad-hoc masks instead of a generic function.

All the functions in this file support both little-endian and big-endian
but currently the file is build only on POWER8 LE machine.

All of the functions are implemented using the Altivec/VMX intrinsics,
except one where I needed to use inline assembly (due to missing
intrinsic).

v2:
- Use vec_vgbbd instead of __builtin_vec_vgbbd
- Add an aligned load function
- Don't use typeof()
- Make file build only on POWER8 LE machine

Signed-off-by: Oded Gabbay 
---
 src/gallium/auxiliary/util/u_pwr8.h | 310 
 1 file changed, 310 insertions(+)
 create mode 100644 src/gallium/auxiliary/util/u_pwr8.h

diff --git a/src/gallium/auxiliary/util/u_pwr8.h 
b/src/gallium/auxiliary/util/u_pwr8.h
new file mode 100644
index 000..1eca6d6
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_pwr8.h
@@ -0,0 +1,310 @@
+/*
+ * Copyright 2015 Red Hat Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Author: Oded Gabbay 
+ */
+
+/**
+ * @file
+ * POWER8 intrinsics portability header.
+ *
+ */
+
+#ifndef U_PWR8_H_
+#define U_PWR8_H_
+
+#if defined(_ARCH_PWR8) && defined(PIPE_ARCH_LITTLE_ENDIAN)
+
+#define VECTOR_ALIGN_16 __attribute__ ((__aligned__ (16)))
+
+typedef VECTOR_ALIGN_16 vector unsigned char __m128i;
+
+typedef VECTOR_ALIGN_16 union m128i {
+   __m128i m128i;
+   vector signed int m128si;
+   vector unsigned int m128ui;
+   ubyte ub[16];
+   ushort us[8];
+   int i[4];
+   uint ui[4];
+} __m128i_union;
+
+static inline __m128i
+vec_set_epi32 (int i3, int i2, int i1, int i0)
+{
+   __m128i_union vdst;
+
+#ifdef PIPE_ARCH_LITTLE_ENDIAN
+   vdst.i[0] = i0;
+   vdst.i[1] = i1;
+   vdst.i[2] = i2;
+   vdst.i[3] = i3;
+#else
+   vdst.i[3] = i0;
+   vdst.i[2] = i1;
+   vdst.i[1] = i2;
+   vdst.i[0] = i3;
+#endif
+
+   return (__m128i) vdst.m128si;
+}
+
+static inline __m128i
+vec_setr_epi32 (int i0, int i1, int i2, int i3)
+{
+  return vec_set_epi32 (i3, i2, i1, i0);
+}
+
+static inline __m128i
+vec_unpacklo_epi32 (__m128i even, __m128i odd)
+{
+   static const __m128i perm_mask =
+#ifdef PIPE_ARCH_LITTLE_ENDIAN
+  { 0,  1,  2,  3, 16, 17, 18, 19,  4,  5,  6,  7, 20, 21, 22, 23};
+#else
+  {24, 25, 26, 27,  8,  9, 10, 11, 28, 29, 30, 31, 12, 13, 14, 15};
+#endif
+
+   return vec_perm (even, odd, perm_mask);
+}
+
+static inline __m128i
+vec_unpackhi_epi32 (__m128i even, __m128i odd)
+{
+   static const __m128i perm_mask =
+#ifdef PIPE_ARCH_LITTLE_ENDIAN
+  { 8,  9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31};
+#else
+  {16, 17, 18, 19,  0,  1,  2,  3, 20, 21, 22, 23,  4,  5,  6,  7};
+#endif
+
+   return vec_perm (even, odd, perm_mask);
+}
+
+static inline __m128i
+vec_unpacklo_epi64 (__m128i even, __m128i odd)
+{
+   static const __m128i perm_mask =
+#ifdef PIPE_ARCH_LITTLE_ENDIAN
+  { 0,  1,  2,  3,  4,  5,  6,  7, 16, 17, 18, 19, 20, 21, 22, 23};
+#else
+  {24, 25, 26, 27, 28, 29, 30, 31,  8,  9, 10, 11, 12, 13, 14, 15};
+#endif
+
+   return vec_perm (even, odd, perm_mask);
+}
+
+static inline __m128i
+vec_unpackhi_epi64 (__m128i even, __m128i odd)
+{
+   static const __m128i perm_mask =
+#ifdef PIPE_ARCH_LITTLE_ENDIAN
+  { 8,  9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31};
+#else
+  {16, 17, 18, 19, 20, 

[Mesa-dev] [PATCH v2 3/5] llvmpipe: Optimize do_triangle_ccw for POWER8

2016-01-03 Thread Oded Gabbay
This patch converts the SSE optimization done in do_triangle_ccw to
VMX/VSX.

I measured the results on POWER8 machine with 32 cores at 3.4GHz and
16GB of RAM.

  FPS/Score
  NameBefore AfterDelta

glmark2 (score)   136.6  139.82.34%
openarena 16.14  16.351.30%
xonotic   4.655  4.7071.11%

v2:

- Convert loads to use aligned loads
- Make sure code is build only on POWER8 LE machine

Signed-off-by: Oded Gabbay 
---
 src/gallium/drivers/llvmpipe/lp_setup_tri.c | 100 
 1 file changed, 100 insertions(+)

diff --git a/src/gallium/drivers/llvmpipe/lp_setup_tri.c 
b/src/gallium/drivers/llvmpipe/lp_setup_tri.c
index b1671dd..0ff10a2 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_tri.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_tri.c
@@ -46,6 +46,9 @@
 
 #if defined(PIPE_ARCH_SSE)
 #include 
+#elif defined(_ARCH_PWR8) && defined(PIPE_ARCH_LITTLE_ENDIAN)
+#include 
+#include "util/u_pwr8.h"
 #endif
 
 static inline int
@@ -462,6 +465,103 @@ do_triangle_ccw(struct lp_setup_context *setup,
   STORE_PLANE(plane[2], p2);
 #undef STORE_PLANE
} else
+#elif defined(_ARCH_PWR8) && defined(PIPE_ARCH_LITTLE_ENDIAN)
+   /*
+* XXX this code is effectively disabled for all practical purposes,
+* as the allowed fb size is tiny if FIXED_ORDER is 8.
+*/
+   if (setup->fb.width <= MAX_FIXED_LENGTH32 &&
+   setup->fb.height <= MAX_FIXED_LENGTH32 &&
+   (bbox.x1 - bbox.x0) <= MAX_FIXED_LENGTH32 &&
+   (bbox.y1 - bbox.y0) <= MAX_FIXED_LENGTH32) {
+  unsigned int bottom_edge;
+  __m128i vertx, verty;
+  __m128i shufx, shufy;
+  __m128i dcdx, dcdy, c;
+  __m128i unused;
+  __m128i dcdx_neg_mask;
+  __m128i dcdy_neg_mask;
+  __m128i dcdx_zero_mask;
+  __m128i top_left_flag;
+  __m128i c_inc_mask, c_inc;
+  __m128i eo, p0, p1, p2;
+  __m128i_union vshuf_mask;
+  __m128i zero = vec_splats((unsigned char) 0);
+  PIPE_ALIGN_VAR(16) int32_t temp_vec[4];
+
+#ifdef PIPE_ARCH_LITTLE_ENDIAN
+  vshuf_mask.i[0] = 0x07060504;
+  vshuf_mask.i[1] = 0x0B0A0908;
+  vshuf_mask.i[2] = 0x03020100;
+  vshuf_mask.i[3] = 0x0F0E0D0C;
+#else
+  vshuf_mask.i[0] = 0x00010203;
+  vshuf_mask.i[1] = 0x0C0D0E0F;
+  vshuf_mask.i[2] = 0x04050607;
+  vshuf_mask.i[3] = 0x08090A0B;
+#endif
+
+  /* vertex x coords */
+  vertx = vec_load_si128((const uint32_t *) position->x);
+  /* vertex y coords */
+  verty = vec_load_si128((const uint32_t *) position->y);
+
+  shufx = vec_perm (vertx, vertx, vshuf_mask.m128i);
+  shufy = vec_perm (verty, verty, vshuf_mask.m128i);
+
+  dcdx = vec_sub_epi32(verty, shufy);
+  dcdy = vec_sub_epi32(vertx, shufx);
+
+  dcdx_neg_mask = vec_srai_epi32(dcdx, 31);
+  dcdx_zero_mask = vec_cmpeq_epi32(dcdx, zero);
+  dcdy_neg_mask = vec_srai_epi32(dcdy, 31);
+
+  bottom_edge = (setup->bottom_edge_rule == 0) ? ~0 : 0;
+  top_left_flag = (__m128i) vec_splats(bottom_edge);
+
+  c_inc_mask = vec_or(dcdx_neg_mask,
+vec_and(dcdx_zero_mask,
+  vec_xor(dcdy_neg_mask,
+top_left_flag)));
+
+  c_inc = vec_srli_epi32(c_inc_mask, 31);
+
+  c = vec_sub_epi32(vec_mullo_epi32(dcdx, vertx),
+vec_mullo_epi32(dcdy, verty));
+
+  c = vec_add_epi32(c, c_inc);
+
+  /* Scale up to match c:
+   */
+  dcdx = vec_slli_epi32(dcdx, FIXED_ORDER);
+  dcdy = vec_slli_epi32(dcdy, FIXED_ORDER);
+
+  /* Calculate trivial reject values:
+   */
+  eo = vec_sub_epi32(vec_andc(dcdy_neg_mask, dcdy),
+ vec_and(dcdx_neg_mask, dcdx));
+
+  /* ei = _mm_sub_epi32(_mm_sub_epi32(dcdy, dcdx), eo); */
+
+  /* Pointless transpose which gets undone immediately in
+   * rasterization:
+   */
+  transpose4_epi32(&c, &dcdx, &dcdy, &eo,
+   &p0, &p1, &p2, &unused);
+
+#define STORE_PLANE(plane, vec) do {  \
+ vec_store_si128((uint32_t *)&temp_vec, vec); \
+ plane.c= (int64_t)temp_vec[0];   \
+ plane.dcdx = temp_vec[1];\
+ plane.dcdy = temp_vec[2];\
+ plane.eo   = temp_vec[3];\
+  } while(0)
+
+  STORE_PLANE(plane[0], p0);
+  STORE_PLANE(plane[1], p1);
+  STORE_PLANE(plane[2], p2);
+#undef STORE_PLANE
+   } else
 #endif
{
   int i;
-- 
2.5.0

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v2 5/5] llvmpipe: Optimize lp_rast_triangle_32_3_16 for POWER8

2016-01-03 Thread Oded Gabbay
This patch converts the SSE-optimized lp_rast_triangle_32_3_16()
to VMX/VSX.

I measured the results on POWER8 machine with 32 cores at 3.4GHz and
16GB of RAM.

  FPS/Score
 NameBefore AfterDelta

openarena16.35  16.7 2.14%
xonotic  4.707  4.97 5.57%

glmark2 didn't show a significant (more than 1%) difference.

v2: Make sure code is build only on POWER8 LE machine

Signed-off-by: Oded Gabbay 
---
 src/gallium/drivers/llvmpipe/lp_rast_tri.c | 142 -
 1 file changed, 141 insertions(+), 1 deletion(-)

diff --git a/src/gallium/drivers/llvmpipe/lp_rast_tri.c 
b/src/gallium/drivers/llvmpipe/lp_rast_tri.c
index 09a182a..232c859 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast_tri.c
+++ b/src/gallium/drivers/llvmpipe/lp_rast_tri.c
@@ -512,7 +512,145 @@ build_mask_linear_32(int c, int dcdx, int dcdy)
return vec_movemask_epi8(result);
 }
 
-#endif /* _ARCH_PWR8 && PIPE_ARCH_LITTLE_ENDIAN */
+static inline __m128i
+lp_plane_to_m128i(const struct lp_rast_plane *plane)
+{
+   return vec_setr_epi32((int32_t)plane->c, (int32_t)plane->dcdx,
+ (int32_t)plane->dcdy, (int32_t)plane->eo);
+}
+
+#define NR_PLANES 3
+
+void
+lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task,
+  const union lp_rast_cmd_arg arg)
+{
+   const struct lp_rast_triangle *tri = arg.triangle.tri;
+   const struct lp_rast_plane *plane = GET_PLANES(tri);
+   int x = (arg.triangle.plane_mask & 0xff) + task->x;
+   int y = (arg.triangle.plane_mask >> 8) + task->y;
+   unsigned i, j;
+
+   struct { unsigned mask:16; unsigned i:8; unsigned j:8; } out[16];
+   unsigned nr = 0;
+
+   __m128i p0 = lp_plane_to_m128i(&plane[0]); /* c, dcdx, dcdy, eo */
+   __m128i p1 = lp_plane_to_m128i(&plane[1]); /* c, dcdx, dcdy, eo */
+   __m128i p2 = lp_plane_to_m128i(&plane[2]); /* c, dcdx, dcdy, eo */
+   __m128i zero = vec_splats((unsigned char) 0);
+
+   __m128i c;
+   __m128i dcdx;
+   __m128i dcdy;
+   __m128i rej4;
+
+   __m128i dcdx2;
+   __m128i dcdx3;
+
+   __m128i span_0;/* 0,dcdx,2dcdx,3dcdx for plane 0 */
+   __m128i span_1;/* 0,dcdx,2dcdx,3dcdx for plane 1 */
+   __m128i span_2;/* 0,dcdx,2dcdx,3dcdx for plane 2 */
+   __m128i unused;
+
+   __m128i vshuf_mask0;
+   __m128i vshuf_mask1;
+   __m128i vshuf_mask2;
+
+#ifdef PIPE_ARCH_LITTLE_ENDIAN
+   vshuf_mask0 = (__m128i) vec_splats((unsigned int) 0x03020100);
+   vshuf_mask1 = (__m128i) vec_splats((unsigned int) 0x07060504);
+   vshuf_mask2 = (__m128i) vec_splats((unsigned int) 0x0B0A0908);
+#else
+   vshuf_mask0 = (__m128i) vec_splats((unsigned int) 0x0C0D0E0F);
+   vshuf_mask1 = (__m128i) vec_splats((unsigned int) 0x08090A0B);
+   vshuf_mask2 = (__m128i) vec_splats((unsigned int) 0x04050607);
+#endif
+
+   transpose4_epi32(&p0, &p1, &p2, &zero,
+&c, &dcdx, &dcdy, &rej4);
+
+   /* Adjust dcdx;
+*/
+   dcdx = vec_sub_epi32(zero, dcdx);
+
+   c = vec_add_epi32(c, vec_mullo_epi32(dcdx, (__m128i) vec_splats(x)));
+   c = vec_add_epi32(c, vec_mullo_epi32(dcdy, (__m128i) vec_splats(y)));
+   rej4 = vec_slli_epi32(rej4, 2);
+
+   /*
+* Adjust so we can just check the sign bit (< 0 comparison),
+* instead of having to do a less efficient <= 0 comparison
+*/
+   c = vec_sub_epi32(c, (__m128i) vec_splats((unsigned int) 1));
+   rej4 = vec_add_epi32(rej4, (__m128i) vec_splats((unsigned int) 1));
+
+   dcdx2 = vec_add_epi32(dcdx, dcdx);
+   dcdx3 = vec_add_epi32(dcdx2, dcdx);
+
+   transpose4_epi32(&zero, &dcdx, &dcdx2, &dcdx3,
+&span_0, &span_1, &span_2, &unused);
+
+   for (i = 0; i < 4; i++) {
+  __m128i cx = c;
+
+  for (j = 0; j < 4; j++) {
+ __m128i c4rej = vec_add_epi32(cx, rej4);
+ __m128i rej_masks = vec_srai_epi32(c4rej, 31);
+
+ /* if (is_zero(rej_masks)) */
+ if (vec_movemask_epi8(rej_masks) == 0) {
+__m128i c0_0 = vec_add_epi32(vec_perm(cx, cx, vshuf_mask0), 
span_0);
+__m128i c1_0 = vec_add_epi32(vec_perm(cx, cx, vshuf_mask1), 
span_1);
+__m128i c2_0 = vec_add_epi32(vec_perm(cx, cx, vshuf_mask2), 
span_2);
+
+__m128i c_0 = vec_or(vec_or(c0_0, c1_0), c2_0);
+
+__m128i c0_1 = vec_add_epi32(c0_0, vec_perm(dcdy, dcdy, 
vshuf_mask0));
+__m128i c1_1 = vec_add_epi32(c1_0, vec_perm(dcdy, dcdy, 
vshuf_mask1));
+__m128i c2_1 = vec_add_epi32(c2_0, vec_perm(dcdy, dcdy, 
vshuf_mask2));
+
+__m128i c_1 = vec_or(vec_or(c0_1, c1_1), c2_1);
+__m128i c_01 = vec_packs_epi32(c_0, c_1);
+
+__m128i c0_2 = vec_add_epi32(c0_1, vec_perm(dcdy, dcdy, 
vshuf_mask0));
+__m128i c1_2 = vec_add_epi32(c1_1, vec_perm(dcdy, dcdy, 
vshuf_mask1));
+__m128i c2_2 = vec_add_epi32(c2_1, vec_perm(dcdy, dcdy, 
vshuf_mask2));
+
+__m128i c_2 = vec_or(vec_or(c0_

[Mesa-dev] [PATCH v2 1/5] configure.ac: Detect if running on POWER8 arch

2016-01-03 Thread Oded Gabbay
To determine if we could use special POWER8 assembly directives, we first
need to detect whether we are running on POWER8 architecture. This patch
adds this detection to configure.ac and adds the necessary compilation
flags accordingly.

v2:

- Add option to disable POWER8 instructions generation
- Detect whether building on BE or LE machine and build with
  -mpower8-vector only on LE machine
- Make the printed messages more standard

Signed-off-by: Oded Gabbay 
---
 configure.ac | 55 +++
 1 file changed, 55 insertions(+)

diff --git a/configure.ac b/configure.ac
index f8a70be..b1c1d7d 100644
--- a/configure.ac
+++ b/configure.ac
@@ -396,6 +396,61 @@ fi
 AM_CONDITIONAL([SSE41_SUPPORTED], [test x$SSE41_SUPPORTED = x1])
 AC_SUBST([SSE41_CFLAGS], $SSE41_CFLAGS)
 
+dnl Check for Endianness
+AC_C_BIGENDIAN(
+   little_endian=no,
+   little_endian=yes,
+   little_endian=no,
+   little_endian=no
+)
+
+dnl Check for POWER8 Architecture
+PWR8_CFLAGS="-mpower8-vector"
+have_pwr8_intrinsics=no
+AC_MSG_CHECKING(whether gcc supports -mpower8-vector)
+save_CFLAGS=$CFLAGS
+CFLAGS="$PWR8_CFLAGS $CFLAGS"
+AC_COMPILE_IFELSE([AC_LANG_SOURCE([[
+#if defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 
8))
+#error "Need GCC >= 4.8 for sane POWER8 support"
+#endif
+#include 
+int main () {
+vector unsigned char r;
+vector unsigned int v = vec_splat_u32 (1);
+r = __builtin_vec_vgbbd ((vector unsigned char) v);
+return 0;
+}]])], have_pwr8_intrinsics=yes)
+CFLAGS=$save_CFLAGS
+
+AC_ARG_ENABLE(pwr8,
+   [AC_HELP_STRING([--disable-pwr8-inst],
+   [disable POWER8-specific instructions])],
+   [enable_pwr8=$enableval], [enable_pwr8=auto])
+
+if test "x$enable_pwr8" = xno ; then
+   have_pwr8_intrinsics=disabled
+fi
+
+if test $have_pwr8_intrinsics = yes && test $little_endian = yes ; then
+   DEFINES="$DEFINES -D_ARCH_PWR8"
+   CXXFLAGS="$CXXFLAGS $PWR8_CFLAGS"
+   CFLAGS="$CFLAGS $PWR8_CFLAGS"
+else
+   PWR8_CFLAGS=
+fi
+
+AC_MSG_RESULT($have_pwr8_intrinsics)
+if test "x$enable_pwr8" = xyes && test $have_pwr8_intrinsics = no ; then
+   AC_MSG_ERROR([POWER8 compiler support not detected])
+fi
+
+if test $have_pwr8_intrinsics = yes && test $little_endian = no ; then
+   AC_MSG_WARN([POWER8 optimization is enabled only on POWER8 Little-Endian])
+fi
+
+AC_SUBST([PWR8_CFLAGS], $PWR8_CFLAGS)
+
 dnl Can't have static and shared libraries, default to static if user
 dnl explicitly requested. If both disabled, set to static since shared
 dnl was explicitly requested.
-- 
2.5.0

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 2/6] i965/gen8+: Invalidate color calc state when switching to the GPGPU pipeline.

2016-01-03 Thread Matt Turner
On Sun, Jan 3, 2016 at 1:48 AM, Francisco Jerez  wrote:
> This hardware bug can cause a hang on context restore while the
> current pipeline is set to GPGPU (BDWGFX HSD 1909593).  In addition to
> clearing the valid bit, mark the CC state as dirty to make sure that
> the CC indirect state pointer is re-emitted when we switch back to the
> 3D pipeline.
> ---
>  src/mesa/drivers/dri/i965/brw_misc_state.c | 20 
>  1 file changed, 20 insertions(+)
>
> diff --git a/src/mesa/drivers/dri/i965/brw_misc_state.c 
> b/src/mesa/drivers/dri/i965/brw_misc_state.c
> index cf6ba5b..7d53d18 100644
> --- a/src/mesa/drivers/dri/i965/brw_misc_state.c
> +++ b/src/mesa/drivers/dri/i965/brw_misc_state.c
> @@ -868,6 +868,26 @@ brw_emit_select_pipeline(struct brw_context *brw, enum 
> brw_pipeline pipeline)
> const uint32_t _3DSTATE_PIPELINE_SELECT =
>is_965 ? CMD_PIPELINE_SELECT_965 : CMD_PIPELINE_SELECT_GM45;
>
> +   if (brw->gen >= 8) {
> +  /* From "BXML » GT » MI » vol1a GPU Overview » [Instruction]
> +   * PIPELINE_SELECT [DevBWR+]":
> +   *
> +   *   Project: BDW, SKL

I think we should restrict this block to brw->gen == 8 || brw->gen ==
9 in that case?

I can't find evidence that the workaround applies to later hardware
(and in fact the page cited has a different workaround for a later
generation).
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 0/6] i965: GPGPU/3D pipeline switching fixes.

2016-01-03 Thread Matt Turner
On Sun, Jan 3, 2016 at 1:47 AM, Francisco Jerez  wrote:
> The PIPELINE_SELECT command has a number of awkward restrictions we
> don't currently take into account while switching between the GPGPU
> and 3D pipeline, what in some cases can lead to corruption or hangs.
> This series should implement all workarounds mentioned in the hardware
> spec ("BXML » GT » MI » vol1a GPU Overview » [Instruction]
> PIPELINE_SELECT [DevBWR+]") that seem to be relevant to us.

I had a question about patch 2, and with that sorted out the series is

Reviewed-by: Matt Turner 
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [Bug 93570] the image of llvmpipe has a low quality on arm (with too many points on it)

2016-01-03 Thread bugzilla-daemon
https://bugs.freedesktop.org/show_bug.cgi?id=93570

Bug ID: 93570
   Summary: the image of llvmpipe has a low quality on arm (with
too many points on it)
   Product: Mesa
   Version: 11.0
  Hardware: ARM
OS: Linux (All)
Status: NEW
  Severity: normal
  Priority: medium
 Component: Drivers/X11
  Assignee: mesa-dev@lists.freedesktop.org
  Reporter: icenowy...@gmail.com
QA Contact: mesa-dev@lists.freedesktop.org

Created attachment 120782
  --> https://bugs.freedesktop.org/attachment.cgi?id=120782&action=edit
The result on arm, the points are shown

I've built a llvmpipe-enabled mesa on my Allwinner A33 (Quad Cortex-A7) device.

I tried to run glmark2 on it, but I found that the image has a very very low
quality.

On my laptop (i5-3230m), when I forced LIBGL_ALWAYS_SOFTWARE, there's no this
problem.

-- 
You are receiving this mail because:
You are the QA Contact for the bug.
You are the assignee for the bug.
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [Bug 93570] the image of llvmpipe has a low quality on arm (with too many points on it)

2016-01-03 Thread bugzilla-daemon
https://bugs.freedesktop.org/show_bug.cgi?id=93570

--- Comment #1 from Icenowy Zheng  ---
The attachment is a jpg, shoot by my mobile phone

-- 
You are receiving this mail because:
You are the QA Contact for the bug.
You are the assignee for the bug.
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 1/6] gallium: document PK2H/UP2H

2016-01-03 Thread Roland Scheidegger
Am 03.01.2016 um 01:37 schrieb Ilia Mirkin:
> Signed-off-by: Ilia Mirkin 
> ---
>  src/gallium/docs/source/tgsi.rst | 10 --
>  1 file changed, 8 insertions(+), 2 deletions(-)
> 
> diff --git a/src/gallium/docs/source/tgsi.rst 
> b/src/gallium/docs/source/tgsi.rst
> index 955ece8..f69998f 100644
> --- a/src/gallium/docs/source/tgsi.rst
> +++ b/src/gallium/docs/source/tgsi.rst
> @@ -458,7 +458,9 @@ while DDY is allowed to be the same for the entire 2x2 
> quad.
>  
>  .. opcode:: PK2H - Pack Two 16-bit Floats
>  
> -  TBD
> +.. math::
> +
> +  dst.x = f32\_to\_f16(src.x) | f32\_to\_f16(src.y) << 16
This doesn't quite match the tgsi info description (which says that the
result is
replicated). If you don't want channel replication probably should make
that CHAN
there instead.



>  
>  .. opcode:: PK2US - Pack Two Unsigned 16-bit Scalars
> @@ -615,7 +617,11 @@ This instruction replicates its result.
>  
>  .. opcode:: UP2H - Unpack Two 16-Bit Floats
>  
> -  TBD
> +.. math::
> +
> +  dst.x = f16\_to\_f32(src0.x \& 0x)
> +
> +  dst.y = f16\_to\_f32(src0.x >> 16)
>
I'm certainly ok with that, albeit (just like PK2H unless you do
replication) it's not what the original source for this opcode does
(which would have been NV_fragment_program).

For the series (with the first point addressed either way,though a tgsi
exec implementation which should be trivial wouldn't hurt neither)
Reviewed-by: Roland Scheidegger 
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [Bug 93570] the image of llvmpipe has a low quality on arm (with too many points on it)

2016-01-03 Thread bugzilla-daemon
https://bugs.freedesktop.org/show_bug.cgi?id=93570

--- Comment #2 from Icenowy Zheng  ---
icenowy [ ~ ] ! glmark2 -b build
libGL error: unable to load driver: mali_drm_dri.so
libGL error: driver pointer missing
libGL error: failed to load driver: mali_drm
** GLX does not support GLX_EXT_swap_control or GLX_MESA_swap_control!
** Failed to set swap interval. Results may be bounded above by refresh rate.
===
glmark2 2014.03
===
OpenGL Information
GL_VENDOR: VMware, Inc.
GL_RENDERER:   Gallium 0.4 on llvmpipe (LLVM 3.7, 128 bits)
GL_VERSION:3.0 Mesa 11.1.0
===
** GLX does not support GLX_EXT_swap_control or GLX_MESA_swap_control!
** Failed to set swap interval. Results may be bounded above by refresh rate.
[build] : FPS: 13 FrameTime: 76.923 ms
===
  glmark2 Score: 13 
===


Here's some log

-- 
You are receiving this mail because:
You are the QA Contact for the bug.
You are the assignee for the bug.
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] st/mesa: fix parameter names for tesseval/tessctrl prototypes

2016-01-03 Thread Samuel Pitoiset
Cc: Ilia Mirkin 
Signed-off-by: Samuel Pitoiset 
---
 src/mesa/state_tracker/st_program.h | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/mesa/state_tracker/st_program.h 
b/src/mesa/state_tracker/st_program.h
index a8571f0..a745315 100644
--- a/src/mesa/state_tracker/st_program.h
+++ b/src/mesa/state_tracker/st_program.h
@@ -405,12 +405,12 @@ st_get_gp_variant(struct st_context *st,
 
 extern struct st_tcp_variant *
 st_get_tcp_variant(struct st_context *st,
-   struct st_tessctrl_program *stgp,
+   struct st_tessctrl_program *sttcp,
const struct st_tcp_variant_key *key);
 
 extern struct st_tep_variant *
 st_get_tep_variant(struct st_context *st,
-   struct st_tesseval_program *stgp,
+   struct st_tesseval_program *sttep,
const struct st_tep_variant_key *key);
 
 extern void
@@ -427,11 +427,11 @@ st_release_gp_variants(struct st_context *st,
 
 extern void
 st_release_tcp_variants(struct st_context *st,
-struct st_tessctrl_program *stgp);
+struct st_tessctrl_program *sttcp);
 
 extern void
 st_release_tep_variants(struct st_context *st,
-struct st_tesseval_program *stgp);
+struct st_tesseval_program *sttep);
 
 extern void
 st_destroy_program_variants(struct st_context *st);
-- 
2.6.4

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 1/6] gallium: document PK2H/UP2H

2016-01-03 Thread Ilia Mirkin
On Sun, Jan 3, 2016 at 12:33 PM, Roland Scheidegger  wrote:
> Am 03.01.2016 um 01:37 schrieb Ilia Mirkin:
>> Signed-off-by: Ilia Mirkin 
>> ---
>>  src/gallium/docs/source/tgsi.rst | 10 --
>>  1 file changed, 8 insertions(+), 2 deletions(-)
>>
>> diff --git a/src/gallium/docs/source/tgsi.rst 
>> b/src/gallium/docs/source/tgsi.rst
>> index 955ece8..f69998f 100644
>> --- a/src/gallium/docs/source/tgsi.rst
>> +++ b/src/gallium/docs/source/tgsi.rst
>> @@ -458,7 +458,9 @@ while DDY is allowed to be the same for the entire 2x2 
>> quad.
>>
>>  .. opcode:: PK2H - Pack Two 16-bit Floats
>>
>> -  TBD
>> +.. math::
>> +
>> +  dst.x = f32\_to\_f16(src.x) | f32\_to\_f16(src.y) << 16
> This doesn't quite match the tgsi info description (which says that the
> result is
> replicated). If you don't want channel replication probably should make
> that CHAN
> there instead.

I'll add the replication to the docs. Looks like NV_fragment_program
also wanted this:

  tmp0 = VectorLoad(op0);
  /* result obtained by combining raw bits of tmp0.x, tmp0.y */
  result.x = RawBits(tmp0.x) | (RawBits(tmp0.y) << 16);
  result.y = RawBits(tmp0.x) | (RawBits(tmp0.y) << 16);
  result.z = RawBits(tmp0.x) | (RawBits(tmp0.y) << 16);
  result.w = RawBits(tmp0.x) | (RawBits(tmp0.y) << 16);

But looks like it's just packing, not actually converting. And it's
unclear whether UP2H is converting or not... let's assume that they do
the conversions or else this is going to be useless.

>
>
>
>>
>>  .. opcode:: PK2US - Pack Two Unsigned 16-bit Scalars
>> @@ -615,7 +617,11 @@ This instruction replicates its result.
>>
>>  .. opcode:: UP2H - Unpack Two 16-Bit Floats
>>
>> -  TBD
>> +.. math::
>> +
>> +  dst.x = f16\_to\_f32(src0.x \& 0x)
>> +
>> +  dst.y = f16\_to\_f32(src0.x >> 16)
>>
> I'm certainly ok with that, albeit (just like PK2H unless you do
> replication) it's not what the original source for this opcode does
> (which would have been NV_fragment_program).

  tmp = ScalarLoad(op0);
  result.x = (fp16) (RawBits(tmp) & 0x);
  result.y = (fp16) ((RawBits(tmp) >> 16) & 0x);
  result.z = (fp16) (RawBits(tmp) & 0x);
  result.w = (fp16) ((RawBits(tmp) >> 16) & 0x);

Happy to add the .zw = .xy bit here as well. I was previously not
aware that these ops came from NV_fragment_program, and instead
assumed that they came from some incomplete attempt to do...
something. (I guess it was for implementing NV_fragment_program ;) )

>
> For the series (with the first point addressed either way,though a tgsi
> exec implementation which should be trivial wouldn't hurt neither)
> Reviewed-by: Roland Scheidegger 

Thanks! I'll do a patch for that shortly (tgsi_exec). Unfortunately I
won't be able to enable the cap since it will still use gallivm by
default for vertices. I have a gallivm implementation as well, but it
hits asserts on LLVM 3.5. I'm pretty sure I tested it at one point or
another, but it must have been on another box with a more recent LLVM.

  -ilia
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH] st/mesa: fix parameter names for tesseval/tessctrl prototypes

2016-01-03 Thread Ilia Mirkin
Reviewed-by: Ilia Mirkin 

Can you guess where I copy-pasted these from? :)

On Sun, Jan 3, 2016 at 12:47 PM, Samuel Pitoiset
 wrote:
> Cc: Ilia Mirkin 
> Signed-off-by: Samuel Pitoiset 
> ---
>  src/mesa/state_tracker/st_program.h | 8 
>  1 file changed, 4 insertions(+), 4 deletions(-)
>
> diff --git a/src/mesa/state_tracker/st_program.h 
> b/src/mesa/state_tracker/st_program.h
> index a8571f0..a745315 100644
> --- a/src/mesa/state_tracker/st_program.h
> +++ b/src/mesa/state_tracker/st_program.h
> @@ -405,12 +405,12 @@ st_get_gp_variant(struct st_context *st,
>
>  extern struct st_tcp_variant *
>  st_get_tcp_variant(struct st_context *st,
> -   struct st_tessctrl_program *stgp,
> +   struct st_tessctrl_program *sttcp,
> const struct st_tcp_variant_key *key);
>
>  extern struct st_tep_variant *
>  st_get_tep_variant(struct st_context *st,
> -   struct st_tesseval_program *stgp,
> +   struct st_tesseval_program *sttep,
> const struct st_tep_variant_key *key);
>
>  extern void
> @@ -427,11 +427,11 @@ st_release_gp_variants(struct st_context *st,
>
>  extern void
>  st_release_tcp_variants(struct st_context *st,
> -struct st_tessctrl_program *stgp);
> +struct st_tessctrl_program *sttcp);
>
>  extern void
>  st_release_tep_variants(struct st_context *st,
> -struct st_tesseval_program *stgp);
> +struct st_tesseval_program *sttep);
>
>  extern void
>  st_destroy_program_variants(struct st_context *st);
> --
> 2.6.4
>
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH] st/mesa: fix parameter names for tesseval/tessctrl prototypes

2016-01-03 Thread Samuel Pitoiset



On 01/03/2016 07:03 PM, Ilia Mirkin wrote:

Reviewed-by: Ilia Mirkin 

Can you guess where I copy-pasted these from? :)


Two lines above? :-)



On Sun, Jan 3, 2016 at 12:47 PM, Samuel Pitoiset
 wrote:

Cc: Ilia Mirkin 
Signed-off-by: Samuel Pitoiset 
---
  src/mesa/state_tracker/st_program.h | 8 
  1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/mesa/state_tracker/st_program.h 
b/src/mesa/state_tracker/st_program.h
index a8571f0..a745315 100644
--- a/src/mesa/state_tracker/st_program.h
+++ b/src/mesa/state_tracker/st_program.h
@@ -405,12 +405,12 @@ st_get_gp_variant(struct st_context *st,

  extern struct st_tcp_variant *
  st_get_tcp_variant(struct st_context *st,
-   struct st_tessctrl_program *stgp,
+   struct st_tessctrl_program *sttcp,
 const struct st_tcp_variant_key *key);

  extern struct st_tep_variant *
  st_get_tep_variant(struct st_context *st,
-   struct st_tesseval_program *stgp,
+   struct st_tesseval_program *sttep,
 const struct st_tep_variant_key *key);

  extern void
@@ -427,11 +427,11 @@ st_release_gp_variants(struct st_context *st,

  extern void
  st_release_tcp_variants(struct st_context *st,
-struct st_tessctrl_program *stgp);
+struct st_tessctrl_program *sttcp);

  extern void
  st_release_tep_variants(struct st_context *st,
-struct st_tesseval_program *stgp);
+struct st_tesseval_program *sttep);

  extern void
  st_destroy_program_variants(struct st_context *st);
--
2.6.4


___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 2/6] i965/gen8+: Invalidate color calc state when switching to the GPGPU pipeline.

2016-01-03 Thread Francisco Jerez
Matt Turner  writes:

> On Sun, Jan 3, 2016 at 1:48 AM, Francisco Jerez  wrote:
>> This hardware bug can cause a hang on context restore while the
>> current pipeline is set to GPGPU (BDWGFX HSD 1909593).  In addition to
>> clearing the valid bit, mark the CC state as dirty to make sure that
>> the CC indirect state pointer is re-emitted when we switch back to the
>> 3D pipeline.
>> ---
>>  src/mesa/drivers/dri/i965/brw_misc_state.c | 20 
>>  1 file changed, 20 insertions(+)
>>
>> diff --git a/src/mesa/drivers/dri/i965/brw_misc_state.c 
>> b/src/mesa/drivers/dri/i965/brw_misc_state.c
>> index cf6ba5b..7d53d18 100644
>> --- a/src/mesa/drivers/dri/i965/brw_misc_state.c
>> +++ b/src/mesa/drivers/dri/i965/brw_misc_state.c
>> @@ -868,6 +868,26 @@ brw_emit_select_pipeline(struct brw_context *brw, enum 
>> brw_pipeline pipeline)
>> const uint32_t _3DSTATE_PIPELINE_SELECT =
>>is_965 ? CMD_PIPELINE_SELECT_965 : CMD_PIPELINE_SELECT_GM45;
>>
>> +   if (brw->gen >= 8) {
>> +  /* From "BXML » GT » MI » vol1a GPU Overview » [Instruction]
>> +   * PIPELINE_SELECT [DevBWR+]":
>> +   *
>> +   *   Project: BDW, SKL
>
> I think we should restrict this block to brw->gen == 8 || brw->gen ==
> 9 in that case?
>
> I can't find evidence that the workaround applies to later hardware
> (and in fact the page cited has a different workaround for a later
> generation).

Yeah, Gen10 will need a different workaround but I wasn't sure we could
release the details already.  Anyway I've changed the above locally to
be limited to pre-Gen10 for now.

Thanks.


signature.asc
Description: PGP signature
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [Bug 93570] the image of llvmpipe has a low quality on arm (with too many points on it)

2016-01-03 Thread bugzilla-daemon
https://bugs.freedesktop.org/show_bug.cgi?id=93570

Ilia Mirkin  changed:

   What|Removed |Added

 Attachment #120782|text/plain  |image/jpeg
  mime type||

-- 
You are receiving this mail because:
You are the QA Contact for the bug.
You are the assignee for the bug.
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 1/5] configure.ac: Detect if running on POWER8 arch

2016-01-03 Thread Roland Scheidegger
Am 03.01.2016 um 13:50 schrieb Oded Gabbay:
> On Thu, Dec 31, 2015 at 8:48 PM, Roland Scheidegger  
> wrote:
>> Am 31.12.2015 um 10:30 schrieb Oded Gabbay:
>>> On Wed, Dec 30, 2015 at 5:41 PM, Roland Scheidegger  
>>> wrote:
 Am 30.12.2015 um 10:46 schrieb Oded Gabbay:
> On Wed, Dec 30, 2015 at 1:11 AM, Roland Scheidegger  
> wrote:
>>
>> So, if I see that right, you will automatically generate binaries using
>> power8 instructions if compiled on power8 capable box, which then won't
>> run on boxes not supporting power8? Is that really what you want?
>> Maybe some runtime detection would be a good idea (though I don't know
>> if anyone cares about power7)?
>
> The problem is I don't think I can eliminate the build time check
> (although I would very much like to) because I need:
> 1. To pass a special flag to the GCC compiler: -mpower8-vector
> 2. To define _ARCH_PWR8 so GCC will include the newer intrinsic
>
> Without those two things, I won't be able to use vec_vgbbd which I
> need to implement the _mm_movemask_epi8 efficiently, and without that,
> all this patch series can be thrown out the window. The emulation of
> _mm_movemask_epi8 using regular instructions is just horrible.
>
> You are correct that once you build a binary with this flag on power8
> machine, that binary won't run on power7 machine. You get "cannot
> execute binary file"
>
> Unfortunately, I don't see a way around this because even if I
> condition the use of vec_vgbbd on a runtime check/define, the library
> still won't be executable because it was built with -mpower8-vector.
>
> Having said that, because I *assume* IBM right now mostly cares about
> Linux running on POWER8 with little-endian, I think it is a fair
> compromise.

 Note I don't have anything against a build time check. My concern here
 is something along the lines of unsuspecting distros shipping binaries
 which won't work, as it looks to me like this will get picked up
 automatically. That is different to how for instance sse41 is handled.
 That is I believe this should only get enabled if someone has specified
 some -mcpu=power8 or whatever flag explicitly somewhere already.

 Roland
>>>
>>> I understand and I share your concern. Maybe we should add
>>> "--disable-pwr8-inst" to mesa's configure ? if that flag is given to
>>> configure, it would disable the optimization code (won't add
>>> _ARCH_PWR8 to defines and won't add -mpower8-vector to gcc flags).
>>>
>>> What do you think ?
>> If the generated code with all automatically picked up compile options
>> really doesn't run on power7 just because of this, I think it would be
>> nicer if this were an explicit enable.
>>
>> Roland
>>
> 
> So the problem is a bit worse then that and requires a harsher solution.
> Apparently, when that compiler flag (power8-vector) is given to GCC,
> GCC uses POWER8-only instructions in other places as well! What I have
> seen so far, is that it uses such instructions in the implementation
> of exp2() and/or log2() (in f.cpp) and also saw it in
> __glXInitializeVisualConfigFromTags(). The instructions used are not
> vector instructions, but floating point instructions, which were added
> only in PowerISA 2.07
> 
> Therefore, I think that for now, I will limit the entire optimization
> code to POWER8 *and* Little-Endian. Because ppc64le packages can
> *only* run on POWER8 systems, and because you can't transfer binaries
> between LE and BE machines, this workaround eliminates the danger of
> crashing on "illegal instruction". In addition, there is no more need
> for runtime checks.
> 
> I hope you agree that with this change, it is better to enable the
> power8-vector by default when building on POWER8 machine installed
> with Linux LE. For all other archs it will be disabled by default.

Yes, that looks reasonable.

> I will try to contact IBM GCC devs to see how we can overcome this
> problem (or if they even care) so I can expand these optimizations to
> BE as well.
IIRC this is problematic for things like sse41 etc. as well, it is just
how gcc works. I think the typical workaround is to move code using
intrinsics which need special compile flags to their own file (or rather
compile unit), which you then can compile with those flags. (This
implies of course separate functions, that is you can't have any
run-time check plus the assembly in the same file or function.)
This is how mesa handles _mesa_streaming_load_memcpy for the i965 driver.

Roland


> 
> I will send the revised patches shortly.
> 
>Oded
> 
>>
>>
>>
>>>
>>> Oded
>>>

>
> Oded
>
>> So far we didn't bother with that for SSE
>> but it has to be said SSE2 is a really low bar (and the manual assembly
>> stuff doesn't use anything more advanced, even though clearly things
>> like the emulated mm_mullo_epi32 are suboptimal if your c

[Mesa-dev] [Bug 93570] the image of llvmpipe has a low quality on arm (with too many points on it)

2016-01-03 Thread bugzilla-daemon
https://bugs.freedesktop.org/show_bug.cgi?id=93570

--- Comment #3 from Ilia Mirkin  ---
A fairer comparison would be

LIBGL_ALWAYS_SOFTWARE=1 LP_NATIVE_VECTOR_WIDTH=128

Since I assume it will otherwise use 256-bit wide vectors on your intel CPU.
But it appears that the rendering is actually incorrect there. My personal
guess is that llvm on arm has some issues, but this is based purely on the fact
that you're seeing misrendering :)

It may be worthwhile to build mesa-git and llvm-svn and see if the issue
persists. (mesa-git should be able to build against the llvm head, while
released mesa probably won't.)

-- 
You are receiving this mail because:
You are the QA Contact for the bug.
You are the assignee for the bug.
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 1/5] configure.ac: Detect if running on POWER8 arch

2016-01-03 Thread Oded Gabbay
On Sun, Jan 3, 2016 at 8:14 PM, Roland Scheidegger  wrote:
> Am 03.01.2016 um 13:50 schrieb Oded Gabbay:
>> On Thu, Dec 31, 2015 at 8:48 PM, Roland Scheidegger  
>> wrote:
>>> Am 31.12.2015 um 10:30 schrieb Oded Gabbay:
 On Wed, Dec 30, 2015 at 5:41 PM, Roland Scheidegger  
 wrote:
> Am 30.12.2015 um 10:46 schrieb Oded Gabbay:
>> On Wed, Dec 30, 2015 at 1:11 AM, Roland Scheidegger  
>> wrote:
>>>
>>> So, if I see that right, you will automatically generate binaries using
>>> power8 instructions if compiled on power8 capable box, which then won't
>>> run on boxes not supporting power8? Is that really what you want?
>>> Maybe some runtime detection would be a good idea (though I don't know
>>> if anyone cares about power7)?
>>
>> The problem is I don't think I can eliminate the build time check
>> (although I would very much like to) because I need:
>> 1. To pass a special flag to the GCC compiler: -mpower8-vector
>> 2. To define _ARCH_PWR8 so GCC will include the newer intrinsic
>>
>> Without those two things, I won't be able to use vec_vgbbd which I
>> need to implement the _mm_movemask_epi8 efficiently, and without that,
>> all this patch series can be thrown out the window. The emulation of
>> _mm_movemask_epi8 using regular instructions is just horrible.
>>
>> You are correct that once you build a binary with this flag on power8
>> machine, that binary won't run on power7 machine. You get "cannot
>> execute binary file"
>>
>> Unfortunately, I don't see a way around this because even if I
>> condition the use of vec_vgbbd on a runtime check/define, the library
>> still won't be executable because it was built with -mpower8-vector.
>>
>> Having said that, because I *assume* IBM right now mostly cares about
>> Linux running on POWER8 with little-endian, I think it is a fair
>> compromise.
>
> Note I don't have anything against a build time check. My concern here
> is something along the lines of unsuspecting distros shipping binaries
> which won't work, as it looks to me like this will get picked up
> automatically. That is different to how for instance sse41 is handled.
> That is I believe this should only get enabled if someone has specified
> some -mcpu=power8 or whatever flag explicitly somewhere already.
>
> Roland

 I understand and I share your concern. Maybe we should add
 "--disable-pwr8-inst" to mesa's configure ? if that flag is given to
 configure, it would disable the optimization code (won't add
 _ARCH_PWR8 to defines and won't add -mpower8-vector to gcc flags).

 What do you think ?
>>> If the generated code with all automatically picked up compile options
>>> really doesn't run on power7 just because of this, I think it would be
>>> nicer if this were an explicit enable.
>>>
>>> Roland
>>>
>>
>> So the problem is a bit worse then that and requires a harsher solution.
>> Apparently, when that compiler flag (power8-vector) is given to GCC,
>> GCC uses POWER8-only instructions in other places as well! What I have
>> seen so far, is that it uses such instructions in the implementation
>> of exp2() and/or log2() (in f.cpp) and also saw it in
>> __glXInitializeVisualConfigFromTags(). The instructions used are not
>> vector instructions, but floating point instructions, which were added
>> only in PowerISA 2.07
>>
>> Therefore, I think that for now, I will limit the entire optimization
>> code to POWER8 *and* Little-Endian. Because ppc64le packages can
>> *only* run on POWER8 systems, and because you can't transfer binaries
>> between LE and BE machines, this workaround eliminates the danger of
>> crashing on "illegal instruction". In addition, there is no more need
>> for runtime checks.
>>
>> I hope you agree that with this change, it is better to enable the
>> power8-vector by default when building on POWER8 machine installed
>> with Linux LE. For all other archs it will be disabled by default.
>
> Yes, that looks reasonable.

Thanks.

>
>> I will try to contact IBM GCC devs to see how we can overcome this
>> problem (or if they even care) so I can expand these optimizations to
>> BE as well.
> IIRC this is problematic for things like sse41 etc. as well, it is just
> how gcc works. I think the typical workaround is to move code using
> intrinsics which need special compile flags to their own file (or rather
> compile unit), which you then can compile with those flags. (This
> implies of course separate functions, that is you can't have any
> run-time check plus the assembly in the same file or function.)
> This is how mesa handles _mesa_streaming_load_memcpy for the i965 driver.
>
> Roland
>
Yeah, I imagined as much, but I didn't know this technique was already
in use in mesa.

I hate this fragmentation of code, and I think a better solution, at
the compiler level, is to have some kind of flag which te

[Mesa-dev] [Bug 93570] the image of llvmpipe has a low quality on arm (with too many points on it)

2016-01-03 Thread bugzilla-daemon
https://bugs.freedesktop.org/show_bug.cgi?id=93570

--- Comment #4 from Icenowy Zheng  ---
Of course I am not benchmarking.
But the image *do* misrenders.

It will take me days of time to build a llvm and mesa combination, and may fail
(as my device has only 512MB RAM)

-- 
You are receiving this mail because:
You are the QA Contact for the bug.
You are the assignee for the bug.
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 1/6] gallium: document PK2H/UP2H

2016-01-03 Thread Roland Scheidegger
Am 03.01.2016 um 19:02 schrieb Ilia Mirkin:
> On Sun, Jan 3, 2016 at 12:33 PM, Roland Scheidegger  
> wrote:
>> Am 03.01.2016 um 01:37 schrieb Ilia Mirkin:
>>> Signed-off-by: Ilia Mirkin 
>>> ---
>>>  src/gallium/docs/source/tgsi.rst | 10 --
>>>  1 file changed, 8 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/src/gallium/docs/source/tgsi.rst 
>>> b/src/gallium/docs/source/tgsi.rst
>>> index 955ece8..f69998f 100644
>>> --- a/src/gallium/docs/source/tgsi.rst
>>> +++ b/src/gallium/docs/source/tgsi.rst
>>> @@ -458,7 +458,9 @@ while DDY is allowed to be the same for the entire 2x2 
>>> quad.
>>>
>>>  .. opcode:: PK2H - Pack Two 16-bit Floats
>>>
>>> -  TBD
>>> +.. math::
>>> +
>>> +  dst.x = f32\_to\_f16(src.x) | f32\_to\_f16(src.y) << 16
>> This doesn't quite match the tgsi info description (which says that the
>> result is
>> replicated). If you don't want channel replication probably should make
>> that CHAN
>> there instead.
> 
> I'll add the replication to the docs. Looks like NV_fragment_program
> also wanted this:
> 
>   tmp0 = VectorLoad(op0);
>   /* result obtained by combining raw bits of tmp0.x, tmp0.y */
>   result.x = RawBits(tmp0.x) | (RawBits(tmp0.y) << 16);
>   result.y = RawBits(tmp0.x) | (RawBits(tmp0.y) << 16);
>   result.z = RawBits(tmp0.x) | (RawBits(tmp0.y) << 16);
>   result.w = RawBits(tmp0.x) | (RawBits(tmp0.y) << 16);
> 
> But looks like it's just packing, not actually converting. And it's
> unclear whether UP2H is converting or not... let's assume that they do
> the conversions or else this is going to be useless.
I don't think that's quite true it only packs (the pseudo-code is
probably a bit sloppy...), given what nv30 could do this doesn't make
sense. Also, UP2H clearly states "...undoes the type conversion and
packing performed by the PK2H instruction". Albeit the pseudo-code
doesn't really mention float anywhere there neither. I think though this
is due to the possibility of the src (for pk2h) or dst (for up2h) being
either a float or half reg, so in the latter case you wouldn't get any
conversion (but don't quote me on that...).


> 
>>
>>
>>
>>>
>>>  .. opcode:: PK2US - Pack Two Unsigned 16-bit Scalars
>>> @@ -615,7 +617,11 @@ This instruction replicates its result.
>>>
>>>  .. opcode:: UP2H - Unpack Two 16-Bit Floats
>>>
>>> -  TBD
>>> +.. math::
>>> +
>>> +  dst.x = f16\_to\_f32(src0.x \& 0x)
>>> +
>>> +  dst.y = f16\_to\_f32(src0.x >> 16)
>>>
>> I'm certainly ok with that, albeit (just like PK2H unless you do
>> replication) it's not what the original source for this opcode does
>> (which would have been NV_fragment_program).
> 
>   tmp = ScalarLoad(op0);
>   result.x = (fp16) (RawBits(tmp) & 0x);
>   result.y = (fp16) ((RawBits(tmp) >> 16) & 0x);
>   result.z = (fp16) (RawBits(tmp) & 0x);
>   result.w = (fp16) ((RawBits(tmp) >> 16) & 0x);
> 
> Happy to add the .zw = .xy bit here as well. I was previously not
> aware that these ops came from NV_fragment_program, and instead
> assumed that they came from some incomplete attempt to do...
> something. (I guess it was for implementing NV_fragment_program ;) )
Yes. I don't think any real effort was really ever made to support it,
but tgsi was supposed to provide a superset of all available opcodes
coming from somewhere (be it gl extensions or coming from d3d9) then.
There's actually an ooold branch sitting on fdo where Michal
removed support for all these opcodes, but it was never merged
(http://cgit.freedesktop.org/mesa/mesa/commit/?id=5efeade4dc7ffe2d10b231b56fac60dbaa8aa0c8)

So, if you want slightly different semantics that should be fine, but if
the original ones aren't annoying could of course just stick to them.

Roland


> 
>>
>> For the series (with the first point addressed either way,though a tgsi
>> exec implementation which should be trivial wouldn't hurt neither)
>> Reviewed-by: Roland Scheidegger 
> 
> Thanks! I'll do a patch for that shortly (tgsi_exec). Unfortunately I
> won't be able to enable the cap since it will still use gallivm by
> default for vertices. I have a gallivm implementation as well, but it
> hits asserts on LLVM 3.5. I'm pretty sure I tested it at one point or
> another, but it must have been on another box with a more recent LLVM.

Ah right. f16 conversion is pretty annoying indeed, though I'd hope the
helpers for that should work. In any case, I only really suggested that
because I'd thought it would be trivial, so if it's not I don't consider
that important...

Roland


___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 1/6] gallium: document PK2H/UP2H

2016-01-03 Thread Ilia Mirkin
On Sun, Jan 3, 2016 at 2:08 PM, Roland Scheidegger  wrote:
>>> For the series (with the first point addressed either way,though a tgsi
>>> exec implementation which should be trivial wouldn't hurt neither)
>>> Reviewed-by: Roland Scheidegger 
>>
>> Thanks! I'll do a patch for that shortly (tgsi_exec). Unfortunately I
>> won't be able to enable the cap since it will still use gallivm by
>> default for vertices. I have a gallivm implementation as well, but it
>> hits asserts on LLVM 3.5. I'm pretty sure I tested it at one point or
>> another, but it must have been on another box with a more recent LLVM.
>
> Ah right. f16 conversion is pretty annoying indeed, though I'd hope the
> helpers for that should work. In any case, I only really suggested that
> because I'd thought it would be trivial, so if it's not I don't consider
> that important...

I'll send it out as a separate series, including my (semi?) broken
gallivm impl and leave it to you to fix it if you care, or ignore if
you don't. (I already have it, so might as well...) I understand
neither how LLVM works, nor how gallivm uses LLVM, which isn't a great
combination :)
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 1/5] configure.ac: Detect if running on POWER8 arch

2016-01-03 Thread Roland Scheidegger
Am 03.01.2016 um 19:20 schrieb Oded Gabbay:
> On Sun, Jan 3, 2016 at 8:14 PM, Roland Scheidegger  wrote:
>> Am 03.01.2016 um 13:50 schrieb Oded Gabbay:
>>> On Thu, Dec 31, 2015 at 8:48 PM, Roland Scheidegger  
>>> wrote:
 Am 31.12.2015 um 10:30 schrieb Oded Gabbay:
> On Wed, Dec 30, 2015 at 5:41 PM, Roland Scheidegger  
> wrote:
>> Am 30.12.2015 um 10:46 schrieb Oded Gabbay:
>>> On Wed, Dec 30, 2015 at 1:11 AM, Roland Scheidegger 
>>>  wrote:

 So, if I see that right, you will automatically generate binaries using
 power8 instructions if compiled on power8 capable box, which then won't
 run on boxes not supporting power8? Is that really what you want?
 Maybe some runtime detection would be a good idea (though I don't know
 if anyone cares about power7)?
>>>
>>> The problem is I don't think I can eliminate the build time check
>>> (although I would very much like to) because I need:
>>> 1. To pass a special flag to the GCC compiler: -mpower8-vector
>>> 2. To define _ARCH_PWR8 so GCC will include the newer intrinsic
>>>
>>> Without those two things, I won't be able to use vec_vgbbd which I
>>> need to implement the _mm_movemask_epi8 efficiently, and without that,
>>> all this patch series can be thrown out the window. The emulation of
>>> _mm_movemask_epi8 using regular instructions is just horrible.
>>>
>>> You are correct that once you build a binary with this flag on power8
>>> machine, that binary won't run on power7 machine. You get "cannot
>>> execute binary file"
>>>
>>> Unfortunately, I don't see a way around this because even if I
>>> condition the use of vec_vgbbd on a runtime check/define, the library
>>> still won't be executable because it was built with -mpower8-vector.
>>>
>>> Having said that, because I *assume* IBM right now mostly cares about
>>> Linux running on POWER8 with little-endian, I think it is a fair
>>> compromise.
>>
>> Note I don't have anything against a build time check. My concern here
>> is something along the lines of unsuspecting distros shipping binaries
>> which won't work, as it looks to me like this will get picked up
>> automatically. That is different to how for instance sse41 is handled.
>> That is I believe this should only get enabled if someone has specified
>> some -mcpu=power8 or whatever flag explicitly somewhere already.
>>
>> Roland
>
> I understand and I share your concern. Maybe we should add
> "--disable-pwr8-inst" to mesa's configure ? if that flag is given to
> configure, it would disable the optimization code (won't add
> _ARCH_PWR8 to defines and won't add -mpower8-vector to gcc flags).
>
> What do you think ?
 If the generated code with all automatically picked up compile options
 really doesn't run on power7 just because of this, I think it would be
 nicer if this were an explicit enable.

 Roland

>>>
>>> So the problem is a bit worse then that and requires a harsher solution.
>>> Apparently, when that compiler flag (power8-vector) is given to GCC,
>>> GCC uses POWER8-only instructions in other places as well! What I have
>>> seen so far, is that it uses such instructions in the implementation
>>> of exp2() and/or log2() (in f.cpp) and also saw it in
>>> __glXInitializeVisualConfigFromTags(). The instructions used are not
>>> vector instructions, but floating point instructions, which were added
>>> only in PowerISA 2.07
>>>
>>> Therefore, I think that for now, I will limit the entire optimization
>>> code to POWER8 *and* Little-Endian. Because ppc64le packages can
>>> *only* run on POWER8 systems, and because you can't transfer binaries
>>> between LE and BE machines, this workaround eliminates the danger of
>>> crashing on "illegal instruction". In addition, there is no more need
>>> for runtime checks.
>>>
>>> I hope you agree that with this change, it is better to enable the
>>> power8-vector by default when building on POWER8 machine installed
>>> with Linux LE. For all other archs it will be disabled by default.
>>
>> Yes, that looks reasonable.
> 
> Thanks.
> 
>>
>>> I will try to contact IBM GCC devs to see how we can overcome this
>>> problem (or if they even care) so I can expand these optimizations to
>>> BE as well.
>> IIRC this is problematic for things like sse41 etc. as well, it is just
>> how gcc works. I think the typical workaround is to move code using
>> intrinsics which need special compile flags to their own file (or rather
>> compile unit), which you then can compile with those flags. (This
>> implies of course separate functions, that is you can't have any
>> run-time check plus the assembly in the same file or function.)
>> This is how mesa handles _mesa_streaming_load_memcpy for the i965 driver.
>>
>> Roland
>>
> Yeah, I imagined as much, but I didn't know this technique was al

[Mesa-dev] [Bug 93570] the image of llvmpipe has a low quality on arm (with too many points on it)

2016-01-03 Thread bugzilla-daemon
https://bugs.freedesktop.org/show_bug.cgi?id=93570

--- Comment #5 from Roland Scheidegger  ---
This looks rather interesting, like a shuffle gone wrong (always affects the
same 3 pixel in 4x4 pixel stamp). This chip does have NEON instructions right?
I think llvm used to have quite some problems if it needed to lower all the
vector code to scalars (not to mention the horrific performance).
Theoretically llvmpipe should work pretty ok on arm (albeit there's no arm
specific optimizations, so slower than possible) but there were spurious
reports of it not working well earlier (not many people try, it seems).
It could also well be a llvm bug, I'd suggest a newer version if you're not
already using the latest.

-- 
You are receiving this mail because:
You are the QA Contact for the bug.
You are the assignee for the bug.
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 1/6] gallium: document PK2H/UP2H

2016-01-03 Thread Ilia Mirkin
On Sun, Jan 3, 2016 at 2:15 PM, Ilia Mirkin  wrote:
> On Sun, Jan 3, 2016 at 2:08 PM, Roland Scheidegger  wrote:
 For the series (with the first point addressed either way,though a tgsi
 exec implementation which should be trivial wouldn't hurt neither)
 Reviewed-by: Roland Scheidegger 
>>>
>>> Thanks! I'll do a patch for that shortly (tgsi_exec). Unfortunately I
>>> won't be able to enable the cap since it will still use gallivm by
>>> default for vertices. I have a gallivm implementation as well, but it
>>> hits asserts on LLVM 3.5. I'm pretty sure I tested it at one point or
>>> another, but it must have been on another box with a more recent LLVM.
>>
>> Ah right. f16 conversion is pretty annoying indeed, though I'd hope the
>> helpers for that should work. In any case, I only really suggested that
>> because I'd thought it would be trivial, so if it's not I don't consider
>> that important...
>
> I'll send it out as a separate series, including my (semi?) broken
> gallivm impl and leave it to you to fix it if you care, or ignore if
> you don't. (I already have it, so might as well...) I understand
> neither how LLVM works, nor how gallivm uses LLVM, which isn't a great
> combination :)

And of course the piglits expect out-of-bounds numbers to be
represented as infinities, instead of the clamped value, which is what
util_float_to_half does :(
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 6/6] r600: add support for PK2H/UP2H

2016-01-03 Thread Ilia Mirkin
Glenn (or anyone else interested in r600): feel free to take this
patch over. I've pushed the core (and nvc0) support, but due to
Roland's feedback I changed the return values to be replicated(ish),
not 100% sure how to do that and it seems best to leave it to you.

On Sat, Jan 2, 2016 at 7:38 PM, Ilia Mirkin  wrote:
> Signed-off-by: Ilia Mirkin 
> ---
>  src/gallium/drivers/r600/r600_pipe.c   |   2 +-
>  src/gallium/drivers/r600/r600_shader.c | 102 
> +++--
>  2 files changed, 99 insertions(+), 5 deletions(-)
>
> diff --git a/src/gallium/drivers/r600/r600_pipe.c 
> b/src/gallium/drivers/r600/r600_pipe.c
> index 70c1ec1..359fe41 100644
> --- a/src/gallium/drivers/r600/r600_pipe.c
> +++ b/src/gallium/drivers/r600/r600_pipe.c
> @@ -328,6 +328,7 @@ static int r600_get_param(struct pipe_screen* pscreen, 
> enum pipe_cap param)
> case PIPE_CAP_TEXTURE_QUERY_LOD:
> case PIPE_CAP_TGSI_FS_FINE_DERIVATIVE:
> case PIPE_CAP_SAMPLER_VIEW_TARGET:
> +   case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
> return family >= CHIP_CEDAR ? 1 : 0;
> case PIPE_CAP_MAX_TEXTURE_GATHER_COMPONENTS:
> return family >= CHIP_CEDAR ? 4 : 0;
> @@ -351,7 +352,6 @@ static int r600_get_param(struct pipe_screen* pscreen, 
> enum pipe_cap param)
> case PIPE_CAP_DRAW_PARAMETERS:
> case PIPE_CAP_MULTI_DRAW_INDIRECT:
> case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS:
> -   case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
> return 0;
>
> case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
> diff --git a/src/gallium/drivers/r600/r600_shader.c 
> b/src/gallium/drivers/r600/r600_shader.c
> index d411b0b..23ea34e 100644
> --- a/src/gallium/drivers/r600/r600_shader.c
> +++ b/src/gallium/drivers/r600/r600_shader.c
> @@ -8959,6 +8959,100 @@ static int tgsi_umad(struct r600_shader_ctx *ctx)
> return 0;
>  }
>
> +static int tgsi_pk2h(struct r600_shader_ctx *ctx)
> +{
> +   struct tgsi_full_instruction *inst = 
> &ctx->parse.FullToken.FullInstruction;
> +   struct r600_bytecode_alu alu;
> +   int r;
> +
> +   /* temp.xy = f32_to_f16(src) */
> +   memset(&alu, 0, sizeof(struct r600_bytecode_alu));
> +   alu.op = ALU_OP1_FLT32_TO_FLT16;
> +   alu.dst.chan = 0;
> +   alu.dst.sel = ctx->temp_reg;
> +   alu.dst.write = 1;
> +   r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
> +   r = r600_bytecode_add_alu(ctx->bc, &alu);
> +   if (r)
> +   return r;
> +   alu.dst.chan = 1;
> +   r600_bytecode_src(&alu.src[0], &ctx->src[0], 1);
> +   alu.last = 1;
> +   r = r600_bytecode_add_alu(ctx->bc, &alu);
> +   if (r)
> +   return r;
> +
> +   /* dst.x = temp.y * 0x1 + temp.x */
> +   memset(&alu, 0, sizeof(struct r600_bytecode_alu));
> +   alu.op = ALU_OP3_MULADD_UINT24;
> +   alu.is_op3 = 1;
> +   tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
> +   alu.last = 1;
> +   alu.src[0].sel = ctx->temp_reg;
> +   alu.src[0].chan = 1;
> +   alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
> +   alu.src[1].value = 0x1;
> +   alu.src[2].sel = ctx->temp_reg;
> +   alu.src[2].chan = 0;
> +   r = r600_bytecode_add_alu(ctx->bc, &alu);
> +   if (r)
> +   return r;
> +
> +   return 0;
> +}
> +
> +static int tgsi_up2h(struct r600_shader_ctx *ctx)
> +{
> +   struct tgsi_full_instruction *inst = 
> &ctx->parse.FullToken.FullInstruction;
> +   struct r600_bytecode_alu alu;
> +   int r;
> +   int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
> +
> +   /* temp.x = src.x */
> +   /* note: no need to mask out the high bits */
> +   memset(&alu, 0, sizeof(struct r600_bytecode_alu));
> +   alu.op = ALU_OP1_MOV;
> +   alu.dst.chan = 0;
> +   alu.dst.sel = ctx->temp_reg;
> +   alu.dst.write = 1;
> +   r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
> +   r = r600_bytecode_add_alu(ctx->bc, &alu);
> +   if (r)
> +   return r;
> +
> +   /* temp.y = src.x >> 16 */
> +   memset(&alu, 0, sizeof(struct r600_bytecode_alu));
> +   alu.op = ALU_OP2_LSHR_INT;
> +   alu.dst.chan = 1;
> +   alu.dst.sel = ctx->temp_reg;
> +   alu.dst.write = 1;
> +   r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
> +   alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
> +   alu.src[1].value = 16;
> +   alu.last = 1;
> +   r = r600_bytecode_add_alu(ctx->bc, &alu);
> +   if (r)
> +   return r;
> +
> +   /* dst.xy = f16_to_f32(temp.xy) */
> +   for (int i = 0; i < lasti + 1; i++) {
> +   if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
> +   continue;
> +   memset(&alu, 0, sizeof(struct r600_bytecode_alu));
> +   tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
> +   alu.op = ALU_OP1_FLT16_TO_FLT32;
> +   alu.src[0].sel = ctx->temp_reg;
> +   

[Mesa-dev] [PATCH 2/2] WIP tgsi: add PK2H/UP2H support

2016-01-03 Thread Ilia Mirkin
It seems like there's something horribly wrong with the
util_float_to_half function. In a standalone compilation it works fine
for -6.10203e-05, generating 0x8400, but inside mesa it ends up with
0x8000. The result of the magic.f multiplication is 0. No idea why.

Signed-off-by: Ilia Mirkin 
---

See above comments for why I didn't include this at all. Should you figure out
what was going wrong, note that this can only be enabled in softpipe if the
whole pipeline is using tgsi_exec (or if the gallivm patch is fixed/upstreamed).

Feel free to take this over and/or modify as necessary.

 src/gallium/auxiliary/tgsi/tgsi_exec.c   | 44 ++--
 src/gallium/drivers/softpipe/sp_screen.c |  2 +-
 2 files changed, 43 insertions(+), 3 deletions(-)

diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.c 
b/src/gallium/auxiliary/tgsi/tgsi_exec.c
index f67c162..12a477b 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_exec.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_exec.c
@@ -58,6 +58,7 @@
 #include "tgsi/tgsi_parse.h"
 #include "tgsi/tgsi_util.h"
 #include "tgsi_exec.h"
+#include "util/u_half.h"
 #include "util/u_memory.h"
 #include "util/u_math.h"
 
@@ -3058,6 +3059,45 @@ exec_dp2(struct tgsi_exec_machine *mach,
 }
 
 static void
+exec_pk2h(struct tgsi_exec_machine *mach,
+  const struct tgsi_full_instruction *inst)
+{
+   unsigned int chan;
+   union tgsi_exec_channel arg[2], dst;
+
+   fetch_source(mach, &arg[0], &inst->Src[0], TGSI_CHAN_X, 
TGSI_EXEC_DATA_FLOAT);
+   fetch_source(mach, &arg[1], &inst->Src[0], TGSI_CHAN_Y, 
TGSI_EXEC_DATA_FLOAT);
+   for (chan = 0; chan < TGSI_QUAD_SIZE; chan++) {
+  dst.u[chan] = util_float_to_half(arg[0].f[chan]) |
+ (util_float_to_half(arg[1].f[chan]) << 16);
+   }
+   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
+  if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
+ store_dest(mach, &dst, &inst->Dst[0], inst, chan, 
TGSI_EXEC_DATA_UINT);
+  }
+   }
+}
+
+static void
+exec_up2h(struct tgsi_exec_machine *mach,
+  const struct tgsi_full_instruction *inst)
+{
+   unsigned int chan;
+   union tgsi_exec_channel arg, dst[2];
+
+   fetch_source(mach, &arg, &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_UINT);
+   for (chan = 0; chan < 4; chan++) {
+  dst[0].f[chan] = util_half_to_float(arg.u[chan] & 0x);
+  dst[1].f[chan] = util_half_to_float(arg.u[chan] >> 16);
+   }
+   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
+  if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
+ store_dest(mach, &dst[chan & 1], &inst->Dst[0], inst, chan, 
TGSI_EXEC_DATA_FLOAT);
+  }
+   }
+}
+
+static void
 exec_scs(struct tgsi_exec_machine *mach,
  const struct tgsi_full_instruction *inst)
 {
@@ -4339,7 +4379,7 @@ exec_instruction(
   break;
 
case TGSI_OPCODE_PK2H:
-  assert (0);
+  exec_pk2h(mach, inst);
   break;
 
case TGSI_OPCODE_PK2US:
@@ -4425,7 +4465,7 @@ exec_instruction(
   break;
 
case TGSI_OPCODE_UP2H:
-  assert (0);
+  exec_up2h(mach, inst);
   break;
 
case TGSI_OPCODE_UP2US:
diff --git a/src/gallium/drivers/softpipe/sp_screen.c 
b/src/gallium/drivers/softpipe/sp_screen.c
index e74044b..d4526ef 100644
--- a/src/gallium/drivers/softpipe/sp_screen.c
+++ b/src/gallium/drivers/softpipe/sp_screen.c
@@ -236,6 +236,7 @@ softpipe_get_param(struct pipe_screen *screen, enum 
pipe_cap param)
case PIPE_CAP_CLIP_HALFZ:
case PIPE_CAP_TEXTURE_FLOAT_LINEAR:
case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR:
+   case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
   return 1;
case PIPE_CAP_VERTEXID_NOBASE:
   return 0;
@@ -252,7 +253,6 @@ softpipe_get_param(struct pipe_screen *screen, enum 
pipe_cap param)
case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
case PIPE_CAP_CLEAR_TEXTURE:
case PIPE_CAP_DRAW_PARAMETERS:
-   case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
case PIPE_CAP_MULTI_DRAW_INDIRECT:
case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS:
   return 0;
-- 
2.4.10

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH 1/2] WIP gallivm: add support for PK2H/UP2H

2016-01-03 Thread Ilia Mirkin
This hits assertion failures on LLVM 3.5

Signed-off-by: Ilia Mirkin 
---

It definitely worked at one point or another, but it might have been with
a later LLVM version and/or on a different CPU. On my i7-920 with LLVM 3.5
I definitely get assertion errors from inside LLVM. Any interested party
can take this patch over and fix it as they see fit. Or ignore it.

 src/gallium/auxiliary/gallivm/lp_bld_tgsi.c|  1 -
 src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c | 87 ++
 src/gallium/drivers/llvmpipe/lp_screen.c   |  2 +-
 3 files changed, 88 insertions(+), 2 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c 
b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c
index c88dfbf..1cbe47c 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c
@@ -248,7 +248,6 @@ lp_build_tgsi_inst_llvm(
/* Ignore deprecated instructions */
switch (inst->Instruction.Opcode) {
 
-   case TGSI_OPCODE_UP2H:
case TGSI_OPCODE_UP2US:
case TGSI_OPCODE_UP4B:
case TGSI_OPCODE_UP4UB:
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c 
b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
index 3d5e2cb..ac3298d 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
@@ -1020,6 +1020,88 @@ static void dfrac_emit(
emit_data->args[0], 
tmp, "");
 }
 
+static void
+pk2h_fetch_args(
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data)
+{
+   /* src0.x */
+   emit_data->args[0] = lp_build_emit_fetch(bld_base, emit_data->inst,
+0, TGSI_CHAN_X);
+   /* src0.y */
+   emit_data->args[1] = lp_build_emit_fetch(bld_base, emit_data->inst,
+0, TGSI_CHAN_Y);
+}
+
+static void
+emit_pk2h(const struct lp_build_tgsi_action *action,
+  struct lp_build_tgsi_context *bld_base,
+  struct lp_build_emit_data *emit_data)
+{
+   LLVMBuilderRef builder = bld_base->base.gallivm->builder;
+   LLVMContextRef context = bld_base->base.gallivm->context;
+   struct lp_build_context *uint_bld = &bld_base->uint_bld;
+   LLVMTypeRef fp16 = LLVMVectorType(LLVMHalfTypeInContext(context),
+ bld_base->base.type.length);
+   LLVMTypeRef i16 = LLVMVectorType(LLVMInt16TypeInContext(context),
+bld_base->base.type.length);
+   LLVMValueRef const16 = lp_build_const_vec(uint_bld->gallivm, uint_bld->type,
+ 16);
+
+   LLVMValueRef low = LLVMBuildFPTrunc(
+  builder, emit_data->args[0], fp16, "");
+   LLVMValueRef high = LLVMBuildFPTrunc(
+  builder, emit_data->args[1], fp16, "");
+
+   low = LLVMBuildZExt(builder, LLVMBuildBitCast(builder, low, i16, ""),
+   uint_bld->vec_type, "");
+   high = LLVMBuildZExt(builder, LLVMBuildBitCast(builder, high, i16, ""),
+uint_bld->vec_type, "");
+
+   emit_data->output[emit_data->chan] =
+  LLVMBuildOr(builder, low, LLVMBuildShl(builder, high, const16, ""), "");
+}
+
+static void
+up2h_fetch_args(
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data)
+{
+   /* src0.x */
+   emit_data->args[0] = lp_build_emit_fetch(bld_base, emit_data->inst,
+0, TGSI_CHAN_X);
+}
+
+static void
+emit_up2h(const struct lp_build_tgsi_action *action,
+  struct lp_build_tgsi_context *bld_base,
+  struct lp_build_emit_data *emit_data)
+{
+   LLVMBuilderRef builder = bld_base->base.gallivm->builder;
+   LLVMContextRef context = bld_base->base.gallivm->context;
+   struct lp_build_context *uint_bld = &bld_base->uint_bld;
+   LLVMTypeRef fp16 = LLVMVectorType(LLVMHalfTypeInContext(context),
+ bld_base->base.type.length);
+   LLVMTypeRef i16 = LLVMVectorType(LLVMInt16TypeInContext(context),
+bld_base->base.type.length);
+   LLVMValueRef const16 = lp_build_const_vec(uint_bld->gallivm, uint_bld->type,
+ 16);
+
+   LLVMValueRef input = LLVMBuildBitCast(
+  builder, emit_data->args[0], bld_base->base.int_vec_type, "");
+   int i;
+
+   for (i = 0; i < 2; i++) {
+  LLVMValueRef val = input;
+  if (i == 1)
+ val = LLVMBuildLShr(builder, val, const16, "");
+  val = LLVMBuildTrunc(builder, val, i16, "");
+  val = LLVMBuildBitCast(builder, val, fp16, "");
+  emit_data->output[i] =
+ LLVMBuildFPExt(builder, val, bld_base->base.vec_type, "");
+   }
+}
+
 void
 lp_set_default_actions(struct lp_build_tgsi_context * bld_base)
 {
@@ -1093,6 +1175,11 @@ lp_set_default_actions(struct lp_build_tgsi_context * 
bld_base)
bld_base->op_actions[TGSI_OPCODE_DRCP].emit = drcp_emit;
bld_base->op_

Re: [Mesa-dev] [PATCH 2/6] i965/gen8+: Invalidate color calc state when switching to the GPGPU pipeline.

2016-01-03 Thread Kenneth Graunke
On Saturday, January 2, 2016 10:48:01 PM PST Francisco Jerez wrote:
> This hardware bug can cause a hang on context restore while the
> current pipeline is set to GPGPU (BDWGFX HSD 1909593).  In addition to
> clearing the valid bit, mark the CC state as dirty to make sure that
> the CC indirect state pointer is re-emitted when we switch back to the
> 3D pipeline.
> ---
>  src/mesa/drivers/dri/i965/brw_misc_state.c | 20 
>  1 file changed, 20 insertions(+)
> 
> diff --git a/src/mesa/drivers/dri/i965/brw_misc_state.c b/src/mesa/drivers/
dri/i965/brw_misc_state.c
> index cf6ba5b..7d53d18 100644
> --- a/src/mesa/drivers/dri/i965/brw_misc_state.c
> +++ b/src/mesa/drivers/dri/i965/brw_misc_state.c
> @@ -868,6 +868,26 @@ brw_emit_select_pipeline(struct brw_context *brw, enum 
brw_pipeline pipeline)
> const uint32_t _3DSTATE_PIPELINE_SELECT =
>is_965 ? CMD_PIPELINE_SELECT_965 : CMD_PIPELINE_SELECT_GM45;
>  
> +   if (brw->gen >= 8) {
> +  /* From "BXML » GT » MI » vol1a GPU Overview » [Instruction]
> +   * PIPELINE_SELECT [DevBWR+]":

How about:

  /* From the Broadwell PRM, Volume 2a: Instructions, PIPELINE_SELECT:

(We try to cite the public docs where possible.)

Patches 1-2 are:
Reviewed-by: Kenneth Graunke 

Thanks for fixing this - good catch!


signature.asc
Description: This is a digitally signed message part.
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 3/6] i965/gen6-7: Implement stall and flushes required prior to switching pipelines.

2016-01-03 Thread Kenneth Graunke
On Saturday, January 2, 2016 10:48:02 PM PST Francisco Jerez wrote:
> Switching the current pipeline while it's not completely idle or the
> read and write caches aren't flushed can lead to corruption.  Fixes
> misrendering of at least the following Khronos CTS test:
> 
>  ES31-CTS.shader_image_load_store.basic-allTargets-store-fs
> 
> The stall and flushes are no longer required on Gen8+.
> 
> Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=93323
> ---
>  src/mesa/drivers/dri/i965/brw_misc_state.c | 28 +++
+
>  1 file changed, 28 insertions(+)
> 
> diff --git a/src/mesa/drivers/dri/i965/brw_misc_state.c b/src/mesa/drivers/
dri/i965/brw_misc_state.c
> index 7d53d18..75540c1 100644
> --- a/src/mesa/drivers/dri/i965/brw_misc_state.c
> +++ b/src/mesa/drivers/dri/i965/brw_misc_state.c
> @@ -886,6 +886,34 @@ brw_emit_select_pipeline(struct brw_context *brw, enum 
brw_pipeline pipeline)
>  
>   brw->ctx.NewDriverState |= BRW_NEW_CC_STATE;
>}
> +
> +   } else if (brw->gen >= 6) {
> +  /* From "BXML » GT » MI » vol1a GPU Overview » [Instruction]
> +   * PIPELINE_SELECT [DevBWR+]":

Can we cite the public docs?

> +   *
> +   *   Project: DEVSNB+
> +   *
> +   *   Software must ensure all the write caches are flushed through a
> +   *   stalling PIPE_CONTROL command followed by another PIPE_CONTROL
> +   *   command to invalidate read only caches prior to programming
> +   *   MI_PIPELINE_SELECT command to change the Pipeline Select Mode.
> +   */
> +  const unsigned dc_flush =
> + brw->gen >= 7 ? PIPE_CONTROL_DATA_CACHE_INVALIDATE : 0;

I was going to suggest doing a brw_emit_post_sync_nonzero_flush first
on Sandybridge, but I forgot that we now just emit that at the start
of every state upload.  Fairly moot anyway since we don't do GPGPU on
Sandybridge anyway.

> +
> +  brw_emit_pipe_control_flush(brw,
> +  PIPE_CONTROL_RENDER_TARGET_FLUSH |
> +  PIPE_CONTROL_DEPTH_CACHE_FLUSH |
> +  dc_flush |
> +  PIPE_CONTROL_NO_WRITE |
> +  PIPE_CONTROL_CS_STALL);

Why RENDER_TARGET_FLUSH, DEPTH_CACHE_FLUSH, DATA_CACHE_INVALIDATE,
and NO_WRITE?  The cited workaround explains a CS Stall and the RO
invalidations below, but I'm not seeing why the others are needed.

> +
> +  brw_emit_pipe_control_flush(brw,
> +  PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
> +  PIPE_CONTROL_CONST_CACHE_INVALIDATE |
> +  PIPE_CONTROL_STATE_CACHE_INVALIDATE |
> +  PIPE_CONTROL_INSTRUCTION_INVALIDATE |
> +  PIPE_CONTROL_NO_WRITE);
> }
>  
> /* Select the pipeline */
> 



signature.asc
Description: This is a digitally signed message part.
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 4/6] i965/gen4-5: Emit MI_FLUSH as required prior to switching pipelines.

2016-01-03 Thread Kenneth Graunke
On Saturday, January 2, 2016 10:48:03 PM PST Francisco Jerez wrote:
> AFAIK brw_emit_select_pipeline() is only called once during context
> init on Gen4-5, at which point the pipeline is likely to be already
> idle so it may just happen to work by luck regardless of the MI_FLUSH.
> ---
>  src/mesa/drivers/dri/i965/brw_misc_state.c | 13 +
>  1 file changed, 13 insertions(+)
> 
> diff --git a/src/mesa/drivers/dri/i965/brw_misc_state.c b/src/mesa/drivers/
dri/i965/brw_misc_state.c
> index 75540c1..e5af1da 100644
> --- a/src/mesa/drivers/dri/i965/brw_misc_state.c
> +++ b/src/mesa/drivers/dri/i965/brw_misc_state.c
> @@ -914,6 +914,19 @@ brw_emit_select_pipeline(struct brw_context *brw, enum 
brw_pipeline pipeline)
>PIPE_CONTROL_STATE_CACHE_INVALIDATE |
>PIPE_CONTROL_INSTRUCTION_INVALIDATE |
>PIPE_CONTROL_NO_WRITE);
> +
> +   } else {
> +  /* From "BXML » GT » MI » vol1a GPU Overview » [Instruction]
> +   * PIPELINE_SELECT [DevBWR+]":
> +   *
> +   *   Project: PRE-DEVSNB
> +   *
> +   *   Software must ensure the current pipeline is flushed via an
> +   *   MI_FLUSH or PIPE_CONTROL prior to the execution of 
PIPELINE_SELECT.
> +   */
> +  BEGIN_BATCH(1);
> +  OUT_BATCH(MI_FLUSH);
> +  ADVANCE_BATCH();
> }
>  
> /* Select the pipeline */
> 

Patches 4-5 are:
Reviewed-by: Kenneth Graunke 

Patch 6 already has Matt's review, so I'm going to leave it be.


signature.asc
Description: This is a digitally signed message part.
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] r600g: Add support for PK2H/UP2H

2016-01-03 Thread Glenn Kennard
Based off of Ilia's original patch, but with output values replicated so
that it matches the TGSI semantics.

Signed-off-by: Glenn Kennard 
---
 src/gallium/drivers/r600/r600_pipe.c   |   2 +-
 src/gallium/drivers/r600/r600_shader.c | 107 +++--
 2 files changed, 104 insertions(+), 5 deletions(-)

diff --git a/src/gallium/drivers/r600/r600_pipe.c 
b/src/gallium/drivers/r600/r600_pipe.c
index d71082f..3b5d26c 100644
--- a/src/gallium/drivers/r600/r600_pipe.c
+++ b/src/gallium/drivers/r600/r600_pipe.c
@@ -328,6 +328,7 @@ static int r600_get_param(struct pipe_screen* pscreen, enum 
pipe_cap param)
case PIPE_CAP_TEXTURE_QUERY_LOD:
case PIPE_CAP_TGSI_FS_FINE_DERIVATIVE:
case PIPE_CAP_SAMPLER_VIEW_TARGET:
+   case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
return family >= CHIP_CEDAR ? 1 : 0;
case PIPE_CAP_MAX_TEXTURE_GATHER_COMPONENTS:
return family >= CHIP_CEDAR ? 4 : 0;
@@ -349,7 +350,6 @@ static int r600_get_param(struct pipe_screen* pscreen, enum 
pipe_cap param)
case PIPE_CAP_SHAREABLE_SHADERS:
case PIPE_CAP_CLEAR_TEXTURE:
case PIPE_CAP_DRAW_PARAMETERS:
-   case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
return 0;
 
case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
diff --git a/src/gallium/drivers/r600/r600_shader.c 
b/src/gallium/drivers/r600/r600_shader.c
index 9c040ae..7b1eade 100644
--- a/src/gallium/drivers/r600/r600_shader.c
+++ b/src/gallium/drivers/r600/r600_shader.c
@@ -8960,6 +8960,105 @@ static int tgsi_umad(struct r600_shader_ctx *ctx)
return 0;
 }
 
+static int tgsi_pk2h(struct r600_shader_ctx *ctx)
+{
+   struct tgsi_full_instruction *inst = 
&ctx->parse.FullToken.FullInstruction;
+   struct r600_bytecode_alu alu;
+   int r, i;
+   int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
+
+   /* temp.xy = f32_to_f16(src) */
+   memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+   alu.op = ALU_OP1_FLT32_TO_FLT16;
+   alu.dst.chan = 0;
+   alu.dst.sel = ctx->temp_reg;
+   alu.dst.write = 1;
+   r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
+   r = r600_bytecode_add_alu(ctx->bc, &alu);
+   if (r)
+   return r;
+   alu.dst.chan = 1;
+   r600_bytecode_src(&alu.src[0], &ctx->src[0], 1);
+   alu.last = 1;
+   r = r600_bytecode_add_alu(ctx->bc, &alu);
+   if (r)
+   return r;
+
+   /* dst.x = temp.y * 0x1 + temp.x */
+   for (i = 0; i < lasti + 1; i++) {
+   if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
+   continue;
+
+   memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+   alu.op = ALU_OP3_MULADD_UINT24;
+   alu.is_op3 = 1;
+   tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
+   alu.last = i == lasti;
+   alu.src[0].sel = ctx->temp_reg;
+   alu.src[0].chan = 1;
+   alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
+   alu.src[1].value = 0x1;
+   alu.src[2].sel = ctx->temp_reg;
+   alu.src[2].chan = 0;
+   r = r600_bytecode_add_alu(ctx->bc, &alu);
+   if (r)
+   return r;
+   }
+
+   return 0;
+}
+
+static int tgsi_up2h(struct r600_shader_ctx *ctx)
+{
+   struct tgsi_full_instruction *inst = 
&ctx->parse.FullToken.FullInstruction;
+   struct r600_bytecode_alu alu;
+   int r, i;
+   int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
+
+   /* temp.x = src.x */
+   /* note: no need to mask out the high bits */
+   memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+   alu.op = ALU_OP1_MOV;
+   alu.dst.chan = 0;
+   alu.dst.sel = ctx->temp_reg;
+   alu.dst.write = 1;
+   r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
+   r = r600_bytecode_add_alu(ctx->bc, &alu);
+   if (r)
+   return r;
+
+   /* temp.y = src.x >> 16 */
+   memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+   alu.op = ALU_OP2_LSHR_INT;
+   alu.dst.chan = 1;
+   alu.dst.sel = ctx->temp_reg;
+   alu.dst.write = 1;
+   r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
+   alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
+   alu.src[1].value = 16;
+   alu.last = 1;
+   r = r600_bytecode_add_alu(ctx->bc, &alu);
+   if (r)
+   return r;
+
+   /* dst.wz = dst.xy = f16_to_f32(temp.xy) */
+   for (i = 0; i < lasti + 1; i++) {
+   if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
+   continue;
+   memset(&alu, 0, sizeof(struct r600_bytecode_alu));
+   tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
+   alu.op = ALU_OP1_FLT16_TO_FLT32;
+   alu.src[0].sel = ctx->temp_reg;
+   alu.src[0].chan = i % 2;
+   alu.last = i == lasti;
+   r = r600_bytecod

Re: [Mesa-dev] [PATCH 3/6] i965/gen6-7: Implement stall and flushes required prior to switching pipelines.

2016-01-03 Thread Francisco Jerez
Kenneth Graunke  writes:

> On Saturday, January 2, 2016 10:48:02 PM PST Francisco Jerez wrote:
>> Switching the current pipeline while it's not completely idle or the
>> read and write caches aren't flushed can lead to corruption.  Fixes
>> misrendering of at least the following Khronos CTS test:
>> 
>>  ES31-CTS.shader_image_load_store.basic-allTargets-store-fs
>> 
>> The stall and flushes are no longer required on Gen8+.
>> 
>> Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=93323
>> ---
>>  src/mesa/drivers/dri/i965/brw_misc_state.c | 28 +++
> +
>>  1 file changed, 28 insertions(+)
>> 
>> diff --git a/src/mesa/drivers/dri/i965/brw_misc_state.c b/src/mesa/drivers/
> dri/i965/brw_misc_state.c
>> index 7d53d18..75540c1 100644
>> --- a/src/mesa/drivers/dri/i965/brw_misc_state.c
>> +++ b/src/mesa/drivers/dri/i965/brw_misc_state.c
>> @@ -886,6 +886,34 @@ brw_emit_select_pipeline(struct brw_context *brw, enum 
> brw_pipeline pipeline)
>>  
>>   brw->ctx.NewDriverState |= BRW_NEW_CC_STATE;
>>}
>> +
>> +   } else if (brw->gen >= 6) {
>> +  /* From "BXML » GT » MI » vol1a GPU Overview » [Instruction]
>> +   * PIPELINE_SELECT [DevBWR+]":
>
> Can we cite the public docs?
>

The public docs for PIPELINE_SELECT seemed rather inaccurate.  The IVB
version I have in front of me right now is missing this one workaround,
and the BDW version mentions it incorrectly.  Sigh...

>> +   *
>> +   *   Project: DEVSNB+
>> +   *
>> +   *   Software must ensure all the write caches are flushed through a
>> +   *   stalling PIPE_CONTROL command followed by another PIPE_CONTROL
>> +   *   command to invalidate read only caches prior to programming
>> +   *   MI_PIPELINE_SELECT command to change the Pipeline Select Mode.
>> +   */
>> +  const unsigned dc_flush =
>> + brw->gen >= 7 ? PIPE_CONTROL_DATA_CACHE_INVALIDATE : 0;
>
> I was going to suggest doing a brw_emit_post_sync_nonzero_flush first
> on Sandybridge, but I forgot that we now just emit that at the start
> of every state upload.  Fairly moot anyway since we don't do GPGPU on
> Sandybridge anyway.
>
Hmm, that sounds very sensible to me, it would be rather fragile for
this function to rely on a flush with post-sync op having been done
previously, even if at this point this will only be called once at
context creation on SNB -- Although for the same reason it seems rather
fragile for brw_emit_pipe_control_flush() to assume that the workaround
has been applied already.  I'd be inclined to change
brw_emit_pipe_control_flush() to emit the post-sync op when needed on
SNB just like we do for other PIPE_CONTROL workarounds on Gen7 and Gen8.

>> +
>> +  brw_emit_pipe_control_flush(brw,
>> +  PIPE_CONTROL_RENDER_TARGET_FLUSH |
>> +  PIPE_CONTROL_DEPTH_CACHE_FLUSH |
>> +  dc_flush |
>> +  PIPE_CONTROL_NO_WRITE |
>> +  PIPE_CONTROL_CS_STALL);
>
> Why RENDER_TARGET_FLUSH, DEPTH_CACHE_FLUSH, DATA_CACHE_INVALIDATE,
> and NO_WRITE?  The cited workaround explains a CS Stall and the RO
> invalidations below, but I'm not seeing why the others are needed.
>
It also says that "software must ensure all the write caches are
flushed".

>> +
>> +  brw_emit_pipe_control_flush(brw,
>> +  PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
>> +  PIPE_CONTROL_CONST_CACHE_INVALIDATE |
>> +  PIPE_CONTROL_STATE_CACHE_INVALIDATE |
>> +  PIPE_CONTROL_INSTRUCTION_INVALIDATE |
>> +  PIPE_CONTROL_NO_WRITE);
>> }
>>  
>> /* Select the pipeline */
>> 


signature.asc
Description: PGP signature
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 1/6] gallium: document PK2H/UP2H

2016-01-03 Thread Roland Scheidegger
Am 03.01.2016 um 21:32 schrieb Ilia Mirkin:
> On Sun, Jan 3, 2016 at 2:15 PM, Ilia Mirkin  wrote:
>> On Sun, Jan 3, 2016 at 2:08 PM, Roland Scheidegger  
>> wrote:
> For the series (with the first point addressed either way,though a tgsi
> exec implementation which should be trivial wouldn't hurt neither)
> Reviewed-by: Roland Scheidegger 

 Thanks! I'll do a patch for that shortly (tgsi_exec). Unfortunately I
 won't be able to enable the cap since it will still use gallivm by
 default for vertices. I have a gallivm implementation as well, but it
 hits asserts on LLVM 3.5. I'm pretty sure I tested it at one point or
 another, but it must have been on another box with a more recent LLVM.
>>>
>>> Ah right. f16 conversion is pretty annoying indeed, though I'd hope the
>>> helpers for that should work. In any case, I only really suggested that
>>> because I'd thought it would be trivial, so if it's not I don't consider
>>> that important...
>>
>> I'll send it out as a separate series, including my (semi?) broken
>> gallivm impl and leave it to you to fix it if you care, or ignore if
>> you don't. (I already have it, so might as well...) I understand
>> neither how LLVM works, nor how gallivm uses LLVM, which isn't a great
>> combination :)
> 
> And of course the piglits expect out-of-bounds numbers to be
> represented as infinities, instead of the clamped value

This is, imho, a bug, they should allow both. Because round-towards-zero
when converting is allowed by GL when converting floats to half, albeit
round-to-nearest-even is preferred. And the former gets you the clamped
values.

> which is what util_float_to_half does :(
Yep. The reason both the util and gallivm code do round-towards zero is
that for such conversions GL allows both, but d3d10 is deeply unhappy if
you do round-toward-nearest-even (for float to float conversions), at
least for the clamp vs. infinite issue. As per the data conversion
rules:
https://msdn.microsoft.com/en-us/library/windows/desktop/dd607323%28v=vs.85%29.aspx
Albeit there's no specific half float conversion instructions in d3d10
(but in d3d11), render target conversions etc. must honor these rules too.
I suspect most hw can do both without too much fuzz (x86 f16c certainly
can).

Roland

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 1/6] gallium: document PK2H/UP2H

2016-01-03 Thread Ilia Mirkin
On Sun, Jan 3, 2016 at 7:51 PM, Roland Scheidegger  wrote:
> Am 03.01.2016 um 21:32 schrieb Ilia Mirkin:
>> On Sun, Jan 3, 2016 at 2:15 PM, Ilia Mirkin  wrote:
>>> On Sun, Jan 3, 2016 at 2:08 PM, Roland Scheidegger  
>>> wrote:
>> For the series (with the first point addressed either way,though a tgsi
>> exec implementation which should be trivial wouldn't hurt neither)
>> Reviewed-by: Roland Scheidegger 
>
> Thanks! I'll do a patch for that shortly (tgsi_exec). Unfortunately I
> won't be able to enable the cap since it will still use gallivm by
> default for vertices. I have a gallivm implementation as well, but it
> hits asserts on LLVM 3.5. I'm pretty sure I tested it at one point or
> another, but it must have been on another box with a more recent LLVM.

 Ah right. f16 conversion is pretty annoying indeed, though I'd hope the
 helpers for that should work. In any case, I only really suggested that
 because I'd thought it would be trivial, so if it's not I don't consider
 that important...
>>>
>>> I'll send it out as a separate series, including my (semi?) broken
>>> gallivm impl and leave it to you to fix it if you care, or ignore if
>>> you don't. (I already have it, so might as well...) I understand
>>> neither how LLVM works, nor how gallivm uses LLVM, which isn't a great
>>> combination :)
>>
>> And of course the piglits expect out-of-bounds numbers to be
>> represented as infinities, instead of the clamped value
>
> This is, imho, a bug, they should allow both. Because round-towards-zero
> when converting is allowed by GL when converting floats to half, albeit
> round-to-nearest-even is preferred. And the former gets you the clamped
> values.
>
>> which is what util_float_to_half does :(
> Yep. The reason both the util and gallivm code do round-towards zero is
> that for such conversions GL allows both, but d3d10 is deeply unhappy if
> you do round-toward-nearest-even (for float to float conversions), at
> least for the clamp vs. infinite issue. As per the data conversion
> rules:
> https://msdn.microsoft.com/en-us/library/windows/desktop/dd607323%28v=vs.85%29.aspx
> Albeit there's no specific half float conversion instructions in d3d10
> (but in d3d11), render target conversions etc. must honor these rules too.
> I suspect most hw can do both without too much fuzz (x86 f16c certainly
> can).

Take it up with people who aren't me :)

http://cgit.freedesktop.org/mesa/mesa/tree/src/glsl/lower_packing_builtins.cpp#n990

FWIW the f32 -> f16 opcode this maps to on nvc0 has the same
behaviour. Now it also has rounding mode flags which I don't set and
perhaps one of them would yield the behaviour that you're talking
about, but I don't know offhand how to get it. Curiously from the PTX
ISA docs: "Conversions to floating-point that are beyond the range of
floating-point numbers are represented with the maximum floating-point
value (IEEE 754 Inf for f32 and f64, and ~131,000 for f16)."

If you get the piglit tests changed, I guess I'll poke around.

  -ilia
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 2/2] WIP tgsi: add PK2H/UP2H support

2016-01-03 Thread Roland Scheidegger
Am 03.01.2016 um 22:29 schrieb Ilia Mirkin:
> It seems like there's something horribly wrong with the
> util_float_to_half function. In a standalone compilation it works fine
> for -6.10203e-05, generating 0x8400, but inside mesa it ends up with
> 0x8000. The result of the magic.f multiplication is 0. No idea why.

Ahh that's easy. Because we switch off denorms (on x86...). (Look at
util_fpstate_set_denorms_to_zero)
And this number would be  _just_ below the smallest normal number - the
float mul would produce a denorm, which gets flushed to zero.

That said, this is wrong (the gallivm code will not hit this issue, as
it uses all int math, pretty much because of that, it should be noted
that if you actually hit those denorms it's going to be dead slow on a
lot of intel cpus). Should probably switch the util code to some other
method for conversion as well.
d3d10 says you must flush denorms to zero, and I can't see any reason
why you'd want them for gl graphics neither, however conversion to/from
f16 is an exception  - denorm numbers must be correctly represented (at
least for d3d10 - wouldn't be surprised if GL doesn't care).

So it's a pretty minor issue (albeit really should be addressed at some
point). The magic mul method is somewhat elegant, but that's a
limitation it has.

Roland


> 
> Signed-off-by: Ilia Mirkin 
> ---
> 
> See above comments for why I didn't include this at all. Should you figure out
> what was going wrong, note that this can only be enabled in softpipe if the
> whole pipeline is using tgsi_exec (or if the gallivm patch is 
> fixed/upstreamed).
> 
> Feel free to take this over and/or modify as necessary.
> 
>  src/gallium/auxiliary/tgsi/tgsi_exec.c   | 44 
> ++--
>  src/gallium/drivers/softpipe/sp_screen.c |  2 +-
>  2 files changed, 43 insertions(+), 3 deletions(-)
> 
> diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.c 
> b/src/gallium/auxiliary/tgsi/tgsi_exec.c
> index f67c162..12a477b 100644
> --- a/src/gallium/auxiliary/tgsi/tgsi_exec.c
> +++ b/src/gallium/auxiliary/tgsi/tgsi_exec.c
> @@ -58,6 +58,7 @@
>  #include "tgsi/tgsi_parse.h"
>  #include "tgsi/tgsi_util.h"
>  #include "tgsi_exec.h"
> +#include "util/u_half.h"
>  #include "util/u_memory.h"
>  #include "util/u_math.h"
>  
> @@ -3058,6 +3059,45 @@ exec_dp2(struct tgsi_exec_machine *mach,
>  }
>  
>  static void
> +exec_pk2h(struct tgsi_exec_machine *mach,
> +  const struct tgsi_full_instruction *inst)
> +{
> +   unsigned int chan;
> +   union tgsi_exec_channel arg[2], dst;
> +
> +   fetch_source(mach, &arg[0], &inst->Src[0], TGSI_CHAN_X, 
> TGSI_EXEC_DATA_FLOAT);
> +   fetch_source(mach, &arg[1], &inst->Src[0], TGSI_CHAN_Y, 
> TGSI_EXEC_DATA_FLOAT);
> +   for (chan = 0; chan < TGSI_QUAD_SIZE; chan++) {
> +  dst.u[chan] = util_float_to_half(arg[0].f[chan]) |
> + (util_float_to_half(arg[1].f[chan]) << 16);
> +   }
> +   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
> +  if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
> + store_dest(mach, &dst, &inst->Dst[0], inst, chan, 
> TGSI_EXEC_DATA_UINT);
> +  }
> +   }
> +}
> +
> +static void
> +exec_up2h(struct tgsi_exec_machine *mach,
> +  const struct tgsi_full_instruction *inst)
> +{
> +   unsigned int chan;
> +   union tgsi_exec_channel arg, dst[2];
> +
> +   fetch_source(mach, &arg, &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_UINT);
> +   for (chan = 0; chan < 4; chan++) {
> +  dst[0].f[chan] = util_half_to_float(arg.u[chan] & 0x);
> +  dst[1].f[chan] = util_half_to_float(arg.u[chan] >> 16);
> +   }
> +   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
> +  if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
> + store_dest(mach, &dst[chan & 1], &inst->Dst[0], inst, chan, 
> TGSI_EXEC_DATA_FLOAT);
> +  }
> +   }
> +}
> +
> +static void
>  exec_scs(struct tgsi_exec_machine *mach,
>   const struct tgsi_full_instruction *inst)
>  {
> @@ -4339,7 +4379,7 @@ exec_instruction(
>break;
>  
> case TGSI_OPCODE_PK2H:
> -  assert (0);
> +  exec_pk2h(mach, inst);
>break;
>  
> case TGSI_OPCODE_PK2US:
> @@ -4425,7 +4465,7 @@ exec_instruction(
>break;
>  
> case TGSI_OPCODE_UP2H:
> -  assert (0);
> +  exec_up2h(mach, inst);
>break;
>  
> case TGSI_OPCODE_UP2US:
> diff --git a/src/gallium/drivers/softpipe/sp_screen.c 
> b/src/gallium/drivers/softpipe/sp_screen.c
> index e74044b..d4526ef 100644
> --- a/src/gallium/drivers/softpipe/sp_screen.c
> +++ b/src/gallium/drivers/softpipe/sp_screen.c
> @@ -236,6 +236,7 @@ softpipe_get_param(struct pipe_screen *screen, enum 
> pipe_cap param)
> case PIPE_CAP_CLIP_HALFZ:
> case PIPE_CAP_TEXTURE_FLOAT_LINEAR:
> case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR:
> +   case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
>return 1;
> case PIPE_CAP_VERTEXID_NOBASE:
>return 0;
> @@ -252,7 +253,6 @@ softpipe_get_param(struct pipe_screen *screen, enum 
> pipe_cap param)
> 

Re: [Mesa-dev] [PATCH 1/2] WIP gallivm: add support for PK2H/UP2H

2016-01-03 Thread Roland Scheidegger
Am 03.01.2016 um 22:29 schrieb Ilia Mirkin:
> This hits assertion failures on LLVM 3.5
> 
> Signed-off-by: Ilia Mirkin 
> ---
> 
> It definitely worked at one point or another, but it might have been with
> a later LLVM version and/or on a different CPU. On my i7-920 with LLVM 3.5
> I definitely get assertion errors from inside LLVM. Any interested party
> can take this patch over and fix it as they see fit. Or ignore it.

Interesting. I wasn't even aware using fptrunc could work at all with
f16 type. And on some quick look this was indeed introduced later, I
think llvm 3.6 (some backends might still not do it today). There are
also llvm.convert.to.fp16 (and f32) operations (probably the same
backends won't do them neither...). I'm not really sure what rounding
mode semantics they'll end up with. Seems like fptrunc actually might do
round-to-nearest-even (I suppose llvm.convert.to.fp16 too), but
depending on how llvm ends up doing it it might well be subject to the
same no-denorm issue as the util code.
(And unfortunately, it looks like we don't have any direct control over
rounding mode neither for them so we can't ditch
lp_build_float_to_smallfloat and lp_build_smallfloat_to_float.)

Roland


> 
>  src/gallium/auxiliary/gallivm/lp_bld_tgsi.c|  1 -
>  src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c | 87 
> ++
>  src/gallium/drivers/llvmpipe/lp_screen.c   |  2 +-
>  3 files changed, 88 insertions(+), 2 deletions(-)
> 
> diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c 
> b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c
> index c88dfbf..1cbe47c 100644
> --- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c
> +++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c
> @@ -248,7 +248,6 @@ lp_build_tgsi_inst_llvm(
> /* Ignore deprecated instructions */
> switch (inst->Instruction.Opcode) {
>  
> -   case TGSI_OPCODE_UP2H:
> case TGSI_OPCODE_UP2US:
> case TGSI_OPCODE_UP4B:
> case TGSI_OPCODE_UP4UB:
> diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c 
> b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
> index 3d5e2cb..ac3298d 100644
> --- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
> +++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
> @@ -1020,6 +1020,88 @@ static void dfrac_emit(
> emit_data->args[0], 
> tmp, "");
>  }
>  
> +static void
> +pk2h_fetch_args(
> +   struct lp_build_tgsi_context * bld_base,
> +   struct lp_build_emit_data * emit_data)
> +{
> +   /* src0.x */
> +   emit_data->args[0] = lp_build_emit_fetch(bld_base, emit_data->inst,
> +0, TGSI_CHAN_X);
> +   /* src0.y */
> +   emit_data->args[1] = lp_build_emit_fetch(bld_base, emit_data->inst,
> +0, TGSI_CHAN_Y);
> +}
> +
> +static void
> +emit_pk2h(const struct lp_build_tgsi_action *action,
> +  struct lp_build_tgsi_context *bld_base,
> +  struct lp_build_emit_data *emit_data)
> +{
> +   LLVMBuilderRef builder = bld_base->base.gallivm->builder;
> +   LLVMContextRef context = bld_base->base.gallivm->context;
> +   struct lp_build_context *uint_bld = &bld_base->uint_bld;
> +   LLVMTypeRef fp16 = LLVMVectorType(LLVMHalfTypeInContext(context),
> + bld_base->base.type.length);
> +   LLVMTypeRef i16 = LLVMVectorType(LLVMInt16TypeInContext(context),
> +bld_base->base.type.length);
> +   LLVMValueRef const16 = lp_build_const_vec(uint_bld->gallivm, 
> uint_bld->type,
> + 16);
> +
> +   LLVMValueRef low = LLVMBuildFPTrunc(
> +  builder, emit_data->args[0], fp16, "");
> +   LLVMValueRef high = LLVMBuildFPTrunc(
> +  builder, emit_data->args[1], fp16, "");
> +
> +   low = LLVMBuildZExt(builder, LLVMBuildBitCast(builder, low, i16, ""),
> +   uint_bld->vec_type, "");
> +   high = LLVMBuildZExt(builder, LLVMBuildBitCast(builder, high, i16, ""),
> +uint_bld->vec_type, "");
> +
> +   emit_data->output[emit_data->chan] =
> +  LLVMBuildOr(builder, low, LLVMBuildShl(builder, high, const16, ""), 
> "");
> +}
> +
> +static void
> +up2h_fetch_args(
> +   struct lp_build_tgsi_context * bld_base,
> +   struct lp_build_emit_data * emit_data)
> +{
> +   /* src0.x */
> +   emit_data->args[0] = lp_build_emit_fetch(bld_base, emit_data->inst,
> +0, TGSI_CHAN_X);
> +}
> +
> +static void
> +emit_up2h(const struct lp_build_tgsi_action *action,
> +  struct lp_build_tgsi_context *bld_base,
> +  struct lp_build_emit_data *emit_data)
> +{
> +   LLVMBuilderRef builder = bld_base->base.gallivm->builder;
> +   LLVMContextRef context = bld_base->base.gallivm->context;
> +   struct lp_build_context *uint_bld = &bld_base->uint_bld;
> +   LLVMTypeRef fp16 = LLVMVectorType(LLVMHalfTypeInContext(context),
> +  

Re: [Mesa-dev] [PATCH 1/2] WIP gallivm: add support for PK2H/UP2H

2016-01-03 Thread Ilia Mirkin
On Sun, Jan 3, 2016 at 8:37 PM, Roland Scheidegger  wrote:
> Am 03.01.2016 um 22:29 schrieb Ilia Mirkin:
>> This hits assertion failures on LLVM 3.5
>>
>> Signed-off-by: Ilia Mirkin 
>> ---
>>
>> It definitely worked at one point or another, but it might have been with
>> a later LLVM version and/or on a different CPU. On my i7-920 with LLVM 3.5
>> I definitely get assertion errors from inside LLVM. Any interested party
>> can take this patch over and fix it as they see fit. Or ignore it.
>
> Interesting. I wasn't even aware using fptrunc could work at all with
> f16 type. And on some quick look this was indeed introduced later, I
> think llvm 3.6 (some backends might still not do it today). There are
> also llvm.convert.to.fp16 (and f32) operations (probably the same
> backends won't do them neither...). I'm not really sure what rounding
> mode semantics they'll end up with. Seems like fptrunc actually might do
> round-to-nearest-even (I suppose llvm.convert.to.fp16 too), but
> depending on how llvm ends up doing it it might well be subject to the
> same no-denorm issue as the util code.
> (And unfortunately, it looks like we don't have any direct control over
> rounding mode neither for them so we can't ditch
> lp_build_float_to_smallfloat and lp_build_smallfloat_to_float.)

My (admittedly faint) recollection is that this passed the existing
piglit tests on a Haswell CPU and I guess at least LLVM 3.6 or maybe
even 3.7 (not at the machine right now). But depending on CPU
different code might be emitted of course. I wasn't aware of the
lp_build_float_to_smallfloat stuff. I don't plan on pursuing this
patch further, if you're interested, feel free to redo it.

This whole series was mostly about me _really_ hating the code that
mesa lowered the half-float pack/unpack into, not any actual
performance thing. I don't think that we're aware of a single usage of
these builtins outside of piglit. Curiously I saw that GRID Autosport
makes use of f32tof16 (and back) functions, but they're
locally-defined as

uint packfp32(in float fp32)
{
uint result;
uint temp = floatBitsToUint(fp32);
result = ((temp & 0x8000u) >> 16) | (((temp & 0x7fffu)
>> 13) - (0x3800u >> 13));
return result;
}

Which later is used as:

r9.y = uintBitsToFloat(uint(0x3f80));
r0.x = uintBitsToFloat(uint(r9.y));
r0.y = uintBitsToFloat(f32tof16(r0.y));
r2.w =
intBitsToFloat(bfi(floatBitsToInt(r0.x), floatBitsToInt(r0.y),
int(16), int(16)));

I guess it was too much trouble to use packHalf2x16(r0.xy) [and it
*appears* that they forgot to f32tof16 r0.x... this all gets stored
off into a ssbo and presumably reused somewhere, so can't tell if it
was intended].

  -ilia
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 1/6] gallium: document PK2H/UP2H

2016-01-03 Thread Roland Scheidegger
Am 04.01.2016 um 02:05 schrieb Ilia Mirkin:
> On Sun, Jan 3, 2016 at 7:51 PM, Roland Scheidegger  wrote:
>> Am 03.01.2016 um 21:32 schrieb Ilia Mirkin:
>>> On Sun, Jan 3, 2016 at 2:15 PM, Ilia Mirkin  wrote:
 On Sun, Jan 3, 2016 at 2:08 PM, Roland Scheidegger  
 wrote:
>>> For the series (with the first point addressed either way,though a tgsi
>>> exec implementation which should be trivial wouldn't hurt neither)
>>> Reviewed-by: Roland Scheidegger 
>>
>> Thanks! I'll do a patch for that shortly (tgsi_exec). Unfortunately I
>> won't be able to enable the cap since it will still use gallivm by
>> default for vertices. I have a gallivm implementation as well, but it
>> hits asserts on LLVM 3.5. I'm pretty sure I tested it at one point or
>> another, but it must have been on another box with a more recent LLVM.
>
> Ah right. f16 conversion is pretty annoying indeed, though I'd hope the
> helpers for that should work. In any case, I only really suggested that
> because I'd thought it would be trivial, so if it's not I don't consider
> that important...

 I'll send it out as a separate series, including my (semi?) broken
 gallivm impl and leave it to you to fix it if you care, or ignore if
 you don't. (I already have it, so might as well...) I understand
 neither how LLVM works, nor how gallivm uses LLVM, which isn't a great
 combination :)
>>>
>>> And of course the piglits expect out-of-bounds numbers to be
>>> represented as infinities, instead of the clamped value
>>
>> This is, imho, a bug, they should allow both. Because round-towards-zero
>> when converting is allowed by GL when converting floats to half, albeit
>> round-to-nearest-even is preferred. And the former gets you the clamped
>> values.
>>
>>> which is what util_float_to_half does :(
>> Yep. The reason both the util and gallivm code do round-towards zero is
>> that for such conversions GL allows both, but d3d10 is deeply unhappy if
>> you do round-toward-nearest-even (for float to float conversions), at
>> least for the clamp vs. infinite issue. As per the data conversion
>> rules:
>> https://urldefense.proofpoint.com/v2/url?u=https-3A__msdn.microsoft.com_en-2Dus_library_windows_desktop_dd607323-2528v-3Dvs.85-2529.aspx&d=BQIBaQ&c=Sqcl0Ez6M0X8aeM67LKIiDJAXVeAw-YihVMNtXt-uEs&r=Vjtt0vs_iqoI31UfJxBl7yv9I2FeiaeAYgMTLKRBc_I&m=Gd7OrjAeguJzGQHAmmnWwz-_ok3_P7HVdfP1UqlD06w&s=c9EvJslgDjJWgBgsKb_VdSLRtbWWq30XqYi0689ilkQ&e=
>>  
>> Albeit there's no specific half float conversion instructions in d3d10
>> (but in d3d11), render target conversions etc. must honor these rules too.
>> I suspect most hw can do both without too much fuzz (x86 f16c certainly
>> can).
> 
> Take it up with people who aren't me :)
>
http://cgit.freedesktop.org/mesa/mesa/tree/src/glsl/lower_packing_builtins.cpp#n990

Yes, it is actually imho somewhat surprising intel gpus can't even do
the round-toward-zero behavior natively (meaning they'd most likely have
to emulate that one way or the other for the d3d10 driver).


> FWIW the f32 -> f16 opcode this maps to on nvc0 has the same
> behaviour. Now it also has rounding mode flags which I don't set and
> perhaps one of them would yield the behaviour that you're talking
> about, but I don't know offhand how to get it. Curiously from the PTX
> ISA docs: "Conversions to floating-point that are beyond the range of
> floating-point numbers are represented with the maximum floating-point
> value (IEEE 754 Inf for f32 and f64, and ~131,000 for f16)."
Yes that's somewhat odd. imho if you set round-towards-zero you should
get the maxf value. With round-to-nearest(-even) you should get the
infinities. This is per standard ieee754 rules. Getting something like
round-to-nearest but the overflowing values still clamped to maxf (or
vice versa) doesn't really make all that much sense, if that is what's
somehow implied by this paragraph.


> 
> If you get the piglit tests changed, I guess I'll poke around.

Hmm quite some python code, so I probably don't have time to dig into
that. Albeit what I can tell is the rounding mode functions inside
gen_builtin_packing_tests.py are (to me) somewhat confusingly named,
"round to nearest" and "round to even" - both are round to nearest, one
is just "round to nearest_even" the other is really "round to
nearest_away_from_zero".
But really, glsl just says "The rounding mode cannot be set and is
undefined." And that is true for ALL operations.
The section also says though for "implicit and explicit conversions
between types - Correctly rounded". It is possible to interpret that as
meaning that while the rounding mode is undefined, it must be consistent
for all operations (in which case it would indeed not be legal to do
ordinary arithmetic with round-to-nearest but packHalf2x16 with
round-toward-zero). If that's true, we need some way to distinguish
between the two possible float->half conversions in gallium, which
sounds like quite

[Mesa-dev] [Bug 93570] the image of llvmpipe has a low quality on arm (with too many points on it)

2016-01-03 Thread bugzilla-daemon
https://bugs.freedesktop.org/show_bug.cgi?id=93570

--- Comment #6 from Icenowy Zheng  ---
My CPU (Cortex-A7) *do* has neon. All A7 cores have neon.

I used llvm-3.7, and I'm now building llvm-svn (used
https://github.com/llvm-mirror/llvm)

Note: you can run a glmark2 and compare the image to the one that I uploaded.

-- 
You are receiving this mail because:
You are the QA Contact for the bug.
You are the assignee for the bug.
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 6/6] i965/gen7.5+: Disable resource streamer during GPGPU workloads.

2016-01-03 Thread Abdiel Janulgue


On 01/03/2016 08:48 AM, Francisco Jerez wrote:
> The RS and hardware binding tables are only supported on the 3D
> pipeline and can lead to corruption if left enabled during a GPGPU
> workload.  Disable it when switching to the GPGPU (or media) pipeline
> and re-enable it when switching back to the 3D pipeline.

Yep, this is the way it is.

Reviewed-by: Abdiel Janulgue 

> ---
>  src/mesa/drivers/dri/i965/brw_binding_tables.c |  2 +-
>  src/mesa/drivers/dri/i965/brw_misc_state.c | 38 
> ++
>  src/mesa/drivers/dri/i965/brw_state.h  |  1 +
>  3 files changed, 40 insertions(+), 1 deletion(-)
> 
> diff --git a/src/mesa/drivers/dri/i965/brw_binding_tables.c 
> b/src/mesa/drivers/dri/i965/brw_binding_tables.c
> index 80935cf..5c5aa0e 100644
> --- a/src/mesa/drivers/dri/i965/brw_binding_tables.c
> +++ b/src/mesa/drivers/dri/i965/brw_binding_tables.c
> @@ -364,7 +364,7 @@ gen7_disable_hw_binding_tables(struct brw_context *brw)
>  /**
>   * Enable hardware binding tables and set up the binding table pool.
>   */
> -static void
> +void
>  gen7_enable_hw_binding_tables(struct brw_context *brw)
>  {
> if (!brw->use_resource_streamer)
> diff --git a/src/mesa/drivers/dri/i965/brw_misc_state.c 
> b/src/mesa/drivers/dri/i965/brw_misc_state.c
> index 2263604..7e68838 100644
> --- a/src/mesa/drivers/dri/i965/brw_misc_state.c
> +++ b/src/mesa/drivers/dri/i965/brw_misc_state.c
> @@ -868,6 +868,25 @@ brw_emit_select_pipeline(struct brw_context *brw, enum 
> brw_pipeline pipeline)
> const uint32_t _3DSTATE_PIPELINE_SELECT =
>is_965 ? CMD_PIPELINE_SELECT_965 : CMD_PIPELINE_SELECT_GM45;
>  
> +   if (brw->use_resource_streamer && pipeline != BRW_RENDER_PIPELINE) {
> +  /* From "BXML » GT » MI » vol1a GPU Overview » [Instruction]
> +   * PIPELINE_SELECT [DevBWR+]":
> +   *
> +   * Project: HSW, BDW, CHV, SKL, BXT
> +   *
> +   * Hardware Binding Tables are only supported for 3D workloads. 
> Resource
> +   * streamer must be enabled only for 3D workloads. Resource streamer
> +   * must be disabled for Media and GPGPU workloads.
> +   */
> +  BEGIN_BATCH(1);
> +  OUT_BATCH(MI_RS_CONTROL | 0);
> +  ADVANCE_BATCH();
> +
> +  gen7_disable_hw_binding_tables(brw);
> +
> +  /* XXX - Disable gather constant pool too when we start using it. */
> +   }
> +
> if (brw->gen >= 8) {
>/* From "BXML » GT » MI » vol1a GPU Overview » [Instruction]
> * PIPELINE_SELECT [DevBWR+]":
> @@ -959,6 +978,25 @@ brw_emit_select_pipeline(struct brw_context *brw, enum 
> brw_pipeline pipeline)
>OUT_BATCH(0);
>ADVANCE_BATCH();
> }
> +
> +   if (brw->use_resource_streamer && pipeline == BRW_RENDER_PIPELINE) {
> +  /* From "BXML » GT » MI » vol1a GPU Overview » [Instruction]
> +   * PIPELINE_SELECT [DevBWR+]":
> +   *
> +   * Project: HSW, BDW, CHV, SKL, BXT
> +   *
> +   * Hardware Binding Tables are only supported for 3D workloads. 
> Resource
> +   * streamer must be enabled only for 3D workloads. Resource streamer
> +   * must be disabled for Media and GPGPU workloads.
> +   */
> +  BEGIN_BATCH(1);
> +  OUT_BATCH(MI_RS_CONTROL | 1);
> +  ADVANCE_BATCH();
> +
> +  gen7_enable_hw_binding_tables(brw);
> +
> +  /* XXX - Re-enable gather constant pool here. */
> +   }
>  }
>  
>  /**
> diff --git a/src/mesa/drivers/dri/i965/brw_state.h 
> b/src/mesa/drivers/dri/i965/brw_state.h
> index d29b997..7d61b7c 100644
> --- a/src/mesa/drivers/dri/i965/brw_state.h
> +++ b/src/mesa/drivers/dri/i965/brw_state.h
> @@ -396,6 +396,7 @@ void gen7_update_binding_table_from_array(struct 
> brw_context *brw,
>gl_shader_stage stage,
>const uint32_t* binding_table,
>int num_surfaces);
> +void gen7_enable_hw_binding_tables(struct brw_context *brw);
>  void gen7_disable_hw_binding_tables(struct brw_context *brw);
>  void gen7_reset_hw_bt_pool_offsets(struct brw_context *brw);
>  
> 
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH] mesa: use gl_shader_variable in program resource list

2016-01-03 Thread Tapani Pälli



On 12/30/2015 09:53 PM, Marek Olšák wrote:

On Mon, Nov 2, 2015 at 10:12 AM, Tapani Pälli  wrote:



On 11/02/2015 09:16 AM, Ilia Mirkin wrote:


On Mon, Nov 2, 2015 at 1:58 AM, Tapani Pälli 
wrote:


Patch changes linker to allocate gl_shader_variable instead of using
ir_variable. This makes it possible to get rid of ir_variables and ir
in memory after linking.

v2: check that we do not create duplicate entries with
  packed varyings

Signed-off-by: Tapani Pälli 
---
   src/glsl/linker.cpp| 58
+++---
   src/mesa/main/mtypes.h | 56

   src/mesa/main/shader_query.cpp | 36 +-
   3 files changed, 123 insertions(+), 27 deletions(-)

diff --git a/src/glsl/linker.cpp b/src/glsl/linker.cpp
index 48dd2d3..d0353b4 100644
--- a/src/glsl/linker.cpp
+++ b/src/glsl/linker.cpp
@@ -3341,6 +3341,27 @@ build_stageref(struct gl_shader_program *shProg,
const char *name,
  return stages;
   }

+/**
+ * Create gl_shader_variable from ir_variable class.
+ */
+static gl_shader_variable *
+create_shader_variable(struct gl_shader_program *shProg, const
ir_variable *in)
+{
+   gl_shader_variable *out = ralloc(shProg, struct gl_shader_variable);
+   if (!out)
+  return NULL;
+
+   out->type = in->type;
+   out->name = ralloc_strdup(shProg, in->name);



This can fail too, right? Might be nice to error-check.



Thanks, static analysis might complain about this, will fix.



+
+   out->location = in->data.location;
+   out->index = in->data.index;
+   out->patch = in->data.patch;
+   out->mode = in->data.mode;
+
+   return out;
+}
+
   static bool
   add_interface_variables(struct gl_shader_program *shProg,
   exec_list *ir, GLenum programInterface)
@@ -3392,9 +3413,13 @@ add_interface_variables(struct gl_shader_program
*shProg,
 if (strncmp(var->name, "gl_out_FragData", 15) == 0)
continue;

-  if (!add_program_resource(shProg, programInterface, var,
-build_stageref(shProg, var->name,
-   var->data.mode) | mask))
+  gl_shader_variable *sha_v = create_shader_variable(shProg, var);
+  if (!sha_v)
+ return false;
+
+  if (!add_program_resource(shProg, programInterface, sha_v,
+build_stageref(shProg, sha_v->name,
+   sha_v->mode) | mask))
return false;
  }
  return true;
@@ -3422,9 +3447,14 @@ add_packed_varyings(struct gl_shader_program
*shProg, int stage)
default:
   unreachable("unexpected type");
}
- if (!add_program_resource(shProg, iface, var,
-   build_stageref(shProg, var->name,
-  var->data.mode)))
+
+ gl_shader_variable *sha_v = create_shader_variable(shProg,
var);
+ if (!sha_v)
+return false;
+
+ if (!add_program_resource(shProg, iface, sha_v,
+   build_stageref(shProg, sha_v->name,
+  sha_v->mode)))
   return false;
 }
  }
@@ -3443,7 +3473,12 @@ add_fragdata_arrays(struct gl_shader_program
*shProg)
 ir_variable *var = node->as_variable();
 if (var) {
assert(var->data.mode == ir_var_shader_out);
- if (!add_program_resource(shProg, GL_PROGRAM_OUTPUT, var,
+
+ gl_shader_variable *sha_v = create_shader_variable(shProg,
var);
+ if (!sha_v)
+return false;
+
+ if (!add_program_resource(shProg, GL_PROGRAM_OUTPUT, sha_v,
  1 << MESA_SHADER_FRAGMENT))
   return false;
 }
@@ -3723,8 +3758,13 @@ build_program_resource_list(struct
gl_shader_program *shProg)
  if (shProg->SeparateShader) {
 if (!add_packed_varyings(shProg, input_stage))
return;
-  if (!add_packed_varyings(shProg, output_stage))
- return;
+  /* Only when dealing with multiple stages, otherwise we would have
+   * duplicate gl_shader_variable entries.
+   */
+  if (input_stage != output_stage) {
+ if (!add_packed_varyings(shProg, output_stage))
+return;
+  }
  }

  if (!add_fragdata_arrays(shProg))
diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h
index d6c1eb8..0316769 100644
--- a/src/mesa/main/mtypes.h
+++ b/src/mesa/main/mtypes.h
@@ -2519,6 +2519,62 @@ struct gl_active_atomic_buffer
   };

   /**
+ * Data container for shader queries. This holds only the minimal
+ * amount of required information for resource queries to work.
+ */
+struct gl_shader_variable
+{
+   /**
+* Declared type of the variable
+*/
+   const struct glsl_type *type;
+
+   /**
+* Declared name of the variable
+*/
+   char *name;
+
+   /**

[Mesa-dev] [Bug 93570] the image of llvmpipe has a low quality on arm (with too many points on it)

2016-01-03 Thread bugzilla-daemon
https://bugs.freedesktop.org/show_bug.cgi?id=93570

--- Comment #7 from Icenowy Zheng  ---
In addition, my DDX driver is https://github.com/ssvb/xf86-video-fbturbo, which
uses neon to accelerate operations such as bitblit .

-- 
You are receiving this mail because:
You are the QA Contact for the bug.
You are the assignee for the bug.
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [Bug 93570] the image of llvmpipe has a low quality on arm (with too many points on it)

2016-01-03 Thread bugzilla-daemon
https://bugs.freedesktop.org/show_bug.cgi?id=93570

--- Comment #8 from Icenowy Zheng  ---
It seems that the answer to the question above is "No". I changed to the
original fbdev DDX, and the problem is still here?

Is there any way to dump the binary code generate by llvm?

-- 
You are receiving this mail because:
You are the QA Contact for the bug.
You are the assignee for the bug.
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH] mesa: do not validate io of non-compute and compute stage

2016-01-03 Thread Tapani Pälli
Fixes regression on SSO tests that have both non-compute and
compute programs in a program pipeline.

Signed-off-by: Tapani Pälli 
Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=93532
---
 src/mesa/main/shader_query.cpp | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/src/mesa/main/shader_query.cpp b/src/mesa/main/shader_query.cpp
index e526119..570acfa 100644
--- a/src/mesa/main/shader_query.cpp
+++ b/src/mesa/main/shader_query.cpp
@@ -1496,6 +1496,13 @@ _mesa_validate_pipeline_io(struct gl_pipeline_object 
*pipeline)
 
for (idx = prev + 1; idx < ARRAY_SIZE(pipeline->CurrentProgram); idx++) {
   if (shProg[idx]) {
+ /* Pipeline might include both non-compute and a compute program, do
+  * not attempt to validate varyings between non-compute and compute
+  * stage.
+  */
+ if (shProg[idx]->_LinkedShaders[idx]->Stage == MESA_SHADER_COMPUTE)
+break;
+
  if (!validate_io(shProg[prev]->_LinkedShaders[prev],
   shProg[idx]->_LinkedShaders[idx],
   shProg[prev]->IsES || shProg[idx]->IsES))
-- 
2.5.0

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [PATCH v3] mesa: use gl_shader_variable in program resource list

2016-01-03 Thread Tapani Pälli
Patch changes linker to allocate gl_shader_variable instead of using
ir_variable. This makes it possible to get rid of ir_variables and ir
in memory after linking.

v2: check that we do not create duplicate entries with
packed varyings

v3: document 'patch' bit (Ilia Mirkin)

Signed-off-by: Tapani Pälli 
---
 src/glsl/linker.cpp| 61 +++---
 src/mesa/main/mtypes.h | 61 ++
 src/mesa/main/shader_query.cpp | 38 +-
 3 files changed, 132 insertions(+), 28 deletions(-)

diff --git a/src/glsl/linker.cpp b/src/glsl/linker.cpp
index a6e81b4..45daa12 100644
--- a/src/glsl/linker.cpp
+++ b/src/glsl/linker.cpp
@@ -3373,6 +3373,30 @@ build_stageref(struct gl_shader_program *shProg, const 
char *name,
return stages;
 }
 
+/**
+ * Create gl_shader_variable from ir_variable class.
+ */
+static gl_shader_variable *
+create_shader_variable(struct gl_shader_program *shProg, const ir_variable *in)
+{
+   gl_shader_variable *out = ralloc(shProg, struct gl_shader_variable);
+   if (!out)
+  return NULL;
+
+   out->type = in->type;
+   out->name = ralloc_strdup(shProg, in->name);
+
+   if (!out->name)
+  return NULL;
+
+   out->location = in->data.location;
+   out->index = in->data.index;
+   out->patch = in->data.patch;
+   out->mode = in->data.mode;
+
+   return out;
+}
+
 static bool
 add_interface_variables(struct gl_shader_program *shProg,
 exec_list *ir, GLenum programInterface)
@@ -3424,9 +3448,13 @@ add_interface_variables(struct gl_shader_program *shProg,
   if (strncmp(var->name, "gl_out_FragData", 15) == 0)
  continue;
 
-  if (!add_program_resource(shProg, programInterface, var,
-build_stageref(shProg, var->name,
-   var->data.mode) | mask))
+  gl_shader_variable *sha_v = create_shader_variable(shProg, var);
+  if (!sha_v)
+ return false;
+
+  if (!add_program_resource(shProg, programInterface, sha_v,
+build_stageref(shProg, sha_v->name,
+   sha_v->mode) | mask))
  return false;
}
return true;
@@ -3454,9 +3482,14 @@ add_packed_varyings(struct gl_shader_program *shProg, 
int stage)
  default:
 unreachable("unexpected type");
  }
- if (!add_program_resource(shProg, iface, var,
-   build_stageref(shProg, var->name,
-  var->data.mode)))
+
+ gl_shader_variable *sha_v = create_shader_variable(shProg, var);
+ if (!sha_v)
+return false;
+
+ if (!add_program_resource(shProg, iface, sha_v,
+   build_stageref(shProg, sha_v->name,
+  sha_v->mode)))
 return false;
   }
}
@@ -3475,7 +3508,12 @@ add_fragdata_arrays(struct gl_shader_program *shProg)
   ir_variable *var = node->as_variable();
   if (var) {
  assert(var->data.mode == ir_var_shader_out);
- if (!add_program_resource(shProg, GL_PROGRAM_OUTPUT, var,
+
+ gl_shader_variable *sha_v = create_shader_variable(shProg, var);
+ if (!sha_v)
+return false;
+
+ if (!add_program_resource(shProg, GL_PROGRAM_OUTPUT, sha_v,
1 << MESA_SHADER_FRAGMENT))
 return false;
   }
@@ -3726,8 +3764,13 @@ build_program_resource_list(struct gl_shader_program 
*shProg)
if (shProg->SeparateShader) {
   if (!add_packed_varyings(shProg, input_stage))
  return;
-  if (!add_packed_varyings(shProg, output_stage))
- return;
+  /* Only when dealing with multiple stages, otherwise we would have
+   * duplicate gl_shader_variable entries.
+   */
+  if (input_stage != output_stage) {
+ if (!add_packed_varyings(shProg, output_stage))
+return;
+  }
}
 
if (!add_fragdata_arrays(shProg))
diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h
index 5b9fce8..c9fe728 100644
--- a/src/mesa/main/mtypes.h
+++ b/src/mesa/main/mtypes.h
@@ -2525,6 +2525,67 @@ struct gl_active_atomic_buffer
 };
 
 /**
+ * Data container for shader queries. This holds only the minimal
+ * amount of required information for resource queries to work.
+ */
+struct gl_shader_variable
+{
+   /**
+* Declared type of the variable
+*/
+   const struct glsl_type *type;
+
+   /**
+* Declared name of the variable
+*/
+   char *name;
+
+   /**
+* Storage location of the base of this variable
+*
+* The precise meaning of this field depends on the nature of the variable.
+*
+*   - Vertex shader input: one of the values from \c gl_vert_attrib.
+*   - Vertex shader output: one of the values from \c gl_varying_slot.
+*   -