[Mesa-dev] [RFC v3] drm/i915: Select engines via class and instance in execbuffer2

2017-05-17 Thread Tvrtko Ursulin
From: Tvrtko Ursulin 

Building on top of the previous patch which exported the concept
of engine classes and instances, we can also use this instead of
the current awkward engine selection uAPI.

This is primarily interesting for the VCS engine selection which
is a) currently done via disjoint set of flags, and b) the
current I915_EXEC_BSD flags has different semantics depending on
the underlying hardware which is bad.

Proposed idea here is to reserve 16-bits of flags, to pass in
the engine class and instance (8 bits each), and a new flag
named I915_EXEC_CLASS_INSTACE to tell the kernel this new engine
selection API is in use.

The new uAPI also removes access to the weak VCS engine
balancing as currently existing in the driver.

Example usage to send a command to VCS0:

  eb.flags = i915_execbuffer2_engine(DRM_I915_ENGINE_CLASS_VIDEO_DECODE, 0);

Or to send a command to VCS1:

  eb.flags = i915_execbuffer2_engine(DRM_I915_ENGINE_CLASS_VIDEO_DECODE, 1);

v2:
 * Fix unknown flags mask.
 * Use I915_EXEC_RING_MASK for class. (Chris Wilson)

v3:
 * Add a map for fast class-instance engine lookup. (Chris Wilson)

Signed-off-by: Tvrtko Ursulin 
Cc: Ben Widawsky 
Cc: Chris Wilson 
Cc: Daniel Vetter 
Cc: Joonas Lahtinen 
Cc: Jon Bloomfield 
Cc: Daniel Charles 
Cc: "Rogozhkin, Dmitry V" 
Cc: Oscar Mateo 
Cc: "Gong, Zhipeng" 
Cc: intel-vaapi-me...@lists.01.org
Cc: mesa-dev@lists.freedesktop.org
---
 drivers/gpu/drm/i915/i915_drv.h|  1 +
 drivers/gpu/drm/i915/i915_gem_execbuffer.c | 30 ++
 drivers/gpu/drm/i915/i915_reg.h|  3 +++
 drivers/gpu/drm/i915/intel_engine_cs.c |  7 +++
 include/uapi/drm/i915_drm.h| 11 ++-
 5 files changed, 51 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 5dfa4a12e647..7bf4fd42480c 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2066,6 +2066,7 @@ struct drm_i915_private {
struct pci_dev *bridge_dev;
struct i915_gem_context *kernel_context;
struct intel_engine_cs *engine[I915_NUM_ENGINES];
+   struct intel_engine_cs *engine_class[MAX_ENGINE_CLASS + 
1][MAX_ENGINE_INSTANCE + 1];
struct i915_vma *semaphore;
 
struct drm_dma_handle *status_page_dmah;
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c 
b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index af1965774e7b..c1ad49ab64cd 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -1492,6 +1492,33 @@ gen8_dispatch_bsd_engine(struct drm_i915_private 
*dev_priv,
return file_priv->bsd_engine;
 }
 
+extern u8 user_class_map[DRM_I915_ENGINE_CLASS_MAX];
+
+static struct intel_engine_cs *get_engine_class(struct drm_i915_private *i915,
+   u8 class, u8 instance)
+{
+   if (class > MAX_ENGINE_CLASS || instance > MAX_ENGINE_INSTANCE)
+   return NULL;
+
+   return i915->engine_class[class][instance];
+}
+
+static struct intel_engine_cs *
+eb_select_engine_class_instance(struct drm_i915_private *i915,
+   struct drm_i915_gem_execbuffer2 *args)
+{
+   u8 class, instance;
+
+   class = args->flags & I915_EXEC_RING_MASK;
+   if (class >= DRM_I915_ENGINE_CLASS_MAX)
+   return NULL;
+
+   instance = (args->flags >> I915_EXEC_INSTANCE_SHIFT) &&
+  I915_EXEC_INSTANCE_MASK;
+
+   return get_engine_class(i915, user_class_map[class], instance);
+}
+
 #define I915_USER_RINGS (4)
 
 static const enum intel_engine_id user_ring_map[I915_USER_RINGS + 1] = {
@@ -1510,6 +1537,9 @@ eb_select_engine(struct drm_i915_private *dev_priv,
unsigned int user_ring_id = args->flags & I915_EXEC_RING_MASK;
struct intel_engine_cs *engine;
 
+   if (args->flags & I915_EXEC_CLASS_INSTANCE)
+   return eb_select_engine_class_instance(dev_priv, args);
+
if (user_ring_id > I915_USER_RINGS) {
DRM_DEBUG("execbuf with unknown ring: %u\n", user_ring_id);
return NULL;
diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
index ee144ec57935..a3b59043b991 100644
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -92,6 +92,9 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg)
 #define VIDEO_ENHANCEMENT_CLASS2
 #define COPY_ENGINE_CLASS  3
 #define OTHER_CLASS4
+#define MAX_ENGINE_CLASS   4
+
+#define MAX_ENGINE_INSTANCE1
 
 /* PCI config space */
 
diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c 
b/drivers/gpu/drm/i915/intel_engine_cs.c
index 7566cf48012f..c5ad51c43d23 100644
--- a/drivers/gpu/drm/i915/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/intel_engine_cs.c
@@ -225,7 +225,14 @@ intel_engin

Re: [Mesa-dev] [Intel-gfx] [RFC v3] drm/i915: Select engines via class and instance in execbuffer2

2017-05-18 Thread Tvrtko Ursulin


On 18/05/2017 12:10, Chris Wilson wrote:

On Thu, May 18, 2017 at 01:55:59PM +0300, Joonas Lahtinen wrote:

On ke, 2017-05-17 at 16:40 +0100, Tvrtko Ursulin wrote:

From: Tvrtko Ursulin 

Building on top of the previous patch which exported the concept
of engine classes and instances, we can also use this instead of
the current awkward engine selection uAPI.

This is primarily interesting for the VCS engine selection which
is a) currently done via disjoint set of flags, and b) the
current I915_EXEC_BSD flags has different semantics depending on
the underlying hardware which is bad.

Proposed idea here is to reserve 16-bits of flags, to pass in
the engine class and instance (8 bits each), and a new flag
named I915_EXEC_CLASS_INSTACE to tell the kernel this new engine
selection API is in use.


Note this text doesn't describe the interface in v3.


Would it make sense to use bitmask for future proofing? Then we could
allow passing multiple options in the future.


No. The first use case has to be explicit control of engine. That's
orthogonal to asking to select any of a particular class.


Was the suggestion to allow instance=any and instance=N? Or even all the 
way to instance=N-or-M?


If not the latter, then I can think of two other possible approaches:

1. Reserve 0xff as instance=any, aka 128 instances should be enough for 
everbody. :) Could simply enforce in the uAPI that instance == 
I915_EXEC_INSTANCE_MASK = -EINVAL for now or something.


2. Do nothing now and add say I915_EXEC_CLASS at a later point. This 
patch adds I915_EXEC_CLASS_INSTANCE so I915_EXEC_CLASS would not sound 
out of place.


Regards,

Tvrtko
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [Intel-gfx] [RFC v3] drm/i915: Select engines via class and instance in execbuffer2

2017-05-18 Thread Tvrtko Ursulin


On 18/05/2017 13:24, Chris Wilson wrote:

On Thu, May 18, 2017 at 01:13:20PM +0100, Tvrtko Ursulin wrote:


On 18/05/2017 12:10, Chris Wilson wrote:

On Thu, May 18, 2017 at 01:55:59PM +0300, Joonas Lahtinen wrote:

On ke, 2017-05-17 at 16:40 +0100, Tvrtko Ursulin wrote:

From: Tvrtko Ursulin 

Building on top of the previous patch which exported the concept
of engine classes and instances, we can also use this instead of
the current awkward engine selection uAPI.

This is primarily interesting for the VCS engine selection which
is a) currently done via disjoint set of flags, and b) the
current I915_EXEC_BSD flags has different semantics depending on
the underlying hardware which is bad.

Proposed idea here is to reserve 16-bits of flags, to pass in
the engine class and instance (8 bits each), and a new flag
named I915_EXEC_CLASS_INSTACE to tell the kernel this new engine
selection API is in use.


Note this text doesn't describe the interface in v3.


Would it make sense to use bitmask for future proofing? Then we could
allow passing multiple options in the future.


No. The first use case has to be explicit control of engine. That's
orthogonal to asking to select any of a particular class.


Was the suggestion to allow instance=any and instance=N? Or even all
the way to instance=N-or-M?

If not the latter, then I can think of two other possible approaches:

1. Reserve 0xff as instance=any, aka 128 instances should be enough
for everbody. :) Could simply enforce in the uAPI that instance ==
I915_EXEC_INSTANCE_MASK = -EINVAL for now or something.

2. Do nothing now and add say I915_EXEC_CLASS at a later point. This
patch adds I915_EXEC_CLASS_INSTANCE so I915_EXEC_CLASS would not
sound out of place.


Yes, I would argue to defer it until later. One problem I have at the
moment is that not all members of a class are equal, HEVC-capable
engines and the reset do not belong to the same class (i.e. my hope is
that we could just say  | [  | INSTANCE_MASK ] | LOAD_BALANCE)
So I can see the sense in having instance as a mask, or at least making
the current instance field large enough to store a mask in future. I
just feel uneasy as that field could grow quite large, and maybe it will
be better to set the constraint via a context param (all dependency on
frequency and tuning of the LOAD_BALANCE). Hmm, liking having the
instance-mask on the context atm.


I don't think per context mask would work unless you won't to mandate 
multi-contexts where they wouldn't otherwise be needed.


But this problem in general can also be solved separately from 
class-instance addressing via engine feature masking.


As the GEM_ENGINE_INFO ioctl proposal defines a set of engine flags, at 
a later point execbuf could be extended with a new flag. For example:


eb.flags = I915_EXEC_CLASS | I915_ENGINE_CLASS_VIDEO | 
I915_EXEC_ENGINE_FEATURES | I915_ENGINE_HAS_HEVC;


Only problem I see that engine flags in the current proposal are u64 so 
it would be a bit challenging to fit that in eb.flags.


Not sure if it would make sense to split the engine info flags into a 
smaller and larger set. u8 which would be flags "compatible" with 
I915_EXEC_ENGINE_FEATURES, and the remainder which would be something 
else, or unused/reserved? Like:


struct drm_i915_engine_info {
/** Engine instance number. */
__u32   instance;
__u32   rsvd;

/** Engine specific features and info. */
#define DRM_I915_ENGINE_HAS_HEVCBIT(0)
__u8 features;
__u8 rsvd;

__32 info;
};

Or at that point you bring on eb3.

Regards,

Tvrtko
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [Intel-gfx] [RFC v3] drm/i915: Select engines via class and instance in execbuffer2

2017-05-18 Thread Tvrtko Ursulin


On 17/05/2017 17:44, Chris Wilson wrote:

On Wed, May 17, 2017 at 04:40:57PM +0100, Tvrtko Ursulin wrote:

From: Tvrtko Ursulin 

Building on top of the previous patch which exported the concept
of engine classes and instances, we can also use this instead of
the current awkward engine selection uAPI.

This is primarily interesting for the VCS engine selection which
is a) currently done via disjoint set of flags, and b) the
current I915_EXEC_BSD flags has different semantics depending on
the underlying hardware which is bad.

Proposed idea here is to reserve 16-bits of flags, to pass in
the engine class and instance (8 bits each), and a new flag
named I915_EXEC_CLASS_INSTACE to tell the kernel this new engine
selection API is in use.

The new uAPI also removes access to the weak VCS engine
balancing as currently existing in the driver.

Example usage to send a command to VCS0:

  eb.flags = i915_execbuffer2_engine(DRM_I915_ENGINE_CLASS_VIDEO_DECODE, 0);

Or to send a command to VCS1:

  eb.flags = i915_execbuffer2_engine(DRM_I915_ENGINE_CLASS_VIDEO_DECODE, 1);

v2:
 * Fix unknown flags mask.
 * Use I915_EXEC_RING_MASK for class. (Chris Wilson)

v3:
 * Add a map for fast class-instance engine lookup. (Chris Wilson)

Signed-off-by: Tvrtko Ursulin 
Cc: Ben Widawsky 
Cc: Chris Wilson 
Cc: Daniel Vetter 
Cc: Joonas Lahtinen 
Cc: Jon Bloomfield 
Cc: Daniel Charles 
Cc: "Rogozhkin, Dmitry V" 
Cc: Oscar Mateo 
Cc: "Gong, Zhipeng" 
Cc: intel-vaapi-me...@lists.01.org
Cc: mesa-dev@lists.freedesktop.org
---
 drivers/gpu/drm/i915/i915_drv.h|  1 +
 drivers/gpu/drm/i915/i915_gem_execbuffer.c | 30 ++
 drivers/gpu/drm/i915/i915_reg.h|  3 +++
 drivers/gpu/drm/i915/intel_engine_cs.c |  7 +++
 include/uapi/drm/i915_drm.h| 11 ++-
 5 files changed, 51 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 5dfa4a12e647..7bf4fd42480c 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2066,6 +2066,7 @@ struct drm_i915_private {
struct pci_dev *bridge_dev;
struct i915_gem_context *kernel_context;
struct intel_engine_cs *engine[I915_NUM_ENGINES];
+   struct intel_engine_cs *engine_class[MAX_ENGINE_CLASS + 
1][MAX_ENGINE_INSTANCE + 1];
struct i915_vma *semaphore;

struct drm_dma_handle *status_page_dmah;
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c 
b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index af1965774e7b..c1ad49ab64cd 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -1492,6 +1492,33 @@ gen8_dispatch_bsd_engine(struct drm_i915_private 
*dev_priv,
return file_priv->bsd_engine;
 }

+extern u8 user_class_map[DRM_I915_ENGINE_CLASS_MAX];
+
+static struct intel_engine_cs *get_engine_class(struct drm_i915_private *i915,
+   u8 class, u8 instance)
+{
+   if (class > MAX_ENGINE_CLASS || instance > MAX_ENGINE_INSTANCE)
+   return NULL;
+
+   return i915->engine_class[class][instance];
+}


Be bold make this this an intel_engine_lookup(), I forsee some other
users appearing very shortly.


Still static or you want to export it straight away? Preference for 
where to put it? Here or intel_engine_cs.c?



+static struct intel_engine_cs *
+eb_select_engine_class_instance(struct drm_i915_private *i915,
+   struct drm_i915_gem_execbuffer2 *args)
+{
+   u8 class, instance;
+
+   class = args->flags & I915_EXEC_RING_MASK;
+   if (class >= DRM_I915_ENGINE_CLASS_MAX)
+   return NULL;
+
+   instance = (args->flags >> I915_EXEC_INSTANCE_SHIFT) &&
+  I915_EXEC_INSTANCE_MASK;
+
+   return get_engine_class(i915, user_class_map[class], instance);
+}
+
 #define I915_USER_RINGS (4)

 static const enum intel_engine_id user_ring_map[I915_USER_RINGS + 1] = {
@@ -1510,6 +1537,9 @@ eb_select_engine(struct drm_i915_private *dev_priv,
unsigned int user_ring_id = args->flags & I915_EXEC_RING_MASK;
struct intel_engine_cs *engine;

+   if (args->flags & I915_EXEC_CLASS_INSTANCE)
+   return eb_select_engine_class_instance(dev_priv, args);
+
if (user_ring_id > I915_USER_RINGS) {
DRM_DEBUG("execbuf with unknown ring: %u\n", user_ring_id);
return NULL;
diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
index ee144ec57935..a3b59043b991 100644
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -92,6 +92,9 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg)
 #define VIDEO_ENHANCEMENT_CLASS2
 #define COPY_ENGINE_CLASS  3
 #define OTHER_CLASS4
+#define MAX_ENGINE_CLASS   4
+
+#define MAX_ENGI

Re: [Mesa-dev] [Intel-gfx] [RFC v3] drm/i915: Select engines via class and instance in execbuffer2

2017-05-18 Thread Tvrtko Ursulin


On 18/05/2017 15:10, Emil Velikov wrote:

Hi Tvrtko,

On 18 May 2017 at 14:06, Tvrtko Ursulin  wrote:


struct drm_i915_engine_info {
/** Engine instance number. */
__u32   instance;
__u32   rsvd;

/** Engine specific features and info. */
#define DRM_I915_ENGINE_HAS_HEVCBIT(0)
__u8 features;
__u8 rsvd;

__32 info;
};


To spare yourself from writing a compat ioctl, you want to have the
members at the same offset on both 32 and 64bit builds.
At the same times the whole struct size should be multiple of 64bit.

This and a few others are covered in Daniel Vetter's "Botching up ioctls" [1]


Yes, thanks, I failed to add up to 64. :)

Regards,

Tvrtko
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [Intel-gfx] [RFC v3] drm/i915: Select engines via class and instance in execbuffer2

2017-05-18 Thread Tvrtko Ursulin


On 18/05/2017 14:37, Chris Wilson wrote:

On Thu, May 18, 2017 at 02:06:35PM +0100, Tvrtko Ursulin wrote:


On 18/05/2017 13:24, Chris Wilson wrote:

Yes, I would argue to defer it until later. One problem I have at the
moment is that not all members of a class are equal, HEVC-capable
engines and the reset do not belong to the same class (i.e. my hope is
that we could just say  | [  | INSTANCE_MASK ] | LOAD_BALANCE)
So I can see the sense in having instance as a mask, or at least making
the current instance field large enough to store a mask in future. I
just feel uneasy as that field could grow quite large, and maybe it will
be better to set the constraint via a context param (all dependency on
frequency and tuning of the LOAD_BALANCE). Hmm, liking having the
instance-mask on the context atm.


I don't think per context mask would work unless you won't to
mandate multi-contexts where they wouldn't otherwise be needed.


Contexts are not thread-friendly. About the only way you can make them
safe (wrt execbuf) is through the use of userspace GTT allocation (i.e.
assigning an address on creation and making it permanent with softpin).

So in general you end up serialising around execbuf and copying the
state in/out of the ioctl. That gives a window of opportunity to use
context_setparam as an extension for irregular parameter updates.

It is not as nice as providing space in the execbuf ioctl, because of
the extra state being carried around in the context.


Yes not nice, I can't say that I like this very much.


But this problem in general can also be solved separately from
class-instance addressing via engine feature masking.


But imo all members of a class should have the same features. That would
be my definition of a class!


That sounds very totalitarian! :)) To me a class is a group of some 
entities which share some common characteristics - not necessarily 
completely uniform.


Regards,

Tvrtko
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [RFC v3 1/2] drm/i915: Engine discovery uAPI

2017-05-18 Thread Tvrtko Ursulin
From: Tvrtko Ursulin 

Engine discovery uAPI allows userspace to probe for engine
configuration and features without needing to maintain the
internal PCI id based database.

This enables removal of code duplications across userspace
components.

Probing is done via the new DRM_IOCTL_I915_GEM_ENGINE_INFO ioctl
which returns the number and information on the specified engine
class.

Currently only general engine configuration and HEVC feature of
the VCS engine can be probed but the uAPI is designed to be
generic and extensible.

Code is based almost exactly on the earlier proposal on the
topic by Jon Bloomfield. Engine class and instance refactoring
made recently by Daniele Ceraolo Spurio enabled this to be
implemented in an elegant fashion.

To probe configuration userspace sets the engine class it wants
to query (struct drm_i915_gem_engine_info) and provides an array
of drm_i915_engine_info structs which will be filled in by the
driver. Userspace also has to tell i915 how many elements are in
the array, and the driver will report back the total number of
engine instances in any case.

v2:
 * Add a version field and consolidate to one engine count.
   (Chris Wilson)
 * Rename uAPI flags for VCS engines to DRM_I915_ENGINE_CLASS_VIDEO.
   (Gong Zhipeng)

v3:
 * Drop the DRM_ prefix from uAPI enums. (Chris Wilson)
 * Only reserve 8-bits for the engine info for time being.

Signed-off-by: Tvrtko Ursulin 
Cc: Ben Widawsky 
Cc: Chris Wilson 
Cc: Daniel Vetter 
Cc: Joonas Lahtinen 
Cc: Jon Bloomfield 
Cc: Daniel Charles 
Cc: "Rogozhkin, Dmitry V" 
Cc: Oscar Mateo 
Cc: "Gong, Zhipeng" 
Cc: intel-vaapi-me...@lists.01.org
Cc: mesa-dev@lists.freedesktop.org
---
 drivers/gpu/drm/i915/i915_drv.c|  1 +
 drivers/gpu/drm/i915/i915_drv.h|  3 ++
 drivers/gpu/drm/i915/intel_engine_cs.c | 64 ++
 include/uapi/drm/i915_drm.h| 41 ++
 4 files changed, 109 insertions(+)

diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index 72fb47a439d2..dc268f41a609 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -2623,6 +2623,7 @@ static const struct drm_ioctl_desc i915_ioctls[] = {
DRM_IOCTL_DEF_DRV(I915_GEM_CONTEXT_GETPARAM, 
i915_gem_context_getparam_ioctl, DRM_RENDER_ALLOW),
DRM_IOCTL_DEF_DRV(I915_GEM_CONTEXT_SETPARAM, 
i915_gem_context_setparam_ioctl, DRM_RENDER_ALLOW),
DRM_IOCTL_DEF_DRV(I915_PERF_OPEN, i915_perf_open_ioctl, 
DRM_RENDER_ALLOW),
+   DRM_IOCTL_DEF_DRV(I915_GEM_ENGINE_INFO, i915_gem_engine_info_ioctl, 
DRM_RENDER_ALLOW),
 };
 
 static struct drm_driver driver = {
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 17883a84b8c1..1e08b82c4823 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -3521,6 +3521,9 @@ i915_gem_context_lookup_timeline(struct i915_gem_context 
*ctx,
 int i915_perf_open_ioctl(struct drm_device *dev, void *data,
 struct drm_file *file);
 
+int i915_gem_engine_info_ioctl(struct drm_device *dev, void *data,
+  struct drm_file *file);
+
 /* i915_gem_evict.c */
 int __must_check i915_gem_evict_something(struct i915_address_space *vm,
  u64 min_size, u64 alignment,
diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c 
b/drivers/gpu/drm/i915/intel_engine_cs.c
index 413bfd8d4bf4..8d370c5d07bc 100644
--- a/drivers/gpu/drm/i915/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/intel_engine_cs.c
@@ -25,6 +25,7 @@
 #include "i915_drv.h"
 #include "intel_ringbuffer.h"
 #include "intel_lrc.h"
+#include 
 
 /* Haswell does have the CXT_SIZE register however it does not appear to be
  * valid. Now, docs explain in dwords what is in the context object. The full
@@ -1286,6 +1287,69 @@ void intel_engines_mark_idle(struct drm_i915_private 
*i915)
}
 }
 
+u8 user_class_map[I915_ENGINE_CLASS_MAX] = {
+   [I915_ENGINE_CLASS_OTHER] = OTHER_CLASS,
+   [I915_ENGINE_CLASS_RENDER] = RENDER_CLASS,
+   [I915_ENGINE_CLASS_COPY] = COPY_ENGINE_CLASS,
+   [I915_ENGINE_CLASS_VIDEO] = VIDEO_DECODE_CLASS,
+   [I915_ENGINE_CLASS_VIDEO_ENHANCE] = VIDEO_ENHANCEMENT_CLASS,
+};
+
+int i915_gem_engine_info_ioctl(struct drm_device *dev, void *data,
+  struct drm_file *file)
+{
+   struct drm_i915_private *i915 = to_i915(dev);
+   struct drm_i915_gem_engine_info *args = data;
+   struct drm_i915_engine_info __user *user_info =
+   u64_to_user_ptr(args->info_ptr);
+   unsigned int info_size = args->num_engines;
+   struct drm_i915_engine_info info;
+   struct intel_engine_cs *engine;
+   enum intel_engine_id id;
+   u8 class;
+
+   if (args->rsvd)
+   return -EINVAL;
+
+   switch (args->engine_class) {
+   case 

[Mesa-dev] [RFC v4 2/2] drm/i915: Select engines via class and instance in execbuffer2

2017-05-18 Thread Tvrtko Ursulin
From: Tvrtko Ursulin 

Building on top of the previous patch which exported the concept
of engine classes and instances, we can also use this instead of
the current awkward engine selection uAPI.

This is primarily interesting for the VCS engine selection which
is a) currently done via disjoint set of flags, and b) the
current I915_EXEC_BSD flags has different semantics depending on
the underlying hardware which is bad.

Proposed idea here is to reserve 8-bits of flags, to pass in the
engine instance, re-use the existing engine selection bits for
the class selection, and a new flag named
I915_EXEC_CLASS_INSTANCE to tell the kernel this new engine
selection API is in use.

The new uAPI also removes access to the weak VCS engine
balancing as currently existing in the driver.

Example usage to send a command to VCS0:

  eb.flags = i915_execbuffer2_engine(I915_ENGINE_CLASS_VIDEO_DECODE, 0);

Or to send a command to VCS1:

  eb.flags = i915_execbuffer2_engine(I915_ENGINE_CLASS_VIDEO_DECODE, 1);

v2:
 * Fix unknown flags mask.
 * Use I915_EXEC_RING_MASK for class. (Chris Wilson)

v3:
 * Add a map for fast class-instance engine lookup. (Chris Wilson)

v4:
 * Update commit to reflect v3.
 * Export intel_engine_lookup for other users. (Chris Wilson)
 * Split out some warns. (Chris Wilson)

Signed-off-by: Tvrtko Ursulin 
Cc: Ben Widawsky 
Cc: Chris Wilson 
Cc: Daniel Vetter 
Cc: Joonas Lahtinen 
Cc: Jon Bloomfield 
Cc: Daniel Charles 
Cc: "Rogozhkin, Dmitry V" 
Cc: Oscar Mateo 
Cc: "Gong, Zhipeng" 
Cc: intel-vaapi-me...@lists.01.org
Cc: mesa-dev@lists.freedesktop.org
---
 drivers/gpu/drm/i915/i915_drv.h|  1 +
 drivers/gpu/drm/i915/i915_gem_execbuffer.c | 20 
 drivers/gpu/drm/i915/i915_reg.h|  3 +++
 drivers/gpu/drm/i915/intel_engine_cs.c | 20 
 drivers/gpu/drm/i915/intel_ringbuffer.h|  3 +++
 include/uapi/drm/i915_drm.h| 12 +++-
 6 files changed, 58 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 1e08b82c4823..53b41963f672 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2115,6 +2115,7 @@ struct drm_i915_private {
struct pci_dev *bridge_dev;
struct i915_gem_context *kernel_context;
struct intel_engine_cs *engine[I915_NUM_ENGINES];
+   struct intel_engine_cs *engine_class[MAX_ENGINE_CLASS + 
1][MAX_ENGINE_INSTANCE + 1];
struct i915_vma *semaphore;
 
struct drm_dma_handle *status_page_dmah;
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c 
b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index af1965774e7b..006c8046af5f 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -1492,6 +1492,23 @@ gen8_dispatch_bsd_engine(struct drm_i915_private 
*dev_priv,
return file_priv->bsd_engine;
 }
 
+static struct intel_engine_cs *
+eb_select_engine_class_instance(struct drm_i915_private *i915,
+   struct drm_i915_gem_execbuffer2 *args)
+{
+   u8 class = args->flags & I915_EXEC_RING_MASK;
+   extern u8 user_class_map[I915_ENGINE_CLASS_MAX];
+   u8 instance;
+
+   if (class >= ARRAY_SIZE(user_class_map))
+   return NULL;
+
+   instance = (args->flags >> I915_EXEC_INSTANCE_SHIFT) &&
+  I915_EXEC_INSTANCE_MASK;
+
+   return intel_engine_lookup(i915, user_class_map[class], instance);
+}
+
 #define I915_USER_RINGS (4)
 
 static const enum intel_engine_id user_ring_map[I915_USER_RINGS + 1] = {
@@ -1510,6 +1527,9 @@ eb_select_engine(struct drm_i915_private *dev_priv,
unsigned int user_ring_id = args->flags & I915_EXEC_RING_MASK;
struct intel_engine_cs *engine;
 
+   if (args->flags & I915_EXEC_CLASS_INSTANCE)
+   return eb_select_engine_class_instance(dev_priv, args);
+
if (user_ring_id > I915_USER_RINGS) {
DRM_DEBUG("execbuf with unknown ring: %u\n", user_ring_id);
return NULL;
diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
index 89888adb9af1..fa4a3841c4af 100644
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -92,6 +92,9 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg)
 #define VIDEO_ENHANCEMENT_CLASS2
 #define COPY_ENGINE_CLASS  3
 #define OTHER_CLASS4
+#define MAX_ENGINE_CLASS   4
+
+#define MAX_ENGINE_INSTANCE1
 
 /* PCI config space */
 
diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c 
b/drivers/gpu/drm/i915/intel_engine_cs.c
index 8d370c5d07bc..8f31514b01d5 100644
--- a/drivers/gpu/drm/i915/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/intel_engine_cs.c
@@ -198,6 +198,15 @@ intel_engine_setup(struct drm_i915_private *dev_priv,
GEM_BUG_ON(info->class >= ARRAY_SIZE(intel_engi

Re: [Mesa-dev] [Intel-gfx] [RFC v3] drm/i915: Select engines via class and instance in execbuffer2

2017-05-24 Thread Tvrtko Ursulin


On 18/05/2017 18:00, Chris Wilson wrote:

On Thu, May 18, 2017 at 05:20:38PM +0100, Tvrtko Ursulin wrote:


On 18/05/2017 14:37, Chris Wilson wrote:

On Thu, May 18, 2017 at 02:06:35PM +0100, Tvrtko Ursulin wrote:


But this problem in general can also be solved separately from
class-instance addressing via engine feature masking.


But imo all members of a class should have the same features. That would
be my definition of a class!


That sounds very totalitarian! :)) To me a class is a group of some
entities which share some common characteristics - not necessarily
completely uniform.


The problem otherwise is that we then have to define yet another
interface based on features. To me that sounds like too much
duplication, that we could avoid from the beginning. Curse the hw for
being asymmetical!


Hm I don't see a problem with the feature base engine selection on top. 
You still do because of the desire classes were equal in features?


To sum up what I (and we) talked about in various parts of the thread(s):

Step 1a: New execbuf engine selection uAPI.

 - execbuf class=VCS instance=1

Step 1b: Engine discovery uAPI.

Same as above but userpace can figure out how many VCS engines there
are without PCI probing.

I didn't get much feedback on this one. :(

Step 2: Feature masks for execbuf.

 - execbuf class=VCS instance=0 features=HEVC = OK
 - execbuf class=VCS instance=1 features=HEVC = FAIL

But userspace can use engine discovery to figure out which are the valid 
combinations.


This could be a simpler, but less featureful and not very elegant 
alternative to step 2.


Otherwise just a prep step for the subsequent steps below.

Step 3a: (One day maybe) userspace selects a class, i915 picks the engine

 - execbuf class=VCS instance=any

Step 3b: userspace selected class and features

 - execbuf class=VCS instance=any features=HEVC

This RFC proposed steps 1a and 1b. The rest we leave for later.

How does that sound? Acceptable?

In case of engine discovery useful enough or what other features could 
we put it in to make it more useful for userspace? Potentially enable 
dropping PCI id probing altogether and enable libva/mesa/??? to probe 
everything using i915 ioctls.


Regards,

Tvrtko
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [Intel-gfx] [RFC 1/2] drm/i915: Engine discovery uAPI

2017-04-24 Thread Tvrtko Ursulin


On 18/04/2017 21:13, Chris Wilson wrote:

On Tue, Apr 18, 2017 at 05:56:14PM +0100, Tvrtko Ursulin wrote:

+enum drm_i915_gem_engine_class {
+   DRM_I915_ENGINE_CLASS_OTHER = 0,
+   DRM_I915_ENGINE_CLASS_RENDER = 1,
+   DRM_I915_ENGINE_CLASS_COPY = 2,
+   DRM_I915_ENGINE_CLASS_VIDEO_DECODE = 3,
+   DRM_I915_ENGINE_CLASS_VIDEO_ENHANCE = 4,
+   DRM_I915_ENGINE_CLASS_MAX /* non-ABI */
+};
+
+struct drm_i915_engine_info {
+   /** Engine instance number. */
+   __u32   instance;
+   __u32   rsvd;
+
+   /** Engine specific info. */
+#define DRM_I915_ENGINE_HAS_HEVC   BIT(0)
+   __u64 info;
+};


So the main question is how can we extend this in future, keeping
forwards/backwards compat?

I think if we put a query version into info and then the kernel supplies
an array matching that version (or reports the most recent version
supported if the request is too modern.)

The kernel has to keep all the old struct variants and exporters
indefinitely.


Versioning sounds good to me.


Another alternative would get an ENGINE_GETPARAM where we just have a
switch of all possibily questions. Maybe better as a CONTEXT_GETPARAM if
we start thinking about allowing CONTEXT_SETPARAM to fine tune
individual clients.


This idea I did not get - what is the switch of all possible questions? 
You mean new ioctl like ENGINE_GETPARAM which would return a list of 
queries supported by CONTEXT_GETPARAM? Which would effectively be a 
dispatcher-in-dispatcher kind of thing?



+struct drm_i915_gem_engine_info {
+   /** in: Engine class to probe (enum drm_i915_gem_engine_class). */
+   __u32 engine_class;


__u32 [in/out] version ? (see above)


+
+   /** out: Actual number of hardware engines. */
+   __u32 num_engines;
+
+   /**
+* in: Number of struct drm_i915_engine_ifo entries in the provided
+* info array.
+*/
+   __u32 info_size;


This is superfluous with num_engines. The standard 2 pass, discovery of
size, followed by allocation and final query.


This is also fine. I was one the fence whether to actually condense it 
to one field in the first posting or not myself.


Regards,

Tvrtko
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [Intel-gfx] [RFC 1/2] drm/i915: Engine discovery uAPI

2017-04-24 Thread Tvrtko Ursulin


On 19/04/2017 06:22, Kenneth Graunke wrote:

On Tuesday, April 18, 2017 9:56:14 AM PDT Tvrtko Ursulin wrote:

From: Tvrtko Ursulin 

Engine discovery uAPI allows userspace to probe for engine
configuration and features without needing to maintain the
internal PCI id based database.


I don't understand why I would want to query the existence of engines
from the kernel.  As a userspace driver developer, I have to know how to
program the specific generation of hardware I'm on.  I better know what
engines that GPU has, or else there's no way I can competently program it.

In Mesa, we recently imported libdrm and deleted all the engine checks
(a460e1eb51406e5ca54abda42112bfb8523ff046).  All generations have an
RCS, Gen6+ has a separate BLT, and we don't need to use the others.
It's completely statically determinable with a simple check.  Runtime
checks make sense for optional things...but not "is there a 3D engine?".

Plus, even if you add this to the kernel, we still support back to 3.6
(and ChromeOS needs us to continue supporting 3.8), so we won't be able
to magically use the new uABI - we'd need to support both.  Which, if
the point is to delete code...we'd actually have /more/ code for a few
years.  Or, we could not use it...but then nobody would be testing it,
and if a bug creeps in...that pushes it back more years still.


Okay the argument of more code for a while is I suppose always true with 
these types of work. But in general, the idea is to consolidate the 
queries and avoid (may be only partially) duplicate PCI databases across 
components.


Because I suspect today you do some device discovery via libpciaccess 
(or something) and some via i915 GET_PARAMs and so. So the idea is to 
consolidate all that and do it via i915. Since another argument you 
raise, is that you have to know how does the GPU looks like to be able 
to competently program it, in which case who knows better than the 
kernel driver?


But I think the main part of the argument is that why collect and derive 
this information from various sources when perhaps it could only be one.


Maybe the exact idea is not so interesting for Mesa, which I wouldn't be 
surprised at all since the idea was born from libva and the BSD engine 
usage there. In which case perhaps Mesa could become more interested if 
the proposal was exporting some other data to userspace?


I don't think it is critical to find something like that in Mesa, but it 
may be interesting. I think Ben mentioned at one point he had some ideas 
in this area, or something was discussed in the past which may be 
similar. I forgot the exact details now.


So I think for now, if there is nothing which would be interesting in 
Mesa along the lines described so far, please just keep an eye on this. 
Just to make sure if some other component will be interested, and we end 
up starting to implement something, it is at least not hindering you, if 
we cannot find anything useful for you in here.


Regards,

Tvrtko
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [Intel-gfx] [RFC 2/2] drm/i915: Select engines via class and instance in execbuffer2

2017-04-24 Thread Tvrtko Ursulin


On 18/04/2017 22:10, Chris Wilson wrote:

On Tue, Apr 18, 2017 at 05:56:15PM +0100, Tvrtko Ursulin wrote:

From: Tvrtko Ursulin 

Building on top of the previous patch which exported the concept
of engine classes and instances, we can also use this instead of
the current awkward engine selection uAPI.

This is primarily interesting for the VCS engine selection which
is a) currently done via disjoint set of flags, and b) the
current I915_EXEC_BSD flags has different semantics depending on
the underlying hardware which is bad.

Proposed idea here is to reserve 16-bits of flags, to pass in
the engine class and instance (8 bits each), and a new flag
named I915_EXEC_CLASS_INSTACE to tell the kernel this new engine
selection API is in use.

The new uAPI also removes access to the weak VCS engine
balancing as currently existing in the driver.

Example usage to send a command to VCS0:

  eb.flags = i915_execbuffer2_engine(DRM_I915_ENGINE_CLASS_VIDEO_DECODE, 0);

Or to send a command to VCS1:

  eb.flags = i915_execbuffer2_engine(DRM_I915_ENGINE_CLASS_VIDEO_DECODE, 1);


To save a bit of space, we can use the ring selector as a class selector
if bit18 is set, with 19-27 as instance. That limits us to 64 classes -
hopefully not a problem for near future. At least I might have you sold
you on a flexible execbuf3 by then.


I was considering re-using those bits yes. I was thinking about the pro 
of keeping it completely separate but I suppose there is not much value 
in that. So I can re-use the ring selector just as well and have a 
smaller impact on number of bits left over.



(As a digression, some cryptic notes for an implementation I did over Easter:
/*
 * Execbuf3!
 *
 * ringbuffer
 *  - per context
 *  - per engine


We have this already so I am missing something I guess.


 *  - PAGE_SIZE ctl [ro head, rw tai] + user pot
 *  - kthread [i915/$ctx-$engine] (optional?)


No idea what these two are. :)


 *  - assumes NO_RELOC-esque awareness


Ok ok NO_RELOC. :)


 *
 * SYNC flags [wait/signal], handle [semaphore/fence]


Sync fence in out just as today, but probably more?


 *
 * BIND handle, offset [user provided]
 * ALLOC[32,64] handle, flags, *offset [kernel provided, need RELOC]
 * RELOC[32,64] handle, target_handle, offset, delta
 * CLEAR flags, handle
 * UNBIND handle


Explicit VMA management? Separate ioctl maybe would be better?


 *
 * BATCH flags, handle, offset
 * [or SVM flags, address]
 *   PIN flags (MAY_RELOC), count, handle[count]
 *   FENCE flags, count, handle[count]
 * SUBMIT handle [fence/NULL with error]
 */


No idea again. :)


At the moment it is just trying to do execbuf2, but more compactly and
with fewer ioctls. But one of the main selling points is that we can
extend the information passed around more freely than execbuf2.)


I have nothing against a better eb since I trust you know much better it 
is needed and when. But I don't know how long it will take to get there. 
This class/instance idea could be implemented quickly to solve the sore 
point of VCS/VCS2 engine selection. But yeah, it is another uABI to keep 
in that case.


Regards,

Tvrtko
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [Intel-gfx] [RFC v2 2/2] drm/i915: Select engines via class and instance in execbuffer2

2017-04-27 Thread Tvrtko Ursulin


On 27/04/2017 10:25, Chris Wilson wrote:

On Thu, Apr 27, 2017 at 10:10:34AM +0100, Tvrtko Ursulin wrote:

From: Tvrtko Ursulin 

Building on top of the previous patch which exported the concept
of engine classes and instances, we can also use this instead of
the current awkward engine selection uAPI.

This is primarily interesting for the VCS engine selection which
is a) currently done via disjoint set of flags, and b) the
current I915_EXEC_BSD flags has different semantics depending on
the underlying hardware which is bad.

Proposed idea here is to reserve 16-bits of flags, to pass in
the engine class and instance (8 bits each), and a new flag
named I915_EXEC_CLASS_INSTACE to tell the kernel this new engine
selection API is in use.

The new uAPI also removes access to the weak VCS engine
balancing as currently existing in the driver.

Example usage to send a command to VCS0:

  eb.flags = i915_execbuffer2_engine(DRM_I915_ENGINE_CLASS_VIDEO_DECODE, 0);

Or to send a command to VCS1:

  eb.flags = i915_execbuffer2_engine(DRM_I915_ENGINE_CLASS_VIDEO_DECODE, 1);

v2:
 * Fix unknown flags mask.
 * Use I915_EXEC_RING_MASK for class. (Chris Wilson)

Signed-off-by: Tvrtko Ursulin 
Cc: Ben Widawsky 
Cc: Chris Wilson 
Cc: Daniel Vetter 
Cc: Joonas Lahtinen 
Cc: Jon Bloomfield 
Cc: Daniel Charles 
Cc: "Rogozhkin, Dmitry V" 
Cc: Oscar Mateo 
Cc: "Gong, Zhipeng" 
Cc: intel-vaapi-me...@lists.01.org
Cc: mesa-dev@lists.freedesktop.org
---
 drivers/gpu/drm/i915/i915_gem_execbuffer.c | 29 +
 include/uapi/drm/i915_drm.h| 11 ++-
 2 files changed, 39 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c 
b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index af1965774e7b..ecd1486642a7 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -1492,6 +1492,32 @@ gen8_dispatch_bsd_engine(struct drm_i915_private 
*dev_priv,
return file_priv->bsd_engine;
 }

+extern u8 user_class_map[DRM_I915_ENGINE_CLASS_MAX];
+
+static struct intel_engine_cs *
+eb_select_engine_class_instance(struct drm_i915_private *i915,
+   struct drm_i915_gem_execbuffer2 *args)
+{
+   struct intel_engine_cs *engine;
+   enum intel_engine_id id;
+   u8 class, instance;
+
+   class = args->flags & I915_EXEC_RING_MASK;
+   if (class >= DRM_I915_ENGINE_CLASS_MAX)
+   return NULL;
+   class = user_class_map[class];
+
+   instance = (args->flags >> I915_EXEC_INSTANCE_SHIFT) &&
+  I915_EXEC_INSTANCE_MASK;
+
+   for_each_engine(engine, i915, id) {
+   if (engine->class == class && engine->instance == instance)
+   return engine;
+   }


I am underwhelmed. No, i915->class_engine[class][instance] ?


Hey it's just an RFC for the uAPI proposal! Implementation efficiency 
only comes later! :)



Still, at what point do we kill busy-ioctl per-engine reporting? Should


It's the one we already broke before without no one noticing, where it 
userspace only effectively cares about a boolean value?


If so you recommend we make it a real boolean?


we update all tracepoints to use class:instance (I think that's a better
abi going forward).


I can't think of any big problems doing so. Could rename ring= to 
engine= there as well. engine=. for example?


Regards,

Tvrtko
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [RFC v2 1/2] drm/i915: Engine discovery uAPI

2017-04-27 Thread Tvrtko Ursulin
From: Tvrtko Ursulin 

Engine discovery uAPI allows userspace to probe for engine
configuration and features without needing to maintain the
internal PCI id based database.

This enables removal of code duplications across userspace
components.

Probing is done via the new DRM_IOCTL_I915_GEM_ENGINE_INFO ioctl
which returns the number and information on the specified engine
class.

Currently only general engine configuration and HEVC feature of
the VCS engine can be probed but the uAPI is designed to be
generic and extensible.

Code is based almost exactly on the earlier proposal on the
topic by Jon Bloomfield. Engine class and instance refactoring
made recently by Daniele Ceraolo Spurio enabled this to be
implemented in an elegant fashion.

To probe configuration userspace sets the engine class it wants
to query (struct drm_i915_gem_engine_info) and provides an array
of drm_i915_engine_info structs which will be filled in by the
driver. Userspace also has to tell i915 how many elements are in
the array, and the driver will report back the total number of
engine instances in any case.

v2:
 * Add a version field and consolidate to one engine count.
   (Chris Wilson)
 * Rename uAPI flags for VCS engines to DRM_I915_ENGINE_CLASS_VIDEO.
   (Gong Zhipeng)

Signed-off-by: Tvrtko Ursulin 
Cc: Ben Widawsky 
Cc: Chris Wilson 
Cc: Daniel Vetter 
Cc: Joonas Lahtinen 
Cc: Jon Bloomfield 
Cc: Daniel Charles 
Cc: "Rogozhkin, Dmitry V" 
Cc: Oscar Mateo 
Cc: "Gong, Zhipeng" 
Cc: intel-vaapi-me...@lists.01.org
Cc: mesa-dev@lists.freedesktop.org
---
 drivers/gpu/drm/i915/i915_drv.c|  1 +
 drivers/gpu/drm/i915/i915_drv.h|  3 ++
 drivers/gpu/drm/i915/intel_engine_cs.c | 64 ++
 include/uapi/drm/i915_drm.h| 40 +
 4 files changed, 108 insertions(+)

diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index c7d68e789642..1a3f0859227b 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -2609,6 +2609,7 @@ static const struct drm_ioctl_desc i915_ioctls[] = {
DRM_IOCTL_DEF_DRV(I915_GEM_CONTEXT_GETPARAM, 
i915_gem_context_getparam_ioctl, DRM_RENDER_ALLOW),
DRM_IOCTL_DEF_DRV(I915_GEM_CONTEXT_SETPARAM, 
i915_gem_context_setparam_ioctl, DRM_RENDER_ALLOW),
DRM_IOCTL_DEF_DRV(I915_PERF_OPEN, i915_perf_open_ioctl, 
DRM_RENDER_ALLOW),
+   DRM_IOCTL_DEF_DRV(I915_GEM_ENGINE_INFO, i915_gem_engine_info_ioctl, 
DRM_RENDER_ALLOW),
 };
 
 static struct drm_driver driver = {
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 357b6c6c2f04..6eed0e854561 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -3547,6 +3547,9 @@ i915_gem_context_lookup_timeline(struct i915_gem_context 
*ctx,
 int i915_perf_open_ioctl(struct drm_device *dev, void *data,
 struct drm_file *file);
 
+int i915_gem_engine_info_ioctl(struct drm_device *dev, void *data,
+  struct drm_file *file);
+
 /* i915_gem_evict.c */
 int __must_check i915_gem_evict_something(struct i915_address_space *vm,
  u64 min_size, u64 alignment,
diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c 
b/drivers/gpu/drm/i915/intel_engine_cs.c
index 82a274b336c5..caed32dbd912 100644
--- a/drivers/gpu/drm/i915/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/intel_engine_cs.c
@@ -25,6 +25,7 @@
 #include "i915_drv.h"
 #include "intel_ringbuffer.h"
 #include "intel_lrc.h"
+#include 
 
 struct engine_class_info {
const char *name;
@@ -1187,6 +1188,69 @@ void intel_engines_reset_default_submission(struct 
drm_i915_private *i915)
engine->set_default_submission(engine);
 }
 
+u8 user_class_map[DRM_I915_ENGINE_CLASS_MAX] = {
+   [DRM_I915_ENGINE_CLASS_OTHER] = OTHER_CLASS,
+   [DRM_I915_ENGINE_CLASS_RENDER] = RENDER_CLASS,
+   [DRM_I915_ENGINE_CLASS_COPY] = COPY_ENGINE_CLASS,
+   [DRM_I915_ENGINE_CLASS_VIDEO] = VIDEO_DECODE_CLASS,
+   [DRM_I915_ENGINE_CLASS_VIDEO_ENHANCE] = VIDEO_ENHANCEMENT_CLASS,
+};
+
+int i915_gem_engine_info_ioctl(struct drm_device *dev, void *data,
+  struct drm_file *file)
+{
+   struct drm_i915_private *i915 = to_i915(dev);
+   struct drm_i915_gem_engine_info *args = data;
+   struct drm_i915_engine_info __user *user_info =
+   u64_to_user_ptr(args->info_ptr);
+   unsigned int info_size = args->num_engines;
+   struct drm_i915_engine_info info;
+   struct intel_engine_cs *engine;
+   enum intel_engine_id id;
+   u8 class;
+
+   if (args->rsvd)
+   return -EINVAL;
+
+   switch (args->engine_class) {
+   case DRM_I915_ENGINE_CLASS_OTHER:
+   case DRM_I915_ENGINE_CLASS_RENDER:
+   case DRM_I915_ENGINE_CLASS_COPY:
+   case DRM_I915_ENG

[Mesa-dev] [RFC v2 2/2] drm/i915: Select engines via class and instance in execbuffer2

2017-04-27 Thread Tvrtko Ursulin
From: Tvrtko Ursulin 

Building on top of the previous patch which exported the concept
of engine classes and instances, we can also use this instead of
the current awkward engine selection uAPI.

This is primarily interesting for the VCS engine selection which
is a) currently done via disjoint set of flags, and b) the
current I915_EXEC_BSD flags has different semantics depending on
the underlying hardware which is bad.

Proposed idea here is to reserve 16-bits of flags, to pass in
the engine class and instance (8 bits each), and a new flag
named I915_EXEC_CLASS_INSTACE to tell the kernel this new engine
selection API is in use.

The new uAPI also removes access to the weak VCS engine
balancing as currently existing in the driver.

Example usage to send a command to VCS0:

  eb.flags = i915_execbuffer2_engine(DRM_I915_ENGINE_CLASS_VIDEO_DECODE, 0);

Or to send a command to VCS1:

  eb.flags = i915_execbuffer2_engine(DRM_I915_ENGINE_CLASS_VIDEO_DECODE, 1);

v2:
 * Fix unknown flags mask.
 * Use I915_EXEC_RING_MASK for class. (Chris Wilson)

Signed-off-by: Tvrtko Ursulin 
Cc: Ben Widawsky 
Cc: Chris Wilson 
Cc: Daniel Vetter 
Cc: Joonas Lahtinen 
Cc: Jon Bloomfield 
Cc: Daniel Charles 
Cc: "Rogozhkin, Dmitry V" 
Cc: Oscar Mateo 
Cc: "Gong, Zhipeng" 
Cc: intel-vaapi-me...@lists.01.org
Cc: mesa-dev@lists.freedesktop.org
---
 drivers/gpu/drm/i915/i915_gem_execbuffer.c | 29 +
 include/uapi/drm/i915_drm.h| 11 ++-
 2 files changed, 39 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c 
b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index af1965774e7b..ecd1486642a7 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -1492,6 +1492,32 @@ gen8_dispatch_bsd_engine(struct drm_i915_private 
*dev_priv,
return file_priv->bsd_engine;
 }
 
+extern u8 user_class_map[DRM_I915_ENGINE_CLASS_MAX];
+
+static struct intel_engine_cs *
+eb_select_engine_class_instance(struct drm_i915_private *i915,
+   struct drm_i915_gem_execbuffer2 *args)
+{
+   struct intel_engine_cs *engine;
+   enum intel_engine_id id;
+   u8 class, instance;
+
+   class = args->flags & I915_EXEC_RING_MASK;
+   if (class >= DRM_I915_ENGINE_CLASS_MAX)
+   return NULL;
+   class = user_class_map[class];
+
+   instance = (args->flags >> I915_EXEC_INSTANCE_SHIFT) &&
+  I915_EXEC_INSTANCE_MASK;
+
+   for_each_engine(engine, i915, id) {
+   if (engine->class == class && engine->instance == instance)
+   return engine;
+   }
+
+   return NULL;
+}
+
 #define I915_USER_RINGS (4)
 
 static const enum intel_engine_id user_ring_map[I915_USER_RINGS + 1] = {
@@ -1510,6 +1536,9 @@ eb_select_engine(struct drm_i915_private *dev_priv,
unsigned int user_ring_id = args->flags & I915_EXEC_RING_MASK;
struct intel_engine_cs *engine;
 
+   if (args->flags & I915_EXEC_CLASS_INSTANCE)
+   return eb_select_engine_class_instance(dev_priv, args);
+
if (user_ring_id > I915_USER_RINGS) {
DRM_DEBUG("execbuf with unknown ring: %u\n", user_ring_id);
return NULL;
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index 2ac6667e57ea..6a26bdf5e684 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -906,7 +906,12 @@ struct drm_i915_gem_execbuffer2 {
  */
 #define I915_EXEC_FENCE_OUT(1<<17)
 
-#define __I915_EXEC_UNKNOWN_FLAGS (-(I915_EXEC_FENCE_OUT<<1))
+#define I915_EXEC_CLASS_INSTANCE   (1<<18)
+
+#define I915_EXEC_INSTANCE_SHIFT   (19)
+#define I915_EXEC_INSTANCE_MASK(0xff << 
I915_EXEC_INSTANCE_SHIFT)
+
+#define __I915_EXEC_UNKNOWN_FLAGS (-((1 << 27) << 1))
 
 #define I915_EXEC_CONTEXT_ID_MASK  (0x)
 #define i915_execbuffer2_set_context_id(eb2, context) \
@@ -914,6 +919,10 @@ struct drm_i915_gem_execbuffer2 {
 #define i915_execbuffer2_get_context_id(eb2) \
((eb2).rsvd1 & I915_EXEC_CONTEXT_ID_MASK)
 
+#define i915_execbuffer2_engine(class, instance) \
+   (I915_EXEC_CLASS_INSTANCE | (class) | \
+   ((instance) << I915_EXEC_INSTANCE_SHIFT))
+
 struct drm_i915_gem_pin {
/** Handle of the buffer to be pinned. */
__u32 handle;
-- 
2.9.3

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [Intel-gfx] [RFC v3] drm/i915: Select engines via class and instance in execbuffer2

2017-06-15 Thread Tvrtko Ursulin


On 25/05/2017 15:14, Chris Wilson wrote:

On Wed, May 24, 2017 at 12:28:07PM +0100, Tvrtko Ursulin wrote:


On 18/05/2017 18:00, Chris Wilson wrote:

On Thu, May 18, 2017 at 05:20:38PM +0100, Tvrtko Ursulin wrote:


On 18/05/2017 14:37, Chris Wilson wrote:

On Thu, May 18, 2017 at 02:06:35PM +0100, Tvrtko Ursulin wrote:


But this problem in general can also be solved separately from
class-instance addressing via engine feature masking.


But imo all members of a class should have the same features. That would
be my definition of a class!


That sounds very totalitarian! :)) To me a class is a group of some
entities which share some common characteristics - not necessarily
completely uniform.


The problem otherwise is that we then have to define yet another
interface based on features. To me that sounds like too much
duplication, that we could avoid from the beginning. Curse the hw for
being asymmetical!


Hm I don't see a problem with the feature base engine selection on
top. You still do because of the desire classes were equal in
features?


Ok, having another think about it, I agree that the features define a
subclass. Basically it comes down to VCS | HEVC being a subset of the
VCS group and we need a way to express any of VCS | HEVC and any of VCS
cleanly.


Cool, I'll respin with engine features on the top to see how that looks. 
And I am also thinking about dropping the GT discovery bits. More on 
that below.



To sum up what I (and we) talked about in various parts of the thread(s):

Step 1a: New execbuf engine selection uAPI.

  - execbuf class=VCS instance=1

Step 1b: Engine discovery uAPI.

Same as above but userpace can figure out how many VCS engines there
are without PCI probing.

I didn't get much feedback on this one. :(


Mainly because we didn't emphasis that this would be the only way to
discover the parameters for execbuf / svm and it lost in the idea that
this would replace their device_info tables. We are suggesting that we
can phase out those tables (but we should put some effort into making
sure we discard unwanted ones after init) since we will have to provide
the information anyway.


It has become more unlikely we would get libva (my hopeful first user) 
on board for this in the short/medium term. So as said above, I am 
thinking to drop this for now.


If at some point we go for "step 2" below that would also enable some 
sort of engine enumeration/probing via execbuf null batches.



Step 2: Feature masks for execbuf.

  - execbuf class=VCS instance=0 features=HEVC = OK
  - execbuf class=VCS instance=1 features=HEVC = FAIL

But userspace can use engine discovery to figure out which are the
valid combinations.

This could be a simpler, but less featureful and not very elegant
alternative to step 2.

Otherwise just a prep step for the subsequent steps below.

Step 3a: (One day maybe) userspace selects a class, i915 picks the engine

  - execbuf class=VCS instance=any

Step 3b: userspace selected class and features

  - execbuf class=VCS instance=any features=HEVC

This RFC proposed steps 1a and 1b. The rest we leave for later.

How does that sound? Acceptable?


I want "1c: extensible interface for per-engine settings elsewhere in the
uAPI" as well.


What do you have in mind here, what other uAPI deals with engines?

Regards,

Tvrtko

We also need an answer to the i915_gem_busy_ioctl conundrum - if/when we
should deprecated the engine id being exposed. It's the same question
more or less as for how long (or whether we should) continue to support
I915_EXEC_RING. Plus how to expose these via tracepoints.

The quick answer is while we have less than 16 engines, we don't have to
worry about the uABI breakage.
  

In case of engine discovery useful enough or what other features
could we put it in to make it more useful for userspace? Potentially
enable dropping PCI id probing altogether and enable libva/mesa/???
to probe everything using i915 ioctls.


To be convenient we have to beat the ease of device_info[PCIID], and
being futureproof. The resistance will be because then they are
dependent on the kernel for features that are not dependent on that
kernel for execution, i.e. being able to use new userspace on old
kernels and remain feature complete. To counter that we need to
have a complete description of a new device the moment we enable it.
That makes it a hard sell. The benefit is basically only a reduction in
RO library mmappings, a very small improvement of start up speed at
best.
-Chris


___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [RFC 1/2] drm/i915: Engine discovery uAPI

2017-04-18 Thread Tvrtko Ursulin
From: Tvrtko Ursulin 

Engine discovery uAPI allows userspace to probe for engine
configuration and features without needing to maintain the
internal PCI id based database.

This enables removal of code duplications across userspace
components.

Probing is done via the new DRM_IOCTL_I915_GEM_ENGINE_INFO ioctl
which returns the number and information on the specified engine
class.

Currently only general engine configuration and HEVC feature of
the VCS engine can be probed but the uAPI is designed to be
generic and extensible.

Code is based almost exactly on the earlier proposal on the
topic by Jon Bloomfield. Engine class and instance refactoring
made recently by Daniele Ceraolo Spurio enabled this to be
implemented in an elegant fashion.

To probe configuration userspace sets the engine class it wants
to query (struct drm_i915_gem_engine_info) and provides an array
of drm_i915_engine_info structs which will be filled in by the
driver. Userspace also has to tell i915 how many elements are in
the array, and the driver will report back the total number of
engine instances in any case.

Signed-off-by: Tvrtko Ursulin 
Cc: Ben Widawsky 
Cc: Chris Wilson 
Cc: Daniel Vetter 
Cc: Joonas Lahtinen 
Cc: Jon Bloomfield 
Cc: Daniel Charles 
Cc: "Rogozhkin, Dmitry V" 
Cc: Oscar Mateo 
Cc: "Gong, Zhipeng" 
Cc: intel-vaapi-me...@lists.01.org
Cc: mesa-dev@lists.freedesktop.org
---
 drivers/gpu/drm/i915/i915_drv.c|  1 +
 drivers/gpu/drm/i915/i915_drv.h|  3 ++
 drivers/gpu/drm/i915/intel_engine_cs.c | 59 ++
 include/uapi/drm/i915_drm.h| 40 +++
 4 files changed, 103 insertions(+)

diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index cc7393e65e99..b720c9468adf 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -2607,6 +2607,7 @@ static const struct drm_ioctl_desc i915_ioctls[] = {
DRM_IOCTL_DEF_DRV(I915_GEM_CONTEXT_GETPARAM, 
i915_gem_context_getparam_ioctl, DRM_RENDER_ALLOW),
DRM_IOCTL_DEF_DRV(I915_GEM_CONTEXT_SETPARAM, 
i915_gem_context_setparam_ioctl, DRM_RENDER_ALLOW),
DRM_IOCTL_DEF_DRV(I915_PERF_OPEN, i915_perf_open_ioctl, 
DRM_RENDER_ALLOW),
+   DRM_IOCTL_DEF_DRV(I915_GEM_ENGINE_INFO, i915_gem_engine_info_ioctl, 
DRM_RENDER_ALLOW),
 };
 
 static struct drm_driver driver = {
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 357b6c6c2f04..6eed0e854561 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -3547,6 +3547,9 @@ i915_gem_context_lookup_timeline(struct i915_gem_context 
*ctx,
 int i915_perf_open_ioctl(struct drm_device *dev, void *data,
 struct drm_file *file);
 
+int i915_gem_engine_info_ioctl(struct drm_device *dev, void *data,
+  struct drm_file *file);
+
 /* i915_gem_evict.c */
 int __must_check i915_gem_evict_something(struct i915_address_space *vm,
  u64 min_size, u64 alignment,
diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c 
b/drivers/gpu/drm/i915/intel_engine_cs.c
index 402769d9d840..6deaffd34ae0 100644
--- a/drivers/gpu/drm/i915/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/intel_engine_cs.c
@@ -25,6 +25,7 @@
 #include "i915_drv.h"
 #include "intel_ringbuffer.h"
 #include "intel_lrc.h"
+#include 
 
 struct engine_class_info {
const char *name;
@@ -1189,6 +1190,64 @@ void intel_engines_reset_default_submission(struct 
drm_i915_private *i915)
engine->set_default_submission(engine);
 }
 
+u8 user_class_map[DRM_I915_ENGINE_CLASS_MAX] = {
+   [DRM_I915_ENGINE_CLASS_OTHER] = OTHER_CLASS,
+   [DRM_I915_ENGINE_CLASS_RENDER] = RENDER_CLASS,
+   [DRM_I915_ENGINE_CLASS_COPY] = COPY_ENGINE_CLASS,
+   [DRM_I915_ENGINE_CLASS_VIDEO_DECODE] = VIDEO_DECODE_CLASS,
+   [DRM_I915_ENGINE_CLASS_VIDEO_ENHANCE] = VIDEO_ENHANCEMENT_CLASS,
+};
+
+int i915_gem_engine_info_ioctl(struct drm_device *dev, void *data,
+  struct drm_file *file)
+{
+   struct drm_i915_private *i915 = to_i915(dev);
+   struct drm_i915_gem_engine_info *args = data;
+   enum drm_i915_gem_engine_class engine_class = args->engine_class;
+   struct drm_i915_engine_info __user *user_info =
+   u64_to_user_ptr(args->info_ptr);
+   struct drm_i915_engine_info info;
+   struct intel_engine_cs *engine;
+   enum intel_engine_id id;
+   u8 class;
+
+   if (args->rsvd)
+   return -EINVAL;
+
+   switch (engine_class) {
+   case DRM_I915_ENGINE_CLASS_OTHER:
+   case DRM_I915_ENGINE_CLASS_RENDER:
+   case DRM_I915_ENGINE_CLASS_COPY:
+   case DRM_I915_ENGINE_CLASS_VIDEO_DECODE:
+   case DRM_I915_ENGINE_CLASS_VIDEO_ENHANCE:
+   if (engine_class >= DRM_I915_ENGINE_CLASS_MAX)
+

[Mesa-dev] [RFC 0/2] New engine discovery and execbuffer2 engine selection uAPI

2017-04-18 Thread Tvrtko Ursulin
From: Tvrtko Ursulin 

Inspired by the recent introduction of the engine class and instance concept,
and a chat with Chris Wilson about a potential unification of PCI id based
device discovery across multiple userspace components, I have cooked up two
patches to gather some opinions on whether this sort of uAPI would be
interesting for a wider audience.

First patch is basically and old idea by Jon Bloomfield on how to do engine
discovery, while the second one is just a quick hack to show how a similar
approach could work for execbuffer2.

Both patches are compile tested only and intended only to start a discussion.

Tvrtko Ursulin (2):
  drm/i915: Engine discovery uAPI
  drm/i915: Select engines via class and instance in execbuffer2

 drivers/gpu/drm/i915/i915_drv.c|  1 +
 drivers/gpu/drm/i915/i915_drv.h|  3 ++
 drivers/gpu/drm/i915/i915_gem_execbuffer.c | 36 ++
 drivers/gpu/drm/i915/intel_engine_cs.c | 59 ++
 include/uapi/drm/i915_drm.h| 47 +++-
 5 files changed, 145 insertions(+), 1 deletion(-)

-- 
2.9.3

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


[Mesa-dev] [RFC 2/2] drm/i915: Select engines via class and instance in execbuffer2

2017-04-18 Thread Tvrtko Ursulin
From: Tvrtko Ursulin 

Building on top of the previous patch which exported the concept
of engine classes and instances, we can also use this instead of
the current awkward engine selection uAPI.

This is primarily interesting for the VCS engine selection which
is a) currently done via disjoint set of flags, and b) the
current I915_EXEC_BSD flags has different semantics depending on
the underlying hardware which is bad.

Proposed idea here is to reserve 16-bits of flags, to pass in
the engine class and instance (8 bits each), and a new flag
named I915_EXEC_CLASS_INSTACE to tell the kernel this new engine
selection API is in use.

The new uAPI also removes access to the weak VCS engine
balancing as currently existing in the driver.

Example usage to send a command to VCS0:

  eb.flags = i915_execbuffer2_engine(DRM_I915_ENGINE_CLASS_VIDEO_DECODE, 0);

Or to send a command to VCS1:

  eb.flags = i915_execbuffer2_engine(DRM_I915_ENGINE_CLASS_VIDEO_DECODE, 1);

Signed-off-by: Tvrtko Ursulin 
Cc: Ben Widawsky 
Cc: Chris Wilson 
Cc: Daniel Vetter 
Cc: Joonas Lahtinen 
Cc: Jon Bloomfield 
Cc: Daniel Charles 
Cc: "Rogozhkin, Dmitry V" 
Cc: Oscar Mateo 
Cc: "Gong, Zhipeng" 
Cc: intel-vaapi-me...@lists.01.org
Cc: mesa-dev@lists.freedesktop.org
---
 drivers/gpu/drm/i915/i915_gem_execbuffer.c | 36 ++
 include/uapi/drm/i915_drm.h|  7 +-
 2 files changed, 42 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c 
b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index af1965774e7b..7fc92ec839a1 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -1153,6 +1153,10 @@ i915_gem_check_execbuffer(struct 
drm_i915_gem_execbuffer2 *exec)
if (exec->flags & __I915_EXEC_UNKNOWN_FLAGS)
return false;
 
+   if ((exec->flags & I915_EXEC_CLASS_INSTANCE) &&
+   (exec->flags & I915_EXEC_RING_MASK))
+   return false;
+
/* Kernel clipping was a DRI1 misfeature */
if (exec->num_cliprects || exec->cliprects_ptr)
return false;
@@ -1492,6 +1496,35 @@ gen8_dispatch_bsd_engine(struct drm_i915_private 
*dev_priv,
return file_priv->bsd_engine;
 }
 
+extern u8 user_class_map[DRM_I915_ENGINE_CLASS_MAX];
+
+static struct intel_engine_cs *
+eb_select_engine_class_instance(struct drm_i915_private *i915,
+   struct drm_i915_gem_execbuffer2 *args)
+{
+   struct intel_engine_cs *engine;
+   enum intel_engine_id id;
+   u16 class_instance;
+   u8 user_class, class, instance;
+
+   class_instance = (args->flags & I915_EXEC_CLASS_INSTANCE_MASK) >>
+I915_EXEC_CLASS_INSTANCE_SHIFT;
+
+   user_class = class_instance >> 8;
+   instance = class_instance & 0xff;
+
+   if (user_class >= DRM_I915_ENGINE_CLASS_MAX)
+   return NULL;
+   class = user_class_map[user_class];
+
+   for_each_engine(engine, i915, id) {
+   if (engine->class == class && engine->instance == instance)
+   return engine;
+   }
+
+   return NULL;
+}
+
 #define I915_USER_RINGS (4)
 
 static const enum intel_engine_id user_ring_map[I915_USER_RINGS + 1] = {
@@ -1510,6 +1543,9 @@ eb_select_engine(struct drm_i915_private *dev_priv,
unsigned int user_ring_id = args->flags & I915_EXEC_RING_MASK;
struct intel_engine_cs *engine;
 
+   if (args->flags & I915_EXEC_CLASS_INSTANCE)
+   return eb_select_engine_class_instance(dev_priv, args);
+
if (user_ring_id > I915_USER_RINGS) {
DRM_DEBUG("execbuf with unknown ring: %u\n", user_ring_id);
return NULL;
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index 6058596a9f33..727a6dc4b029 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -906,7 +906,12 @@ struct drm_i915_gem_execbuffer2 {
  */
 #define I915_EXEC_FENCE_OUT(1<<17)
 
-#define __I915_EXEC_UNKNOWN_FLAGS (-(I915_EXEC_FENCE_OUT<<1))
+#define I915_EXEC_CLASS_INSTANCE   (1<<18)
+
+#define I915_EXEC_CLASS_INSTANCE_SHIFT (19)
+#define I915_EXEC_CLASS_INSTANCE_MASK  (0x << 
I915_EXEC_CLASS_INSTANCE_SHIFT)
+
+#define __I915_EXEC_UNKNOWN_FLAGS (-(35 << 1))
 
 #define I915_EXEC_CONTEXT_ID_MASK  (0x)
 #define i915_execbuffer2_set_context_id(eb2, context) \
-- 
2.9.3

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [Intel-gfx] [PATCH v3 3/4] drm/i915/uapi: convert i915_query and friend to kernel doc

2021-04-16 Thread Tvrtko Ursulin



On 15/04/2021 16:59, Matthew Auld wrote:

Add a note about the two-step process.

Suggested-by: Daniel Vetter 
Signed-off-by: Matthew Auld 
Cc: Joonas Lahtinen 
Cc: Jordan Justen 
Cc: Daniel Vetter 
Cc: Kenneth Graunke 
Cc: Jason Ekstrand 
Cc: Dave Airlie 
Cc: dri-de...@lists.freedesktop.org
Cc: mesa-dev@lists.freedesktop.org
---
  include/uapi/drm/i915_drm.h | 57 ++---
  1 file changed, 46 insertions(+), 11 deletions(-)

diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index d9c954a5a456..ef36f1a0adde 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -2210,14 +2210,23 @@ struct drm_i915_perf_oa_config {
__u64 flex_regs_ptr;
  };
  
+/**

+ * struct drm_i915_query_item - An individual query for the kernel to process.
+ *
+ * The behaviour is determined by the @query_id. Note that exactly what
+ * @data_ptr is also depends on the specific @query_id.
+ */
  struct drm_i915_query_item {
+   /** @query_id: The id for this query */
__u64 query_id;
  #define DRM_I915_QUERY_TOPOLOGY_INFO1
  #define DRM_I915_QUERY_ENGINE_INFO2
  #define DRM_I915_QUERY_PERF_CONFIG  3
  /* Must be kept compact -- no holes and well documented */
  
-	/*

+   /**
+* @length:
+*
 * When set to zero by userspace, this is filled with the size of the
 * data to be written at the data_ptr pointer. The kernel sets this
 * value to a negative value to signal an error on a particular query
@@ -2225,21 +2234,26 @@ struct drm_i915_query_item {
 */
__s32 length;
  
-	/*

+   /**
+* @flags:
+*
 * When query_id == DRM_I915_QUERY_TOPOLOGY_INFO, must be 0.
 *
 * When query_id == DRM_I915_QUERY_PERF_CONFIG, must be one of the
-* following :
-* - DRM_I915_QUERY_PERF_CONFIG_LIST
-* - DRM_I915_QUERY_PERF_CONFIG_DATA_FOR_UUID
-* - DRM_I915_QUERY_PERF_CONFIG_FOR_UUID
+* following:
+*
+*  - DRM_I915_QUERY_PERF_CONFIG_LIST
+*  - DRM_I915_QUERY_PERF_CONFIG_DATA_FOR_UUID
+*  - DRM_I915_QUERY_PERF_CONFIG_FOR_UUID
 */
__u32 flags;
  #define DRM_I915_QUERY_PERF_CONFIG_LIST  1
  #define DRM_I915_QUERY_PERF_CONFIG_DATA_FOR_UUID 2
  #define DRM_I915_QUERY_PERF_CONFIG_DATA_FOR_ID   3
  
-	/*

+   /**
+* @data_ptr:
+*
 * Data will be written at the location pointed by data_ptr when the
 * value of length matches the length of the data to be written by the
 * kernel.
@@ -2247,16 +2261,37 @@ struct drm_i915_query_item {
__u64 data_ptr;
  };
  
+/**

+ * struct drm_i915_query - Supply an array of drm_i915_query_item for the 
kernel
+ * to fill out.
+ *
+ * Note that this is generally a two step process for each drm_i915_query_item
+ * in the array:
+ *
+ * 1.) Call the DRM_IOCTL_I915_QUERY, giving it our array of
+ * drm_i915_query_item, with drm_i915_query_item.size set to zero. The
+ * kernel will then fill in the size, in bytes, which tells userspace how
+ * memory it needs to allocate for the blob(say for an array of
+ * properties).
+ *
+ * 2.) Next we call DRM_IOCTL_I915_QUERY again, this time with the
+ * drm_i915_query_item.data_ptr equal to our newly allocated blob. Note
+ * that the i915_query_item.size should still be the same as what the
+ * kernel previously set. At this point the kernel can fill in the blob.


I'd also document the one step alternative where userspace passes in a 
buffer equal or larger than the required size. For many small queries 
(most?) this actually works just as well and with one ioctl only.


Regards,

Tvrtko


+ *
+ */
  struct drm_i915_query {
+   /** @num_items: The number of elements in the @items_ptr array */
__u32 num_items;
  
-	/*

-* Unused for now. Must be cleared to zero.
+   /**
+* @flags: Unused for now. Must be cleared to zero.
 */
__u32 flags;
  
-	/*

-* This points to an array of num_items drm_i915_query_item structures.
+   /**
+* @items_ptr: This points to an array of num_items drm_i915_query_item
+* structures.
 */
__u64 items_ptr;
  };


___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH v3 4/4] drm/doc/rfc: i915 DG1 uAPI

2021-04-20 Thread Tvrtko Ursulin



On 19/04/2021 16:19, Jason Ekstrand wrote:

On Mon, Apr 19, 2021 at 7:02 AM Matthew Auld  wrote:


On 16/04/2021 17:38, Jason Ekstrand wrote:

On Thu, Apr 15, 2021 at 11:04 AM Matthew Auld  wrote:


Add an entry for the new uAPI needed for DG1.

v2(Daniel):
- include the overall upstreaming plan
- add a note for mmap, there are differences here for TTM vs i915
- bunch of other suggestions from Daniel
v3:
   (Daniel)
- add a note for set/get caching stuff
- add some more docs for existing query and extensions stuff
- add an actual code example for regions query
- bunch of other stuff
   (Jason)
- uAPI change(!):
  - try a simpler design with the placements extension
  - rather than have a generic setparam which can cover multiple
use cases, have each extension be responsible for one thing
only

Signed-off-by: Matthew Auld 
Cc: Joonas Lahtinen 
Cc: Jordan Justen 
Cc: Daniel Vetter 
Cc: Kenneth Graunke 
Cc: Jason Ekstrand 
Cc: Dave Airlie 
Cc: dri-de...@lists.freedesktop.org
Cc: mesa-dev@lists.freedesktop.org
---
   Documentation/gpu/rfc/i915_gem_lmem.h   | 255 
   Documentation/gpu/rfc/i915_gem_lmem.rst | 139 +
   Documentation/gpu/rfc/index.rst |   4 +
   3 files changed, 398 insertions(+)
   create mode 100644 Documentation/gpu/rfc/i915_gem_lmem.h
   create mode 100644 Documentation/gpu/rfc/i915_gem_lmem.rst

diff --git a/Documentation/gpu/rfc/i915_gem_lmem.h 
b/Documentation/gpu/rfc/i915_gem_lmem.h
new file mode 100644
index ..2a82a452e9f2
--- /dev/null
+++ b/Documentation/gpu/rfc/i915_gem_lmem.h
@@ -0,0 +1,255 @@
+/*
+ * Note that drm_i915_query_item and drm_i915_query are existing bits of uAPI.
+ * For the regions query we are just adding a new query id, so no actual new
+ * ioctl or anything, but including it here for reference.
+ */
+struct drm_i915_query_item {
+#define DRM_I915_QUERY_MEMORY_REGIONS   0xdeadbeaf
+   
+__u64 query_id;
+
+/*
+ * When set to zero by userspace, this is filled with the size of the
+ * data to be written at the data_ptr pointer. The kernel sets this
+ * value to a negative value to signal an error on a particular query
+ * item.
+ */
+__s32 length;
+
+__u32 flags;
+/*
+ * Data will be written at the location pointed by data_ptr when the
+ * value of length matches the length of the data to be written by the
+ * kernel.
+ */
+__u64 data_ptr;
+};
+
+struct drm_i915_query {
+__u32 num_items;
+/*
+ * Unused for now. Must be cleared to zero.
+ */
+__u32 flags;
+/*
+ * This points to an array of num_items drm_i915_query_item structures.
+ */
+__u64 items_ptr;
+};
+
+#define DRM_IOCTL_I915_QUERY   DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_QUERY, 
struct drm_i915_query)
+
+/**
+ * enum drm_i915_gem_memory_class
+ */
+enum drm_i915_gem_memory_class {
+   /** @I915_MEMORY_CLASS_SYSTEM: system memory */
+   I915_MEMORY_CLASS_SYSTEM = 0,
+   /** @I915_MEMORY_CLASS_DEVICE: device local-memory */
+   I915_MEMORY_CLASS_DEVICE,
+};
+
+/**
+ * struct drm_i915_gem_memory_class_instance
+ */
+struct drm_i915_gem_memory_class_instance {
+   /** @memory_class: see enum drm_i915_gem_memory_class */
+   __u16 memory_class;
+
+   /** @memory_instance: which instance */
+   __u16 memory_instance;
+};
+
+/**
+ * struct drm_i915_memory_region_info
+ *
+ * Describes one region as known to the driver.
+ *
+ * Note that we reserve quite a lot of stuff here for potential future work. As
+ * an example we might want expose the capabilities(see caps) for a given
+ * region, which could include things like if the region is CPU
+ * mappable/accessible etc.


I get caps but I'm seriously at a loss as to what the rest of this
would be used for.  Why are caps and flags both there and separate?
Flags are typically something you set, not query.  Also, what's with
rsvd1 at the end?  This smells of substantial over-building to me.

I thought to myself, "maybe I'm missing a future use-case" so I looked
at the internal tree and none of this is being used there either.
This indicates to me that either I'm missing something and there's
code somewhere I don't know about or, with three years of building on
internal branches, we still haven't proven that any of this is needed.
If it's the latter, which I strongly suspect, maybe we should drop the
unnecessary bits and only add them back in if and when we have proof
that they're useful.


Do you mean just drop caps/flags here, but keep/inflate rsvd0/rsvd1,
which is less opinionated about future unknowns? If so, makes sense to me.


I meant drop flags and rsvd1.  We need rsvd0 for padding and  I can
see some value to caps.  We may want to advertise, for instance, what
mapping coherency types are available per-heap.  But I 

Re: [Mesa-dev] [PATCH v3 4/4] drm/doc/rfc: i915 DG1 uAPI

2021-04-21 Thread Tvrtko Ursulin



On 20/04/2021 18:00, Jason Ekstrand wrote:

On Tue, Apr 20, 2021 at 11:34 AM Tvrtko Ursulin
 wrote:



On 19/04/2021 16:19, Jason Ekstrand wrote:

On Mon, Apr 19, 2021 at 7:02 AM Matthew Auld  wrote:


On 16/04/2021 17:38, Jason Ekstrand wrote:

On Thu, Apr 15, 2021 at 11:04 AM Matthew Auld  wrote:


Add an entry for the new uAPI needed for DG1.

v2(Daniel):
 - include the overall upstreaming plan
 - add a note for mmap, there are differences here for TTM vs i915
 - bunch of other suggestions from Daniel
v3:
(Daniel)
 - add a note for set/get caching stuff
 - add some more docs for existing query and extensions stuff
 - add an actual code example for regions query
 - bunch of other stuff
(Jason)
 - uAPI change(!):
   - try a simpler design with the placements extension
   - rather than have a generic setparam which can cover multiple
 use cases, have each extension be responsible for one thing
 only

Signed-off-by: Matthew Auld 
Cc: Joonas Lahtinen 
Cc: Jordan Justen 
Cc: Daniel Vetter 
Cc: Kenneth Graunke 
Cc: Jason Ekstrand 
Cc: Dave Airlie 
Cc: dri-de...@lists.freedesktop.org
Cc: mesa-dev@lists.freedesktop.org
---
Documentation/gpu/rfc/i915_gem_lmem.h   | 255 
Documentation/gpu/rfc/i915_gem_lmem.rst | 139 +
Documentation/gpu/rfc/index.rst |   4 +
3 files changed, 398 insertions(+)
create mode 100644 Documentation/gpu/rfc/i915_gem_lmem.h
create mode 100644 Documentation/gpu/rfc/i915_gem_lmem.rst

diff --git a/Documentation/gpu/rfc/i915_gem_lmem.h 
b/Documentation/gpu/rfc/i915_gem_lmem.h
new file mode 100644
index ..2a82a452e9f2
--- /dev/null
+++ b/Documentation/gpu/rfc/i915_gem_lmem.h
@@ -0,0 +1,255 @@
+/*
+ * Note that drm_i915_query_item and drm_i915_query are existing bits of uAPI.
+ * For the regions query we are just adding a new query id, so no actual new
+ * ioctl or anything, but including it here for reference.
+ */
+struct drm_i915_query_item {
+#define DRM_I915_QUERY_MEMORY_REGIONS   0xdeadbeaf
+   
+__u64 query_id;
+
+/*
+ * When set to zero by userspace, this is filled with the size of the
+ * data to be written at the data_ptr pointer. The kernel sets this
+ * value to a negative value to signal an error on a particular query
+ * item.
+ */
+__s32 length;
+
+__u32 flags;
+/*
+ * Data will be written at the location pointed by data_ptr when the
+ * value of length matches the length of the data to be written by the
+ * kernel.
+ */
+__u64 data_ptr;
+};
+
+struct drm_i915_query {
+__u32 num_items;
+/*
+ * Unused for now. Must be cleared to zero.
+ */
+__u32 flags;
+/*
+ * This points to an array of num_items drm_i915_query_item structures.
+ */
+__u64 items_ptr;
+};
+
+#define DRM_IOCTL_I915_QUERY   DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_QUERY, 
struct drm_i915_query)
+
+/**
+ * enum drm_i915_gem_memory_class
+ */
+enum drm_i915_gem_memory_class {
+   /** @I915_MEMORY_CLASS_SYSTEM: system memory */
+   I915_MEMORY_CLASS_SYSTEM = 0,
+   /** @I915_MEMORY_CLASS_DEVICE: device local-memory */
+   I915_MEMORY_CLASS_DEVICE,
+};
+
+/**
+ * struct drm_i915_gem_memory_class_instance
+ */
+struct drm_i915_gem_memory_class_instance {
+   /** @memory_class: see enum drm_i915_gem_memory_class */
+   __u16 memory_class;
+
+   /** @memory_instance: which instance */
+   __u16 memory_instance;
+};
+
+/**
+ * struct drm_i915_memory_region_info
+ *
+ * Describes one region as known to the driver.
+ *
+ * Note that we reserve quite a lot of stuff here for potential future work. As
+ * an example we might want expose the capabilities(see caps) for a given
+ * region, which could include things like if the region is CPU
+ * mappable/accessible etc.


I get caps but I'm seriously at a loss as to what the rest of this
would be used for.  Why are caps and flags both there and separate?
Flags are typically something you set, not query.  Also, what's with
rsvd1 at the end?  This smells of substantial over-building to me.

I thought to myself, "maybe I'm missing a future use-case" so I looked
at the internal tree and none of this is being used there either.
This indicates to me that either I'm missing something and there's
code somewhere I don't know about or, with three years of building on
internal branches, we still haven't proven that any of this is needed.
If it's the latter, which I strongly suspect, maybe we should drop the
unnecessary bits and only add them back in if and when we have proof
that they're useful.


Do you mean just drop caps/flags here, but keep/inflate rsvd0/rsvd1,
which is less opinionated about future unknowns? If so, makes sense to me.


I meant drop

Re: [Mesa-dev] [PATCH v3 4/4] drm/doc/rfc: i915 DG1 uAPI

2021-04-21 Thread Tvrtko Ursulin



On 21/04/2021 14:54, Jason Ekstrand wrote:

On Wed, Apr 21, 2021 at 3:22 AM Tvrtko Ursulin
 wrote:


On 20/04/2021 18:00, Jason Ekstrand wrote:

On Tue, Apr 20, 2021 at 11:34 AM Tvrtko Ursulin
 wrote:



On 19/04/2021 16:19, Jason Ekstrand wrote:

On Mon, Apr 19, 2021 at 7:02 AM Matthew Auld  wrote:


On 16/04/2021 17:38, Jason Ekstrand wrote:

On Thu, Apr 15, 2021 at 11:04 AM Matthew Auld  wrote:


Add an entry for the new uAPI needed for DG1.

v2(Daniel):
  - include the overall upstreaming plan
  - add a note for mmap, there are differences here for TTM vs i915
  - bunch of other suggestions from Daniel
v3:
 (Daniel)
  - add a note for set/get caching stuff
  - add some more docs for existing query and extensions stuff
  - add an actual code example for regions query
  - bunch of other stuff
 (Jason)
  - uAPI change(!):
- try a simpler design with the placements extension
- rather than have a generic setparam which can cover multiple
  use cases, have each extension be responsible for one thing
  only

Signed-off-by: Matthew Auld 
Cc: Joonas Lahtinen 
Cc: Jordan Justen 
Cc: Daniel Vetter 
Cc: Kenneth Graunke 
Cc: Jason Ekstrand 
Cc: Dave Airlie 
Cc: dri-de...@lists.freedesktop.org
Cc: mesa-dev@lists.freedesktop.org
---
 Documentation/gpu/rfc/i915_gem_lmem.h   | 255 
 Documentation/gpu/rfc/i915_gem_lmem.rst | 139 +
 Documentation/gpu/rfc/index.rst |   4 +
 3 files changed, 398 insertions(+)
 create mode 100644 Documentation/gpu/rfc/i915_gem_lmem.h
 create mode 100644 Documentation/gpu/rfc/i915_gem_lmem.rst

diff --git a/Documentation/gpu/rfc/i915_gem_lmem.h 
b/Documentation/gpu/rfc/i915_gem_lmem.h
new file mode 100644
index ..2a82a452e9f2
--- /dev/null
+++ b/Documentation/gpu/rfc/i915_gem_lmem.h
@@ -0,0 +1,255 @@
+/*
+ * Note that drm_i915_query_item and drm_i915_query are existing bits of uAPI.
+ * For the regions query we are just adding a new query id, so no actual new
+ * ioctl or anything, but including it here for reference.
+ */
+struct drm_i915_query_item {
+#define DRM_I915_QUERY_MEMORY_REGIONS   0xdeadbeaf
+   
+__u64 query_id;
+
+/*
+ * When set to zero by userspace, this is filled with the size of the
+ * data to be written at the data_ptr pointer. The kernel sets this
+ * value to a negative value to signal an error on a particular query
+ * item.
+ */
+__s32 length;
+
+__u32 flags;
+/*
+ * Data will be written at the location pointed by data_ptr when the
+ * value of length matches the length of the data to be written by the
+ * kernel.
+ */
+__u64 data_ptr;
+};
+
+struct drm_i915_query {
+__u32 num_items;
+/*
+ * Unused for now. Must be cleared to zero.
+ */
+__u32 flags;
+/*
+ * This points to an array of num_items drm_i915_query_item structures.
+ */
+__u64 items_ptr;
+};
+
+#define DRM_IOCTL_I915_QUERY   DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_QUERY, 
struct drm_i915_query)
+
+/**
+ * enum drm_i915_gem_memory_class
+ */
+enum drm_i915_gem_memory_class {
+   /** @I915_MEMORY_CLASS_SYSTEM: system memory */
+   I915_MEMORY_CLASS_SYSTEM = 0,
+   /** @I915_MEMORY_CLASS_DEVICE: device local-memory */
+   I915_MEMORY_CLASS_DEVICE,
+};
+
+/**
+ * struct drm_i915_gem_memory_class_instance
+ */
+struct drm_i915_gem_memory_class_instance {
+   /** @memory_class: see enum drm_i915_gem_memory_class */
+   __u16 memory_class;
+
+   /** @memory_instance: which instance */
+   __u16 memory_instance;
+};
+
+/**
+ * struct drm_i915_memory_region_info
+ *
+ * Describes one region as known to the driver.
+ *
+ * Note that we reserve quite a lot of stuff here for potential future work. As
+ * an example we might want expose the capabilities(see caps) for a given
+ * region, which could include things like if the region is CPU
+ * mappable/accessible etc.


I get caps but I'm seriously at a loss as to what the rest of this
would be used for.  Why are caps and flags both there and separate?
Flags are typically something you set, not query.  Also, what's with
rsvd1 at the end?  This smells of substantial over-building to me.

I thought to myself, "maybe I'm missing a future use-case" so I looked
at the internal tree and none of this is being used there either.
This indicates to me that either I'm missing something and there's
code somewhere I don't know about or, with three years of building on
internal branches, we still haven't proven that any of this is needed.
If it's the latter, which I strongly suspect, maybe we should drop the
unnecessary bits and only add them back in if and when we have proof
that they're useful.


Do you mean just drop caps/flags here, 

Re: [Mesa-dev] [PATCH v3 4/4] drm/doc/rfc: i915 DG1 uAPI

2021-04-21 Thread Tvrtko Ursulin



On 21/04/2021 18:17, Jason Ekstrand wrote:

On Wed, Apr 21, 2021 at 9:25 AM Tvrtko Ursulin
 wrote:



On 21/04/2021 14:54, Jason Ekstrand wrote:

On Wed, Apr 21, 2021 at 3:22 AM Tvrtko Ursulin
 wrote:


On 20/04/2021 18:00, Jason Ekstrand wrote:

On Tue, Apr 20, 2021 at 11:34 AM Tvrtko Ursulin
 wrote:



On 19/04/2021 16:19, Jason Ekstrand wrote:

On Mon, Apr 19, 2021 at 7:02 AM Matthew Auld  wrote:


On 16/04/2021 17:38, Jason Ekstrand wrote:

On Thu, Apr 15, 2021 at 11:04 AM Matthew Auld  wrote:


Add an entry for the new uAPI needed for DG1.

v2(Daniel):
   - include the overall upstreaming plan
   - add a note for mmap, there are differences here for TTM vs i915
   - bunch of other suggestions from Daniel
v3:
  (Daniel)
   - add a note for set/get caching stuff
   - add some more docs for existing query and extensions stuff
   - add an actual code example for regions query
   - bunch of other stuff
  (Jason)
   - uAPI change(!):
 - try a simpler design with the placements extension
 - rather than have a generic setparam which can cover multiple
   use cases, have each extension be responsible for one thing
   only

Signed-off-by: Matthew Auld 
Cc: Joonas Lahtinen 
Cc: Jordan Justen 
Cc: Daniel Vetter 
Cc: Kenneth Graunke 
Cc: Jason Ekstrand 
Cc: Dave Airlie 
Cc: dri-de...@lists.freedesktop.org
Cc: mesa-dev@lists.freedesktop.org
---
  Documentation/gpu/rfc/i915_gem_lmem.h   | 255 
  Documentation/gpu/rfc/i915_gem_lmem.rst | 139 +
  Documentation/gpu/rfc/index.rst |   4 +
  3 files changed, 398 insertions(+)
  create mode 100644 Documentation/gpu/rfc/i915_gem_lmem.h
  create mode 100644 Documentation/gpu/rfc/i915_gem_lmem.rst

diff --git a/Documentation/gpu/rfc/i915_gem_lmem.h 
b/Documentation/gpu/rfc/i915_gem_lmem.h
new file mode 100644
index ..2a82a452e9f2
--- /dev/null
+++ b/Documentation/gpu/rfc/i915_gem_lmem.h
@@ -0,0 +1,255 @@
+/*
+ * Note that drm_i915_query_item and drm_i915_query are existing bits of uAPI.
+ * For the regions query we are just adding a new query id, so no actual new
+ * ioctl or anything, but including it here for reference.
+ */
+struct drm_i915_query_item {
+#define DRM_I915_QUERY_MEMORY_REGIONS   0xdeadbeaf
+   
+__u64 query_id;
+
+/*
+ * When set to zero by userspace, this is filled with the size of the
+ * data to be written at the data_ptr pointer. The kernel sets this
+ * value to a negative value to signal an error on a particular query
+ * item.
+ */
+__s32 length;
+
+__u32 flags;
+/*
+ * Data will be written at the location pointed by data_ptr when the
+ * value of length matches the length of the data to be written by the
+ * kernel.
+ */
+__u64 data_ptr;
+};
+
+struct drm_i915_query {
+__u32 num_items;
+/*
+ * Unused for now. Must be cleared to zero.
+ */
+__u32 flags;
+/*
+ * This points to an array of num_items drm_i915_query_item structures.
+ */
+__u64 items_ptr;
+};
+
+#define DRM_IOCTL_I915_QUERY   DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_QUERY, 
struct drm_i915_query)
+
+/**
+ * enum drm_i915_gem_memory_class
+ */
+enum drm_i915_gem_memory_class {
+   /** @I915_MEMORY_CLASS_SYSTEM: system memory */
+   I915_MEMORY_CLASS_SYSTEM = 0,
+   /** @I915_MEMORY_CLASS_DEVICE: device local-memory */
+   I915_MEMORY_CLASS_DEVICE,
+};
+
+/**
+ * struct drm_i915_gem_memory_class_instance
+ */
+struct drm_i915_gem_memory_class_instance {
+   /** @memory_class: see enum drm_i915_gem_memory_class */
+   __u16 memory_class;
+
+   /** @memory_instance: which instance */
+   __u16 memory_instance;
+};
+
+/**
+ * struct drm_i915_memory_region_info
+ *
+ * Describes one region as known to the driver.
+ *
+ * Note that we reserve quite a lot of stuff here for potential future work. As
+ * an example we might want expose the capabilities(see caps) for a given
+ * region, which could include things like if the region is CPU
+ * mappable/accessible etc.


I get caps but I'm seriously at a loss as to what the rest of this
would be used for.  Why are caps and flags both there and separate?
Flags are typically something you set, not query.  Also, what's with
rsvd1 at the end?  This smells of substantial over-building to me.

I thought to myself, "maybe I'm missing a future use-case" so I looked
at the internal tree and none of this is being used there either.
This indicates to me that either I'm missing something and there's
code somewhere I don't know about or, with three years of building on
internal branches, we still haven't proven that any of this is needed.
If it's the latter, which I strongly suspect, maybe we should drop the
unnecessary 

Re: [Mesa-dev] [Intel-gfx] [RFC 2/2] drm/doc/rfc: i915 new parallel submission uAPI plan

2021-05-19 Thread Tvrtko Ursulin



On 19/05/2021 12:10, Daniel Vetter wrote:

On Tue, May 18, 2021 at 04:58:30PM -0700, Matthew Brost wrote:

Add entry fpr i915 new parallel submission uAPI plan.

v2:
  (Daniel Vetter):
   - Expand logical order explaination
   - Add dummy header
   - Only allow N BBs in execbuf IOCTL
   - Configure parallel submission per slot not per gem context

Cc: Tvrtko Ursulin 
Cc: Tony Ye 
CC: Carl Zhang 
Cc: Daniel Vetter 
Cc: Jason Ekstrand 
Signed-off-by: Matthew Brost 
---
  Documentation/gpu/rfc/i915_parallel_execbuf.h | 144 ++
  Documentation/gpu/rfc/i915_scheduler.rst  |  53 ++-
  2 files changed, 196 insertions(+), 1 deletion(-)
  create mode 100644 Documentation/gpu/rfc/i915_parallel_execbuf.h

diff --git a/Documentation/gpu/rfc/i915_parallel_execbuf.h 
b/Documentation/gpu/rfc/i915_parallel_execbuf.h
new file mode 100644
index ..8c64b983ccad
--- /dev/null
+++ b/Documentation/gpu/rfc/i915_parallel_execbuf.h
@@ -0,0 +1,144 @@
+#define I915_CONTEXT_ENGINES_EXT_PARALLEL_SUBMIT 2 /* see 
i915_context_engines_parallel_submit */
+
+/*
+ * i915_context_engines_parallel_submit:
+ *
+ * Setup a slot to allow multiple BBs to be submitted in a single execbuf 
IOCTL.
+ * Those BBs will then be scheduled to run on the GPU in parallel. Multiple
+ * hardware contexts are created internally in the i915 run these BBs. Once a
+ * slot is configured for N BBs only N BBs can be submitted in each execbuf
+ * IOCTL and this is implict behavior (e.g. the user doesn't tell the execbuf
+ * IOCTL there are N BBs, the execbuf IOCTL know how many BBs there are based 
on
+ * the slots configuration).
+ *
+ * Their are two currently defined ways to control the placement of the
+ * hardware contexts on physical engines: default behavior (no flags) and
+ * I915_PARALLEL_IMPLICT_BONDS (a flag). More flags may be added the in the
+ * future as new hardware / use cases arise. Details of how to use this
+ * interface below above the flags.
+ *
+ * Returns -EINVAL if hardware context placement configuration invalid or if 
the
+ * placement configuration isn't supported on the platform / submission
+ * interface.
+ * Returns -ENODEV if extension isn't supported on the platform / submission
+ * inteface.
+ */
+struct i915_context_engines_parallel_submit {
+   struct i915_user_extension base;
+
+   __u16 engine_index; /* slot for parallel engine */
+   __u16 width;/* number of contexts per parallel engine */
+   __u16 num_siblings; /* number of siblings per context */
+   __u16 mbz16;


Ok the big picture looks reasonable now, the flags still confuse me.


+/*
+ * Default placement behvavior (currently unsupported):
+ *
+ * Rather than restricting parallel submission to a single class with a
+ * logically contiguous placement (I915_PARALLEL_IMPLICT_BONDS), add a mode 
that
+ * enables parallel submission across multiple engine classes. In this case 
each
+ * context's logical engine mask indicates where that context can placed. It is
+ * implied in this mode that all contexts have mutual exclusive placement (e.g.
+ * if one context is running CS0 no other contexts can run on CS0).
+ *
+ * Example 1 pseudo code:
+ * CSX[Y] = engine class X, logical instance Y
+ * INVALID = I915_ENGINE_CLASS_INVALID, I915_ENGINE_CLASS_INVALID_NONE
+ * set_engines(INVALID)
+ * set_parallel(engine_index=0, width=2, num_siblings=2,
+ * engines=CS0[0],CS0[1],CS1[0],CS1[1])
+ *
+ * Results in the following valid placements:
+ * CS0[0], CS1[0]
+ * CS0[0], CS1[1]
+ * CS0[1], CS1[0]
+ * CS0[1], CS1[1]
+ *
+ * This can also be though of as 2 virtual engines:
+ * VE[0] = CS0[0], CS0[1]
+ * VE[1] = CS1[0], CS1[1]
+ *
+ * Example 2 pseudo code:
+ * CS[X] = generic engine of same class, logical instance X
+ * INVALID = I915_ENGINE_CLASS_INVALID, I915_ENGINE_CLASS_INVALID_NONE
+ * set_engines(INVALID)
+ * set_parallel(engine_index=0, width=2, num_siblings=3,
+ * engines=CS[0],CS[1],CS[2],CS[0],CS[1],CS[2])
+ *
+ * Results in the following valid placements:
+ * CS[0], CS[1]
+ * CS[0], CS[2]
+ * CS[1], CS[0]
+ * CS[1], CS[2]
+ * CS[2], CS[0]
+ * CS[2], CS[1]
+ *
+ *
+ * This can also be though of as 2 virtual engines:
+ * VE[0] = CS[0], CS[1], CS[2]
+ * VE[1] = CS[0], CS[1], CS[2]
+
+ * This enables a use case where all engines are created equally, we don't care
+ * where they are scheduled, we just want a certain number of resources, for
+ * those resources to be scheduled in parallel, and possibly across multiple
+ * engine classes.
+ */


So I don't really get what this does compared to setting the flag below.
Is this just about running the batchbuffers the wrong way round, i.e. if
you have (simplest case)

width=2, num_sibglings=1, engines=CS[0], CS[1]

Then both
CS[0], CS[1]
and
CS[1], CS[0]
are possible options for running 2 batches? Iow, the backend is allowed to
run the batchbuffers the wrong way round, which gains us nothing, since we
assume

Re: [Mesa-dev] [Intel-gfx] [RFC 2/2] drm/doc/rfc: i915 new parallel submission uAPI plan

2021-05-20 Thread Tvrtko Ursulin



On 20/05/2021 10:54, Daniel Vetter wrote:

On Wed, May 19, 2021 at 7:19 PM Matthew Brost  wrote:


On Wed, May 19, 2021 at 01:10:04PM +0200, Daniel Vetter wrote:

On Tue, May 18, 2021 at 04:58:30PM -0700, Matthew Brost wrote:

Add entry fpr i915 new parallel submission uAPI plan.

v2:
  (Daniel Vetter):
   - Expand logical order explaination
   - Add dummy header
   - Only allow N BBs in execbuf IOCTL
   - Configure parallel submission per slot not per gem context

Cc: Tvrtko Ursulin 
Cc: Tony Ye 
CC: Carl Zhang 
Cc: Daniel Vetter 
Cc: Jason Ekstrand 
Signed-off-by: Matthew Brost 
---
  Documentation/gpu/rfc/i915_parallel_execbuf.h | 144 ++
  Documentation/gpu/rfc/i915_scheduler.rst  |  53 ++-
  2 files changed, 196 insertions(+), 1 deletion(-)
  create mode 100644 Documentation/gpu/rfc/i915_parallel_execbuf.h

diff --git a/Documentation/gpu/rfc/i915_parallel_execbuf.h 
b/Documentation/gpu/rfc/i915_parallel_execbuf.h
new file mode 100644
index ..8c64b983ccad
--- /dev/null
+++ b/Documentation/gpu/rfc/i915_parallel_execbuf.h
@@ -0,0 +1,144 @@
+#define I915_CONTEXT_ENGINES_EXT_PARALLEL_SUBMIT 2 /* see 
i915_context_engines_parallel_submit */
+
+/*
+ * i915_context_engines_parallel_submit:
+ *
+ * Setup a slot to allow multiple BBs to be submitted in a single execbuf 
IOCTL.
+ * Those BBs will then be scheduled to run on the GPU in parallel. Multiple
+ * hardware contexts are created internally in the i915 run these BBs. Once a
+ * slot is configured for N BBs only N BBs can be submitted in each execbuf
+ * IOCTL and this is implict behavior (e.g. the user doesn't tell the execbuf
+ * IOCTL there are N BBs, the execbuf IOCTL know how many BBs there are based 
on
+ * the slots configuration).
+ *
+ * Their are two currently defined ways to control the placement of the
+ * hardware contexts on physical engines: default behavior (no flags) and
+ * I915_PARALLEL_IMPLICT_BONDS (a flag). More flags may be added the in the
+ * future as new hardware / use cases arise. Details of how to use this
+ * interface below above the flags.
+ *
+ * Returns -EINVAL if hardware context placement configuration invalid or if 
the
+ * placement configuration isn't supported on the platform / submission
+ * interface.
+ * Returns -ENODEV if extension isn't supported on the platform / submission
+ * inteface.
+ */
+struct i915_context_engines_parallel_submit {
+   struct i915_user_extension base;
+
+   __u16 engine_index; /* slot for parallel engine */
+   __u16 width;/* number of contexts per parallel engine */
+   __u16 num_siblings; /* number of siblings per context */
+   __u16 mbz16;


Ok the big picture looks reasonable now, the flags still confuse me.



Yea, it is a bit confusing.


+/*
+ * Default placement behvavior (currently unsupported):
+ *
+ * Rather than restricting parallel submission to a single class with a
+ * logically contiguous placement (I915_PARALLEL_IMPLICT_BONDS), add a mode 
that
+ * enables parallel submission across multiple engine classes. In this case 
each
+ * context's logical engine mask indicates where that context can placed. It is
+ * implied in this mode that all contexts have mutual exclusive placement (e.g.
+ * if one context is running CS0 no other contexts can run on CS0).
+ *
+ * Example 1 pseudo code:
+ * CSX[Y] = engine class X, logical instance Y
+ * INVALID = I915_ENGINE_CLASS_INVALID, I915_ENGINE_CLASS_INVALID_NONE
+ * set_engines(INVALID)
+ * set_parallel(engine_index=0, width=2, num_siblings=2,
+ * engines=CS0[0],CS0[1],CS1[0],CS1[1])
+ *
+ * Results in the following valid placements:
+ * CS0[0], CS1[0]
+ * CS0[0], CS1[1]
+ * CS0[1], CS1[0]
+ * CS0[1], CS1[1]
+ *
+ * This can also be though of as 2 virtual engines:
+ * VE[0] = CS0[0], CS0[1]
+ * VE[1] = CS1[0], CS1[1]
+ *
+ * Example 2 pseudo code:
+ * CS[X] = generic engine of same class, logical instance X
+ * INVALID = I915_ENGINE_CLASS_INVALID, I915_ENGINE_CLASS_INVALID_NONE
+ * set_engines(INVALID)
+ * set_parallel(engine_index=0, width=2, num_siblings=3,
+ * engines=CS[0],CS[1],CS[2],CS[0],CS[1],CS[2])
+ *
+ * Results in the following valid placements:
+ * CS[0], CS[1]
+ * CS[0], CS[2]
+ * CS[1], CS[0]
+ * CS[1], CS[2]
+ * CS[2], CS[0]
+ * CS[2], CS[1]
+ *
+ *
+ * This can also be though of as 2 virtual engines:
+ * VE[0] = CS[0], CS[1], CS[2]
+ * VE[1] = CS[0], CS[1], CS[2]
+
+ * This enables a use case where all engines are created equally, we don't care
+ * where they are scheduled, we just want a certain number of resources, for
+ * those resources to be scheduled in parallel, and possibly across multiple
+ * engine classes.
+ */


So I don't really get what this does compared to setting the flag below.
Is this just about running the batchbuffers the wrong way round, i.e. if
you have (simplest case)

width=2, num_sibglings=1, engines=CS[0], CS[1]

Then both
CS[0], CS[1]
and
CS[1], CS[0]
are possible options for running 

Re: [Mesa-dev] [Intel-gfx] [RFC 2/2] drm/doc/rfc: i915 new parallel submission uAPI plan

2021-05-21 Thread Tvrtko Ursulin



On 19/05/2021 00:58, Matthew Brost wrote:

Add entry fpr i915 new parallel submission uAPI plan.

v2:
  (Daniel Vetter):
   - Expand logical order explaination
   - Add dummy header
   - Only allow N BBs in execbuf IOCTL
   - Configure parallel submission per slot not per gem context

Cc: Tvrtko Ursulin 
Cc: Tony Ye 
CC: Carl Zhang 
Cc: Daniel Vetter 
Cc: Jason Ekstrand 
Signed-off-by: Matthew Brost 
---
  Documentation/gpu/rfc/i915_parallel_execbuf.h | 144 ++
  Documentation/gpu/rfc/i915_scheduler.rst  |  53 ++-
  2 files changed, 196 insertions(+), 1 deletion(-)
  create mode 100644 Documentation/gpu/rfc/i915_parallel_execbuf.h

diff --git a/Documentation/gpu/rfc/i915_parallel_execbuf.h 
b/Documentation/gpu/rfc/i915_parallel_execbuf.h
new file mode 100644
index ..8c64b983ccad
--- /dev/null
+++ b/Documentation/gpu/rfc/i915_parallel_execbuf.h
@@ -0,0 +1,144 @@
+#define I915_CONTEXT_ENGINES_EXT_PARALLEL_SUBMIT 2 /* see 
i915_context_engines_parallel_submit */
+
+/*
+ * i915_context_engines_parallel_submit:
+ *
+ * Setup a slot to allow multiple BBs to be submitted in a single execbuf 
IOCTL.
+ * Those BBs will then be scheduled to run on the GPU in parallel. Multiple
+ * hardware contexts are created internally in the i915 run these BBs. Once a
+ * slot is configured for N BBs only N BBs can be submitted in each execbuf
+ * IOCTL and this is implict behavior (e.g. the user doesn't tell the execbuf
+ * IOCTL there are N BBs, the execbuf IOCTL know how many BBs there are based 
on
+ * the slots configuration).


1)
Expand the term slot here with "slot in the context engine map" least 
once for clarity.


2)
About where execbuf will implicitly be finding batches - suggest to also 
cover first/last flag here. I know you have it in the readme but I think 
it is good if uapi header is as self-contained as possible.



+ *
+ * Their are two currently defined ways to control the placement of the
+ * hardware contexts on physical engines: default behavior (no flags) and
+ * I915_PARALLEL_IMPLICT_BONDS (a flag). More flags may be added the in the
+ * future as new hardware / use cases arise. Details of how to use this
+ * interface below above the flags.
+ *
+ * Returns -EINVAL if hardware context placement configuration invalid or if 
the
+ * placement configuration isn't supported on the platform / submission
+ * interface.
+ * Returns -ENODEV if extension isn't supported on the platform / submission
+ * inteface.
+ */
+struct i915_context_engines_parallel_submit {
+   struct i915_user_extension base;
+
+   __u16 engine_index; /* slot for parallel engine */
+   __u16 width;/* number of contexts per parallel engine */
+   __u16 num_siblings; /* number of siblings per context */
+   __u16 mbz16;
+/*
+ * Default placement behvavior (currently unsupported):
+ *
+ * Rather than restricting parallel submission to a single class with a
+ * logically contiguous placement (I915_PARALLEL_IMPLICT_BONDS), add a mode 
that


What do you mean with logically contiguous here? It sounds ambiguous 
versus logical vs "normal" engine instance numbers.



+ * enables parallel submission across multiple engine classes. In this case 
each
+ * context's logical engine mask indicates where that context can placed. It is
+ * implied in this mode that all contexts have mutual exclusive placement (e.g.
+ * if one context is running CS0 no other contexts can run on CS0).


I think talk about logical context and its mask is too implementation 
detail at the uapi level. Instead I would suggest more userspace 
programmer centric description.



+ *
+ * Example 1 pseudo code:
+ * CSX[Y] = engine class X, logical instance Y
+ * INVALID = I915_ENGINE_CLASS_INVALID, I915_ENGINE_CLASS_INVALID_NONE
+ * set_engines(INVALID)
+ * set_parallel(engine_index=0, width=2, num_siblings=2,
+ * engines=CS0[0],CS0[1],CS1[0],CS1[1])
+ *
+ * Results in the following valid placements:
+ * CS0[0], CS1[0]
+ * CS0[0], CS1[1]
+ * CS0[1], CS1[0]
+ * CS0[1], CS1[1]
+ *
+ * This can also be though of as 2 virtual engines:
+ * VE[0] = CS0[0], CS0[1]
+ * VE[1] = CS1[0], CS1[1]


Ah okay so essentially similar to what I was proposing a year ago. But 
then it is no longer "set_parallel" really. It is one slot in the engine 
map, right, with the idea to super class intel_context in the 
implementation?


So really a wide virtual engine, as opposed to single one. In which case 
I think it makes sense to stay close to the existing naming of the 
load_balance extension for consistency. Load_balance_wide? 
Load_balance_parallel? Multi?


I also have to say the notation "CS0[0]" - I who know this problem space 
am finding it hard to penetrate what that actually means. (Also 
uppercase IMO makes it hard to read, but maybe it is just me.)


Looking a bit lower below, extension seems to be taking a 2d array of 
class:instance pairs, 

Re: [Mesa-dev] [Intel-gfx] [RFC 2/2] drm/doc/rfc: i915 new parallel submission uAPI plan

2021-05-21 Thread Tvrtko Ursulin



On 20/05/2021 20:41, Daniel Vetter wrote:

On Thu, May 20, 2021 at 11:57:44AM +0100, Tvrtko Ursulin wrote:


On 20/05/2021 10:54, Daniel Vetter wrote:

On Wed, May 19, 2021 at 7:19 PM Matthew Brost  wrote:


On Wed, May 19, 2021 at 01:10:04PM +0200, Daniel Vetter wrote:

On Tue, May 18, 2021 at 04:58:30PM -0700, Matthew Brost wrote:

Add entry fpr i915 new parallel submission uAPI plan.

v2:
   (Daniel Vetter):
- Expand logical order explaination
- Add dummy header
- Only allow N BBs in execbuf IOCTL
- Configure parallel submission per slot not per gem context

Cc: Tvrtko Ursulin 
Cc: Tony Ye 
CC: Carl Zhang 
Cc: Daniel Vetter 
Cc: Jason Ekstrand 
Signed-off-by: Matthew Brost 
---
   Documentation/gpu/rfc/i915_parallel_execbuf.h | 144 ++
   Documentation/gpu/rfc/i915_scheduler.rst  |  53 ++-
   2 files changed, 196 insertions(+), 1 deletion(-)
   create mode 100644 Documentation/gpu/rfc/i915_parallel_execbuf.h

diff --git a/Documentation/gpu/rfc/i915_parallel_execbuf.h 
b/Documentation/gpu/rfc/i915_parallel_execbuf.h
new file mode 100644
index ..8c64b983ccad
--- /dev/null
+++ b/Documentation/gpu/rfc/i915_parallel_execbuf.h
@@ -0,0 +1,144 @@
+#define I915_CONTEXT_ENGINES_EXT_PARALLEL_SUBMIT 2 /* see 
i915_context_engines_parallel_submit */
+
+/*
+ * i915_context_engines_parallel_submit:
+ *
+ * Setup a slot to allow multiple BBs to be submitted in a single execbuf 
IOCTL.
+ * Those BBs will then be scheduled to run on the GPU in parallel. Multiple
+ * hardware contexts are created internally in the i915 run these BBs. Once a
+ * slot is configured for N BBs only N BBs can be submitted in each execbuf
+ * IOCTL and this is implict behavior (e.g. the user doesn't tell the execbuf
+ * IOCTL there are N BBs, the execbuf IOCTL know how many BBs there are based 
on
+ * the slots configuration).
+ *
+ * Their are two currently defined ways to control the placement of the
+ * hardware contexts on physical engines: default behavior (no flags) and
+ * I915_PARALLEL_IMPLICT_BONDS (a flag). More flags may be added the in the
+ * future as new hardware / use cases arise. Details of how to use this
+ * interface below above the flags.
+ *
+ * Returns -EINVAL if hardware context placement configuration invalid or if 
the
+ * placement configuration isn't supported on the platform / submission
+ * interface.
+ * Returns -ENODEV if extension isn't supported on the platform / submission
+ * inteface.
+ */
+struct i915_context_engines_parallel_submit {
+   struct i915_user_extension base;
+
+   __u16 engine_index; /* slot for parallel engine */
+   __u16 width;/* number of contexts per parallel engine */
+   __u16 num_siblings; /* number of siblings per context */
+   __u16 mbz16;


Ok the big picture looks reasonable now, the flags still confuse me.



Yea, it is a bit confusing.


+/*
+ * Default placement behvavior (currently unsupported):
+ *
+ * Rather than restricting parallel submission to a single class with a
+ * logically contiguous placement (I915_PARALLEL_IMPLICT_BONDS), add a mode 
that
+ * enables parallel submission across multiple engine classes. In this case 
each
+ * context's logical engine mask indicates where that context can placed. It is
+ * implied in this mode that all contexts have mutual exclusive placement (e.g.
+ * if one context is running CS0 no other contexts can run on CS0).
+ *
+ * Example 1 pseudo code:
+ * CSX[Y] = engine class X, logical instance Y
+ * INVALID = I915_ENGINE_CLASS_INVALID, I915_ENGINE_CLASS_INVALID_NONE
+ * set_engines(INVALID)
+ * set_parallel(engine_index=0, width=2, num_siblings=2,
+ * engines=CS0[0],CS0[1],CS1[0],CS1[1])
+ *
+ * Results in the following valid placements:
+ * CS0[0], CS1[0]
+ * CS0[0], CS1[1]
+ * CS0[1], CS1[0]
+ * CS0[1], CS1[1]
+ *
+ * This can also be though of as 2 virtual engines:
+ * VE[0] = CS0[0], CS0[1]
+ * VE[1] = CS1[0], CS1[1]
+ *
+ * Example 2 pseudo code:
+ * CS[X] = generic engine of same class, logical instance X
+ * INVALID = I915_ENGINE_CLASS_INVALID, I915_ENGINE_CLASS_INVALID_NONE
+ * set_engines(INVALID)
+ * set_parallel(engine_index=0, width=2, num_siblings=3,
+ * engines=CS[0],CS[1],CS[2],CS[0],CS[1],CS[2])
+ *
+ * Results in the following valid placements:
+ * CS[0], CS[1]
+ * CS[0], CS[2]
+ * CS[1], CS[0]
+ * CS[1], CS[2]
+ * CS[2], CS[0]
+ * CS[2], CS[1]
+ *
+ *
+ * This can also be though of as 2 virtual engines:
+ * VE[0] = CS[0], CS[1], CS[2]
+ * VE[1] = CS[0], CS[1], CS[2]
+
+ * This enables a use case where all engines are created equally, we don't care
+ * where they are scheduled, we just want a certain number of resources, for
+ * those resources to be scheduled in parallel, and possibly across multiple
+ * engine classes.
+ */


So I don't really get what this does compared to setting the flag below.
Is this just about running the batchbuffers the wrong way round, i.e. if
you have (simplest case)

wid

Re: [Mesa-dev] [Intel-gfx] [RFC 2/2] drm/doc/rfc: i915 new parallel submission uAPI plan

2021-05-24 Thread Tvrtko Ursulin



On 21/05/2021 17:48, Matthew Brost wrote:

On Fri, May 21, 2021 at 01:00:54PM +0100, Tvrtko Ursulin wrote:


[snip]


+ * enables parallel submission across multiple engine classes. In this case 
each
+ * context's logical engine mask indicates where that context can placed. It is
+ * implied in this mode that all contexts have mutual exclusive placement (e.g.
+ * if one context is running CS0 no other contexts can run on CS0).


I think talk about logical context and its mask is too implementation detail
at the uapi level. Instead I would suggest more userspace programmer centric
description.


Ok, can you give me suggestion? Writing DOC isn't really my strength.


Yeah, not mine either. Maybe we need to hire a technical writer. :)

I think in general I would just talk a bit how until now submission was 
along a single engine only and this is adding a wide submission model, 
expanding how it works with more details and only then talking about 
logical contexts if needed.


It depends a bit whether our userspace clients still predominantly think 
in terms of engines, or is it contexts? I don't have an answer there.


It probably isn't the most important thing and probably with a few 
tweaks of what you have it can be good enough. Key probably is simply 
coming up with as intuitive as possible diagrams, with consistent 
naming, showing how the wide engine is built and how it works.



+ *
+ * Example 1 pseudo code:
+ * CSX[Y] = engine class X, logical instance Y
+ * INVALID = I915_ENGINE_CLASS_INVALID, I915_ENGINE_CLASS_INVALID_NONE
+ * set_engines(INVALID)
+ * set_parallel(engine_index=0, width=2, num_siblings=2,
+ * engines=CS0[0],CS0[1],CS1[0],CS1[1])
+ *
+ * Results in the following valid placements:
+ * CS0[0], CS1[0]
+ * CS0[0], CS1[1]
+ * CS0[1], CS1[0]
+ * CS0[1], CS1[1]
+ *
+ * This can also be though of as 2 virtual engines:
+ * VE[0] = CS0[0], CS0[1]
+ * VE[1] = CS1[0], CS1[1]


Ah okay so essentially similar to what I was proposing a year ago. But then
it is no longer "set_parallel" really. It is one slot in the engine map,
right, with the idea to super class intel_context in the implementation?



Yes, it is bascially a super class intel_context. In the implementation is
parent-child with the parent having a linked list of child intel_contexts.
  

So really a wide virtual engine, as opposed to single one. In which case I
think it makes sense to stay close to the existing naming of the
load_balance extension for consistency. Load_balance_wide?
Load_balance_parallel? Multi?


I like the way is named but I also don't want to argue about this as I don't
really care. If someone else says this should be renamed, let's do it.


I don't care too much apart from a general desire for more consistency 
and fewer terms in use.



I also have to say the notation "CS0[0]" - I who know this problem space am
finding it hard to penetrate what that actually means. (Also uppercase IMO
makes it hard to read, but maybe it is just me.)



Yea, now I think about it CS0[0] is bad because of using numbers twice. How
about CSX[0] & CSY[1]? I used upper case because in the i915 all engine classes
defines are upper case but agree it might be easier to read it lower case.


What would X and Y represent? Or if I paste this part:

 * VE[0] = CS0[0], CS0[1]
 * VE[1] = CS1[0], CS1[1]

Which index is engine instance and what is the other index?


Looking a bit lower below, extension seems to be taking a 2d array of
class:instance pairs, right? If so then reading these docs in order, or even
just looking further down, I don't think that is explicitly called out
clearly enough.

So I think a paragraph or two explaining clearly how the 2d array of engines
corresponds to the allowed engines for full virtual engine width. Or maybe
just a 2d diagram?

   2-wide virtual engine:
 .engines = [
   /* channel 0 allowed engines: */  [cs0, cs1],
   /* channel 1 allowed engines: */  [cs0, cs1]
  ]

Not sure if that's better.



Yes, it is a 2-d array. Agree the explaination could be better.


Also to be noted, this only allows uniform number of allowed engines per
channel. I am not saying we need the non-uniform setup today but with bonds
there isn't this limitation.



Not exactly. You could do something like this.

witdth = 2
siblings = 2
engines = CSX[0], CSX[1], CSY[0], INVALID

This would allow a placement of:

CSX[0], CSY[0]
CSX[1], CSY[0]

In this case the siblings is just a max value of each entry.


Okay fair, did not think about that or saw it mentioned.

Regards,

Tvrtko
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [Intel-gfx] [RFC PATCH 1/2] drm/doc/rfc: i915 GuC submission / DRM scheduler

2021-05-27 Thread Tvrtko Ursulin


On 27/05/2021 00:33, Matthew Brost wrote:

Add entry for i915 GuC submission / DRM scheduler integration plan.
Follow up patch with details of new parallel submission uAPI to come.

v2:
  (Daniel Vetter)
   - Expand explaination of why bonding isn't supported for GuC
 submission
   - CC some of the DRM scheduler maintainers
   - Add priority inheritance / boosting use case
   - Add reasoning for removing in order assumptions
  (Daniel Stone)
   - Add links to priority spec


Where will the outstanding items like, from the top of my head only, 
error capture and open source logging tool be tracked? I thought here 
but maybe not.


Regards,

Tvrtko


Cc: Christian König 
Cc: Luben Tuikov 
Cc: Alex Deucher 
Cc: Steven Price 
Cc: Jon Bloomfield 
Cc: Jason Ekstrand 
Cc: Dave Airlie 
Cc: Daniel Vetter 
Cc: Jason Ekstrand 
Cc: dri-de...@lists.freedesktop.org
Signed-off-by: Matthew Brost 
---
  Documentation/gpu/rfc/i915_scheduler.rst | 85 
  Documentation/gpu/rfc/index.rst  |  4 ++
  2 files changed, 89 insertions(+)
  create mode 100644 Documentation/gpu/rfc/i915_scheduler.rst

diff --git a/Documentation/gpu/rfc/i915_scheduler.rst 
b/Documentation/gpu/rfc/i915_scheduler.rst
new file mode 100644
index ..7faa46cde088
--- /dev/null
+++ b/Documentation/gpu/rfc/i915_scheduler.rst
@@ -0,0 +1,85 @@
+=
+I915 GuC Submission/DRM Scheduler Section
+=
+
+Upstream plan
+=
+For upstream the overall plan for landing GuC submission and integrating the
+i915 with the DRM scheduler is:
+
+* Merge basic GuC submission
+   * Basic submission support for all gen11+ platforms
+   * Not enabled by default on any current platforms but can be enabled via
+ modparam enable_guc
+   * Lots of rework will need to be done to integrate with DRM scheduler so
+ no need to nit pick everything in the code, it just should be
+ functional, no major coding style / layering errors, and not regress
+ execlists
+   * Update IGTs / selftests as needed to work with GuC submission
+   * Enable CI on supported platforms for a baseline
+   * Rework / get CI heathly for GuC submission in place as needed
+* Merge new parallel submission uAPI
+   * Bonding uAPI completely incompatible with GuC submission, plus it has
+ severe design issues in general, which is why we want to retire it no
+ matter what
+   * New uAPI adds I915_CONTEXT_ENGINES_EXT_PARALLEL context setup step
+ which configures a slot with N contexts
+   * After I915_CONTEXT_ENGINES_EXT_PARALLEL a user can submit N batches to
+ a slot in a single execbuf IOCTL and the batches run on the GPU in
+ paralllel
+   * Initially only for GuC submission but execlists can be supported if
+ needed
+* Convert the i915 to use the DRM scheduler
+   * GuC submission backend fully integrated with DRM scheduler
+   * All request queues removed from backend (e.g. all backpressure
+ handled in DRM scheduler)
+   * Resets / cancels hook in DRM scheduler
+   * Watchdog hooks into DRM scheduler
+   * Lots of complexity of the GuC backend can be pulled out once
+ integrated with DRM scheduler (e.g. state machine gets
+ simplier, locking gets simplier, etc...)
+   * Execlist backend will do the minimum required to hook in the DRM
+ scheduler so it can live next to the fully integrated GuC backend
+   * Legacy interface
+   * Features like timeslicing / preemption / virtual engines would
+ be difficult to integrate with the DRM scheduler and these
+ features are not required for GuC submission as the GuC does
+ these things for us
+   * ROI low on fully integrating into DRM scheduler
+   * Fully integrating would add lots of complexity to DRM
+ scheduler
+   * Port i915 priority inheritance / boosting feature in DRM scheduler
+   * Used for i915 page flip, may be useful to other DRM drivers as
+ well
+   * Will be an optional feature in the DRM scheduler
+   * Remove in-order completion assumptions from DRM scheduler
+   * Even when using the DRM scheduler the backends will handle
+ preemption, timeslicing, etc... so it is possible for jobs to
+ finish out of order
+   * Pull out i915 priority levels and use DRM priority levels
+   * Optimize DRM scheduler as needed
+
+New uAPI for basic GuC submission
+=
+No major changes are required to the uAPI for basic GuC submission. The only
+change is a new scheduler attribute: I915_SCHEDULER_CAP_STATIC_PRIORITY_MAP.
+This attribute indicates the 2k i915 user priority level

Re: [Mesa-dev] [Intel-gfx] [RFC PATCH 2/2] drm/doc/rfc: i915 new parallel submission uAPI plan

2021-05-27 Thread Tvrtko Ursulin


On 27/05/2021 00:33, Matthew Brost wrote:

Add entry for i915 new parallel submission uAPI plan.

v2:
  (Daniel Vetter):
   - Expand logical order explaination
   - Add dummy header
   - Only allow N BBs in execbuf IOCTL
   - Configure parallel submission per slot not per gem context
v3:
  (Marcin Ślusarz):
   - Lot's of typos / bad english fixed
  (Tvrtko Ursulin):
   - Consistent pseudo code, clean up wording in descriptions

Cc: Tvrtko Ursulin 
Cc: Tony Ye 
CC: Carl Zhang 
Cc: Daniel Vetter 
Cc: Jason Ekstrand 
Signed-off-by: Matthew Brost 
---
  Documentation/gpu/rfc/i915_parallel_execbuf.h | 145 ++
  Documentation/gpu/rfc/i915_scheduler.rst  |  55 ++-
  2 files changed, 198 insertions(+), 2 deletions(-)
  create mode 100644 Documentation/gpu/rfc/i915_parallel_execbuf.h

diff --git a/Documentation/gpu/rfc/i915_parallel_execbuf.h 
b/Documentation/gpu/rfc/i915_parallel_execbuf.h
new file mode 100644
index ..20de206e3ab4
--- /dev/null
+++ b/Documentation/gpu/rfc/i915_parallel_execbuf.h
@@ -0,0 +1,145 @@
+#define I915_CONTEXT_ENGINES_EXT_PARALLEL_SUBMIT 2 /* see 
i915_context_engines_parallel_submit */
+
+/*
+ * i915_context_engines_parallel_submit:
+ *
+ * Setup a slot in the context engine map to allow multiple BBs to be submitted
+ * in a single execbuf IOCTL. Those BBs will then be scheduled to run on the 
GPU
+ * in parallel. Multiple hardware contexts are created internally in the i915
+ * run these BBs. Once a slot is configured for N BBs only N BBs can be
+ * submitted in each execbuf IOCTL and this is implicit behavior e.g. The user


"i.e. the user" I think.


+ * doesn't tell the execbuf IOCTL there are N BBs, the execbuf IOCTL know how


knows


+ * many BBs there are based on the slots configuration. The N BBs are the last 
N


"slot's" ? Or maybe better fully qualified as "engine map slot"?


+ * buffer objects for first N if I915_EXEC_BATCH_FIRST is set.


s/for/or/


+ *
+ * There are two currently defined ways to control the placement of the
+ * hardware contexts on physical engines: default behavior (no flags) and
+ * I915_PARALLEL_IMPLICIT_BONDS (a flag). More flags may be added the in the
+ * future as new hardware / use cases arise. Details of how to use this
+ * interface above the flags field in this structure.


"are above", "can be found above" ?


+ *
+ * Returns -EINVAL if hardware context placement configuration is invalid or if
+ * the placement configuration isn't supported on the platform / submission
+ * interface.
+ * Returns -ENODEV if extension isn't supported on the platform / submission
+ * inteface.
+ */
+struct i915_context_engines_parallel_submit {
+   struct i915_user_extension base;
+
+   __u16 engine_index; /* slot for parallel engine */
+   __u16 width;/* number of contexts per parallel engine */
+   __u16 num_siblings; /* number of siblings per context */
+   __u16 mbz16;
+/*
+ * Default placement behavior (currently unsupported):
+ *
+ * Allow BBs to be placed on any available engine instance. In this case each
+ * context's engine mask indicates where that context can be placed. It is


Context does not have an engine mask in the uapi so I'd explain this in 
terms of list of engines.



+ * implied in this mode that all contexts have mutual exclusive placement.


mutually


+ * e.g. If one context is running CSX[0] no other contexts can run on CSX[0]).


s/If/if, I think.

I don't find CSX[0] readable nor I think introducing csx as a term is 
desirable. Lowercase cs (like cs0) is what I would prefer for both 
readability and likely cs = command streamer could be more obvious what 
it is. Naming like rcs0 (prefix-cs-number) and similar are exposed in 
kernel logs and error state while csx[] (cs-suffix-number) are not at all.


In placement examples I would refrain from using any prefixes/suffixes 
to designate engine classes and just say cs0/cs1, mentioning the 
same/mixed class requirement separately in the notes if applicable.



+ *
+ * Example 1 pseudo code:
+ * CSX,Y[N] = generic engine class X or Y, logical instance N
+ * INVALID = I915_ENGINE_CLASS_INVALID, I915_ENGINE_CLASS_INVALID_NONE
+ * set_engines(INVALID)
+ * set_parallel(engine_index=0, width=2, num_siblings=2,
+ * engines=CSX[0],CSX[1],CSY[0],CSY[1])
+ *
+ * Results in the following valid placements:
+ * CSX[0], CSY[0]
+ * CSX[0], CSY[1]
+ * CSX[1], CSY[0]
+ * CSX[1], CSY[1]
+ *
+ * This can also be thought of as 2 virtual engines described by 2-D array in
+ * the engines the field:


One the too many.


+ * VE[0] = CSX[0], CSX[1]
+ * VE[1] = CSY[0], CSY[1]
+ *
+ * Example 2 pseudo code:
+ * CSX[Y] = generic engine of same class X, logical instance N
+ * INVALID = I915_ENGINE_CLASS_INVALID, I915_ENGINE_CLASS_INVALID_NONE
+ * set_engines(INVALID)
+ * set_parallel(engine_index=0, width=2, num_siblings=3,
+ *

Re: [Intel-gfx] [PATCH v2] drm/doc: add rfc section for small BAR uapi

2022-04-27 Thread Tvrtko Ursulin



On 20/04/2022 18:13, Matthew Auld wrote:

Add an entry for the new uapi needed for small BAR on DG2+.

v2:
   - Some spelling fixes and other small tweaks. (Akeem & Thomas)
   - Rework error capture interactions, including no longer needing
 NEEDS_CPU_ACCESS for objects marked for capture. (Thomas)
   - Add probed_cpu_visible_size. (Lionel)

Signed-off-by: Matthew Auld 
Cc: Thomas Hellström 
Cc: Lionel Landwerlin 
Cc: Jon Bloomfield 
Cc: Daniel Vetter 
Cc: Jordan Justen 
Cc: Kenneth Graunke 
Cc: Akeem G Abodunrin 
Cc: mesa-dev@lists.freedesktop.org
---
  Documentation/gpu/rfc/i915_small_bar.h   | 190 +++
  Documentation/gpu/rfc/i915_small_bar.rst |  58 +++
  Documentation/gpu/rfc/index.rst  |   4 +
  3 files changed, 252 insertions(+)
  create mode 100644 Documentation/gpu/rfc/i915_small_bar.h
  create mode 100644 Documentation/gpu/rfc/i915_small_bar.rst

diff --git a/Documentation/gpu/rfc/i915_small_bar.h 
b/Documentation/gpu/rfc/i915_small_bar.h
new file mode 100644
index ..7bfd0cf44d35
--- /dev/null
+++ b/Documentation/gpu/rfc/i915_small_bar.h
@@ -0,0 +1,190 @@
+/**
+ * struct __drm_i915_memory_region_info - Describes one region as known to the
+ * driver.
+ *
+ * Note this is using both struct drm_i915_query_item and struct 
drm_i915_query.
+ * For this new query we are adding the new query id 
DRM_I915_QUERY_MEMORY_REGIONS
+ * at &drm_i915_query_item.query_id.
+ */
+struct __drm_i915_memory_region_info {
+   /** @region: The class:instance pair encoding */
+   struct drm_i915_gem_memory_class_instance region;
+
+   /** @rsvd0: MBZ */
+   __u32 rsvd0;
+
+   /** @probed_size: Memory probed by the driver (-1 = unknown) */
+   __u64 probed_size;
+
+   /** @unallocated_size: Estimate of memory remaining (-1 = unknown) */
+   __u64 unallocated_size;
+
+   union {
+   /** @rsvd1: MBZ */
+   __u64 rsvd1[8];
+   struct {
+   /**
+* @probed_cpu_visible_size: Memory probed by the driver
+* that is CPU accessible. (-1 = unknown).
+*
+* This will be always be <= @probed_size, and the
+* remainder(if there is any) will not be CPU
+* accessible.
+*/
+   __u64 probed_cpu_visible_size;


Would unallocated_cpu_visible_size be useful, to follow the total 
unallocated_size?


Btw, have we ever considered whether unallocated_size should require 
CAP_SYS_ADMIN/PERFMON or something?



+   };
+   };
+};
+
+/**
+ * struct __drm_i915_gem_create_ext - Existing gem_create behaviour, with added
+ * extension support using struct i915_user_extension.
+ *
+ * Note that new buffer flags should be added here, at least for the stuff that
+ * is immutable. Previously we would have two ioctls, one to create the object
+ * with gem_create, and another to apply various parameters, however this
+ * creates some ambiguity for the params which are considered immutable. Also 
in
+ * general we're phasing out the various SET/GET ioctls.
+ */
+struct __drm_i915_gem_create_ext {
+   /**
+* @size: Requested size for the object.
+*
+* The (page-aligned) allocated size for the object will be returned.
+*
+* Note that for some devices we have might have further minimum
+* page-size restrictions(larger than 4K), like for device local-memory.
+* However in general the final size here should always reflect any
+* rounding up, if for example using the 
I915_GEM_CREATE_EXT_MEMORY_REGIONS
+* extension to place the object in device local-memory.
+*/
+   __u64 size;
+   /**
+* @handle: Returned handle for the object.
+*
+* Object handles are nonzero.
+*/
+   __u32 handle;
+   /**
+* @flags: Optional flags.
+*
+* Supported values:
+*
+* I915_GEM_CREATE_EXT_FLAG_NEEDS_CPU_ACCESS - Signal to the kernel that
+* the object will need to be accessed via the CPU.
+*
+* Only valid when placing objects in I915_MEMORY_CLASS_DEVICE, and
+* only strictly required on platforms where only some of the device
+* memory is directly visible or mappable through the CPU, like on DG2+.
+*
+* One of the placements MUST also be I915_MEMORY_CLASS_SYSTEM, to
+* ensure we can always spill the allocation to system memory, if we
+* can't place the object in the mappable part of
+* I915_MEMORY_CLASS_DEVICE.
+*
+* Note that since the kernel only supports flat-CCS on objects that can
+* *only* be placed in I915_MEMORY_CLASS_DEVICE, we therefore don't
+* support I915_GEM_CREATE_EXT_FLAG_NEEDS_CPU_ACCESS together with
+* flat-CCS.
+*
+* Without th

Re: [Intel-gfx] [PATCH v2] drm/doc: add rfc section for small BAR uapi

2022-04-28 Thread Tvrtko Ursulin



On 27/04/2022 18:36, Matthew Auld wrote:

On 27/04/2022 09:36, Tvrtko Ursulin wrote:


On 20/04/2022 18:13, Matthew Auld wrote:

Add an entry for the new uapi needed for small BAR on DG2+.

v2:
   - Some spelling fixes and other small tweaks. (Akeem & Thomas)
   - Rework error capture interactions, including no longer needing
 NEEDS_CPU_ACCESS for objects marked for capture. (Thomas)
   - Add probed_cpu_visible_size. (Lionel)

Signed-off-by: Matthew Auld 
Cc: Thomas Hellström 
Cc: Lionel Landwerlin 
Cc: Jon Bloomfield 
Cc: Daniel Vetter 
Cc: Jordan Justen 
Cc: Kenneth Graunke 
Cc: Akeem G Abodunrin 
Cc: mesa-dev@lists.freedesktop.org
---
  Documentation/gpu/rfc/i915_small_bar.h   | 190 +++
  Documentation/gpu/rfc/i915_small_bar.rst |  58 +++
  Documentation/gpu/rfc/index.rst  |   4 +
  3 files changed, 252 insertions(+)
  create mode 100644 Documentation/gpu/rfc/i915_small_bar.h
  create mode 100644 Documentation/gpu/rfc/i915_small_bar.rst

diff --git a/Documentation/gpu/rfc/i915_small_bar.h 
b/Documentation/gpu/rfc/i915_small_bar.h

new file mode 100644
index ..7bfd0cf44d35
--- /dev/null
+++ b/Documentation/gpu/rfc/i915_small_bar.h
@@ -0,0 +1,190 @@
+/**
+ * struct __drm_i915_memory_region_info - Describes one region as 
known to the

+ * driver.
+ *
+ * Note this is using both struct drm_i915_query_item and struct 
drm_i915_query.
+ * For this new query we are adding the new query id 
DRM_I915_QUERY_MEMORY_REGIONS

+ * at &drm_i915_query_item.query_id.
+ */
+struct __drm_i915_memory_region_info {
+    /** @region: The class:instance pair encoding */
+    struct drm_i915_gem_memory_class_instance region;
+
+    /** @rsvd0: MBZ */
+    __u32 rsvd0;
+
+    /** @probed_size: Memory probed by the driver (-1 = unknown) */
+    __u64 probed_size;
+
+    /** @unallocated_size: Estimate of memory remaining (-1 = 
unknown) */

+    __u64 unallocated_size;
+
+    union {
+    /** @rsvd1: MBZ */
+    __u64 rsvd1[8];
+    struct {
+    /**
+ * @probed_cpu_visible_size: Memory probed by the driver
+ * that is CPU accessible. (-1 = unknown).
+ *
+ * This will be always be <= @probed_size, and the
+ * remainder(if there is any) will not be CPU
+ * accessible.
+ */
+    __u64 probed_cpu_visible_size;


Would unallocated_cpu_visible_size be useful, to follow the total 
unallocated_size?


Make sense. But I don't think unallocated_size has actually been 
properly wired up yet. It still just gives the same value as 
probed_size. IIRC for unallocated_size we still need a real 
user/usecase/umd, before wiring that up for real with the existing avail 
tracking. Once we have that we can also add unallocated_cpu_visible_size.


So this does nothing at the moment:

 info.unallocated_size = mr->avail;

Right, it is set to "mem->avail = mem->total;" at region init time and I 
indeed can't find it ever getting modified. Okay.


Btw, have we ever considered whether unallocated_size should require 
CAP_SYS_ADMIN/PERFMON or something?


Note sure. But just in case we do add it for real at some point, why the 
added restriction?


To avoid a side channel, albeit perhaps a very weak one. For engine 
utilization we require CAP_SYS_PERFMON, but that is implied by the perf 
core API. It's open for discussion. I guess it may make sense to limit 
it also because it is questionable the field(s) are even useful.







+    };
+    };
+};
+
+/**
+ * struct __drm_i915_gem_create_ext - Existing gem_create behaviour, 
with added

+ * extension support using struct i915_user_extension.
+ *
+ * Note that new buffer flags should be added here, at least for the 
stuff that
+ * is immutable. Previously we would have two ioctls, one to create 
the object
+ * with gem_create, and another to apply various parameters, however 
this
+ * creates some ambiguity for the params which are considered 
immutable. Also in

+ * general we're phasing out the various SET/GET ioctls.
+ */
+struct __drm_i915_gem_create_ext {
+    /**
+ * @size: Requested size for the object.
+ *
+ * The (page-aligned) allocated size for the object will be 
returned.

+ *
+ * Note that for some devices we have might have further minimum
+ * page-size restrictions(larger than 4K), likefor device 
local-memory.

+ * However in general the final size here should always reflect any
+ * rounding up, if for example using the 
I915_GEM_CREATE_EXT_MEMORY_REGIONS

+ * extension to place the object in device local-memory.
+ */
+    __u64 size;
+    /**
+ * @handle: Returned handle for the object.
+ *
+ * Object handles are nonzero.
+ */
+    __u32 handle;
+    /**
+ * @flags: Optional flags.
+ *
+ * Supported values:
+ *
+ * I915_GEM_CREATE_EXT_FLAG_NEEDS_CPU_ACCESS - Signal to the 
kernel that

+

Re: [Intel-gfx] [PATCH v2] drm/doc: add rfc section for small BAR uapi

2022-04-28 Thread Tvrtko Ursulin



On 28/04/2022 11:25, Matthew Auld wrote:

On 28/04/2022 09:55, Tvrtko Ursulin wrote:


On 27/04/2022 18:36, Matthew Auld wrote:

On 27/04/2022 09:36, Tvrtko Ursulin wrote:


On 20/04/2022 18:13, Matthew Auld wrote:

Add an entry for the new uapi needed for small BAR on DG2+.

v2:
   - Some spelling fixes and other small tweaks. (Akeem & Thomas)
   - Rework error capture interactions, including no longer needing
 NEEDS_CPU_ACCESS for objects marked for capture. (Thomas)
   - Add probed_cpu_visible_size. (Lionel)

Signed-off-by: Matthew Auld 
Cc: Thomas Hellström 
Cc: Lionel Landwerlin 
Cc: Jon Bloomfield 
Cc: Daniel Vetter 
Cc: Jordan Justen 
Cc: Kenneth Graunke 
Cc: Akeem G Abodunrin 
Cc: mesa-dev@lists.freedesktop.org
---
  Documentation/gpu/rfc/i915_small_bar.h   | 190 
+++

  Documentation/gpu/rfc/i915_small_bar.rst |  58 +++
  Documentation/gpu/rfc/index.rst  |   4 +
  3 files changed, 252 insertions(+)
  create mode 100644 Documentation/gpu/rfc/i915_small_bar.h
  create mode 100644 Documentation/gpu/rfc/i915_small_bar.rst

diff --git a/Documentation/gpu/rfc/i915_small_bar.h 
b/Documentation/gpu/rfc/i915_small_bar.h

new file mode 100644
index ..7bfd0cf44d35
--- /dev/null
+++ b/Documentation/gpu/rfc/i915_small_bar.h
@@ -0,0 +1,190 @@
+/**
+ * struct __drm_i915_memory_region_info - Describes one region as 
known to the

+ * driver.
+ *
+ * Note this is using both struct drm_i915_query_item and struct 
drm_i915_query.
+ * For this new query we are adding the new query id 
DRM_I915_QUERY_MEMORY_REGIONS

+ * at &drm_i915_query_item.query_id.
+ */
+struct __drm_i915_memory_region_info {
+    /** @region: The class:instance pair encoding */
+    struct drm_i915_gem_memory_class_instance region;
+
+    /** @rsvd0: MBZ */
+    __u32 rsvd0;
+
+    /** @probed_size: Memory probed by the driver (-1 = unknown) */
+    __u64 probed_size;
+
+    /** @unallocated_size: Estimate of memory remaining (-1 = 
unknown) */

+    __u64 unallocated_size;
+
+    union {
+    /** @rsvd1: MBZ */
+    __u64 rsvd1[8];
+    struct {
+    /**
+ * @probed_cpu_visible_size: Memory probed by the driver
+ * that is CPU accessible. (-1 = unknown).
+ *
+ * This will be always be <= @probed_size, and the
+ * remainder(if there is any) will not be CPU
+ * accessible.
+ */
+    __u64 probed_cpu_visible_size;


Would unallocated_cpu_visible_size be useful, to follow the total 
unallocated_size?


Make sense. But I don't think unallocated_size has actually been 
properly wired up yet. It still just gives the same value as 
probed_size. IIRC for unallocated_size we still need a real 
user/usecase/umd, before wiring that up for real with the existing 
avail tracking. Once we have that we can also add 
unallocated_cpu_visible_size.


So this does nothing at the moment:

  info.unallocated_size = mr->avail;

Right, it is set to "mem->avail = mem->total;" at region init time and 
I indeed can't find it ever getting modified. Okay.


Btw, have we ever considered whether unallocated_size should require 
CAP_SYS_ADMIN/PERFMON or something?


Note sure. But just in case we do add it for real at some point, why 
the added restriction?


To avoid a side channel, albeit perhaps a very weak one. For engine 
utilization we require CAP_SYS_PERFMON, but that is implied by the 
perf core API. It's open for discussion. I guess it may make sense to 
limit it also because it is questionable the field(s) are even useful.







+    };
+    };
+};
+
+/**
+ * struct __drm_i915_gem_create_ext - Existing gem_create 
behaviour, with added

+ * extension support using struct i915_user_extension.
+ *
+ * Note that new buffer flags should be added here, at least for 
the stuff that
+ * is immutable. Previously we would have two ioctls, one to 
create the object
+ * with gem_create, and another to apply various parameters, 
however this
+ * creates some ambiguity for the params which are considered 
immutable. Also in

+ * general we're phasing out the various SET/GET ioctls.
+ */
+struct __drm_i915_gem_create_ext {
+    /**
+ * @size: Requested size for the object.
+ *
+ * The (page-aligned) allocated size for the object will be 
returned.

+ *
+ * Note that for some devices we have might have further minimum
+ * page-size restrictions(larger than 4K), likefor device 
local-memory.
+ * However in general the final size here should always 
reflect any
+ * rounding up, if for example using the 
I915_GEM_CREATE_EXT_MEMORY_REGIONS

+ * extension to place the object in device local-memory.
+ */
+    __u64 size;
+    /**
+ * @handle: Returned handle for the object.
+ *
+ * Object handles are nonzero.
+ */
+    __u32 handle;
+    /**
+ * @flags: Optional flags.
+ *
+ * Supported values:
+ *

Re: [PATCH v2] drm/doc: add rfc section for small BAR uapi

2022-05-03 Thread Tvrtko Ursulin



On 03/05/2022 11:22, Matthew Auld wrote:

On 02/05/2022 09:53, Lionel Landwerlin wrote:

On 02/05/2022 10:54, Lionel Landwerlin wrote:

On 20/04/2022 20:13, Matthew Auld wrote:

Add an entry for the new uapi needed for small BAR on DG2+.

v2:
   - Some spelling fixes and other small tweaks. (Akeem & Thomas)
   - Rework error capture interactions, including no longer needing
 NEEDS_CPU_ACCESS for objects marked for capture. (Thomas)
   - Add probed_cpu_visible_size. (Lionel)

Signed-off-by: Matthew Auld 
Cc: Thomas Hellström 
Cc: Lionel Landwerlin 
Cc: Jon Bloomfield 
Cc: Daniel Vetter 
Cc: Jordan Justen 
Cc: Kenneth Graunke 
Cc: Akeem G Abodunrin 
Cc: mesa-dev@lists.freedesktop.org
---
  Documentation/gpu/rfc/i915_small_bar.h   | 190 
+++

  Documentation/gpu/rfc/i915_small_bar.rst |  58 +++
  Documentation/gpu/rfc/index.rst  |   4 +
  3 files changed, 252 insertions(+)
  create mode 100644 Documentation/gpu/rfc/i915_small_bar.h
  create mode 100644 Documentation/gpu/rfc/i915_small_bar.rst

diff --git a/Documentation/gpu/rfc/i915_small_bar.h 
b/Documentation/gpu/rfc/i915_small_bar.h

new file mode 100644
index ..7bfd0cf44d35
--- /dev/null
+++ b/Documentation/gpu/rfc/i915_small_bar.h
@@ -0,0 +1,190 @@
+/**
+ * struct __drm_i915_memory_region_info - Describes one region as 
known to the

+ * driver.
+ *
+ * Note this is using both struct drm_i915_query_item and struct 
drm_i915_query.
+ * For this new query we are adding the new query id 
DRM_I915_QUERY_MEMORY_REGIONS

+ * at &drm_i915_query_item.query_id.
+ */
+struct __drm_i915_memory_region_info {
+    /** @region: The class:instance pair encoding */
+    struct drm_i915_gem_memory_class_instance region;
+
+    /** @rsvd0: MBZ */
+    __u32 rsvd0;
+
+    /** @probed_size: Memory probed by the driver (-1 = unknown) */
+    __u64 probed_size;
+
+    /** @unallocated_size: Estimate of memory remaining (-1 = 
unknown) */

+    __u64 unallocated_size;
+
+    union {
+    /** @rsvd1: MBZ */
+    __u64 rsvd1[8];
+    struct {
+    /**
+ * @probed_cpu_visible_size: Memory probed by the driver
+ * that is CPU accessible. (-1 = unknown).
+ *
+ * This will be always be <= @probed_size, and the
+ * remainder(if there is any) will not be CPU
+ * accessible.
+ */
+    __u64 probed_cpu_visible_size;
+    };



Trying to implement userspace support in Vulkan for this, I have an 
additional question about the value of probed_cpu_visible_size.


When is it set to -1?

I'm guessing before there is support for this value it'll be 0 (MBZ).

After after it should either be the entire lmem or something smaller.


-Lionel



Other pain point of this new uAPI, previously we could query the 
unallocated size for each heap.


unallocated_size should always give the same value as probed_size. We 
have the avail tracking, but we don't currently expose that through 
unallocated_size, due to lack of real userspace/user etc.




Now lmem is effectively divided into 2 heaps, but unallocated_size is 
tracking allocation from both parts of lmem.


Yeah, if we ever properly expose the unallocated_size, then we could 
also just add unallocated_cpu_visible_size.




Is adding new I915_MEMORY_CLASS_DEVICE_NON_MAPPABLE out of question?


I don't think it's out of the question...

I guess user-space should be able to get the current flag behaviour just 
by specifying: device, system. And it does give more flexibly to allow 
something like: device, device-nm, smem.


I was also thinking about that option, albeit both regions under the 
existing class just separate instances with "capability" flags differing.


Downsides I thought were a) it does not really match the underlying 
resource, which is one and not two from the backing storage POV, and b) 
it allows userspace to do potentially do too much of restrictive 
regions=device-mappable,system  (even if only an innocent mistake); 
disallowing i915 to manage the space better in cases where multiple 
clients happen to fight over it.


The last part is going back to what I was commenting earlier, where I 
though migrating objects which had cpu-access set to any device should 
be allowed. (When mappable region is over subscribed.)


Regards,

Tvrtko

We can also drop the probed_cpu_visible_size, which would now just be 
the probed_size with device/device-nm. And if we lack device-nm, then 
the entire thing must be CPU mappable.


One of the downsides though, is that we can no longer easily mix object 
pages from both device + device-nm, which we could previously do when we 
didn't specify the flag. At least according to the current 
design/behaviour for @regions that would not be allowed. I guess some 
kind of new flag like ALLOC_MIXED or so? Although currently that is only 
possible with device + device-nm in ttm/i915.





-Lionel






+    };
+};
+
+/**
+ * struct __drm_i915_gem_cr

Re: [PATCH v3] drm/doc: add rfc section for small BAR uapi

2022-05-17 Thread Tvrtko Ursulin



On 16/05/2022 19:11, Matthew Auld wrote:

Add an entry for the new uapi needed for small BAR on DG2+.

v2:
   - Some spelling fixes and other small tweaks. (Akeem & Thomas)
   - Rework error capture interactions, including no longer needing
 NEEDS_CPU_ACCESS for objects marked for capture. (Thomas)
   - Add probed_cpu_visible_size. (Lionel)
v3:
   - Drop the vma query for now.
   - Add unallocated_cpu_visible_size as part of the region query.
   - Improve the docs some more, including documenting the expected
 behaviour on older kernels, since this came up in some offline
 discussion.

Signed-off-by: Matthew Auld 
Cc: Thomas Hellström 
Cc: Lionel Landwerlin 
Cc: Tvrtko Ursulin 
Cc: Jon Bloomfield 
Cc: Daniel Vetter 
Cc: Jon Bloomfield 
Cc: Jordan Justen 
Cc: Kenneth Graunke 
Cc: Akeem G Abodunrin 
Cc: mesa-dev@lists.freedesktop.org
---
  Documentation/gpu/rfc/i915_small_bar.h   | 164 +++
  Documentation/gpu/rfc/i915_small_bar.rst |  47 +++
  Documentation/gpu/rfc/index.rst  |   4 +
  3 files changed, 215 insertions(+)
  create mode 100644 Documentation/gpu/rfc/i915_small_bar.h
  create mode 100644 Documentation/gpu/rfc/i915_small_bar.rst

diff --git a/Documentation/gpu/rfc/i915_small_bar.h 
b/Documentation/gpu/rfc/i915_small_bar.h
new file mode 100644
index ..4079d287750b
--- /dev/null
+++ b/Documentation/gpu/rfc/i915_small_bar.h
@@ -0,0 +1,164 @@
+/**
+ * struct __drm_i915_memory_region_info - Describes one region as known to the
+ * driver.
+ *
+ * Note this is using both struct drm_i915_query_item and struct 
drm_i915_query.
+ * For this new query we are adding the new query id 
DRM_I915_QUERY_MEMORY_REGIONS
+ * at &drm_i915_query_item.query_id.
+ */
+struct __drm_i915_memory_region_info {
+   /** @region: The class:instance pair encoding */
+   struct drm_i915_gem_memory_class_instance region;
+
+   /** @rsvd0: MBZ */
+   __u32 rsvd0;
+
+   /** @probed_size: Memory probed by the driver (-1 = unknown) */
+   __u64 probed_size;


Is -1 possible today or when it will be? For system memory it appears 
zeroes are returned today so that has to stay I think. Does it 
effectively mean userspace has to consider both 0 and -1 as unknown is 
the question.



+
+   /**
+* @unallocated_size: Estimate of memory remaining (-1 = unknown)
+*
+* Note this is only currently tracked for I915_MEMORY_CLASS_DEVICE
+* regions, and also requires CAP_PERFMON or CAP_SYS_ADMIN to get
+* reliable accounting. Without this(or if this an older kernel) the


s/if this an/if this is an/

Also same question as above about -1.


+* value here will always match the @probed_size.
+*/
+   __u64 unallocated_size;
+
+   union {
+   /** @rsvd1: MBZ */
+   __u64 rsvd1[8];
+   struct {
+   /**
+* @probed_cpu_visible_size: Memory probed by the driver
+* that is CPU accessible. (-1 = unknown).


Also question about -1. In this case this could be done since the field 
is yet to be added but I am curious if it ever can be -1.



+*
+* This will be always be <= @probed_size, and the
+* remainder(if there is any) will not be CPU
+* accessible.
+*
+* On systems without small BAR, the @probed_size will
+* always equal the @probed_cpu_visible_size, since all
+* of it will be CPU accessible.
+*
+* Note that if the value returned here is zero, then
+* this must be an old kernel which lacks the relevant
+* small-bar uAPI support(including


I have noticed you prefer no space before parentheses throughout the 
text so I guess it's just my preference to have it. Very nitpicky even 
if I am right so up to you.



+* I915_GEM_CREATE_EXT_FLAG_NEEDS_CPU_ACCESS), but on
+* such systems we should never actually end up with a
+* small BAR configuration, assuming we are able to load
+* the kernel module. Hence it should be safe to treat
+* this the same as when @probed_cpu_visible_size ==
+* @probed_size.
+*/
+   __u64 probed_cpu_visible_size;
+
+   /**
+* @unallocated_cpu_visible_size: Estimate of CPU
+* visible memory remaining (-1 = unknown).
+*
+* Note this is only currently tracked for
+* I915_MEMORY_CLASS_DEVICE regions, and also requires
+* CAP_PERFMON or CAP_

Re: [PATCH v3] drm/doc: add rfc section for small BAR uapi

2022-05-17 Thread Tvrtko Ursulin



On 17/05/2022 09:55, Lionel Landwerlin wrote:

On 17/05/2022 11:29, Tvrtko Ursulin wrote:


On 16/05/2022 19:11, Matthew Auld wrote:

Add an entry for the new uapi needed for small BAR on DG2+.

v2:
   - Some spelling fixes and other small tweaks. (Akeem & Thomas)
   - Rework error capture interactions, including no longer needing
 NEEDS_CPU_ACCESS for objects marked for capture. (Thomas)
   - Add probed_cpu_visible_size. (Lionel)
v3:
   - Drop the vma query for now.
   - Add unallocated_cpu_visible_size as part of the region query.
   - Improve the docs some more, including documenting the expected
 behaviour on older kernels, since this came up in some offline
 discussion.

Signed-off-by: Matthew Auld 
Cc: Thomas Hellström 
Cc: Lionel Landwerlin 
Cc: Tvrtko Ursulin 
Cc: Jon Bloomfield 
Cc: Daniel Vetter 
Cc: Jon Bloomfield 
Cc: Jordan Justen 
Cc: Kenneth Graunke 
Cc: Akeem G Abodunrin 
Cc: mesa-dev@lists.freedesktop.org
---
  Documentation/gpu/rfc/i915_small_bar.h   | 164 +++
  Documentation/gpu/rfc/i915_small_bar.rst |  47 +++
  Documentation/gpu/rfc/index.rst  |   4 +
  3 files changed, 215 insertions(+)
  create mode 100644 Documentation/gpu/rfc/i915_small_bar.h
  create mode 100644 Documentation/gpu/rfc/i915_small_bar.rst

diff --git a/Documentation/gpu/rfc/i915_small_bar.h 
b/Documentation/gpu/rfc/i915_small_bar.h

new file mode 100644
index ..4079d287750b
--- /dev/null
+++ b/Documentation/gpu/rfc/i915_small_bar.h
@@ -0,0 +1,164 @@
+/**
+ * struct __drm_i915_memory_region_info - Describes one region as 
known to the

+ * driver.
+ *
+ * Note this is using both struct drm_i915_query_item and struct 
drm_i915_query.
+ * For this new query we are adding the new query id 
DRM_I915_QUERY_MEMORY_REGIONS

+ * at &drm_i915_query_item.query_id.
+ */
+struct __drm_i915_memory_region_info {
+    /** @region: The class:instance pair encoding */
+    struct drm_i915_gem_memory_class_instance region;
+
+    /** @rsvd0: MBZ */
+    __u32 rsvd0;
+
+    /** @probed_size: Memory probed by the driver (-1 = unknown) */
+    __u64 probed_size;


Is -1 possible today or when it will be? For system memory it appears 
zeroes are returned today so that has to stay I think. Does it 
effectively mean userspace has to consider both 0 and -1 as unknown is 
the question.



I raised this on v2. As far as I can tell there are no situation where 
we would get -1.


Is it really probed_size=0 on smem?? It's not the case on the internal 
branch.


My bad, I misread the arguments to intel_memory_region_create while grepping:

struct intel_memory_region *i915_gem_shmem_setup(struct drm_i915_private *i915,
 u16 type, u16 instance)
{
return intel_memory_region_create(i915, 0,
  totalram_pages() << PAGE_SHIFT,
  PAGE_SIZE, 0, 0,
  type, instance,
  &shmem_region_ops);

I saw "0, 0" and wrongly assumed that would be the data, since it matched with 
my mental model and the comment against unallocated_size saying it's only tracked for 
device memory.

Although I'd say it is questionable for i915 to return this data. I wonder it 
use case is possible where it would even be wrong but don't know. I guess the 
cat is out of the bag now.

If the situation is -1 for unknown and some valid size (not zero) I don't think 
there is a problem here.

Regards,

Tvrtko


Anv is not currently handling that case.


I would very much like to not deal with 0 for smem.

It really makes it easier for userspace rather than having to fish 
information from 2 different places and on top of dealing with multiple 
kernel versions.



-Lionel





+
+    /**
+ * @unallocated_size: Estimate of memory remaining (-1 = unknown)
+ *
+ * Note this is only currently tracked for I915_MEMORY_CLASS_DEVICE
+ * regions, and also requires CAP_PERFMON or CAP_SYS_ADMIN to get
+ * reliable accounting. Without this(or if this an older kernel) 
the


s/if this an/if this is an/

Also same question as above about -1.


+ * value here will always match the @probed_size.
+ */
+    __u64 unallocated_size;
+
+    union {
+    /** @rsvd1: MBZ */
+    __u64 rsvd1[8];
+    struct {
+    /**
+ * @probed_cpu_visible_size: Memory probed by the driver
+ * that is CPU accessible. (-1 = unknown).


Also question about -1. In this case this could be done since the 
field is yet to be added but I am curious if it ever can be -1.



+ *
+ * This will be always be <= @probed_size, and the
+ * remainder(if there is any) will not be CPU
+ * accessible.
+ *
+ * On systems without small BAR, the @probed_size will
+   

Re: [PATCH v3] drm/doc: add rfc section for small BAR uapi

2022-05-17 Thread Tvrtko Ursulin



On 17/05/2022 10:39, Lionel Landwerlin wrote:

On 17/05/2022 12:23, Tvrtko Ursulin wrote:


On 17/05/2022 09:55, Lionel Landwerlin wrote:

On 17/05/2022 11:29, Tvrtko Ursulin wrote:


On 16/05/2022 19:11, Matthew Auld wrote:

Add an entry for the new uapi needed for small BAR on DG2+.

v2:
   - Some spelling fixes and other small tweaks. (Akeem & Thomas)
   - Rework error capture interactions, including no longer needing
 NEEDS_CPU_ACCESS for objects marked for capture. (Thomas)
   - Add probed_cpu_visible_size. (Lionel)
v3:
   - Drop the vma query for now.
   - Add unallocated_cpu_visible_size as part of the region query.
   - Improve the docs some more, including documenting the expected
 behaviour on older kernels, since this came up in some offline
 discussion.

Signed-off-by: Matthew Auld 
Cc: Thomas Hellström 
Cc: Lionel Landwerlin 
Cc: Tvrtko Ursulin 
Cc: Jon Bloomfield 
Cc: Daniel Vetter 
Cc: Jon Bloomfield 
Cc: Jordan Justen 
Cc: Kenneth Graunke 
Cc: Akeem G Abodunrin 
Cc: mesa-dev@lists.freedesktop.org
---
  Documentation/gpu/rfc/i915_small_bar.h   | 164 
+++

  Documentation/gpu/rfc/i915_small_bar.rst |  47 +++
  Documentation/gpu/rfc/index.rst  |   4 +
  3 files changed, 215 insertions(+)
  create mode 100644 Documentation/gpu/rfc/i915_small_bar.h
  create mode 100644 Documentation/gpu/rfc/i915_small_bar.rst

diff --git a/Documentation/gpu/rfc/i915_small_bar.h 
b/Documentation/gpu/rfc/i915_small_bar.h

new file mode 100644
index ..4079d287750b
--- /dev/null
+++ b/Documentation/gpu/rfc/i915_small_bar.h
@@ -0,0 +1,164 @@
+/**
+ * struct __drm_i915_memory_region_info - Describes one region as 
known to the

+ * driver.
+ *
+ * Note this is using both struct drm_i915_query_item and struct 
drm_i915_query.
+ * For this new query we are adding the new query id 
DRM_I915_QUERY_MEMORY_REGIONS

+ * at &drm_i915_query_item.query_id.
+ */
+struct __drm_i915_memory_region_info {
+    /** @region: The class:instance pair encoding */
+    struct drm_i915_gem_memory_class_instance region;
+
+    /** @rsvd0: MBZ */
+    __u32 rsvd0;
+
+    /** @probed_size: Memory probed by the driver (-1 = unknown) */
+    __u64 probed_size;


Is -1 possible today or when it will be? For system memory it 
appears zeroes are returned today so that has to stay I think. Does 
it effectively mean userspace has to consider both 0 and -1 as 
unknown is the question.



I raised this on v2. As far as I can tell there are no situation 
where we would get -1.


Is it really probed_size=0 on smem?? It's not the case on the 
internal branch.


My bad, I misread the arguments to intel_memory_region_create while 
grepping:


struct intel_memory_region *i915_gem_shmem_setup(struct 
drm_i915_private *i915,

 u16 type, u16 instance)
{
return intel_memory_region_create(i915, 0,
  totalram_pages() << PAGE_SHIFT,
  PAGE_SIZE, 0, 0,
  type, instance,
  &shmem_region_ops);

I saw "0, 0" and wrongly assumed that would be the data, since it 
matched with my mental model and the comment against unallocated_size 
saying it's only tracked for device memory.


Although I'd say it is questionable for i915 to return this data. I 
wonder it use case is possible where it would even be wrong but don't 
know. I guess the cat is out of the bag now.



Not sure how questionable that is. There are a bunch of tools reporting 
the amount of memory available (free, top, htop, etc...).


It might not be totalram_pages() but probably something close to it.


Questionable as it being a resource driver does not own so it reporting 
it is a pure userspace convenience and maintenance burden for the 
driver. :) Not sure I understand why userspace even wants to know? Only 
reliable use could be to decide whether to even try and run a workload? 
But in that case too perhaps users wants to run it with swapping so let 
them.


There is also the question memory accounting and a process could be 
running in a container with a limited view of the world (totalram_pages 
would be wrong). Although granted, we bypass memory accounting at the 
moment. Not sure what was the discussion on that before. Probably 
because one process can create an object and another can instantiate the 
backing store. Whether or not we could, or should, therefore account 
against the real creator is the question. But if we one day did, then 
we'd have to fiddle with the memory query to stop returning 
totalram_pages() and that would potentially be complicated. Well 
"would".. we probably will since this is already shipping.


Regards,

Tvrtko



Having a non 0 & non -1 value is useful.


-Lionel




If the situation is -1 for unknown and some valid size (not zero) I 
don't think there is a problem here.


Regards,

Tvrtko


Anv is not currently handling that case.


Re: [PATCH v3] drm/doc: add rfc section for small BAR uapi

2022-05-17 Thread Tvrtko Ursulin



On 17/05/2022 10:12, Matthew Auld wrote:

On 17/05/2022 09:29, Tvrtko Ursulin wrote:


On 16/05/2022 19:11, Matthew Auld wrote:

Add an entry for the new uapi needed for small BAR on DG2+.

v2:
   - Some spelling fixes and other small tweaks. (Akeem & Thomas)
   - Rework error capture interactions, including no longer needing
 NEEDS_CPU_ACCESS for objects marked for capture. (Thomas)
   - Add probed_cpu_visible_size. (Lionel)
v3:
   - Drop the vma query for now.
   - Add unallocated_cpu_visible_size as part of the region query.
   - Improve the docs some more, including documenting the expected
 behaviour on older kernels, since this came up in some offline
 discussion.

Signed-off-by: Matthew Auld 
Cc: Thomas Hellström 
Cc: Lionel Landwerlin 
Cc: Tvrtko Ursulin 
Cc: Jon Bloomfield 
Cc: Daniel Vetter 
Cc: Jon Bloomfield 
Cc: Jordan Justen 
Cc: Kenneth Graunke 
Cc: Akeem G Abodunrin 
Cc: mesa-dev@lists.freedesktop.org
---
  Documentation/gpu/rfc/i915_small_bar.h   | 164 +++
  Documentation/gpu/rfc/i915_small_bar.rst |  47 +++
  Documentation/gpu/rfc/index.rst  |   4 +
  3 files changed, 215 insertions(+)
  create mode 100644 Documentation/gpu/rfc/i915_small_bar.h
  create mode 100644 Documentation/gpu/rfc/i915_small_bar.rst

diff --git a/Documentation/gpu/rfc/i915_small_bar.h 
b/Documentation/gpu/rfc/i915_small_bar.h

new file mode 100644
index ..4079d287750b
--- /dev/null
+++ b/Documentation/gpu/rfc/i915_small_bar.h
@@ -0,0 +1,164 @@
+/**
+ * struct __drm_i915_memory_region_info - Describes one region as 
known to the

+ * driver.
+ *
+ * Note this is using both struct drm_i915_query_item and struct 
drm_i915_query.
+ * For this new query we are adding the new query id 
DRM_I915_QUERY_MEMORY_REGIONS

+ * at &drm_i915_query_item.query_id.
+ */
+struct __drm_i915_memory_region_info {
+    /** @region: The class:instance pair encoding */
+    struct drm_i915_gem_memory_class_instance region;
+
+    /** @rsvd0: MBZ */
+    __u32 rsvd0;
+
+    /** @probed_size: Memory probed by the driver (-1 = unknown) */
+    __u64 probed_size;


Is -1 possible today or when it will be? For system memory it appears 
zeroes are returned today so that has to stay I think. Does it 
effectively mean userspace has to consider both 0 and -1 as unknown is 
the question.


It looks like it just returns the totalram_pages(). So at the moment 
nothing ever currently returns -1 or 0. Maybe that was a mistake for 
I915_MEMORY_SYSTEM.


Yes I figured out my confusion later.




+
+    /**
+ * @unallocated_size: Estimate of memory remaining (-1 = unknown)
+ *
+ * Note this is only currently tracked for I915_MEMORY_CLASS_DEVICE
+ * regions, and also requires CAP_PERFMON or CAP_SYS_ADMIN to get
+ * reliable accounting. Without this(or if this an older kernel) 
the


s/if this an/if this is an/

Also same question as above about -1.


This should be the same as above, since this will give the same value as 
probed_size, or give the real avail value for lmem.





+ * value here will always match the @probed_size.
+ */
+    __u64 unallocated_size;
+
+    union {
+    /** @rsvd1: MBZ */
+    __u64 rsvd1[8];
+    struct {
+    /**
+ * @probed_cpu_visible_size: Memory probed by the driver
+ * that is CPU accessible. (-1 = unknown).


Also question about -1. In this case this could be done since the 
field is yet to be added but I am curious if it ever can be -1.


I was just going to make this the same as probed_size for system memory. 
But we could use -1 here instead. What do you think? Same for 
unallocated below.


I don't completely follow the instead part, when you already have 
documented it?


-1 will start appearing with changes that require CAP_SYS_PERFMON right?

Regards,

Tvrtko


+ *
+ * This will be always be <= @probed_size, and the
+ * remainder(if there is any) will not be CPU
+ * accessible.
+ *
+ * On systems without small BAR, the @probed_size will
+ * always equal the @probed_cpu_visible_size, since all
+ * of it will be CPU accessible.
+ *
+ * Note that if the value returned here is zero, then
+ * this must be an old kernel which lacks the relevant
+ * small-bar uAPI support(including


I have noticed you prefer no space before parentheses throughout the 
text so I guess it's just my preference to have it. Very nitpicky even 
if I am right so up to you.


Ok, will change :)




+ * I915_GEM_CREATE_EXT_FLAG_NEEDS_CPU_ACCESS), but on
+ * such systems we should never actually end up with a
+ * small BAR configuration, assuming we are able to load
+ * the kernel module. Hence it should be safe to treat
+ * this the same as when @probed_cpu_visible_size ==
+  

Re: [PATCH v4] drm/doc: add rfc section for small BAR uapi

2022-05-17 Thread Tvrtko Ursulin



On 17/05/2022 11:52, Matthew Auld wrote:

Add an entry for the new uapi needed for small BAR on DG2+.

v2:
   - Some spelling fixes and other small tweaks. (Akeem & Thomas)
   - Rework error capture interactions, including no longer needing
 NEEDS_CPU_ACCESS for objects marked for capture. (Thomas)
   - Add probed_cpu_visible_size. (Lionel)
v3:
   - Drop the vma query for now.
   - Add unallocated_cpu_visible_size as part of the region query.
   - Improve the docs some more, including documenting the expected
 behaviour on older kernels, since this came up in some offline
 discussion.
v4:
   - Various improvements all over. (Tvrtko)


You can ignore my previous reply, the clarifications from v4 read good 
to me.


Acked-by: Tvrtko Ursulin 

Regards,

Tvrtko



Signed-off-by: Matthew Auld 
Cc: Thomas Hellström 
Cc: Lionel Landwerlin 
Cc: Tvrtko Ursulin 
Cc: Jon Bloomfield 
Cc: Daniel Vetter 
Cc: Jon Bloomfield 
Cc: Jordan Justen 
Cc: Kenneth Graunke 
Cc: Akeem G Abodunrin 
Cc: mesa-dev@lists.freedesktop.org
---
  Documentation/gpu/rfc/i915_small_bar.h   | 189 +++
  Documentation/gpu/rfc/i915_small_bar.rst |  47 ++
  Documentation/gpu/rfc/index.rst  |   4 +
  3 files changed, 240 insertions(+)
  create mode 100644 Documentation/gpu/rfc/i915_small_bar.h
  create mode 100644 Documentation/gpu/rfc/i915_small_bar.rst

diff --git a/Documentation/gpu/rfc/i915_small_bar.h 
b/Documentation/gpu/rfc/i915_small_bar.h
new file mode 100644
index ..c676640b23ef
--- /dev/null
+++ b/Documentation/gpu/rfc/i915_small_bar.h
@@ -0,0 +1,189 @@
+/**
+ * struct __drm_i915_memory_region_info - Describes one region as known to the
+ * driver.
+ *
+ * Note this is using both struct drm_i915_query_item and struct 
drm_i915_query.
+ * For this new query we are adding the new query id 
DRM_I915_QUERY_MEMORY_REGIONS
+ * at &drm_i915_query_item.query_id.
+ */
+struct __drm_i915_memory_region_info {
+   /** @region: The class:instance pair encoding */
+   struct drm_i915_gem_memory_class_instance region;
+
+   /** @rsvd0: MBZ */
+   __u32 rsvd0;
+
+   /**
+* @probed_size: Memory probed by the driver (-1 = unknown)
+*
+* Note that it should not be possible to ever encounter a zero value
+* here, also note that no current region type will ever return -1 here.
+* Although for future region types, this might be a possibility. The
+* same applies to the other size fields.
+*/
+   __u64 probed_size;
+
+   /**
+* @unallocated_size: Estimate of memory remaining (-1 = unknown)
+*
+* Requires CAP_PERFMON or CAP_SYS_ADMIN to get reliable accounting.
+* Without this (or if this is an older kernel) the value here will
+* always equal the @probed_size. Note this is only currently tracked
+* for I915_MEMORY_CLASS_DEVICE regions (for other types the value here
+* will always equal the @probed_size).
+*/
+   __u64 unallocated_size;
+
+   union {
+   /** @rsvd1: MBZ */
+   __u64 rsvd1[8];
+   struct {
+   /**
+* @probed_cpu_visible_size: Memory probed by the driver
+* that is CPU accessible. (-1 = unknown).
+*
+* This will be always be <= @probed_size, and the
+* remainder (if there is any) will not be CPU
+* accessible.
+*
+* On systems without small BAR, the @probed_size will
+* always equal the @probed_cpu_visible_size, since all
+* of it will be CPU accessible.
+*
+* Note this is only tracked for
+* I915_MEMORY_CLASS_DEVICE regions (for other types the
+* value here will always equal the @probed_size).
+*
+* Note that if the value returned here is zero, then
+* this must be an old kernel which lacks the relevant
+* small-bar uAPI support (including
+* I915_GEM_CREATE_EXT_FLAG_NEEDS_CPU_ACCESS), but on
+* such systems we should never actually end up with a
+* small BAR configuration, assuming we are able to load
+* the kernel module. Hence it should be safe to treat
+* this the same as when @probed_cpu_visible_size ==
+* @probed_size.
+*/
+   __u64 probed_cpu_visible_size;
+
+   /**
+* @unallocated_cpu_visible_size: Estimate of CPU
+* visible memory remaining (-