Some limits, such as max # of threads in a work-group, vary depending on the resources (ie. registers) used by a kernel. OpenCL provides clGetKernelWorkGroupInfo() for querying these kernel specific limits. To implement this properly, we need a variant of get_compute_param() which takes the compute-state CSO as an argument.
Signed-off-by: Rob Clark <robdcl...@gmail.com> --- src/gallium/include/pipe/p_defines.h | 2 +- src/gallium/include/pipe/p_screen.h | 22 ++++++- src/gallium/state_trackers/clover/api/kernel.cpp | 9 ++- src/gallium/state_trackers/clover/core/device.cpp | 10 ++++ src/gallium/state_trackers/clover/core/device.hpp | 5 ++ src/gallium/state_trackers/clover/core/kernel.cpp | 73 ++++++++++++++++++++--- src/gallium/state_trackers/clover/core/kernel.hpp | 7 ++- 7 files changed, 115 insertions(+), 13 deletions(-) diff --git a/src/gallium/include/pipe/p_defines.h b/src/gallium/include/pipe/p_defines.h index 2ae12f12a1e..0fa96c0d412 100644 --- a/src/gallium/include/pipe/p_defines.h +++ b/src/gallium/include/pipe/p_defines.h @@ -899,7 +899,7 @@ enum pipe_shader_ir /** * Compute-specific implementation capability. They can be queried - * using pipe_screen::get_compute_param. + * using pipe_screen::get_compute_param or pipe_screen::get_kernel_param. */ enum pipe_compute_cap { diff --git a/src/gallium/include/pipe/p_screen.h b/src/gallium/include/pipe/p_screen.h index 101e229088b..cf6049bec43 100644 --- a/src/gallium/include/pipe/p_screen.h +++ b/src/gallium/include/pipe/p_screen.h @@ -117,7 +117,10 @@ struct pipe_screen { enum pipe_video_cap param ); /** - * Query a compute-specific capability/parameter/limit. + * Query a compute-specific capability/parameter/limit. Some parameters + * may have kernel specific lower limits based on the resources used by + * the kernel. See pipe_context::get_kernel_param. + * * \param ir_type shader IR type for which the param applies, or don't care * if the param is not shader related * \param param one of PIPE_COMPUTE_CAP_x @@ -131,6 +134,23 @@ struct pipe_screen { enum pipe_compute_cap param, void *ret); + /** + * Query a compute kernel-specific limit. Some parameters + * may have kernel specific lower limits based on the resources used by + * the kernel. See pipe_screen::get_compute_param. + * + * \param hwso shader state obj (as returned by create_compute_state()) + * if the param is not shader related + * \param param one of PIPE_COMPUTE_CAP_x + * \param ret pointer to a preallocated buffer that will be + * initialized to the parameter value, or NULL. + * \return size in bytes of the parameter value that would be + * returned. + */ + int (*get_kernel_param)(struct pipe_screen *, void *hwcso, + enum pipe_compute_cap param, + void *ret); + /** * Query a timestamp in nanoseconds. The returned value should match * PIPE_QUERY_TIMESTAMP. This function returns immediately and doesn't diff --git a/src/gallium/state_trackers/clover/api/kernel.cpp b/src/gallium/state_trackers/clover/api/kernel.cpp index b665773d9ec..60ffd01c827 100644 --- a/src/gallium/state_trackers/clover/api/kernel.cpp +++ b/src/gallium/state_trackers/clover/api/kernel.cpp @@ -155,9 +155,12 @@ clGetKernelWorkGroupInfo(cl_kernel d_kern, cl_device_id d_dev, if (!count(dev, kern.program().devices())) throw error(CL_INVALID_DEVICE); + /* try to ensure kernel is built for build specific limits: */ + kern.build(dev); + switch (param) { case CL_KERNEL_WORK_GROUP_SIZE: - buf.as_scalar<size_t>() = dev.max_threads_per_block(); + buf.as_scalar<size_t>() = kern.max_threads_per_block(dev); break; case CL_KERNEL_COMPILE_WORK_GROUP_SIZE: @@ -169,7 +172,7 @@ clGetKernelWorkGroupInfo(cl_kernel d_kern, cl_device_id d_dev, break; case CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE: - buf.as_scalar<size_t>() = dev.subgroup_size(); + buf.as_scalar<size_t>() = kern.subgroup_size(dev); break; case CL_KERNEL_PRIVATE_MEM_SIZE: @@ -262,7 +265,7 @@ namespace { throw error(CL_INVALID_WORK_GROUP_SIZE); if (fold(multiplies(), 1u, block_size) > - q.device().max_threads_per_block()) + kern.max_threads_per_block(q.device())) throw error(CL_INVALID_WORK_GROUP_SIZE); return block_size; diff --git a/src/gallium/state_trackers/clover/core/device.cpp b/src/gallium/state_trackers/clover/core/device.cpp index 70f54c9caed..97e098f65de 100644 --- a/src/gallium/state_trackers/clover/core/device.cpp +++ b/src/gallium/state_trackers/clover/core/device.cpp @@ -50,6 +50,16 @@ device::device(clover::platform &platform, pipe_loader_device *ldev) : pipe->destroy(pipe); throw error(CL_INVALID_DEVICE); } + + uint32_t shareable_shaders = + pipe->get_param(pipe, PIPE_CAP_SHAREABLE_SHADERS); + + if (shareable_shaders) { + /* create dummy context to use for compiling shaders */ + pctx = pipe->context_create(pipe, NULL, PIPE_CONTEXT_COMPUTE_ONLY); + } else { + pctx = NULL; + } } device::~device() { diff --git a/src/gallium/state_trackers/clover/core/device.hpp b/src/gallium/state_trackers/clover/core/device.hpp index db791e8cfbe..63cf3abccc4 100644 --- a/src/gallium/state_trackers/clover/core/device.hpp +++ b/src/gallium/state_trackers/clover/core/device.hpp @@ -94,6 +94,11 @@ namespace clover { clover::platform &platform; pipe_screen *pipe; + /* dummy context for compiling kernels, if the driver supports + * shareable compute-state CSO. + */ + pipe_context *pctx; + private: pipe_loader_device *ldev; }; diff --git a/src/gallium/state_trackers/clover/core/kernel.cpp b/src/gallium/state_trackers/clover/core/kernel.cpp index 9730450ceb9..424e44f4ab4 100644 --- a/src/gallium/state_trackers/clover/core/kernel.cpp +++ b/src/gallium/state_trackers/clover/core/kernel.cpp @@ -110,6 +110,11 @@ kernel::launch(command_queue &q, exec.unbind(); } +void +kernel::build(const device &d) { + exec.bind_st(d, false); +} + size_t kernel::mem_local() const { size_t sz = 0; @@ -140,11 +145,41 @@ kernel::optimal_block_size(const command_queue &q, grid_size); } + +namespace { + template<typename T> + std::vector<T> + get_compute_param(pipe_screen *pipe, void *hwcso, + pipe_compute_cap cap) { + int sz = pipe->get_kernel_param(pipe, hwcso, cap, NULL); + std::vector<T> v(sz / sizeof(T)); + + pipe->get_kernel_param(pipe, hwcso, cap, &v.front()); + return v; + } +} + std::vector<size_t> kernel::required_block_size() const { return { 0, 0, 0 }; } +size_t +kernel::max_threads_per_block(const device &d) const { + if (!d.pipe->get_kernel_param || !exec.st) + return d.max_threads_per_block(); + return get_compute_param<uint64_t>(d.pipe, exec.st, + PIPE_COMPUTE_CAP_MAX_THREADS_PER_BLOCK)[0]; +} + +cl_uint +kernel::subgroup_size(const device &d) const { + if (!d.pipe->get_kernel_param || !exec.st) + return d.subgroup_size(); + return get_compute_param<uint32_t>(d.pipe, exec.st, + PIPE_COMPUTE_CAP_SUBGROUP_SIZE)[0]; +} + kernel::argument_range kernel::args() { return map(derefs(), _args); @@ -234,23 +269,47 @@ kernel::exec_context::bind(intrusive_ptr<command_queue> _q, } } + const device &d = q->device(); + return bind_st(d, (_q != q) && !d.pctx); +} + +/* Try to build compute-state CSO.. if the queue is not known (ie. NULL), + * but the device supports sharable compute-state CSO's, then compile using + * the device's dummy context. This case is for clGetKernelWorkGroupInfo() + * where we need to compile the kernel in order to get kernel specific + * limits. + */ +void * +kernel::exec_context::bind_st(const device &_d, bool force) { + pipe_context *pctx = q ? q->pipe : _d.pctx; + bool needs_rebuild = force || !st; + + if (!pctx) + return NULL; + + if (cs.req_input_mem != input.size()) + needs_rebuild = true; + + if (cs.req_local_mem != mem_local) + needs_rebuild = true; + // Create a new compute state if anything changed. - if (!st || q != _q || - cs.req_local_mem != mem_local || - cs.req_input_mem != input.size()) { + if (needs_rebuild) { if (st) - _q->pipe->delete_compute_state(_q->pipe, st); + pctx->delete_compute_state(pctx, st); - cs.ir_type = q->device().ir_format(); + cs.ir_type = _d.ir_format(); if (cs.ir_type == PIPE_SHADER_IR_NIR) { // driver takes ownership of nir_shader: - cs.prog = nir_shader_clone(NULL, (nir_shader *)kern.nir(q->device())); + cs.prog = nir_shader_clone(NULL, (nir_shader *)kern.nir(_d)); } else { + auto &m = kern.program().build(_d).binary; + auto msec = find(type_equals(module::section::text_executable), m.secs); cs.prog = &(msec.data[0]); } cs.req_local_mem = mem_local; cs.req_input_mem = input.size(); - st = q->pipe->create_compute_state(q->pipe, &cs); + st = pctx->create_compute_state(pctx, &cs); } return st; diff --git a/src/gallium/state_trackers/clover/core/kernel.hpp b/src/gallium/state_trackers/clover/core/kernel.hpp index d60b3d6af35..54eb570de92 100644 --- a/src/gallium/state_trackers/clover/core/kernel.hpp +++ b/src/gallium/state_trackers/clover/core/kernel.hpp @@ -48,6 +48,7 @@ namespace clover { void *bind(intrusive_ptr<command_queue> _q, const std::vector<size_t> &grid_offset); + void *bind_st(const device &d, bool force); void unbind(); kernel &kern; @@ -60,9 +61,9 @@ namespace clover { std::vector<pipe_resource *> g_buffers; std::vector<size_t> g_handles; size_t mem_local; + void *st; private: - void *st; pipe_compute_state cs; }; @@ -120,6 +121,7 @@ namespace clover { const std::vector<size_t> &grid_offset, const std::vector<size_t> &grid_size, const std::vector<size_t> &block_size); + void build(const device &d); size_t mem_local() const; size_t mem_private() const; @@ -132,6 +134,9 @@ namespace clover { std::vector<size_t> required_block_size() const; + size_t max_threads_per_block(const device &d) const; + cl_uint subgroup_size(const device &d) const; + argument_range args(); const_argument_range args() const; -- 2.14.3 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev