From: Tom Stellard <thomas.stell...@amd.com> This results in huge performance improvements for applications like pyrit, which depend on the implementation to determine the optimal work group size. --- src/gallium/state_trackers/clover/api/kernel.cpp | 9 ++++++++- src/gallium/state_trackers/clover/core/device.cpp | 19 +++++++++++++++++++ src/gallium/state_trackers/clover/core/device.hpp | 2 ++ 3 files changed, 29 insertions(+), 1 deletion(-)
diff --git a/src/gallium/state_trackers/clover/api/kernel.cpp b/src/gallium/state_trackers/clover/api/kernel.cpp index 13113a2..93cfe98 100644 --- a/src/gallium/state_trackers/clover/api/kernel.cpp +++ b/src/gallium/state_trackers/clover/api/kernel.cpp @@ -276,10 +276,17 @@ clEnqueueNDRangeKernel(cl_command_queue q, cl_kernel kern, cl_event *ev) try { auto grid_offset = opt_vector(pgrid_offset, dims, 0); auto grid_size = opt_vector(pgrid_size, dims, 1); - auto block_size = opt_vector(pblock_size, dims, 1); kernel_validate(q, kern, dims, pgrid_offset, pgrid_size, pblock_size, num_deps, deps, ev); + std::vector<size_t> block_size; + if (pblock_size) { + block_size = opt_vector(pblock_size, dims, 1); + } else { + std::vector<size_t> optimal_block_size = + q->dev.optimal_block_size(grid_size.data(), dims); + block_size = opt_vector(optimal_block_size.data(), dims, 1); + } hard_event *hev = new hard_event( *q, CL_COMMAND_NDRANGE_KERNEL, { deps, deps + num_deps }, diff --git a/src/gallium/state_trackers/clover/core/device.cpp b/src/gallium/state_trackers/clover/core/device.cpp index 94faeee..710e175 100644 --- a/src/gallium/state_trackers/clover/core/device.cpp +++ b/src/gallium/state_trackers/clover/core/device.cpp @@ -169,6 +169,25 @@ _cl_device_id::max_block_size() const { return { v.begin(), v.end() }; } +std::vector<size_t> +device::optimal_block_size(const size_t *grid_size, unsigned dims) const { + + size_t max_threads = max_threads_per_block(); + std::vector<size_t> max_size = max_block_size(); + size_t threads = 1; + std::vector<size_t> optimal; + + for (unsigned i = 0; i < dims; i++) { + size_t dim_size = max_size[i]; + while (dim_size * threads > max_threads || (grid_size[i] % dim_size != 0)) { + dim_size >>= 1; + } + threads *= dim_size; + optimal.push_back(dim_size); + } + return optimal; +} + std::string _cl_device_id::device_name() const { return pipe->get_name(pipe); diff --git a/src/gallium/state_trackers/clover/core/device.hpp b/src/gallium/state_trackers/clover/core/device.hpp index c6e50db..227c558 100644 --- a/src/gallium/state_trackers/clover/core/device.hpp +++ b/src/gallium/state_trackers/clover/core/device.hpp @@ -62,6 +62,8 @@ public: cl_ulong max_mem_alloc_size() const; std::vector<size_t> max_block_size() const; + std::vector<size_t> optimal_block_size(const size_t *grid_size, + unsigned dims) const; std::string device_name() const; std::string vendor_name() const; enum pipe_shader_ir ir_format() const; -- 1.8.1.4 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev