Daniel P. Berrangé <berra...@redhat.com> writes: > On Fri, Feb 21, 2025 at 04:01:01PM +0000, Alex Bennée wrote: >> While running the new GPU tests it was noted that the proprietary >> nVidia driver barfed when run under the sanitiser: >> >> 2025-02-20 11:13:08,226: [11:13:07.782] Output 'headless' attempts >> EOTF mode SDR and colorimetry mode default. >> 2025-02-20 11:13:08,227: [11:13:07.784] Output 'headless' using color >> profile: stock sRGB color profile >> >> and that's the last thing it outputs. >> >> The sanitizer reports that when the framework sends the SIGTERM >> because of the timeout we get a write to a NULL pointer (but >> interesting not this time in an atexit callback): >> >> UndefinedBehaviorSanitizer:DEADLYSIGNAL >> ==471863==ERROR: UndefinedBehaviorSanitizer: SEGV on unknown address >> 0x000000000000 (pc 0x7a18ceaafe80 bp 0x000000000000 sp 0x7ffe8e3ff6d0 >> T471863) >> ==471863==The signal is caused by a WRITE memory access. >> ==471863==Hint: address points to the zero page. >> #0 0x7a18ceaafe80 >> (/lib/x86_64-linux-gnu/libnvidia-eglcore.so.535.183.01+0x16afe80) >> (BuildId: 24b0d0b90369112e3de888a93eb8d7e00304a6db) >> #1 0x7a18ce9e72c0 >> (/lib/x86_64-linux-gnu/libnvidia-eglcore.so.535.183.01+0x15e72c0) >> (BuildId: 24b0d0b90369112e3de888a93eb8d7e00304a6db) >> #2 0x7a18ce9f11bb >> (/lib/x86_64-linux-gnu/libnvidia-eglcore.so.535.183.01+0x15f11bb) >> (BuildId: 24b0d0b90369112e3de888a93eb8d7e00304a6db) >> #3 0x7a18ce6dc9d1 >> (/lib/x86_64-linux-gnu/libnvidia-eglcore.so.535.183.01+0x12dc9d1) >> (BuildId: 24b0d0b90369112e3de888a93eb8d7e00304a6db) >> #4 0x7a18e7d15326 in vrend_renderer_create_fence >> >> /usr/src/virglrenderer-1.0.0-1ubuntu2/obj-x86_64-linux-gnu/../src/vrend_renderer.c:10883:26 >> #5 0x55bfb6621871 in virtio_gpu_virgl_process_cmd >> >> The #dri-devel channel confirmed: >> >> <digetx> stsquad: nv driver is known to not work with venus, don't use >> it for testing >> >> So lets implement a blocklist to stop users starting a known bad >> setup. > > I don't much like the conceptual idea of blocking usage of QEMU itself > based on current point-in-time bugs in the host OS driver stack, because > it is making an assertion that all future versions of the driver will > also be broken and that's not generally valid. > > If the user chose to use a dodgy graphics driver, they can deal with > the consequences of their choice. > > Skipping only the functional test, without any qemu-system code changes > though is more palettable as that's not a hard block on usage.
Well how do you do one without the other? I don't want to always skip the vulkan testing because some developer setups have broken drivers. Unless you are suggesting something like: -device virtio-vga-gl,hostmem=4G,blob=on,venus=on,ignore-nvidia=on or something like that? > >> >> Reported-by: Peter Maydell <peter.mayd...@linaro.org> >> Signed-off-by: Alex Bennée <alex.ben...@linaro.org> >> Cc: Dmitry Osipenko <dmitry.osipe...@collabora.com> >> --- >> meson.build | 4 + >> include/qemu/host-gpu.h | 23 +++++ >> hw/display/virtio-gpu.c | 4 + >> stubs/host-gpu.c | 17 ++++ >> util/host-gpu.c | 102 ++++++++++++++++++++++ >> stubs/meson.build | 4 + >> tests/functional/test_aarch64_virt_gpu.py | 2 + >> util/meson.build | 2 + >> 8 files changed, 158 insertions(+) >> create mode 100644 include/qemu/host-gpu.h >> create mode 100644 stubs/host-gpu.c >> create mode 100644 util/host-gpu.c >> >> diff --git a/meson.build b/meson.build >> index 4588bfd864..8f4a431445 100644 >> --- a/meson.build >> +++ b/meson.build >> @@ -1373,12 +1373,16 @@ if not get_option('qatzip').auto() or have_system >> endif >> >> virgl = not_found >> +vulkan = not_found >> >> have_vhost_user_gpu = have_tools and host_os == 'linux' and pixman.found() >> if not get_option('virglrenderer').auto() or have_system or >> have_vhost_user_gpu >> virgl = dependency('virglrenderer', >> method: 'pkg-config', >> required: get_option('virglrenderer')) >> + vulkan = dependency('vulkan', >> + method: 'pkg-config', >> + required: get_option('virglrenderer')) >> endif >> rutabaga = not_found >> if not get_option('rutabaga_gfx').auto() or have_system or >> have_vhost_user_gpu >> diff --git a/include/qemu/host-gpu.h b/include/qemu/host-gpu.h >> new file mode 100644 >> index 0000000000..45053c2f77 >> --- /dev/null >> +++ b/include/qemu/host-gpu.h >> @@ -0,0 +1,23 @@ >> +/* >> + * Utility functions to probe host GPU >> + * >> + * Copyright (c) 2025 Linaro Ltd >> + * >> + * SPDX-License-Identifier: GPL-2.0-or-later >> + */ >> +#ifndef HOST_GPU_H >> +#define HOST_GPU_H >> + >> +#include "qapi/error.h" >> + >> +/** >> + * validate_vulkan_backend() - verify working backend >> + * >> + * errp: error pointer >> + * >> + * If the system vulkan implementation is known to not work return >> + * false otherwise true. >> + */ >> +bool validate_vulkan_backend(Error **errp); >> + >> +#endif /* HOST_GPU_H */ >> diff --git a/hw/display/virtio-gpu.c b/hw/display/virtio-gpu.c >> index 11a7a85750..816eedf838 100644 >> --- a/hw/display/virtio-gpu.c >> +++ b/hw/display/virtio-gpu.c >> @@ -32,6 +32,7 @@ >> #include "qemu/module.h" >> #include "qapi/error.h" >> #include "qemu/error-report.h" >> +#include "qemu/host-gpu.h" >> >> #define VIRTIO_GPU_VM_VERSION 1 >> >> @@ -1498,6 +1499,9 @@ void virtio_gpu_device_realize(DeviceState *qdev, >> Error **errp) >> error_setg(errp, "venus requires enabled blob and hostmem >> options"); >> return; >> } >> + if (!validate_vulkan_backend(errp)) { >> + return; >> + } >> #else >> error_setg(errp, "old virglrenderer, venus unsupported"); >> return; >> diff --git a/stubs/host-gpu.c b/stubs/host-gpu.c >> new file mode 100644 >> index 0000000000..7bf76ee4f6 >> --- /dev/null >> +++ b/stubs/host-gpu.c >> @@ -0,0 +1,17 @@ >> +/* >> + * Stub of utility functions to probe host GPU >> + * >> + * Copyright (c) 2025 Linaro Ltd >> + * >> + * SPDX-License-Identifier: GPL-2.0-or-later >> + */ >> + >> +#include "qemu/osdep.h" >> +#include "qapi/error.h" >> +#include "qemu/host-gpu.h" >> + >> +bool validate_vulkan_backend(Error **errp) >> +{ >> + error_setg(errp, "No vulkan library present"); >> + return false; >> +} >> diff --git a/util/host-gpu.c b/util/host-gpu.c >> new file mode 100644 >> index 0000000000..5e7bf2557c >> --- /dev/null >> +++ b/util/host-gpu.c >> @@ -0,0 +1,102 @@ >> +/* >> + * Utility functions to probe host GPU >> + * >> + * Copyright (c) 2025 Linaro Ltd >> + * >> + * SPDX-License-Identifier: GPL-2.0-or-later >> + */ >> + >> +#include "qemu/osdep.h" >> +#include "qapi/error.h" >> +#include "qemu/host-gpu.h" >> + >> +#include <vulkan/vulkan.h> >> + >> +const char *extensions[] = { >> + /* needed to query the driver details */ >> + "VK_KHR_get_physical_device_properties2", >> +}; >> + >> +/* >> + * Info for known broken drivers. Sadly driver version info tends to >> + * be in the driverInfo text field which is free form so tricky to >> + * parse. >> + */ >> +struct VkDriverBlockList { >> + VkDriverId id; >> + const char *reason; >> +}; >> + >> +struct VkDriverBlockList vulkan_blocklist[] = { >> + /* at least 535.183.01 is reported to SEGV in libnvidia-eglcore.so */ >> + { VK_DRIVER_ID_NVIDIA_PROPRIETARY, "proprietary nVidia driver is >> broken" }, >> +}; >> + >> +static bool is_driver_blocked(VkPhysicalDevice dev, Error **errp) >> +{ >> + VkPhysicalDeviceDriverProperties driverProperties = { >> + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DRIVER_PROPERTIES, >> + .pNext = NULL >> + }; >> + VkPhysicalDeviceProperties2 deviceProperties2 = { >> + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2, >> + .pNext = &driverProperties >> + }; >> + VkPhysicalDeviceProperties *deviceProperties = >> &deviceProperties2.properties; >> + >> + vkGetPhysicalDeviceProperties2(dev, &deviceProperties2); >> + >> + for (int i = 0; i < ARRAY_SIZE(vulkan_blocklist); i++) { >> + if (driverProperties.driverID == vulkan_blocklist[i].id) { >> + error_setg(errp, "Blocked GPU %s because %s", >> + deviceProperties->deviceName, >> + vulkan_blocklist[i].reason); >> + return true; >> + } >> + } >> + >> + return false; >> +} >> + >> +bool validate_vulkan_backend(Error **errp) >> +{ >> + VkInstanceCreateInfo instance_info = { >> + VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO, >> + NULL, /* pNext extension */ >> + 0, /* VkInstanceCreateFlags */ >> + NULL, /* Application Info */ >> + 0, NULL, /* no Enabled Layers */ >> + ARRAY_SIZE(extensions), extensions, /* Extensions */ >> + }; >> + >> + VkInstance inst; >> + VkResult res; >> + >> + res = vkCreateInstance(&instance_info, NULL, &inst); >> + >> + if ( res == VK_SUCCESS ) { >> + uint32_t count; >> + VkPhysicalDevice *devices; >> + >> + /* Do the enumeration two-step */ >> + vkEnumeratePhysicalDevices(inst, &count, NULL); >> + devices = g_new0(VkPhysicalDevice, count); >> + vkEnumeratePhysicalDevices(inst, &count, devices); >> + >> + for (int i = 0; i < count; i++) { >> + if (is_driver_blocked(devices[i], errp)) { >> + return false; >> + } >> + } >> + } else { >> + error_setg(errp, "Could not initialise a Vulkan instance"); >> + return false; >> + } >> + >> + /* >> + * It would be nice to g_autofree the instance, but returning >> + * false will abort start-up anyway. >> + */ >> + vkDestroyInstance(inst, NULL); >> + return true; >> +} >> diff --git a/stubs/meson.build b/stubs/meson.build >> index b0fee37e05..c18501aa6d 100644 >> --- a/stubs/meson.build >> +++ b/stubs/meson.build >> @@ -89,3 +89,7 @@ if have_system or have_user >> stub_ss.add(files('hotplug-stubs.c')) >> stub_ss.add(files('sysbus.c')) >> endif >> + >> +if not vulkan.found() >> + stubs_ss.add(files('host-gpu.c')) >> +endif >> diff --git a/tests/functional/test_aarch64_virt_gpu.py >> b/tests/functional/test_aarch64_virt_gpu.py >> index 7a8471d1ca..9a0e694049 100755 >> --- a/tests/functional/test_aarch64_virt_gpu.py >> +++ b/tests/functional/test_aarch64_virt_gpu.py >> @@ -79,6 +79,8 @@ def _run_virt_gpu_test(self, gpu_device, weston_cmd, >> weston_pattern): >> self.skipTest("Can't access host DRM render node") >> elif "'type' does not accept value 'egl-headless'" in >> excp.output: >> self.skipTest("egl-headless support is not available") >> + elif "Blocked GPU" in excp.output: >> + self.skipTest("GPU is in block list") >> else: >> self.log.info(f"unhandled launch failure: {excp.output}") >> raise excp >> diff --git a/util/meson.build b/util/meson.build >> index 780b5977a8..7c6cc36e07 100644 >> --- a/util/meson.build >> +++ b/util/meson.build >> @@ -132,3 +132,5 @@ elif cpu in ['ppc', 'ppc64'] >> elif cpu in ['riscv32', 'riscv64'] >> util_ss.add(files('cpuinfo-riscv.c')) >> endif >> + >> +util_ss.add(when: vulkan, if_true: files('host-gpu.c')) >> -- >> 2.39.5 >> > > With regards, > Daniel -- Alex Bennée Virtualisation Tech Lead @ Linaro