Daniel P. Berrangé <berra...@redhat.com> writes:

> On Fri, Feb 21, 2025 at 04:01:01PM +0000, Alex Bennée wrote:
>> While running the new GPU tests it was noted that the proprietary
>> nVidia driver barfed when run under the sanitiser:
>> 
>>   2025-02-20 11:13:08,226: [11:13:07.782] Output 'headless' attempts
>>   EOTF mode SDR and colorimetry mode default.
>>   2025-02-20 11:13:08,227: [11:13:07.784] Output 'headless' using color
>>   profile: stock sRGB color profile
>> 
>>   and that's the last thing it outputs.
>> 
>>   The sanitizer reports that when the framework sends the SIGTERM
>>   because of the timeout we get a write to a NULL pointer (but
>>   interesting not this time in an atexit callback):
>> 
>>   UndefinedBehaviorSanitizer:DEADLYSIGNAL
>>   ==471863==ERROR: UndefinedBehaviorSanitizer: SEGV on unknown address
>>   0x000000000000 (pc 0x7a18ceaafe80 bp 0x000000000000 sp 0x7ffe8e3ff6d0
>>   T471863)
>>   ==471863==The signal is caused by a WRITE memory access.
>>   ==471863==Hint: address points to the zero page.
>>       #0 0x7a18ceaafe80
>>   (/lib/x86_64-linux-gnu/libnvidia-eglcore.so.535.183.01+0x16afe80)
>>   (BuildId: 24b0d0b90369112e3de888a93eb8d7e00304a6db)
>>       #1 0x7a18ce9e72c0
>>   (/lib/x86_64-linux-gnu/libnvidia-eglcore.so.535.183.01+0x15e72c0)
>>   (BuildId: 24b0d0b90369112e3de888a93eb8d7e00304a6db)
>>       #2 0x7a18ce9f11bb
>>   (/lib/x86_64-linux-gnu/libnvidia-eglcore.so.535.183.01+0x15f11bb)
>>   (BuildId: 24b0d0b90369112e3de888a93eb8d7e00304a6db)
>>       #3 0x7a18ce6dc9d1
>>   (/lib/x86_64-linux-gnu/libnvidia-eglcore.so.535.183.01+0x12dc9d1)
>>   (BuildId: 24b0d0b90369112e3de888a93eb8d7e00304a6db)
>>       #4 0x7a18e7d15326 in vrend_renderer_create_fence
>>   
>> /usr/src/virglrenderer-1.0.0-1ubuntu2/obj-x86_64-linux-gnu/../src/vrend_renderer.c:10883:26
>>       #5 0x55bfb6621871 in virtio_gpu_virgl_process_cmd
>> 
>> The #dri-devel channel confirmed:
>> 
>>   <digetx> stsquad: nv driver is known to not work with venus, don't use
>>       it for testing
>> 
>> So lets implement a blocklist to stop users starting a known bad
>> setup.
>
> I don't much like the conceptual idea of blocking usage of QEMU itself
> based on current point-in-time bugs in the host OS driver stack, because
> it is making an assertion that all future versions of the driver will
> also be broken and that's not generally valid.
>
> If the user chose to use a dodgy graphics driver, they can deal with
> the consequences of their choice.
>
> Skipping only the functional test, without any qemu-system code changes
> though is more palettable as that's not a hard block on usage.

Well how do you do one without the other? I don't want to always skip the
vulkan testing because some developer setups have broken drivers. Unless
you are suggesting something like:

  -device virtio-vga-gl,hostmem=4G,blob=on,venus=on,ignore-nvidia=on

or something like that?

>
>> 
>> Reported-by: Peter Maydell <peter.mayd...@linaro.org>
>> Signed-off-by: Alex Bennée <alex.ben...@linaro.org>
>> Cc: Dmitry Osipenko <dmitry.osipe...@collabora.com>
>> ---
>>  meson.build                               |   4 +
>>  include/qemu/host-gpu.h                   |  23 +++++
>>  hw/display/virtio-gpu.c                   |   4 +
>>  stubs/host-gpu.c                          |  17 ++++
>>  util/host-gpu.c                           | 102 ++++++++++++++++++++++
>>  stubs/meson.build                         |   4 +
>>  tests/functional/test_aarch64_virt_gpu.py |   2 +
>>  util/meson.build                          |   2 +
>>  8 files changed, 158 insertions(+)
>>  create mode 100644 include/qemu/host-gpu.h
>>  create mode 100644 stubs/host-gpu.c
>>  create mode 100644 util/host-gpu.c
>> 
>> diff --git a/meson.build b/meson.build
>> index 4588bfd864..8f4a431445 100644
>> --- a/meson.build
>> +++ b/meson.build
>> @@ -1373,12 +1373,16 @@ if not get_option('qatzip').auto() or have_system
>>  endif
>>  
>>  virgl = not_found
>> +vulkan = not_found
>>  
>>  have_vhost_user_gpu = have_tools and host_os == 'linux' and pixman.found()
>>  if not get_option('virglrenderer').auto() or have_system or 
>> have_vhost_user_gpu
>>    virgl = dependency('virglrenderer',
>>                       method: 'pkg-config',
>>                       required: get_option('virglrenderer'))
>> +  vulkan = dependency('vulkan',
>> +                      method: 'pkg-config',
>> +                      required: get_option('virglrenderer'))
>>  endif
>>  rutabaga = not_found
>>  if not get_option('rutabaga_gfx').auto() or have_system or 
>> have_vhost_user_gpu
>> diff --git a/include/qemu/host-gpu.h b/include/qemu/host-gpu.h
>> new file mode 100644
>> index 0000000000..45053c2f77
>> --- /dev/null
>> +++ b/include/qemu/host-gpu.h
>> @@ -0,0 +1,23 @@
>> +/*
>> + * Utility functions to probe host GPU
>> + *
>> + * Copyright (c) 2025 Linaro Ltd
>> + *
>> + * SPDX-License-Identifier: GPL-2.0-or-later
>> + */
>> +#ifndef HOST_GPU_H
>> +#define HOST_GPU_H
>> +
>> +#include "qapi/error.h"
>> +
>> +/**
>> + * validate_vulkan_backend() - verify working backend
>> + *
>> + * errp: error pointer
>> + *
>> + * If the system vulkan implementation is known to not work return
>> + * false otherwise true.
>> + */
>> +bool validate_vulkan_backend(Error **errp);
>> +
>> +#endif /* HOST_GPU_H */
>> diff --git a/hw/display/virtio-gpu.c b/hw/display/virtio-gpu.c
>> index 11a7a85750..816eedf838 100644
>> --- a/hw/display/virtio-gpu.c
>> +++ b/hw/display/virtio-gpu.c
>> @@ -32,6 +32,7 @@
>>  #include "qemu/module.h"
>>  #include "qapi/error.h"
>>  #include "qemu/error-report.h"
>> +#include "qemu/host-gpu.h"
>>  
>>  #define VIRTIO_GPU_VM_VERSION 1
>>  
>> @@ -1498,6 +1499,9 @@ void virtio_gpu_device_realize(DeviceState *qdev, 
>> Error **errp)
>>              error_setg(errp, "venus requires enabled blob and hostmem 
>> options");
>>              return;
>>          }
>> +        if (!validate_vulkan_backend(errp)) {
>> +            return;
>> +        }
>>      #else
>>          error_setg(errp, "old virglrenderer, venus unsupported");
>>          return;
>> diff --git a/stubs/host-gpu.c b/stubs/host-gpu.c
>> new file mode 100644
>> index 0000000000..7bf76ee4f6
>> --- /dev/null
>> +++ b/stubs/host-gpu.c
>> @@ -0,0 +1,17 @@
>> +/*
>> + * Stub of utility functions to probe host GPU
>> + *
>> + * Copyright (c) 2025 Linaro Ltd
>> + *
>> + * SPDX-License-Identifier: GPL-2.0-or-later
>> + */
>> +
>> +#include "qemu/osdep.h"
>> +#include "qapi/error.h"
>> +#include "qemu/host-gpu.h"
>> +
>> +bool validate_vulkan_backend(Error **errp)
>> +{
>> +    error_setg(errp, "No vulkan library present");
>> +    return false;
>> +}
>> diff --git a/util/host-gpu.c b/util/host-gpu.c
>> new file mode 100644
>> index 0000000000..5e7bf2557c
>> --- /dev/null
>> +++ b/util/host-gpu.c
>> @@ -0,0 +1,102 @@
>> +/*
>> + * Utility functions to probe host GPU
>> + *
>> + * Copyright (c) 2025 Linaro Ltd
>> + *
>> + * SPDX-License-Identifier: GPL-2.0-or-later
>> + */
>> +
>> +#include "qemu/osdep.h"
>> +#include "qapi/error.h"
>> +#include "qemu/host-gpu.h"
>> +
>> +#include <vulkan/vulkan.h>
>> +
>> +const char *extensions[] = {
>> +    /* needed to query the driver details */
>> +    "VK_KHR_get_physical_device_properties2",
>> +};
>> +
>> +/*
>> + * Info for known broken drivers. Sadly driver version info tends to
>> + * be in the driverInfo text field which is free form so tricky to
>> + * parse.
>> + */
>> +struct VkDriverBlockList {
>> +    VkDriverId id;
>> +    const char *reason;
>> +};
>> +
>> +struct VkDriverBlockList vulkan_blocklist[] = {
>> +    /* at least 535.183.01 is reported to SEGV in libnvidia-eglcore.so */
>> +    { VK_DRIVER_ID_NVIDIA_PROPRIETARY, "proprietary nVidia driver is 
>> broken" },
>> +};
>> +
>> +static bool is_driver_blocked(VkPhysicalDevice dev, Error **errp)
>> +{
>> +    VkPhysicalDeviceDriverProperties driverProperties = {
>> +        .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DRIVER_PROPERTIES,
>> +        .pNext = NULL
>> +    };
>> +    VkPhysicalDeviceProperties2 deviceProperties2 = {
>> +        .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2,
>> +        .pNext = &driverProperties
>> +    };
>> +    VkPhysicalDeviceProperties *deviceProperties = 
>> &deviceProperties2.properties;
>> +
>> +    vkGetPhysicalDeviceProperties2(dev, &deviceProperties2);
>> +
>> +    for (int i = 0; i < ARRAY_SIZE(vulkan_blocklist); i++) {
>> +        if (driverProperties.driverID == vulkan_blocklist[i].id) {
>> +            error_setg(errp, "Blocked GPU %s because %s",
>> +                       deviceProperties->deviceName,
>> +                       vulkan_blocklist[i].reason);
>> +            return true;
>> +        }
>> +    }
>> +
>> +    return false;
>> +}
>> +
>> +bool validate_vulkan_backend(Error **errp)
>> +{
>> +    VkInstanceCreateInfo instance_info = {
>> +        VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO,
>> +        NULL, /* pNext extension */
>> +        0,    /* VkInstanceCreateFlags */
>> +        NULL, /* Application Info */
>> +        0, NULL, /* no Enabled Layers */
>> +        ARRAY_SIZE(extensions), extensions, /* Extensions */
>> +    };
>> +
>> +    VkInstance inst;
>> +    VkResult res;
>> +
>> +    res = vkCreateInstance(&instance_info, NULL, &inst);
>> +
>> +    if ( res == VK_SUCCESS ) {
>> +        uint32_t count;
>> +        VkPhysicalDevice *devices;
>> +
>> +        /* Do the enumeration two-step */
>> +        vkEnumeratePhysicalDevices(inst, &count, NULL);
>> +        devices = g_new0(VkPhysicalDevice, count);
>> +        vkEnumeratePhysicalDevices(inst, &count, devices);
>> +
>> +        for (int i = 0; i  < count; i++) {
>> +            if (is_driver_blocked(devices[i], errp)) {
>> +                return false;
>> +            }
>> +        }
>> +    } else {
>> +        error_setg(errp, "Could not initialise a Vulkan instance");
>> +        return false;
>> +    }
>> +
>> +    /*
>> +     * It would be nice to g_autofree the instance, but returning
>> +     * false will abort start-up anyway.
>> +     */
>> +    vkDestroyInstance(inst, NULL);
>> +    return true;
>> +}
>> diff --git a/stubs/meson.build b/stubs/meson.build
>> index b0fee37e05..c18501aa6d 100644
>> --- a/stubs/meson.build
>> +++ b/stubs/meson.build
>> @@ -89,3 +89,7 @@ if have_system or have_user
>>    stub_ss.add(files('hotplug-stubs.c'))
>>    stub_ss.add(files('sysbus.c'))
>>  endif
>> +
>> +if not vulkan.found()
>> +  stubs_ss.add(files('host-gpu.c'))
>> +endif
>> diff --git a/tests/functional/test_aarch64_virt_gpu.py 
>> b/tests/functional/test_aarch64_virt_gpu.py
>> index 7a8471d1ca..9a0e694049 100755
>> --- a/tests/functional/test_aarch64_virt_gpu.py
>> +++ b/tests/functional/test_aarch64_virt_gpu.py
>> @@ -79,6 +79,8 @@ def _run_virt_gpu_test(self, gpu_device,  weston_cmd, 
>> weston_pattern):
>>                  self.skipTest("Can't access host DRM render node")
>>              elif "'type' does not accept value 'egl-headless'" in 
>> excp.output:
>>                  self.skipTest("egl-headless support is not available")
>> +            elif "Blocked GPU" in excp.output:
>> +                self.skipTest("GPU is in block list")
>>              else:
>>                  self.log.info(f"unhandled launch failure: {excp.output}")
>>                  raise excp
>> diff --git a/util/meson.build b/util/meson.build
>> index 780b5977a8..7c6cc36e07 100644
>> --- a/util/meson.build
>> +++ b/util/meson.build
>> @@ -132,3 +132,5 @@ elif cpu in ['ppc', 'ppc64']
>>  elif cpu in ['riscv32', 'riscv64']
>>    util_ss.add(files('cpuinfo-riscv.c'))
>>  endif
>> +
>> +util_ss.add(when: vulkan, if_true: files('host-gpu.c'))
>> -- 
>> 2.39.5
>> 
>
> With regards,
> Daniel

-- 
Alex Bennée
Virtualisation Tech Lead @ Linaro

Reply via email to