On Wed, Feb 28, 2018 at 10:05:55AM +0000, Tvrtko Ursulin wrote:
> From: Chris Wilson <ch...@chris-wilson.co.uk>
> 
> CPU hotplug, especially CPU0, can be flaky on commodity hardware.
> 
> To improve test reliability and reponse times when testing larger runs we
> need to handle those cases better.
> 
> Handle failures to off-line a CPU by immediately skipping the test, and
> failures to on-line a CPU by immediately rebooting the machine.
> 
> This patch includes igt_sysrq_reboot implementation from Chris Wilson.
> 
> v2: Halt by default, reboot if env variable IGT_REBOOT_ON_FATAL_ERROR is
>     set. (Petri Latvala)
> 
> Signed-off-by: Tvrtko Ursulin <tvrtko.ursu...@intel.com>
> Cc: Chris Wilson <ch...@chris-wilson.co.uk>
> Cc: Petri Latvala <petri.latv...@intel.com>
> Cc: Tomi Sarvela <tomi.p.sarv...@intel.com>


Reviewed-by: Petri Latvala <petri.latv...@intel.com>

with two nitpicks below.




> ---
>  lib/Makefile.sources |  2 ++
>  lib/igt_core.c       | 23 +++++++++++++++++++++++
>  lib/igt_core.h       |  1 +
>  lib/igt_sysrq.c      | 22 ++++++++++++++++++++++
>  lib/igt_sysrq.h      | 30 ++++++++++++++++++++++++++++++
>  lib/meson.build      |  1 +
>  tests/perf_pmu.c     | 38 +++++++++++++++++++++++++++++++-------
>  7 files changed, 110 insertions(+), 7 deletions(-)
>  create mode 100644 lib/igt_sysrq.c
>  create mode 100644 lib/igt_sysrq.h
> 
> diff --git a/lib/Makefile.sources b/lib/Makefile.sources
> index 5b13ef8896c0..3d37ef1d1984 100644
> --- a/lib/Makefile.sources
> +++ b/lib/Makefile.sources
> @@ -35,6 +35,8 @@ lib_source_list =           \
>       igt_stats.h             \
>       igt_sysfs.c             \
>       igt_sysfs.h             \
> +     igt_sysrq.c             \
> +     igt_sysrq.h             \
>       igt_x86.h               \
>       igt_x86.c               \
>       igt_vgem.c              \
> diff --git a/lib/igt_core.c b/lib/igt_core.c
> index c292343de09e..3fd9f529f09f 100644
> --- a/lib/igt_core.c
> +++ b/lib/igt_core.c
> @@ -70,6 +70,7 @@
>  #include "igt_core.h"
>  #include "igt_aux.h"
>  #include "igt_sysfs.h"
> +#include "igt_sysrq.h"
>  #include "igt_rc.h"
>  
>  #define UNW_LOCAL_ONLY
> @@ -1136,6 +1137,28 @@ void igt_fail(int exitcode)
>       }
>  }
>  
> +/**
> + * igt_fatal_error:
> + *
> + * Stop test execution or optionally, if the IGT_REBOOT_ON_FATAL_ERROR
> + * environment variable is set, reboot the machine.
> + *
> + * Since out test runner (piglit) does support fatal test exit codes, we
> + * implement the default behaviour by waiting endlessly.
> + */
> +void  __attribute__((noreturn)) igt_fatal_error(void)
> +{
> +     if (igt_check_boolean_env_var("IGT_REBOOT_ON_FATAL_ERROR", false)) {
> +             igt_warn("FATAL ERROR - REBOOTING");
> +             igt_sysrq_reboot();
> +     } else {
> +             igt_warn("FATAL ERROR");
> +             for (;;)
> +                     sleep(60);
> +     }
> +}
> +
> +
>  /**
>   * igt_can_fail:
>   *
> diff --git a/lib/igt_core.h b/lib/igt_core.h
> index 7af2b4c109fe..66523a208c31 100644
> --- a/lib/igt_core.h
> +++ b/lib/igt_core.h
> @@ -311,6 +311,7 @@ void __igt_fail_assert(const char *domain, const char 
> *file,
>                      const char *format, ...)
>       __attribute__((noreturn));
>  void igt_exit(void) __attribute__((noreturn));
> +void igt_fatal_error(void) __attribute__((noreturn));
>  
>  /**
>   * igt_ignore_warn:
> diff --git a/lib/igt_sysrq.c b/lib/igt_sysrq.c
> new file mode 100644
> index 000000000000..fe3d2e344ff1
> --- /dev/null
> +++ b/lib/igt_sysrq.c
> @@ -0,0 +1,22 @@
> +#include <unistd.h>
> +#include <fcntl.h>
> +#include <stdlib.h>
> +#include <sys/reboot.h>
> +
> +#include "igt_core.h"
> +
> +#include "igt_sysrq.h"
> +


Docs for igt_sysrq_reboot?


> +void igt_sysrq_reboot(void)
> +{
> +     sync();
> +
> +     /* Try to be nice at first, and if that fails pull the trigger */
> +     if (reboot(RB_AUTOBOOT)) {
> +             int fd = open("/proc/sysrq-trigger", O_WRONLY);
> +             igt_ignore_warn(write(fd, "b", 2));
> +             close(fd);
> +     }
> +
> +     abort();
> +}
> diff --git a/lib/igt_sysrq.h b/lib/igt_sysrq.h
> new file mode 100644
> index 000000000000..422473d2a480
> --- /dev/null
> +++ b/lib/igt_sysrq.h
> @@ -0,0 +1,30 @@
> +/*
> + * Copyright © 2018 Intel Corporation
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice (including the next
> + * paragraph) shall be included in all copies or substantial portions of the
> + * Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
> DEALINGS
> + * IN THE SOFTWARE.
> + *
> + */
> +
> +#ifndef __IGT_SYSRQ_H__
> +#define __IGT_SYSRQ_H__
> +
> +void igt_sysrq_reboot(void) __attribute__((noreturn));
> +
> +#endif /* __IGT_SYSRQ_H__ */
> diff --git a/lib/meson.build b/lib/meson.build
> index a9e53689b35d..b3b8b14a3f01 100644
> --- a/lib/meson.build
> +++ b/lib/meson.build
> @@ -14,6 +14,7 @@ lib_sources = [
>       'igt_stats.c',
>       'igt_syncobj.c',
>       'igt_sysfs.c',
> +     'igt_sysrq.c',
>       'igt_vgem.c',
>       'igt_x86.c',
>       'instdone.c',
> diff --git a/tests/perf_pmu.c b/tests/perf_pmu.c
> index 3bbb18d2f216..8c75b0641785 100644
> --- a/tests/perf_pmu.c
> +++ b/tests/perf_pmu.c
> @@ -965,6 +965,7 @@ static void cpu_hotplug(int gem_fd)
>       int link[2];
>       int fd, ret;
>       int cur = 0;
> +     char buf;
>  
>       igt_skip_on(IS_BROXTON(intel_get_drm_devid(gem_fd)));
>       igt_require(cpu0_hotplug_support());
> @@ -1011,9 +1012,32 @@ static void cpu_hotplug(int gem_fd)
>                       }
>  
>                       /* Offline followed by online a CPU. */
> -                     igt_assert_eq(write(cpufd, "0", 2), 2);
> +
> +                     ret = write(cpufd, "0", 2);
> +                     if (ret < 0) {
> +                             /*
> +                              * If we failed to offline a CPU we don't want
> +                              * to proceed.
> +                              */
> +                             igt_warn("Failed to offline cpu%u! (%d)\n",
> +                                      cpu, errno);
> +                             igt_assert_eq(write(link[1], "s", 1), 1);
> +                             break;
> +                     }
> +
>                       usleep(1e6);
> -                     igt_assert_eq(write(cpufd, "1", 2), 2);
> +
> +                     ret = write(cpufd, "1", 2);
> +                     if (ret < 0) {
> +                             /*
> +                              * Failed to bring a CPU back online is fatal
> +                              * for the sanity of a test run so reboot
> +                              * immediately.
> +                              */

This is assuming what the user has configured igt_fatal_error() to
do. Just a s/so reboot immediately// maybe?


-- 
Petri Latvala
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

Reply via email to