Should we add a random wait to avoid all the machines retrying au the same time?
Le mer. 26 juin 2024, 13:57, Mukesh Kumar Chaurasiya <mchau...@linux.ibm.com> a écrit : > Sometimes, when booting from a very busy SAN, the access to the > disk can fail and then GRUB will eventually drop to GRUB prompt. > This scenario is more frequent when deploying many machines at > the same time using the same SAN. > This patch aims to force the ofdisk module to retry the open or > read function for network disks after it fails. We use > DEFAULT_RETRY_TIMEOUT, which is 15 seconds to specify the time it'll > retry to access the disk before it definitely fails. The timeout can be > changed by setting the environment variable ofdisk_retry_timeout. > If the environment variable fails to read, GRUB will consider the > default value of 15 seconds. > > Signed-off-by: Diego Domingos <dieg...@linux.vnet.ibm.com> > Signed-off-by: Mukesh Kumar Chaurasiya <mchau...@linux.ibm.com> > --- > docs/grub.texi | 8 ++++ > grub-core/disk/ieee1275/ofdisk.c | 82 ++++++++++++++++++++++++++++++-- > 2 files changed, 87 insertions(+), 3 deletions(-) > > diff --git a/docs/grub.texi b/docs/grub.texi > index f3bdc2564..9514271fc 100644 > --- a/docs/grub.texi > +++ b/docs/grub.texi > @@ -3308,6 +3308,7 @@ These variables have special meaning to GRUB. > * net_default_ip:: > * net_default_mac:: > * net_default_server:: > +* ofdisk_retry_timeout:: > * pager:: > * prefix:: > * pxe_blksize:: > @@ -3738,6 +3739,13 @@ The default is the value of @samp{color_normal} > (@pxref{color_normal}). > @xref{Network}. > > > +@node ofdisk_retry_timeout > +@subsection ofdisk_retry_timeout > + > +The time in seconds till which the GRUB will retry to open or read a disk > in > +case of failure to do so. This value defaults to 15 seconds. > + > + > @node pager > @subsection pager > > diff --git a/grub-core/disk/ieee1275/ofdisk.c > b/grub-core/disk/ieee1275/ofdisk.c > index c6cba0c8a..675e039c5 100644 > --- a/grub-core/disk/ieee1275/ofdisk.c > +++ b/grub-core/disk/ieee1275/ofdisk.c > @@ -24,6 +24,9 @@ > #include <grub/ieee1275/ofdisk.h> > #include <grub/i18n.h> > #include <grub/time.h> > +#include <grub/env.h> > + > +#define RETRY_DEFAULT_TIMEOUT 15 > > static char *last_devpath; > static grub_ieee1275_ihandle_t last_ihandle; > @@ -452,7 +455,7 @@ compute_dev_path (const char *name) > } > > static grub_err_t > -grub_ofdisk_open (const char *name, grub_disk_t disk) > +grub_ofdisk_open_real (const char *name, grub_disk_t disk) > { > grub_ieee1275_phandle_t dev; > char *devpath; > @@ -525,6 +528,54 @@ grub_ofdisk_open (const char *name, grub_disk_t disk) > return 0; > } > > +static grub_uint64_t > +grub_ofdisk_disk_timeout (grub_disk_t disk) > +{ > + grub_uint64_t retry = RETRY_DEFAULT_TIMEOUT; > + const char *timeout = grub_env_get ("ofdisk_retry_timeout"); > + const char *timeout_end; > + > + if (grub_strstr (disk->name, "fibre-channel") != NULL || > + grub_strstr (disk->name, "vfc-client") != NULL) > + { > + if (timeout == NULL) > + return retry; > + retry = grub_strtoul (timeout, &timeout_end, 10); > + /* Ignore all errors and return default timeout */ > + if (*timeout == '\0' || > + *timeout_end != '\0') > + return RETRY_DEFAULT_TIMEOUT; > + } > + else > + return 0; > + > + return retry; > +} > + > +static grub_err_t > +grub_ofdisk_open (const char *name, grub_disk_t disk) > +{ > + grub_err_t err; > + grub_uint64_t timeout = grub_get_time_ms () + (grub_ofdisk_disk_timeout > (disk) * 1000); > + grub_uint16_t inc = 0; > + > + do > + { > + err = grub_ofdisk_open_real (name, disk); > + if (err == GRUB_ERR_UNKNOWN_DEVICE) > + grub_dprintf ("ofdisk", "Failed to open disk %s.\n", name); > + if (grub_get_time_ms () >= timeout) > + break; > + grub_dprintf ("ofdisk", "Retry to open disk %s.\n", name); > + /* > + * Increase wait time for subsequent requests > + * Cur time is used as a source of randomness > + */ > + grub_millisleep ((32 << ++inc) * (grub_get_time_ms () % 32)); > + } while (1); > + return err; > +} > + > static void > grub_ofdisk_close (grub_disk_t disk) > { > @@ -568,8 +619,8 @@ grub_ofdisk_prepare (grub_disk_t disk, > grub_disk_addr_t sector) > } > > static grub_err_t > -grub_ofdisk_read (grub_disk_t disk, grub_disk_addr_t sector, > - grub_size_t size, char *buf) > +grub_ofdisk_read_real (grub_disk_t disk, grub_disk_addr_t sector, > + grub_size_t size, char *buf) > { > grub_err_t err; > grub_ssize_t actual; > @@ -587,6 +638,31 @@ grub_ofdisk_read (grub_disk_t disk, grub_disk_addr_t > sector, > return 0; > } > > +static grub_err_t > +grub_ofdisk_read (grub_disk_t disk, grub_disk_addr_t sector, > + grub_size_t size, char *buf) > +{ > + grub_err_t err; > + grub_uint64_t timeout = grub_get_time_ms () + (grub_ofdisk_disk_timeout > (disk) * 1000); > + grub_uint16_t inc = 0; > + > + do > + { > + err = grub_ofdisk_read_real (disk, sector, size, buf); > + if (err == GRUB_ERR_UNKNOWN_DEVICE) > + grub_dprintf ("ofdisk", "Failed to read disk %s.\n", > (char*)disk->data); > + if (grub_get_time_ms () >= timeout) > + break; > + grub_dprintf ("ofdisk", "Retry to read disk %s.\n", > (char*)disk->data); > + /* > + * Increase wait time for subsequent requests > + * Cur time is used as a source of randomness > + */ > + grub_millisleep ((32 << ++inc) * (grub_get_time_ms () % 32)); > + } while (1); > + return err; > +} > + > static grub_err_t > grub_ofdisk_write (grub_disk_t disk, grub_disk_addr_t sector, > grub_size_t size, const char *buf) > -- > 2.45.2 > > > _______________________________________________ > Grub-devel mailing list > Grub-devel@gnu.org > https://lists.gnu.org/mailman/listinfo/grub-devel >
_______________________________________________ Grub-devel mailing list Grub-devel@gnu.org https://lists.gnu.org/mailman/listinfo/grub-devel