Grabbing the work_mutex keeps probe_active_paths() from running at the same time as multipath_message(). The only messages that could interfere with probing the paths are "disable_group", "enable_group", and "switch_group". These messages could force multipath to pick a new pathgroup while probe_active_paths() was probing the current pathgroup. If the multipath device has a hardware handler, and it switches active pathgroups while there is outstanding IO to a path device, it's possible that IO to the path will fail, even if the path would be usable if it was in the active pathgroup. To avoid this, do not clear the current pathgroup for the *_group messages while probe_active_paths() is running. Instead set a flag, and probe_active_paths() will clear the current pathgroup when it finishes probing the paths. For this to work correctly, multipath needs to check current_pg before next_pg in choose_pgpath(), but before this patch next_pg was only ever set when current_pg was cleared, so this doesn't change the current behavior when paths aren't being probed. Even with this change, it is still possible to switch pathgroups while the probe is running, but only if all the paths have failed, and the probe function will skip them as well in this case.
If multiple DM_MPATH_PROBE_PATHS requests are received at once, there is no point in repeatedly issuing test IOs. Instead, the later probes should wait for the current probe to complete. If current pathgroup is still the same as the one that was just checked, the other probes should skip probing and just check the number of valid paths. Finally, probing the paths should quit early if the multipath device is trying to suspend, instead of continuing to issue test IOs, delaying the suspend. While this patch will not change the behavior of existing multipath users which don't use the DM_MPATH_PROBE_PATHS ioctl, when that ioctl is used, the behavior of the "disable_group", "enable_group", and "switch_group" messages can change subtly. When these messages return, the next IO to the multipath device will no longer be guaranteed to choose a new pathgroup. Instead, choosing a new pathgroup could be delayed by an in-progress DM_MPATH_PROBE_PATHS ioctl. The userspace multipath tools make no assumptions about what will happen to IOs after sending these messages, so this change will not effect already released versions of them, even if the DM_MPATH_PROBE_PATHS ioctl is run alongside them. Signed-off-by: Benjamin Marzinski <bmarz...@redhat.com> --- drivers/md/dm-mpath.c | 118 ++++++++++++++++++++++++++++-------------- 1 file changed, 78 insertions(+), 40 deletions(-) diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 53861ad5dd1d..12b7bcae490c 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -79,6 +79,7 @@ struct multipath { struct pgpath *current_pgpath; struct priority_group *current_pg; struct priority_group *next_pg; /* Switch to this PG if set */ + struct priority_group *last_probed_pg; atomic_t nr_valid_paths; /* Total number of usable paths */ unsigned int nr_priority_groups; @@ -87,6 +88,7 @@ struct multipath { const char *hw_handler_name; char *hw_handler_params; wait_queue_head_t pg_init_wait; /* Wait for pg_init completion */ + wait_queue_head_t probe_wait; /* Wait for probing paths */ unsigned int pg_init_retries; /* Number of times to retry pg_init */ unsigned int pg_init_delay_msecs; /* Number of msecs before pg_init retry */ atomic_t pg_init_in_progress; /* Only one pg_init allowed at once */ @@ -100,6 +102,7 @@ struct multipath { struct bio_list queued_bios; struct timer_list nopath_timer; /* Timeout for queue_if_no_path */ + bool is_suspending; }; /* @@ -132,6 +135,8 @@ static void queue_if_no_path_timeout_work(struct timer_list *t); #define MPATHF_PG_INIT_DISABLED 4 /* pg_init is not currently allowed */ #define MPATHF_PG_INIT_REQUIRED 5 /* pg_init needs calling? */ #define MPATHF_PG_INIT_DELAY_RETRY 6 /* Delay pg_init retry? */ +#define MPATHF_DELAY_PG_SWITCH 7 /* Delay switching pg if it still has paths */ +#define MPATHF_NEED_PG_SWITCH 8 /* Need to switch pgs after the delay has ended */ static bool mpath_double_check_test_bit(int MPATHF_bit, struct multipath *m) { @@ -254,6 +259,7 @@ static int alloc_multipath_stage2(struct dm_target *ti, struct multipath *m) atomic_set(&m->pg_init_count, 0); m->pg_init_delay_msecs = DM_PG_INIT_DELAY_DEFAULT; init_waitqueue_head(&m->pg_init_wait); + init_waitqueue_head(&m->probe_wait); return 0; } @@ -413,13 +419,21 @@ static struct pgpath *choose_pgpath(struct multipath *m, size_t nr_bytes) goto failed; } + /* Don't change PG until it has no remaining paths */ + pg = READ_ONCE(m->current_pg); + if (pg) { + pgpath = choose_path_in_pg(m, pg, nr_bytes); + if (!IS_ERR_OR_NULL(pgpath)) + return pgpath; + } + /* Were we instructed to switch PG? */ if (READ_ONCE(m->next_pg)) { spin_lock_irqsave(&m->lock, flags); pg = m->next_pg; if (!pg) { spin_unlock_irqrestore(&m->lock, flags); - goto check_current_pg; + goto check_all_pgs; } m->next_pg = NULL; spin_unlock_irqrestore(&m->lock, flags); @@ -427,16 +441,7 @@ static struct pgpath *choose_pgpath(struct multipath *m, size_t nr_bytes) if (!IS_ERR_OR_NULL(pgpath)) return pgpath; } - - /* Don't change PG until it has no remaining paths */ -check_current_pg: - pg = READ_ONCE(m->current_pg); - if (pg) { - pgpath = choose_path_in_pg(m, pg, nr_bytes); - if (!IS_ERR_OR_NULL(pgpath)) - return pgpath; - } - +check_all_pgs: /* * Loop through priority groups until we find a valid path. * First time we skip PGs marked 'bypassed'. @@ -1439,15 +1444,19 @@ static int action_dev(struct multipath *m, dev_t dev, action_fn action) * Temporarily try to avoid having to use the specified PG */ static void bypass_pg(struct multipath *m, struct priority_group *pg, - bool bypassed) + bool bypassed, bool can_be_delayed) { unsigned long flags; spin_lock_irqsave(&m->lock, flags); pg->bypassed = bypassed; - m->current_pgpath = NULL; - m->current_pg = NULL; + if (can_be_delayed && test_bit(MPATHF_DELAY_PG_SWITCH, &m->flags)) + set_bit(MPATHF_NEED_PG_SWITCH, &m->flags); + else { + m->current_pgpath = NULL; + m->current_pg = NULL; + } spin_unlock_irqrestore(&m->lock, flags); @@ -1476,8 +1485,12 @@ static int switch_pg_num(struct multipath *m, const char *pgstr) if (--pgnum) continue; - m->current_pgpath = NULL; - m->current_pg = NULL; + if (test_bit(MPATHF_DELAY_PG_SWITCH, &m->flags)) + set_bit(MPATHF_NEED_PG_SWITCH, &m->flags); + else { + m->current_pgpath = NULL; + m->current_pg = NULL; + } m->next_pg = pg; } spin_unlock_irqrestore(&m->lock, flags); @@ -1507,7 +1520,7 @@ static int bypass_pg_num(struct multipath *m, const char *pgstr, bool bypassed) break; } - bypass_pg(m, pg, bypassed); + bypass_pg(m, pg, bypassed, true); return 0; } @@ -1561,7 +1574,7 @@ static void pg_init_done(void *data, int errors) * Probably doing something like FW upgrade on the * controller so try the other pg. */ - bypass_pg(m, pg, true); + bypass_pg(m, pg, true, false); break; case SCSI_DH_RETRY: /* Wait before retrying. */ @@ -1741,7 +1754,11 @@ static int multipath_end_io_bio(struct dm_target *ti, struct bio *clone, static void multipath_presuspend(struct dm_target *ti) { struct multipath *m = ti->private; + unsigned long flags; + spin_lock_irqsave(&m->lock, flags); + m->is_suspending = true; + spin_unlock_irqrestore(&m->lock, flags); /* FIXME: bio-based shouldn't need to always disable queue_if_no_path */ if (m->queue_mode == DM_TYPE_BIO_BASED || !dm_noflush_suspending(m->ti)) queue_if_no_path(m, false, true, __func__); @@ -1765,6 +1782,7 @@ static void multipath_resume(struct dm_target *ti) unsigned long flags; spin_lock_irqsave(&m->lock, flags); + m->is_suspending = false; if (test_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags)) { set_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags); clear_bit(MPATHF_SAVED_QUEUE_IF_NO_PATH, &m->flags); @@ -1845,10 +1863,10 @@ static void multipath_status(struct dm_target *ti, status_type_t type, DMEMIT("%u ", m->nr_priority_groups); - if (m->next_pg) - pg_num = m->next_pg->pg_num; - else if (m->current_pg) + if (m->current_pg) pg_num = m->current_pg->pg_num; + else if (m->next_pg) + pg_num = m->next_pg->pg_num; else pg_num = (m->nr_priority_groups ? 1 : 0); @@ -2077,35 +2095,55 @@ static int probe_path(struct pgpath *pgpath) static int probe_active_paths(struct multipath *m) { struct pgpath *pgpath; - struct priority_group *pg; + struct priority_group *pg = NULL; unsigned long flags; int r = 0; - mutex_lock(&m->work_mutex); - spin_lock_irqsave(&m->lock, flags); - if (test_bit(MPATHF_QUEUE_IO, &m->flags)) - pg = NULL; - else - pg = m->current_pg; + if (test_bit(MPATHF_DELAY_PG_SWITCH, &m->flags)) { + wait_event_lock_irq(m->probe_wait, + !test_bit(MPATHF_DELAY_PG_SWITCH, &m->flags), + m->lock); + /* + * if we waited because a probe was already in progress, + * and it probed the current active pathgroup, don't + * reprobe. Just return the number of valid paths + */ + if (m->current_pg == m->last_probed_pg) + goto skip_probe; + } + if (!m->current_pg || m->is_suspending || + test_bit(MPATHF_QUEUE_IO, &m->flags)) + goto skip_probe; + set_bit(MPATHF_DELAY_PG_SWITCH, &m->flags); + pg = m->last_probed_pg = m->current_pg; spin_unlock_irqrestore(&m->lock, flags); - if (pg) { - list_for_each_entry(pgpath, &pg->pgpaths, list) { - if (!pgpath->is_active) - continue; + list_for_each_entry(pgpath, &pg->pgpaths, list) { + if (pg != READ_ONCE(m->current_pg) || + READ_ONCE(m->is_suspending)) + goto out; + if (!pgpath->is_active) + continue; - r = probe_path(pgpath); - if (r < 0) - goto out; - } + r = probe_path(pgpath); + if (r < 0) + goto out; } - if (!atomic_read(&m->nr_valid_paths)) - r = -ENOTCONN; - out: - mutex_unlock(&m->work_mutex); + spin_lock_irqsave(&m->lock, flags); + clear_bit(MPATHF_DELAY_PG_SWITCH, &m->flags); + if (test_and_clear_bit(MPATHF_NEED_PG_SWITCH, &m->flags)) { + m->current_pgpath = NULL; + m->current_pg = NULL; + } +skip_probe: + if (r == 0 && !atomic_read(&m->nr_valid_paths)) + r = -ENOTCONN; + spin_unlock_irqrestore(&m->lock, flags); + if (pg) + wake_up(&m->probe_wait); return r; } -- 2.48.1