Add code for path selection. NVMe ANA is abstracted into enum mpath_access_state. The motivation here is so that SCSI ALUA can be used. Callbacks .is_disabled, .is_optimized, .get_access_state are added to get the path access state.
Path selection modes round-robin, NUMA, and queue-depth are added, same as NVMe supports. NVMe has almost like-for-like equivalents here: - __mpath_find_path() -> __nvme_find_path() - mpath_find_path() -> nvme_find_path() and similar for all introduced callee functions. Functions mpath_set_iopolicy() and mpath_get_iopolicy() are added for setting default iopolicy. A separate mpath_iopolicy structure is introduced. There is no iopolicy member included in the mpath_head structure as it may not suit NVMe, where iopolicy is per-subsystem and not per namespace. Signed-off-by: John Garry <[email protected]> --- include/linux/multipath.h | 36 ++++++ lib/multipath.c | 251 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 287 insertions(+) diff --git a/include/linux/multipath.h b/include/linux/multipath.h index be9dd9fb83345..c964a1aba9c42 100644 --- a/include/linux/multipath.h +++ b/include/linux/multipath.h @@ -7,6 +7,22 @@ extern const struct block_device_operations mpath_ops; +enum mpath_iopolicy_e { + MPATH_IOPOLICY_NUMA, + MPATH_IOPOLICY_RR, + MPATH_IOPOLICY_QD, +}; + +struct mpath_iopolicy { + enum mpath_iopolicy_e iopolicy; +}; + +enum mpath_access_state { + MPATH_STATE_OPTIMIZED, + MPATH_STATE_ACTIVE, + MPATH_STATE_INVALID = 0xFF +}; + struct mpath_disk { struct gendisk *disk; struct kref ref; @@ -18,10 +34,16 @@ struct mpath_disk { struct mpath_device { struct list_head siblings; + atomic_t nr_active; struct gendisk *disk; + int numa_node; }; struct mpath_head_template { + bool (*is_disabled)(struct mpath_device *); + bool (*is_optimized)(struct mpath_device *); + enum mpath_access_state (*get_access_state)(struct mpath_device *); + enum mpath_iopolicy_e (*get_iopolicy)(struct mpath_head *); const struct attribute_group **device_groups; }; @@ -50,6 +72,14 @@ static inline struct mpath_disk *mpath_gendisk_to_disk(struct gendisk *disk) return mpath_bd_device_to_disk(disk_to_dev(disk)); } +static inline enum mpath_iopolicy_e mpath_read_iopolicy( + struct mpath_iopolicy *mpath_iopolicy) +{ + return READ_ONCE(mpath_iopolicy->iopolicy); +} +void mpath_synchronize(struct mpath_head *mpath_head); +int mpath_set_iopolicy(const char *val, int *iopolicy); +int mpath_get_iopolicy(char *buf, int iopolicy); int mpath_get_head(struct mpath_head *mpath_head); void mpath_put_head(struct mpath_head *mpath_head); struct mpath_head *mpath_alloc_head(void); @@ -66,4 +96,10 @@ static inline bool is_mpath_head(struct gendisk *disk) { return disk->fops == &mpath_ops; } + +static inline bool mpath_qd_iopolicy(struct mpath_iopolicy *mpath_iopolicy) +{ + return mpath_read_iopolicy(mpath_iopolicy) == MPATH_IOPOLICY_QD; +} + #endif // _LIBMULTIPATH_H diff --git a/lib/multipath.c b/lib/multipath.c index 88efb0ae16acb..65a0d2d2bf524 100644 --- a/lib/multipath.c +++ b/lib/multipath.c @@ -6,8 +6,243 @@ #include <linux/module.h> #include <linux/multipath.h> +static struct mpath_device *mpath_find_path(struct mpath_head *mpath_head); + static struct workqueue_struct *mpath_wq; +static const char *mpath_iopolicy_names[] = { + [MPATH_IOPOLICY_NUMA] = "numa", + [MPATH_IOPOLICY_RR] = "round-robin", + [MPATH_IOPOLICY_QD] = "queue-depth", +}; + +int mpath_set_iopolicy(const char *val, int *iopolicy) +{ + if (!val) + return -EINVAL; + if (!strncmp(val, "numa", 4)) + *iopolicy = MPATH_IOPOLICY_NUMA; + else if (!strncmp(val, "round-robin", 11)) + *iopolicy = MPATH_IOPOLICY_RR; + else if (!strncmp(val, "queue-depth", 11)) + *iopolicy = MPATH_IOPOLICY_QD; + else + return -EINVAL; + + return 0; +} +EXPORT_SYMBOL_GPL(mpath_set_iopolicy); + +int mpath_get_iopolicy(char *buf, int iopolicy) +{ + return sprintf(buf, "%s\n", mpath_iopolicy_names[iopolicy]); +} +EXPORT_SYMBOL_GPL(mpath_get_iopolicy); + + +void mpath_synchronize(struct mpath_head *mpath_head) +{ + synchronize_srcu(&mpath_head->srcu); +} +EXPORT_SYMBOL_GPL(mpath_synchronize); + +static bool mpath_path_is_disabled(struct mpath_head *mpath_head, + struct mpath_device *mpath_device) +{ + return mpath_head->mpdt->is_disabled(mpath_device); +} + +static struct mpath_device *__mpath_find_path(struct mpath_head *mpath_head, + enum mpath_iopolicy_e iopolicy, int node) +{ + int found_distance = INT_MAX, fallback_distance = INT_MAX, distance; + struct mpath_device *mpath_dev_found, *mpath_dev_fallback, + *mpath_device; + + list_for_each_entry_srcu(mpath_device, &mpath_head->dev_list, siblings, + srcu_read_lock_held(&mpath_head->srcu)) { + if (mpath_path_is_disabled(mpath_head, mpath_device)) + continue; + + if (mpath_device->numa_node != NUMA_NO_NODE && + (iopolicy == MPATH_IOPOLICY_NUMA)) + distance = node_distance(node, mpath_device->numa_node); + else + distance = LOCAL_DISTANCE; + + switch(mpath_head->mpdt->get_access_state(mpath_device)) { + case MPATH_STATE_OPTIMIZED: + if (distance < found_distance) { + found_distance = distance; + mpath_dev_found = mpath_device; + } + break; + case MPATH_STATE_ACTIVE: + if (distance < fallback_distance) { + fallback_distance = distance; + mpath_dev_fallback = mpath_device; + } + break; + default: + break; + } + } + + if (!mpath_dev_found) + mpath_dev_found = mpath_dev_fallback; + + if (mpath_dev_found) + rcu_assign_pointer(mpath_head->current_path[node], + mpath_dev_found); + + return mpath_dev_found; +} + +static struct mpath_device *mpath_next_dev(struct mpath_head *mpath_head, + struct mpath_device *mpath_dev) +{ + mpath_dev = list_next_or_null_rcu(&mpath_head->dev_list, + &mpath_dev->siblings, struct mpath_device, + siblings); + + if (mpath_dev) + return mpath_dev; + return list_first_or_null_rcu(&mpath_head->dev_list, + struct mpath_device, siblings); +} + +static struct mpath_device *mpath_round_robin_path( + struct mpath_head *mpath_head, + enum mpath_iopolicy_e iopolicy) +{ + struct mpath_device *mpath_device, *found = NULL; + int node = numa_node_id(); + enum mpath_access_state access_state_old; + struct mpath_device *old = + srcu_dereference(mpath_head->current_path[node], + &mpath_head->srcu); + + if (unlikely(!old)) + return __mpath_find_path(mpath_head, iopolicy, node); + + if (list_is_singular(&mpath_head->dev_list)) { + if (mpath_path_is_disabled(mpath_head, old)) + return NULL; + return old; + } + + for (mpath_device = mpath_next_dev(mpath_head, old); + mpath_device && mpath_device != old; + mpath_device = mpath_next_dev(mpath_head, mpath_device)) { + enum mpath_access_state access_state; + + if (mpath_path_is_disabled(mpath_head, mpath_device)) + continue; + access_state = mpath_head->mpdt->get_access_state(mpath_device); + if (access_state == MPATH_STATE_OPTIMIZED) { + found = mpath_device; + goto out; + } + if (access_state == MPATH_STATE_ACTIVE) + found = mpath_device; + } + + /* + * The loop above skips the current path for round-robin semantics. + * Fall back to the current path if either: + * - no other optimized path found and current is optimized, + * - no other usable path found and current is usable. + */ + access_state_old = mpath_head->mpdt->get_access_state(old); + if (!mpath_path_is_disabled(mpath_head, old) && + (access_state_old == MPATH_STATE_OPTIMIZED || + (!found && access_state_old == MPATH_STATE_ACTIVE))) + return old; + + if (!found) + return NULL; +out: + rcu_assign_pointer(mpath_head->current_path[node], found); + + return found; +} + +static struct mpath_device *mpath_queue_depth_path(struct mpath_head *mpath_head) +{ + struct mpath_device *best_opt = NULL, *mpath_device; + struct mpath_device *best_nonopt = NULL; + unsigned int min_depth_opt = UINT_MAX, min_depth_nonopt = UINT_MAX; + unsigned int depth; + + list_for_each_entry_srcu(mpath_device, &mpath_head->dev_list, siblings, + srcu_read_lock_held(&mpath_head->srcu)) { + + if (mpath_path_is_disabled(mpath_head, mpath_device)) + continue; + + depth = atomic_read(&mpath_device->nr_active); + + switch (mpath_head->mpdt->get_access_state(mpath_device)) { + case MPATH_STATE_OPTIMIZED: + if (depth < min_depth_opt) { + min_depth_opt = depth; + best_opt = mpath_device; + } + break; + case MPATH_STATE_ACTIVE: + if (depth < min_depth_nonopt) { + min_depth_nonopt = depth; + best_nonopt = mpath_device; + } + break; + default: + break; + } + + if (min_depth_opt == 0) + return best_opt; + } + + return best_opt ? best_opt : best_nonopt; +} + +static inline bool mpath_path_is_optimized(struct mpath_head *mpath_head, + struct mpath_device *mpath_device) +{ + return mpath_head->mpdt->is_optimized(mpath_device); +} + +static struct mpath_device *mpath_numa_path(struct mpath_head *mpath_head, + enum mpath_iopolicy_e iopolicy) +{ + int node = numa_node_id(); + struct mpath_device *mpath_device; + + mpath_device = srcu_dereference(mpath_head->current_path[node], + &mpath_head->srcu); + if (unlikely(!mpath_device)) + return __mpath_find_path(mpath_head, iopolicy, node); + if (unlikely(!mpath_path_is_optimized(mpath_head, mpath_device))) + return __mpath_find_path(mpath_head, iopolicy, node); + return mpath_device; +} + +__maybe_unused +static struct mpath_device *mpath_find_path(struct mpath_head *mpath_head) +{ + enum mpath_iopolicy_e iopolicy = + mpath_head->mpdt->get_iopolicy(mpath_head); + + switch (iopolicy) { + case MPATH_IOPOLICY_QD: + return mpath_queue_depth_path(mpath_head); + case MPATH_IOPOLICY_RR: + return mpath_round_robin_path(mpath_head, iopolicy); + default: + return mpath_numa_path(mpath_head, iopolicy); + } +} + static void mpath_free_head(struct kref *ref) { struct mpath_head *mpath_head = @@ -99,6 +334,7 @@ void mpath_remove_disk(struct mpath_disk *mpath_disk) if (test_and_clear_bit(MPATH_HEAD_DISK_LIVE, &mpath_head->flags)) { struct gendisk *disk = mpath_disk->disk; + mpath_synchronize(mpath_head); del_gendisk(disk); } } @@ -158,6 +394,21 @@ void mpath_device_set_live(struct mpath_disk *mpath_disk, } queue_work(mpath_wq, &mpath_disk->partition_scan_work); } + + mutex_lock(&mpath_head->lock); + if (mpath_path_is_optimized(mpath_head, mpath_device)) { + int node, srcu_idx; + + srcu_idx = srcu_read_lock(&mpath_head->srcu); + for_each_online_node(node) + __mpath_find_path(mpath_head, + mpath_head->mpdt->get_iopolicy(mpath_head), + node); + srcu_read_unlock(&mpath_head->srcu, srcu_idx); + } + mutex_unlock(&mpath_head->lock); + + mpath_synchronize(mpath_head); } EXPORT_SYMBOL_GPL(mpath_device_set_live); -- 2.43.5

