[PATCH 4.19 42/45] cgroup: Include dying leaders with live threads in PROCS iterations
From: Tejun Heo commit c03cd7738a83b13739f00546166969342c8ff014 upstream. CSS_TASK_ITER_PROCS currently iterates live group leaders; however, this means that a process with dying leader and live threads will be skipped. IOW, cgroup.procs might be empty while cgroup.threads isn't, which is confusing to say the least. Fix it by making cset track dying tasks and include dying leaders with live threads in PROCS iteration. Signed-off-by: Tejun Heo Reported-and-tested-by: Topi Miettinen Cc: Oleg Nesterov Signed-off-by: Greg Kroah-Hartman --- include/linux/cgroup-defs.h |1 + include/linux/cgroup.h |1 + kernel/cgroup/cgroup.c | 44 +--- 3 files changed, 39 insertions(+), 7 deletions(-) --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -207,6 +207,7 @@ struct css_set { */ struct list_head tasks; struct list_head mg_tasks; + struct list_head dying_tasks; /* all css_task_iters currently walking this cset */ struct list_head task_iters; --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -60,6 +60,7 @@ struct css_task_iter { struct list_head*task_pos; struct list_head*tasks_head; struct list_head*mg_tasks_head; + struct list_head*dying_tasks_head; struct css_set *cur_cset; struct css_set *cur_dcset; --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -673,6 +673,7 @@ struct css_set init_css_set = { .dom_cset = &init_css_set, .tasks = LIST_HEAD_INIT(init_css_set.tasks), .mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks), + .dying_tasks= LIST_HEAD_INIT(init_css_set.dying_tasks), .task_iters = LIST_HEAD_INIT(init_css_set.task_iters), .threaded_csets = LIST_HEAD_INIT(init_css_set.threaded_csets), .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links), @@ -1145,6 +1146,7 @@ static struct css_set *find_css_set(stru cset->dom_cset = cset; INIT_LIST_HEAD(&cset->tasks); INIT_LIST_HEAD(&cset->mg_tasks); + INIT_LIST_HEAD(&cset->dying_tasks); INIT_LIST_HEAD(&cset->task_iters); INIT_LIST_HEAD(&cset->threaded_csets); INIT_HLIST_NODE(&cset->hlist); @@ -4152,15 +4154,18 @@ static void css_task_iter_advance_css_se it->task_pos = NULL; return; } - } while (!css_set_populated(cset)); + } while (!css_set_populated(cset) && !list_empty(&cset->dying_tasks)); if (!list_empty(&cset->tasks)) it->task_pos = cset->tasks.next; - else + else if (!list_empty(&cset->mg_tasks)) it->task_pos = cset->mg_tasks.next; + else + it->task_pos = cset->dying_tasks.next; it->tasks_head = &cset->tasks; it->mg_tasks_head = &cset->mg_tasks; + it->dying_tasks_head = &cset->dying_tasks; /* * We don't keep css_sets locked across iteration steps and thus @@ -4199,6 +4204,8 @@ static void css_task_iter_skip(struct cs static void css_task_iter_advance(struct css_task_iter *it) { + struct task_struct *task; + lockdep_assert_held(&css_set_lock); repeat: if (it->task_pos) { @@ -4215,17 +4222,32 @@ repeat: if (it->task_pos == it->tasks_head) it->task_pos = it->mg_tasks_head->next; if (it->task_pos == it->mg_tasks_head) + it->task_pos = it->dying_tasks_head->next; + if (it->task_pos == it->dying_tasks_head) css_task_iter_advance_css_set(it); } else { /* called from start, proceed to the first cset */ css_task_iter_advance_css_set(it); } - /* if PROCS, skip over tasks which aren't group leaders */ - if ((it->flags & CSS_TASK_ITER_PROCS) && it->task_pos && - !thread_group_leader(list_entry(it->task_pos, struct task_struct, - cg_list))) - goto repeat; + if (!it->task_pos) + return; + + task = list_entry(it->task_pos, struct task_struct, cg_list); + + if (it->flags & CSS_TASK_ITER_PROCS) { + /* if PROCS, skip over tasks which aren't group leaders */ + if (!thread_group_leader(task)) + goto repeat; + + /* and dying leaders w/o live member threads */ + if (!atomic_read(&task->signal->live)) + goto repeat; + } else { + /* skip all dying ones */ + if (task->flags & PF_EXITING) + goto repeat; + } } /** @@ -5682,6 +5704,7 @@ void cgroup_exi
[PATCH 4.19 23/45] net: fix ifindex collision during namespace removal
From: Jiri Pirko [ Upstream commit 55b40dbf0e76b4bfb9d8b3a16a0208640a9a45df ] Commit aca51397d014 ("netns: Fix arbitrary net_device-s corruptions on net_ns stop.") introduced a possibility to hit a BUG in case device is returning back to init_net and two following conditions are met: 1) dev->ifindex value is used in a name of another "dev%d" device in init_net. 2) dev->name is used by another device in init_net. Under real life circumstances this is hard to get. Therefore this has been present happily for over 10 years. To reproduce: $ ip a 1: lo: mtu 65536 qdisc noqueue state UNKNOWN group default qlen 1000 link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00 inet 127.0.0.1/8 scope host lo valid_lft forever preferred_lft forever inet6 ::1/128 scope host valid_lft forever preferred_lft forever 2: dummy0: mtu 1500 qdisc noop state DOWN group default qlen 1000 link/ether 86:89:3f:86:61:29 brd ff:ff:ff:ff:ff:ff 3: enp0s2: mtu 1500 qdisc noop state DOWN group default qlen 1000 link/ether 52:54:00:12:34:56 brd ff:ff:ff:ff:ff:ff $ ip netns add ns1 $ ip -n ns1 link add dummy1ns1 type dummy $ ip -n ns1 link add dummy2ns1 type dummy $ ip link set enp0s2 netns ns1 $ ip -n ns1 link set enp0s2 name dummy0 [ 100.858894] virtio_net virtio0 dummy0: renamed from enp0s2 $ ip link add dev4 type dummy $ ip -n ns1 a 1: lo: mtu 65536 qdisc noop state DOWN group default qlen 1000 link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00 2: dummy1ns1: mtu 1500 qdisc noop state DOWN group default qlen 1000 link/ether 16:63:4c:38:3e:ff brd ff:ff:ff:ff:ff:ff 3: dummy2ns1: mtu 1500 qdisc noop state DOWN group default qlen 1000 link/ether aa:9e:86:dd:6b:5d brd ff:ff:ff:ff:ff:ff 4: dummy0: mtu 1500 qdisc noop state DOWN group default qlen 1000 link/ether 52:54:00:12:34:56 brd ff:ff:ff:ff:ff:ff $ ip a 1: lo: mtu 65536 qdisc noqueue state UNKNOWN group default qlen 1000 link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00 inet 127.0.0.1/8 scope host lo valid_lft forever preferred_lft forever inet6 ::1/128 scope host valid_lft forever preferred_lft forever 2: dummy0: mtu 1500 qdisc noop state DOWN group default qlen 1000 link/ether 86:89:3f:86:61:29 brd ff:ff:ff:ff:ff:ff 4: dev4: mtu 1500 qdisc noop state DOWN group default qlen 1000 link/ether 5a:e1:4a:b6:ec:f8 brd ff:ff:ff:ff:ff:ff $ ip netns del ns1 [ 158.717795] default_device_exit: failed to move dummy0 to init_net: -17 [ 158.719316] [ cut here ] [ 158.720591] kernel BUG at net/core/dev.c:9824! [ 158.722260] invalid opcode: [#1] SMP KASAN PTI [ 158.723728] CPU: 0 PID: 56 Comm: kworker/u2:1 Not tainted 5.3.0-rc1+ #18 [ 158.725422] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.12.0-2.fc30 04/01/2014 [ 158.727508] Workqueue: netns cleanup_net [ 158.728915] RIP: 0010:default_device_exit.cold+0x1d/0x1f [ 158.730683] Code: 84 e8 18 c9 3e fe 0f 0b e9 70 90 ff ff e8 36 e4 52 fe 89 d9 4c 89 e2 48 c7 c6 80 d6 25 84 48 c7 c7 20 c0 25 84 e8 f4 c8 3e [ 158.736854] RSP: 0018:8880347e7b90 EFLAGS: 00010282 [ 158.738752] RAX: 003b RBX: ffef RCX: [ 158.741369] RDX: RSI: 8128013d RDI: ed10068fcf64 [ 158.743418] RBP: 888033550170 R08: 003b R09: fbfff0b94b9c [ 158.745626] R10: fbfff0b94b9b R11: 85ca5cdf R12: 888032f28000 [ 158.748405] R13: dc00 R14: 8880335501b8 R15: 1110068fcf72 [ 158.750638] FS: () GS:88803600() knlGS: [ 158.752944] CS: 0010 DS: ES: CR0: 80050033 [ 158.755245] CR2: 7fe8b45d21d0 CR3: 340b4005 CR4: 00360ef0 [ 158.757654] DR0: DR1: DR2: [ 158.760012] DR3: DR6: fffe0ff0 DR7: 0400 [ 158.762758] Call Trace: [ 158.763882] ? dev_change_net_namespace+0xbb0/0xbb0 [ 158.766148] ? devlink_nl_cmd_set_doit+0x520/0x520 [ 158.768034] ? dev_change_net_namespace+0xbb0/0xbb0 [ 158.769870] ops_exit_list.isra.0+0xa8/0x150 [ 158.771544] cleanup_net+0x446/0x8f0 [ 158.772945] ? unregister_pernet_operations+0x4a0/0x4a0 [ 158.775294] process_one_work+0xa1a/0x1740 [ 158.776896] ? pwq_dec_nr_in_flight+0x310/0x310 [ 158.779143] ? do_raw_spin_lock+0x11b/0x280 [ 158.780848] worker_thread+0x9e/0x1060 [ 158.782500] ? process_one_work+0x1740/0x1740 [ 158.784454] kthread+0x31b/0x420 [ 158.786082] ? __kthread_create_on_node+0x3f0/0x3f0 [ 158.788286] ret_from_fork+0x3a/0x50 [ 158.789871] ---[ end trace defd6c657c71f936 ]--- [ 158.792273] RIP: 0010:default_device_exit.cold+0x1d/0x1f [ 158.795478] Code: 84 e8 18 c9 3e fe 0f 0b e9 70 90 ff ff e8 36 e4 52 fe 89 d9 4c 89 e2 48 c7 c6 80 d6 25 84 48 c7 c7 20 c0 25 84 e8 f4 c8 3e [ 158.804854] RSP: 0018:8880347e7b90 EFLAGS: 00010282 [ 158.807865] RAX: 00
[PATCH 4.19 40/45] cgroup: Call cgroup_release() before __exit_signal()
From: Tejun Heo commit 6b115bf58e6f013ca75e7115aabcbd56c20ff31d upstream. cgroup_release() calls cgroup_subsys->release() which is used by the pids controller to uncharge its pid. We want to use it to manage iteration of dying tasks which requires putting it before __unhash_process(). Move cgroup_release() above __exit_signal(). While this makes it uncharge before the pid is freed, pid is RCU freed anyway and the window is very narrow. Signed-off-by: Tejun Heo Cc: Oleg Nesterov Signed-off-by: Greg Kroah-Hartman --- kernel/exit.c |2 +- 1 file changed, 1 insertion(+), 1 deletion(-) --- a/kernel/exit.c +++ b/kernel/exit.c @@ -194,6 +194,7 @@ repeat: rcu_read_unlock(); proc_flush_task(p); + cgroup_release(p); write_lock_irq(&tasklist_lock); ptrace_release_task(p); @@ -219,7 +220,6 @@ repeat: } write_unlock_irq(&tasklist_lock); - cgroup_release(p); release_thread(p); call_rcu(&p->rcu, delayed_put_task_struct);
[PATCH 4.14 13/33] ife: error out when nla attributes are empty
From: Cong Wang [ Upstream commit c8ec4632c6ac9cda0e8c3d51aa41eeab66585bd5 ] act_ife at least requires TCA_IFE_PARMS, so we have to bail out when there is no attribute passed in. Reported-by: syzbot+fbb5b288c9cb6a2ee...@syzkaller.appspotmail.com Fixes: ef6980b6becb ("introduce IFE action") Cc: Jamal Hadi Salim Cc: Jiri Pirko Signed-off-by: Cong Wang Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/sched/act_ife.c |3 +++ 1 file changed, 3 insertions(+) --- a/net/sched/act_ife.c +++ b/net/sched/act_ife.c @@ -459,6 +459,9 @@ static int tcf_ife_init(struct net *net, int ret = 0; int err; + if (!nla) + return -EINVAL; + err = nla_parse_nested(tb, TCA_IFE_MAX, nla, ife_policy, NULL); if (err < 0) return err;
[PATCH 4.19 22/45] net: bridge: mcast: dont delete permanent entries when fast leave is enabled
From: Nikolay Aleksandrov [ Upstream commit 5c725b6b65067909548ac9ca9bc777098ec9883d ] When permanent entries were introduced by the commit below, they were exempt from timing out and thus igmp leave wouldn't affect them unless fast leave was enabled on the port which was added before permanent entries existed. It shouldn't matter if fast leave is enabled or not if the user added a permanent entry it shouldn't be deleted on igmp leave. Before: $ echo 1 > /sys/class/net/eth4/brport/multicast_fast_leave $ bridge mdb add dev br0 port eth4 grp 229.1.1.1 permanent $ bridge mdb show dev br0 port eth4 grp 229.1.1.1 permanent < join and leave 229.1.1.1 on eth4 > $ bridge mdb show $ After: $ echo 1 > /sys/class/net/eth4/brport/multicast_fast_leave $ bridge mdb add dev br0 port eth4 grp 229.1.1.1 permanent $ bridge mdb show dev br0 port eth4 grp 229.1.1.1 permanent < join and leave 229.1.1.1 on eth4 > $ bridge mdb show dev br0 port eth4 grp 229.1.1.1 permanent Fixes: ccb1c31a7a87 ("bridge: add flags to distinguish permanent mdb entires") Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/bridge/br_multicast.c |3 +++ 1 file changed, 3 insertions(+) --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -1621,6 +1621,9 @@ br_multicast_leave_group(struct net_brid if (!br_port_group_equal(p, port, src)) continue; + if (p->flags & MDB_PG_FLAGS_PERMANENT) + break; + rcu_assign_pointer(*pp, p->next); hlist_del_init(&p->mglist); del_timer(&p->timer);
[PATCH 4.19 41/45] cgroup: Implement css_task_iter_skip()
From: Tejun Heo commit b636fd38dc40113f853337a7d2a6885ad23b8811 upstream. When a task is moved out of a cset, task iterators pointing to the task are advanced using the normal css_task_iter_advance() call. This is fine but we'll be tracking dying tasks on csets and thus moving tasks from cset->tasks to (to be added) cset->dying_tasks. When we remove a task from cset->tasks, if we advance the iterators, they may move over to the next cset before we had the chance to add the task back on the dying list, which can allow the task to escape iteration. This patch separates out skipping from advancing. Skipping only moves the affected iterators to the next pointer rather than fully advancing it and the following advancing will recognize that the cursor has already been moved forward and do the rest of advancing. This ensures that when a task moves from one list to another in its cset, as long as it moves in the right direction, it's always visible to iteration. This doesn't cause any visible behavior changes. Signed-off-by: Tejun Heo Cc: Oleg Nesterov Signed-off-by: Greg Kroah-Hartman --- include/linux/cgroup.h |3 ++ kernel/cgroup/cgroup.c | 60 + 2 files changed, 39 insertions(+), 24 deletions(-) --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -43,6 +43,9 @@ /* walk all threaded css_sets in the domain */ #define CSS_TASK_ITER_THREADED (1U << 1) +/* internal flags */ +#define CSS_TASK_ITER_SKIPPED (1U << 16) + /* a css_task_iter should be treated as an opaque object */ struct css_task_iter { struct cgroup_subsys*ss; --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -212,7 +212,8 @@ static struct cftype cgroup_base_files[] static int cgroup_apply_control(struct cgroup *cgrp); static void cgroup_finalize_control(struct cgroup *cgrp, int ret); -static void css_task_iter_advance(struct css_task_iter *it); +static void css_task_iter_skip(struct css_task_iter *it, + struct task_struct *task); static int cgroup_destroy_locked(struct cgroup *cgrp); static struct cgroup_subsys_state *css_create(struct cgroup *cgrp, struct cgroup_subsys *ss); @@ -775,6 +776,21 @@ static void css_set_update_populated(str cgroup_update_populated(link->cgrp, populated); } +/* + * @task is leaving, advance task iterators which are pointing to it so + * that they can resume at the next position. Advancing an iterator might + * remove it from the list, use safe walk. See css_task_iter_skip() for + * details. + */ +static void css_set_skip_task_iters(struct css_set *cset, + struct task_struct *task) +{ + struct css_task_iter *it, *pos; + + list_for_each_entry_safe(it, pos, &cset->task_iters, iters_node) + css_task_iter_skip(it, task); +} + /** * css_set_move_task - move a task from one css_set to another * @task: task being moved @@ -800,22 +816,9 @@ static void css_set_move_task(struct tas css_set_update_populated(to_cset, true); if (from_cset) { - struct css_task_iter *it, *pos; - WARN_ON_ONCE(list_empty(&task->cg_list)); - /* -* @task is leaving, advance task iterators which are -* pointing to it so that they can resume at the next -* position. Advancing an iterator might remove it from -* the list, use safe walk. See css_task_iter_advance*() -* for details. -*/ - list_for_each_entry_safe(it, pos, &from_cset->task_iters, -iters_node) - if (it->task_pos == &task->cg_list) - css_task_iter_advance(it); - + css_set_skip_task_iters(from_cset, task); list_del_init(&task->cg_list); if (!css_set_populated(from_cset)) css_set_update_populated(from_cset, false); @@ -4183,10 +4186,19 @@ static void css_task_iter_advance_css_se list_add(&it->iters_node, &cset->task_iters); } -static void css_task_iter_advance(struct css_task_iter *it) +static void css_task_iter_skip(struct css_task_iter *it, + struct task_struct *task) { - struct list_head *next; + lockdep_assert_held(&css_set_lock); + + if (it->task_pos == &task->cg_list) { + it->task_pos = it->task_pos->next; + it->flags |= CSS_TASK_ITER_SKIPPED; + } +} +static void css_task_iter_advance(struct css_task_iter *it) +{ lockdep_assert_held(&css_set_lock); repeat: if (it->task_pos) { @@ -4195,15 +4207,15 @@ repeat: * consumed first and then ->mg_tasks. After ->mg_tasks, * we move onto the next cset. */ -
[PATCH 4.19 21/45] net: bridge: delete local fdb on device init failure
From: Nikolay Aleksandrov [ Upstream commit d7bae09fa008c6c9a489580db0a5a12063b97f97 ] On initialization failure we have to delete the local fdb which was inserted due to the default pvid creation. This problem has been present since the inception of default_pvid. Note that currently there are 2 cases: 1) in br_dev_init() when br_multicast_init() fails 2) if register_netdevice() fails after calling ndo_init() This patch takes care of both since br_vlan_flush() is called on both occasions. Also the new fdb delete would be a no-op on normal bridge device destruction since the local fdb would've been already flushed by br_dev_delete(). This is not an issue for ports since nbp_vlan_init() is called last when adding a port thus nothing can fail after it. Reported-by: syzbot+88533dc8b582309bf...@syzkaller.appspotmail.com Fixes: 5be5a2df40f0 ("bridge: Add filtering support for default_pvid") Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/bridge/br_vlan.c |5 + 1 file changed, 5 insertions(+) --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -677,6 +677,11 @@ void br_vlan_flush(struct net_bridge *br ASSERT_RTNL(); + /* delete auto-added default pvid local fdb before flushing vlans +* otherwise it will be leaked on bridge device init failure +*/ + br_fdb_delete_by_port(br, NULL, 0, 1); + vg = br_vlan_group(br); __vlan_flush(vg); RCU_INIT_POINTER(br->vlgrp, NULL);
[PATCH 4.19 36/45] net/mlx5: Fix modify_cq_in alignment
From: Edward Srouji [ Upstream commit 7a32f2962c56d9d8a836b4469855caeee8766bd4 ] Fix modify_cq_in alignment to match the device specification. After this fix the 'cq_umem_valid' field will be in the right offset. Cc: # 4.19 Fixes: bd37197554eb ("net/mlx5: Update mlx5_ifc with DEVX UID bits") Signed-off-by: Edward Srouji Reviewed-by: Yishai Hadas Signed-off-by: Leon Romanovsky Signed-off-by: Saeed Mahameed Signed-off-by: Greg Kroah-Hartman --- include/linux/mlx5/mlx5_ifc.h |7 ++- 1 file changed, 6 insertions(+), 1 deletion(-) --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -5623,7 +5623,12 @@ struct mlx5_ifc_modify_cq_in_bits { struct mlx5_ifc_cqc_bits cq_context; - u8 reserved_at_280[0x600]; + u8 reserved_at_280[0x60]; + + u8 cq_umem_valid[0x1]; + u8 reserved_at_2e1[0x1f]; + + u8 reserved_at_300[0x580]; u8 pas[0][0x40]; };
[PATCH 4.19 38/45] r8169: dont use MSI before RTL8168d
From: Heiner Kallweit [ Upstream commit 003bd5b4a7b4a94b501e3a1e2e7c9df6b2a94ed4 ] It was reported that after resuming from suspend network fails with error "do_IRQ: 3.38 No irq handler for vector", see [0]. Enabling WoL can work around the issue, but the only actual fix is to disable MSI. So let's mimic the behavior of the vendor driver and disable MSI on all chip versions before RTL8168d. [0] https://bugzilla.kernel.org/show_bug.cgi?id=204079 Fixes: 6c6aa15fdea5 ("r8169: improve interrupt handling") Reported-by: Dušan Dragić Tested-by: Dušan Dragić Signed-off-by: Heiner Kallweit Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/realtek/r8169.c |9 +++-- 1 file changed, 7 insertions(+), 2 deletions(-) --- a/drivers/net/ethernet/realtek/r8169.c +++ b/drivers/net/ethernet/realtek/r8169.c @@ -7239,13 +7239,18 @@ static int rtl_alloc_irq(struct rtl8169_ { unsigned int flags; - if (tp->mac_version <= RTL_GIGA_MAC_VER_06) { + switch (tp->mac_version) { + case RTL_GIGA_MAC_VER_02 ... RTL_GIGA_MAC_VER_06: RTL_W8(tp, Cfg9346, Cfg9346_Unlock); RTL_W8(tp, Config2, RTL_R8(tp, Config2) & ~MSIEnable); RTL_W8(tp, Cfg9346, Cfg9346_Lock); + /* fall through */ + case RTL_GIGA_MAC_VER_07 ... RTL_GIGA_MAC_VER_24: flags = PCI_IRQ_LEGACY; - } else { + break; + default: flags = PCI_IRQ_ALL_TYPES; + break; } return pci_alloc_irq_vectors(tp->pci_dev, 1, 1, flags);
[PATCH 4.19 27/45] net: qualcomm: rmnet: Fix incorrect UL checksum offload logic
From: Subash Abhinov Kasiviswanathan [ Upstream commit a7cf3d24ee6081930feb4c830a7f6f16ebe31c49 ] The udp_ip4_ind bit is set only for IPv4 UDP non-fragmented packets so that the hardware can flip the checksum to 0x if the computed checksum is 0 per RFC768. However, this bit had to be set for IPv6 UDP non fragmented packets as well per hardware requirements. Otherwise, IPv6 UDP packets with computed checksum as 0 were transmitted by hardware and were dropped in the network. In addition to setting this bit for IPv6 UDP, the field is also appropriately renamed to udp_ind as part of this change. Fixes: 5eb5f8608ef1 ("net: qualcomm: rmnet: Add support for TX checksum offload") Cc: Sean Tranchetti Signed-off-by: Subash Abhinov Kasiviswanathan Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/qualcomm/rmnet/rmnet_map.h |2 +- drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c | 13 + 2 files changed, 10 insertions(+), 5 deletions(-) --- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_map.h +++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_map.h @@ -59,7 +59,7 @@ struct rmnet_map_dl_csum_trailer { struct rmnet_map_ul_csum_header { __be16 csum_start_offset; u16 csum_insert_offset:14; - u16 udp_ip4_ind:1; + u16 udp_ind:1; u16 csum_enabled:1; } __aligned(1); --- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c +++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c @@ -215,9 +215,9 @@ rmnet_map_ipv4_ul_csum_header(void *iphd ul_header->csum_insert_offset = skb->csum_offset; ul_header->csum_enabled = 1; if (ip4h->protocol == IPPROTO_UDP) - ul_header->udp_ip4_ind = 1; + ul_header->udp_ind = 1; else - ul_header->udp_ip4_ind = 0; + ul_header->udp_ind = 0; /* Changing remaining fields to network order */ hdr++; @@ -248,6 +248,7 @@ rmnet_map_ipv6_ul_csum_header(void *ip6h struct rmnet_map_ul_csum_header *ul_header, struct sk_buff *skb) { + struct ipv6hdr *ip6h = (struct ipv6hdr *)ip6hdr; __be16 *hdr = (__be16 *)ul_header, offset; offset = htons((__force u16)(skb_transport_header(skb) - @@ -255,7 +256,11 @@ rmnet_map_ipv6_ul_csum_header(void *ip6h ul_header->csum_start_offset = offset; ul_header->csum_insert_offset = skb->csum_offset; ul_header->csum_enabled = 1; - ul_header->udp_ip4_ind = 0; + + if (ip6h->nexthdr == IPPROTO_UDP) + ul_header->udp_ind = 1; + else + ul_header->udp_ind = 0; /* Changing remaining fields to network order */ hdr++; @@ -428,7 +433,7 @@ sw_csum: ul_header->csum_start_offset = 0; ul_header->csum_insert_offset = 0; ul_header->csum_enabled = 0; - ul_header->udp_ip4_ind = 0; + ul_header->udp_ind = 0; priv->stats.csum_sw++; }
[PATCH 4.19 31/45] net/smc: do not schedule tx_work in SMC_CLOSED state
From: Ursula Braun [ Upstream commit f9cedf1a9b1cdcfb0c52edb391d01771e43994a4 ] The setsockopts options TCP_NODELAY and TCP_CORK may schedule the tx worker. Make sure the socket is not yet moved into SMC_CLOSED state (for instance by a shutdown SHUT_RDWR call). Reported-by: syzbot+92209502e7aab127c...@syzkaller.appspotmail.com Reported-by: syzbot+b972214bb803a343f...@syzkaller.appspotmail.com Fixes: 01d2f7e2cdd31 ("net/smc: sockopts TCP_NODELAY and TCP_CORK") Signed-off-by: Ursula Braun Signed-off-by: Karsten Graul Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/smc/af_smc.c |8 ++-- 1 file changed, 6 insertions(+), 2 deletions(-) --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -1680,14 +1680,18 @@ static int smc_setsockopt(struct socket } break; case TCP_NODELAY: - if (sk->sk_state != SMC_INIT && sk->sk_state != SMC_LISTEN) { + if (sk->sk_state != SMC_INIT && + sk->sk_state != SMC_LISTEN && + sk->sk_state != SMC_CLOSED) { if (val && !smc->use_fallback) mod_delayed_work(system_wq, &smc->conn.tx_work, 0); } break; case TCP_CORK: - if (sk->sk_state != SMC_INIT && sk->sk_state != SMC_LISTEN) { + if (sk->sk_state != SMC_INIT && + sk->sk_state != SMC_LISTEN && + sk->sk_state != SMC_CLOSED) { if (!val && !smc->use_fallback) mod_delayed_work(system_wq, &smc->conn.tx_work, 0);
[PATCH 4.19 26/45] net: phylink: Fix flow control for fixed-link
From: "Ren� van Dorst" [ Upstream commit 8aace4f3eba2a3ceb431e18683ea0e1ecbade5cd ] In phylink_parse_fixedlink() the pl->link_config.advertising bits are AND with pl->supported, pl->supported is zeroed and only the speed/duplex modes and MII bits are set. So pl->link_config.advertising always loses the flow control/pause bits. By setting Pause and Asym_Pause bits in pl->supported, the flow control work again when devicetree "pause" is set in fixes-link node and the MAC advertise that is supports pause. Results with this patch. Legend: - DT = 'Pause' is set in the fixed-link in devicetree. - validate() = ‘Yes’ means phylink_set(mask, Pause) is set in the validate(). - flow = results reported my link is Up line. +-++---+ | DT | validate() | flow | +-++---+ | Yes | Yes| rx/tx | | No | Yes| off | | Yes | No | off | +-++---+ Fixes: 9525ae83959b ("phylink: add phylink infrastructure") Signed-off-by: René van Dorst Acked-by: Russell King Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/phy/phylink.c |2 ++ 1 file changed, 2 insertions(+) --- a/drivers/net/phy/phylink.c +++ b/drivers/net/phy/phylink.c @@ -226,6 +226,8 @@ static int phylink_parse_fixedlink(struc __ETHTOOL_LINK_MODE_MASK_NBITS, true); linkmode_zero(pl->supported); phylink_set(pl->supported, MII); + phylink_set(pl->supported, Pause); + phylink_set(pl->supported, Asym_Pause); if (s) { __set_bit(s->bit, pl->supported); } else {
[PATCH 4.19 00/45] 4.19.66-stable review
This is the start of the stable review cycle for the 4.19.66 release. There are 45 patches in this series, all will be posted as a response to this one. If anyone has any issues with these being applied, please let me know. Responses should be made by Sat 10 Aug 2019 07:03:19 PM UTC. Anything received after that time might be too late. The whole patch series can be found in one patch at: https://www.kernel.org/pub/linux/kernel/v4.x/stable-review/patch-4.19.66-rc1.gz or in the git tree and branch at: git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git linux-4.19.y and the diffstat can be found below. thanks, greg k-h - Pseudo-Shortlog of commits: Greg Kroah-Hartman Linux 4.19.66-rc1 Lukas Wunner spi: bcm2835: Fix 3-wire mode if DMA is enabled Tejun Heo cgroup: Fix css_task_iter_advance_css_set() cset skip condition Tejun Heo cgroup: css_task_iter_skip()'d iterators must be advanced before accessed Tejun Heo cgroup: Include dying leaders with live threads in PROCS iterations Tejun Heo cgroup: Implement css_task_iter_skip() Tejun Heo cgroup: Call cgroup_release() before __exit_signal() Arnd Bergmann compat_ioctl: pppoe: fix PPPOEIOCSFWD handling Heiner Kallweit r8169: don't use MSI before RTL8168d Ariel Levkovich net/mlx5e: Prevent encap flow counter update async to user query Edward Srouji net/mlx5: Fix modify_cq_in alignment Alexis Bauvin tun: mark small packets as owned by the tap sock Taras Kondratiuk tipc: compat: allow tipc commands without arguments Claudiu Manoil ocelot: Cancel delayed work before wq destruction Johan Hovold NFC: nfcmrvl: fix gpio-handling regression Ursula Braun net/smc: do not schedule tx_work in SMC_CLOSED state Dmytro Linkin net: sched: use temporary variable for actions indexes Roman Mashak net sched: update vlan action for batched events operations Jia-Ju Bai net: sched: Fix a possible null-pointer dereference in dequeue_func() Subash Abhinov Kasiviswanathan net: qualcomm: rmnet: Fix incorrect UL checksum offload logic René van Dorst net: phylink: Fix flow control for fixed-link Mark Zhang net/mlx5: Use reversed order when unregister devices Qian Cai net/mlx5e: always initialize frag->last_in_page Jiri Pirko net: fix ifindex collision during namespace removal Nikolay Aleksandrov net: bridge: mcast: don't delete permanent entries when fast leave is enabled Nikolay Aleksandrov net: bridge: delete local fdb on device init failure Matteo Croce mvpp2: refactor MTU change code Matteo Croce mvpp2: fix panic on module removal Jiri Pirko mlxsw: spectrum: Fix error path in mlxsw_sp_module_init() Haishuang Yan ipip: validate header length in ipip_tunnel_xmit Haishuang Yan ip6_tunnel: fix possible use-after-free on xmit Haishuang Yan ip6_gre: reload ipv6h in prepare_ip6gre_xmit_ipv6 Cong Wang ife: error out when nla attributes are empty Sudarsana Reddy Kalluru bnx2x: Disable multi-cos feature. Gustavo A. R. Silva atm: iphase: Fix Spectre v1 vulnerability Greg Kroah-Hartman IB: directly cast the sockaddr union to aockaddr Sebastian Parschauer HID: Add quirk for HP X1200 PIXART OEM mouse Aaron Armstrong Skomra HID: wacom: fix bit shift for Cintiq Companion 2 Dan Williams libnvdimm/bus: Fix wait_nvdimm_bus_probe_idle() ABBA deadlock Dan Williams libnvdimm/bus: Prepare the nd_ioctl() path to be re-entrant Dan Williams libnvdimm/region: Register badblocks before namespaces Dan Williams libnvdimm/bus: Prevent duplicate device_unregister() calls Dan Williams drivers/base: Introduce kill_device() Alexander Duyck driver core: Establish order of operations for device_add and device_del via bitflag Linus Torvalds gcc-9: don't warn about uninitialized variable Hannes Reinecke scsi: fcoe: Embed fc_rport_priv in fcoe_rport structure - Diffstat: Makefile | 4 +- drivers/atm/iphase.c | 8 +- drivers/base/base.h| 4 + drivers/base/core.c| 22 + drivers/base/dd.c | 22 ++--- drivers/hid/hid-ids.h | 1 + drivers/hid/hid-quirks.c | 1 + drivers/hid/wacom_wac.c| 12 +-- drivers/i2c/i2c-core-base.c| 2 +- drivers/infiniband/core/sa_query.c | 9 +- drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c| 3 +- drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c| 46 +++-- drivers/net/ethernet/mellanox/mlx5/core/dev.c | 2 +- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 5 +- drivers/net/ethernet/mellanox/mlx5/core/en_tc.c| 4 +- .../n
[PATCH 4.19 08/45] libnvdimm/bus: Fix wait_nvdimm_bus_probe_idle() ABBA deadlock
commit ca6bf264f6d856f959c4239cda1047b587745c67 upstream. A multithreaded namespace creation/destruction stress test currently deadlocks with the following lockup signature: INFO: task ndctl:2924 blocked for more than 122 seconds. Tainted: G OE 5.2.0-rc4+ #3382 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. ndctl D0 2924 1176 0x Call Trace: ? __schedule+0x27e/0x780 schedule+0x30/0xb0 wait_nvdimm_bus_probe_idle+0x8a/0xd0 [libnvdimm] ? finish_wait+0x80/0x80 uuid_store+0xe6/0x2e0 [libnvdimm] kernfs_fop_write+0xf0/0x1a0 vfs_write+0xb7/0x1b0 ksys_write+0x5c/0xd0 do_syscall_64+0x60/0x240 INFO: task ndctl:2923 blocked for more than 122 seconds. Tainted: G OE 5.2.0-rc4+ #3382 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. ndctl D0 2923 1175 0x Call Trace: ? __schedule+0x27e/0x780 ? __mutex_lock+0x489/0x910 schedule+0x30/0xb0 schedule_preempt_disabled+0x11/0x20 __mutex_lock+0x48e/0x910 ? nvdimm_namespace_common_probe+0x95/0x4d0 [libnvdimm] ? __lock_acquire+0x23f/0x1710 ? nvdimm_namespace_common_probe+0x95/0x4d0 [libnvdimm] nvdimm_namespace_common_probe+0x95/0x4d0 [libnvdimm] __dax_pmem_probe+0x5e/0x210 [dax_pmem_core] ? nvdimm_bus_probe+0x1d0/0x2c0 [libnvdimm] dax_pmem_probe+0xc/0x20 [dax_pmem] nvdimm_bus_probe+0x90/0x2c0 [libnvdimm] really_probe+0xef/0x390 driver_probe_device+0xb4/0x100 In this sequence an 'nd_dax' device is being probed and trying to take the lock on its backing namespace to validate that the 'nd_dax' device indeed has exclusive access to the backing namespace. Meanwhile, another thread is trying to update the uuid property of that same backing namespace. So one thread is in the probe path trying to acquire the lock, and the other thread has acquired the lock and tries to flush the probe path. Fix this deadlock by not holding the namespace device_lock over the wait_nvdimm_bus_probe_idle() synchronization step. In turn this requires the device_lock to be held on entry to wait_nvdimm_bus_probe_idle() and subsequently dropped internally to wait_nvdimm_bus_probe_idle(). Cc: Fixes: bf9bccc14c05 ("libnvdimm: pmem label sets and namespace instantiation") Cc: Vishal Verma Tested-by: Jane Chu Link: https://lore.kernel.org/r/156341210094.292348.2384694131126767789.st...@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams Signed-off-by: Sasha Levin --- drivers/nvdimm/bus.c | 14 +- drivers/nvdimm/region_devs.c | 4 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c index 5abcdb4faa644..2ba22cd1331b0 100644 --- a/drivers/nvdimm/bus.c +++ b/drivers/nvdimm/bus.c @@ -865,10 +865,12 @@ void wait_nvdimm_bus_probe_idle(struct device *dev) do { if (nvdimm_bus->probe_active == 0) break; - nvdimm_bus_unlock(&nvdimm_bus->dev); + nvdimm_bus_unlock(dev); + device_unlock(dev); wait_event(nvdimm_bus->wait, nvdimm_bus->probe_active == 0); - nvdimm_bus_lock(&nvdimm_bus->dev); + device_lock(dev); + nvdimm_bus_lock(dev); } while (true); } @@ -994,7 +996,7 @@ static int __nd_ioctl(struct nvdimm_bus *nvdimm_bus, struct nvdimm *nvdimm, case ND_CMD_ARS_START: case ND_CMD_CLEAR_ERROR: case ND_CMD_CALL: - dev_dbg(&nvdimm_bus->dev, "'%s' command while read-only.\n", + dev_dbg(dev, "'%s' command while read-only.\n", nvdimm ? nvdimm_cmd_name(cmd) : nvdimm_bus_cmd_name(cmd)); return -EPERM; @@ -1083,7 +1085,8 @@ static int __nd_ioctl(struct nvdimm_bus *nvdimm_bus, struct nvdimm *nvdimm, goto out; } - nvdimm_bus_lock(&nvdimm_bus->dev); + device_lock(dev); + nvdimm_bus_lock(dev); rc = nd_cmd_clear_to_send(nvdimm_bus, nvdimm, func, buf); if (rc) goto out_unlock; @@ -1103,7 +1106,8 @@ static int __nd_ioctl(struct nvdimm_bus *nvdimm_bus, struct nvdimm *nvdimm, rc = -EFAULT; out_unlock: - nvdimm_bus_unlock(&nvdimm_bus->dev); + nvdimm_bus_unlock(dev); + device_unlock(dev); out: kfree(in_env); kfree(out_env); diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c index e7377f1028ef6..0303296e6d5b6 100644 --- a/drivers/nvdimm/region_devs.c +++ b/drivers/nvdimm/region_devs.c @@ -425,10 +425,12 @@ static ssize_t available_size_show(struct device *dev, * memory nvdimm_bus_lock() is droppe
[PATCH 4.19 07/45] libnvdimm/bus: Prepare the nd_ioctl() path to be re-entrant
commit 6de5d06e657acdbcf9637dac37916a4a5309e0f4 upstream. In preparation for not holding a lock over the execution of nd_ioctl(), update the implementation to allow multiple threads to be attempting ioctls at the same time. The bus lock still prevents multiple in-flight ->ndctl() invocations from corrupting each other's state, but static global staging buffers are moved to the heap. Reported-by: Vishal Verma Reviewed-by: Vishal Verma Tested-by: Vishal Verma Link: https://lore.kernel.org/r/156341208947.292348.10560140326807607481.st...@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams Signed-off-by: Sasha Levin --- drivers/nvdimm/bus.c | 59 +++- 1 file changed, 37 insertions(+), 22 deletions(-) diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c index 11cfd23e5aff7..5abcdb4faa644 100644 --- a/drivers/nvdimm/bus.c +++ b/drivers/nvdimm/bus.c @@ -951,20 +951,19 @@ static int __nd_ioctl(struct nvdimm_bus *nvdimm_bus, struct nvdimm *nvdimm, int read_only, unsigned int ioctl_cmd, unsigned long arg) { struct nvdimm_bus_descriptor *nd_desc = nvdimm_bus->nd_desc; - static char out_env[ND_CMD_MAX_ENVELOPE]; - static char in_env[ND_CMD_MAX_ENVELOPE]; const struct nd_cmd_desc *desc = NULL; unsigned int cmd = _IOC_NR(ioctl_cmd); struct device *dev = &nvdimm_bus->dev; void __user *p = (void __user *) arg; + char *out_env = NULL, *in_env = NULL; const char *cmd_name, *dimm_name; u32 in_len = 0, out_len = 0; unsigned int func = cmd; unsigned long cmd_mask; struct nd_cmd_pkg pkg; int rc, i, cmd_rc; + void *buf = NULL; u64 buf_len = 0; - void *buf; if (nvdimm) { desc = nd_cmd_dimm_desc(cmd); @@ -1004,6 +1003,9 @@ static int __nd_ioctl(struct nvdimm_bus *nvdimm_bus, struct nvdimm *nvdimm, } /* process an input envelope */ + in_env = kzalloc(ND_CMD_MAX_ENVELOPE, GFP_KERNEL); + if (!in_env) + return -ENOMEM; for (i = 0; i < desc->in_num; i++) { u32 in_size, copy; @@ -1011,14 +1013,17 @@ static int __nd_ioctl(struct nvdimm_bus *nvdimm_bus, struct nvdimm *nvdimm, if (in_size == UINT_MAX) { dev_err(dev, "%s:%s unknown input size cmd: %s field: %d\n", __func__, dimm_name, cmd_name, i); - return -ENXIO; + rc = -ENXIO; + goto out; } - if (in_len < sizeof(in_env)) - copy = min_t(u32, sizeof(in_env) - in_len, in_size); + if (in_len < ND_CMD_MAX_ENVELOPE) + copy = min_t(u32, ND_CMD_MAX_ENVELOPE - in_len, in_size); else copy = 0; - if (copy && copy_from_user(&in_env[in_len], p + in_len, copy)) - return -EFAULT; + if (copy && copy_from_user(&in_env[in_len], p + in_len, copy)) { + rc = -EFAULT; + goto out; + } in_len += in_size; } @@ -1030,6 +1035,12 @@ static int __nd_ioctl(struct nvdimm_bus *nvdimm_bus, struct nvdimm *nvdimm, } /* process an output envelope */ + out_env = kzalloc(ND_CMD_MAX_ENVELOPE, GFP_KERNEL); + if (!out_env) { + rc = -ENOMEM; + goto out; + } + for (i = 0; i < desc->out_num; i++) { u32 out_size = nd_cmd_out_size(nvdimm, cmd, desc, i, (u32 *) in_env, (u32 *) out_env, 0); @@ -1038,15 +1049,18 @@ static int __nd_ioctl(struct nvdimm_bus *nvdimm_bus, struct nvdimm *nvdimm, if (out_size == UINT_MAX) { dev_dbg(dev, "%s unknown output size cmd: %s field: %d\n", dimm_name, cmd_name, i); - return -EFAULT; + rc = -EFAULT; + goto out; } - if (out_len < sizeof(out_env)) - copy = min_t(u32, sizeof(out_env) - out_len, out_size); + if (out_len < ND_CMD_MAX_ENVELOPE) + copy = min_t(u32, ND_CMD_MAX_ENVELOPE - out_len, out_size); else copy = 0; if (copy && copy_from_user(&out_env[out_len], - p + in_len + out_len, copy)) - return -EFAULT; + p + in_len + out_len, copy)) { + rc = -EFAULT; + goto out; + } out_len += out_size; } @@ -1054,12 +1068,15 @@ static int __nd_ioctl(struct nvdimm_bus *nvdimm_bus, struct nvdimm *nvdimm, if (buf
[PATCH 4.19 15/45] ip6_gre: reload ipv6h in prepare_ip6gre_xmit_ipv6
From: Haishuang Yan [ Upstream commit 3bc817d665ac6d9de89f59df522ad86f5b5dfc03 ] Since ip6_tnl_parse_tlv_enc_lim() can call pskb_may_pull() which may change skb->data, so we need to re-load ipv6h at the right place. Fixes: 898b29798e36 ("ip6_gre: Refactor ip6gre xmit codes") Cc: William Tu Signed-off-by: Haishuang Yan Acked-by: William Tu Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv6/ip6_gre.c |3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -680,12 +680,13 @@ static int prepare_ip6gre_xmit_ipv6(stru struct flowi6 *fl6, __u8 *dsfield, int *encap_limit) { - struct ipv6hdr *ipv6h = ipv6_hdr(skb); + struct ipv6hdr *ipv6h; struct ip6_tnl *t = netdev_priv(dev); __u16 offset; offset = ip6_tnl_parse_tlv_enc_lim(skb, skb_network_header(skb)); /* ip6_tnl_parse_tlv_enc_lim() might have reallocated skb->head */ + ipv6h = ipv6_hdr(skb); if (offset > 0) { struct ipv6_tlv_tnl_enc_lim *tel;
[PATCH 4.19 05/45] libnvdimm/bus: Prevent duplicate device_unregister() calls
commit 8aac0e2338916e273ccbd438a2b7a1e8c61749f5 upstream. A multithreaded namespace creation/destruction stress test currently fails with signatures like the following: sysfs group 'power' not found for kobject 'dax1.1' RIP: 0010:sysfs_remove_group+0x76/0x80 Call Trace: device_del+0x73/0x370 device_unregister+0x16/0x50 nd_async_device_unregister+0x1e/0x30 [libnvdimm] async_run_entry_fn+0x39/0x160 process_one_work+0x23c/0x5e0 worker_thread+0x3c/0x390 BUG: kernel NULL pointer dereference, address: 0020 RIP: 0010:klist_put+0x1b/0x6c Call Trace: klist_del+0xe/0x10 device_del+0x8a/0x2c9 ? __switch_to_asm+0x34/0x70 ? __switch_to_asm+0x40/0x70 device_unregister+0x44/0x4f nd_async_device_unregister+0x22/0x2d [libnvdimm] async_run_entry_fn+0x47/0x15a process_one_work+0x1a2/0x2eb worker_thread+0x1b8/0x26e Use the kill_device() helper to atomically resolve the race of multiple threads issuing kill, device_unregister(), requests. Reported-by: Jane Chu Reported-by: Erwin Tsaur Fixes: 4d88a97aa9e8 ("libnvdimm, nvdimm: dimm driver and base libnvdimm device-driver...") Cc: Link: https://github.com/pmem/ndctl/issues/96 Tested-by: Tested-by: Jane Chu Link: https://lore.kernel.org/r/156341207846.292348.10435719262819764054.st...@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Dan Williams Signed-off-by: Sasha Levin --- drivers/nvdimm/bus.c | 25 + 1 file changed, 25 insertions(+) diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c index ee39e2c1644ae..11cfd23e5aff7 100644 --- a/drivers/nvdimm/bus.c +++ b/drivers/nvdimm/bus.c @@ -528,13 +528,38 @@ EXPORT_SYMBOL(nd_device_register); void nd_device_unregister(struct device *dev, enum nd_async_mode mode) { + bool killed; + switch (mode) { case ND_ASYNC: + /* +* In the async case this is being triggered with the +* device lock held and the unregistration work needs to +* be moved out of line iff this is thread has won the +* race to schedule the deletion. +*/ + if (!kill_device(dev)) + return; + get_device(dev); async_schedule_domain(nd_async_device_unregister, dev, &nd_async_domain); break; case ND_SYNC: + /* +* In the sync case the device is being unregistered due +* to a state change of the parent. Claim the kill state +* to synchronize against other unregistration requests, +* or otherwise let the async path handle it if the +* unregistration was already queued. +*/ + device_lock(dev); + killed = kill_device(dev); + device_unlock(dev); + + if (!killed) + return; + nd_synchronize(); device_unregister(dev); break; -- 2.20.1
[PATCH 5.2 47/56] net: phy: fix race in genphy_update_link
From: Heiner Kallweit [ Upstream commit aa6b1956158f1afc52761137620d4b3f8a058d24 ] In phy_start_aneg() autoneg is started, and immediately after that link and autoneg status are read. As reported in [0] it can happen that at time of this read the PHY has reset the "aneg complete" bit but not yet the "link up" bit, what can result in a false link-up detection. To fix this don't report link as up if we're in aneg mode and PHY doesn't signal "aneg complete". [0] https://marc.info/?t=15641350993&r=1&w=2 Fixes: 4950c2ba49cc ("net: phy: fix autoneg mismatch case in genphy_read_status") Reported-by: liuyonglong Tested-by: liuyonglong Signed-off-by: Heiner Kallweit Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/phy/phy_device.c |6 ++ 1 file changed, 6 insertions(+) --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -1730,6 +1730,12 @@ done: phydev->link = status & BMSR_LSTATUS ? 1 : 0; phydev->autoneg_complete = status & BMSR_ANEGCOMPLETE ? 1 : 0; + /* Consider the case that autoneg was started and "aneg complete" +* bit has been reset, but "link up" bit not yet. +*/ + if (phydev->autoneg == AUTONEG_ENABLE && !phydev->autoneg_complete) + phydev->link = 0; + return 0; } EXPORT_SYMBOL(genphy_update_link);
[PATCH 4.19 01/45] scsi: fcoe: Embed fc_rport_priv in fcoe_rport structure
From: Hannes Reinecke commit 023358b136d490ca91735ac6490db3741af5a8bd upstream. Gcc-9 complains for a memset across pointer boundaries, which happens as the code tries to allocate a flexible array on the stack. Turns out we cannot do this without relying on gcc-isms, so with this patch we'll embed the fc_rport_priv structure into fcoe_rport, can use the normal 'container_of' outcast, and will only have to do a memset over one structure. Signed-off-by: Hannes Reinecke Signed-off-by: Martin K. Petersen Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/fcoe/fcoe_ctlr.c | 51 -- drivers/scsi/libfc/fc_rport.c |5 +++- include/scsi/libfcoe.h|1 3 files changed, 25 insertions(+), 32 deletions(-) --- a/drivers/scsi/fcoe/fcoe_ctlr.c +++ b/drivers/scsi/fcoe/fcoe_ctlr.c @@ -2017,7 +2017,7 @@ EXPORT_SYMBOL_GPL(fcoe_wwn_from_mac); */ static inline struct fcoe_rport *fcoe_ctlr_rport(struct fc_rport_priv *rdata) { - return (struct fcoe_rport *)(rdata + 1); + return container_of(rdata, struct fcoe_rport, rdata); } /** @@ -2281,7 +2281,7 @@ static void fcoe_ctlr_vn_start(struct fc */ static int fcoe_ctlr_vn_parse(struct fcoe_ctlr *fip, struct sk_buff *skb, - struct fc_rport_priv *rdata) + struct fcoe_rport *frport) { struct fip_header *fiph; struct fip_desc *desc = NULL; @@ -2289,16 +2289,12 @@ static int fcoe_ctlr_vn_parse(struct fco struct fip_wwn_desc *wwn = NULL; struct fip_vn_desc *vn = NULL; struct fip_size_desc *size = NULL; - struct fcoe_rport *frport; size_t rlen; size_t dlen; u32 desc_mask = 0; u32 dtype; u8 sub; - memset(rdata, 0, sizeof(*rdata) + sizeof(*frport)); - frport = fcoe_ctlr_rport(rdata); - fiph = (struct fip_header *)skb->data; frport->flags = ntohs(fiph->fip_flags); @@ -2361,15 +2357,17 @@ static int fcoe_ctlr_vn_parse(struct fco if (dlen != sizeof(struct fip_wwn_desc)) goto len_err; wwn = (struct fip_wwn_desc *)desc; - rdata->ids.node_name = get_unaligned_be64(&wwn->fd_wwn); + frport->rdata.ids.node_name = + get_unaligned_be64(&wwn->fd_wwn); break; case FIP_DT_VN_ID: if (dlen != sizeof(struct fip_vn_desc)) goto len_err; vn = (struct fip_vn_desc *)desc; memcpy(frport->vn_mac, vn->fd_mac, ETH_ALEN); - rdata->ids.port_id = ntoh24(vn->fd_fc_id); - rdata->ids.port_name = get_unaligned_be64(&vn->fd_wwpn); + frport->rdata.ids.port_id = ntoh24(vn->fd_fc_id); + frport->rdata.ids.port_name = + get_unaligned_be64(&vn->fd_wwpn); break; case FIP_DT_FC4F: if (dlen != sizeof(struct fip_fc4_feat)) @@ -2750,10 +2748,7 @@ static int fcoe_ctlr_vn_recv(struct fcoe { struct fip_header *fiph; enum fip_vn2vn_subcode sub; - struct { - struct fc_rport_priv rdata; - struct fcoe_rport frport; - } buf; + struct fcoe_rport frport = { }; int rc, vlan_id = 0; fiph = (struct fip_header *)skb->data; @@ -2769,7 +2764,7 @@ static int fcoe_ctlr_vn_recv(struct fcoe goto drop; } - rc = fcoe_ctlr_vn_parse(fip, skb, &buf.rdata); + rc = fcoe_ctlr_vn_parse(fip, skb, &frport); if (rc) { LIBFCOE_FIP_DBG(fip, "vn_recv vn_parse error %d\n", rc); goto drop; @@ -2778,19 +2773,19 @@ static int fcoe_ctlr_vn_recv(struct fcoe mutex_lock(&fip->ctlr_mutex); switch (sub) { case FIP_SC_VN_PROBE_REQ: - fcoe_ctlr_vn_probe_req(fip, &buf.rdata); + fcoe_ctlr_vn_probe_req(fip, &frport.rdata); break; case FIP_SC_VN_PROBE_REP: - fcoe_ctlr_vn_probe_reply(fip, &buf.rdata); + fcoe_ctlr_vn_probe_reply(fip, &frport.rdata); break; case FIP_SC_VN_CLAIM_NOTIFY: - fcoe_ctlr_vn_claim_notify(fip, &buf.rdata); + fcoe_ctlr_vn_claim_notify(fip, &frport.rdata); break; case FIP_SC_VN_CLAIM_REP: - fcoe_ctlr_vn_claim_resp(fip, &buf.rdata); + fcoe_ctlr_vn_claim_resp(fip, &frport.rdata); break; case FIP_SC_VN_BEACON: - fcoe_ctlr_vn_beacon(fip, &buf.rdata); + fcoe_ctlr_vn_beacon(fip, &frport.rdata); break; default: LIBFCOE_FIP_DBG(fip, "vn_recv unknown subcode %d\n",
[PATCH 4.19 03/45] driver core: Establish order of operations for device_add and device_del via bitflag
commit 3451a495ef244a88ed6317a035299d835554d579 upstream. Add an additional bit flag to the device_private struct named "dead". This additional flag provides a guarantee that when a device_del is executed on a given interface an async worker will not attempt to attach the driver following the earlier device_del call. Previously this guarantee was not present and could result in the device_del call attempting to remove a driver from an interface only to have the async worker attempt to probe the driver later when it finally completes the asynchronous probe call. One additional change added was that I pulled the check for dev->driver out of the __device_attach_driver call and instead placed it in the __device_attach_async_helper call. This was motivated by the fact that the only other caller of this, __device_attach, had already taken the device_lock() and checked for dev->driver. Instead of testing for this twice in this path it makes more sense to just consolidate the dev->dead and dev->driver checks together into one set of checks. Reviewed-by: Dan Williams Reviewed-by: Rafael J. Wysocki Signed-off-by: Alexander Duyck Signed-off-by: Greg Kroah-Hartman Signed-off-by: Sasha Levin --- drivers/base/base.h | 4 drivers/base/core.c | 11 +++ drivers/base/dd.c | 22 +++--- 3 files changed, 26 insertions(+), 11 deletions(-) diff --git a/drivers/base/base.h b/drivers/base/base.h index 7a419a7a6235b..559b047de9f75 100644 --- a/drivers/base/base.h +++ b/drivers/base/base.h @@ -66,6 +66,9 @@ struct driver_private { * probed first. * @device - pointer back to the struct device that this structure is * associated with. + * @dead - This device is currently either in the process of or has been + * removed from the system. Any asynchronous events scheduled for this + * device should exit without taking any action. * * Nothing outside of the driver core should ever touch these fields. */ @@ -76,6 +79,7 @@ struct device_private { struct klist_node knode_bus; struct list_head deferred_probe; struct device *device; + u8 dead:1; }; #define to_device_private_parent(obj) \ container_of(obj, struct device_private, knode_parent) diff --git a/drivers/base/core.c b/drivers/base/core.c index 92e2c32c22270..37a90d72f3736 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -2050,6 +2050,17 @@ void device_del(struct device *dev) struct kobject *glue_dir = NULL; struct class_interface *class_intf; + /* +* Hold the device lock and set the "dead" flag to guarantee that +* the update behavior is consistent with the other bitfields near +* it and that we cannot have an asynchronous probe routine trying +* to run while we are tearing out the bus/class/sysfs from +* underneath the device. +*/ + device_lock(dev); + dev->p->dead = true; + device_unlock(dev); + /* Notify clients of device removal. This call must come * before dpm_sysfs_remove(). */ diff --git a/drivers/base/dd.c b/drivers/base/dd.c index d48b310c47603..11d24a552ee49 100644 --- a/drivers/base/dd.c +++ b/drivers/base/dd.c @@ -725,15 +725,6 @@ static int __device_attach_driver(struct device_driver *drv, void *_data) bool async_allowed; int ret; - /* -* Check if device has already been claimed. This may -* happen with driver loading, device discovery/registration, -* and deferred probe processing happens all at once with -* multiple threads. -*/ - if (dev->driver) - return -EBUSY; - ret = driver_match_device(drv, dev); if (ret == 0) { /* no match */ @@ -768,6 +759,15 @@ static void __device_attach_async_helper(void *_dev, async_cookie_t cookie) device_lock(dev); + /* +* Check if device has already been removed or claimed. This may +* happen with driver loading, device discovery/registration, +* and deferred probe processing happens all at once with +* multiple threads. +*/ + if (dev->p->dead || dev->driver) + goto out_unlock; + if (dev->parent) pm_runtime_get_sync(dev->parent); @@ -778,7 +778,7 @@ static void __device_attach_async_helper(void *_dev, async_cookie_t cookie) if (dev->parent) pm_runtime_put(dev->parent); - +out_unlock: device_unlock(dev); put_device(dev); @@ -891,7 +891,7 @@ static int __driver_attach(struct device *dev, void *data) if (dev->parent && dev->bus->need_parent_lock) device_lock(dev->parent); device_lock(dev); - if (!dev->driver) + if (!dev->p->dead && !dev->driver) driver_probe_device(drv, dev); device_unlock(dev); if (dev->parent && dev->bus->need_parent_lock) -- 2.20.1
[PATCH 5.2 46/56] hv_sock: Fix hang when a connection is closed
From: Dexuan Cui [ Upstream commit 8c7885e5690be9a27231ebebf82ef29fbf46c4e4 ] There is a race condition for an established connection that is being closed by the guest: the refcnt is 4 at the end of hvs_release() (Note: here the 'remove_sock' is false): 1 for the initial value; 1 for the sk being in the bound list; 1 for the sk being in the connected list; 1 for the delayed close_work. After hvs_release() finishes, __vsock_release() -> sock_put(sk) *may* decrease the refcnt to 3. Concurrently, hvs_close_connection() runs in another thread: calls vsock_remove_sock() to decrease the refcnt by 2; call sock_put() to decrease the refcnt to 0, and free the sk; next, the "release_sock(sk)" may hang due to use-after-free. In the above, after hvs_release() finishes, if hvs_close_connection() runs faster than "__vsock_release() -> sock_put(sk)", then there is not any issue, because at the beginning of hvs_close_connection(), the refcnt is still 4. The issue can be resolved if an extra reference is taken when the connection is established. Fixes: a9eeb998c28d ("hv_sock: Add support for delayed close") Signed-off-by: Dexuan Cui Reviewed-by: Sunil Muthuswamy Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/vmw_vsock/hyperv_transport.c |8 1 file changed, 8 insertions(+) --- a/net/vmw_vsock/hyperv_transport.c +++ b/net/vmw_vsock/hyperv_transport.c @@ -311,6 +311,11 @@ static void hvs_close_connection(struct lock_sock(sk); hvs_do_close_lock_held(vsock_sk(sk), true); release_sock(sk); + + /* Release the refcnt for the channel that's opened in +* hvs_open_connection(). +*/ + sock_put(sk); } static void hvs_open_connection(struct vmbus_channel *chan) @@ -378,6 +383,9 @@ static void hvs_open_connection(struct v } set_per_channel_state(chan, conn_from_host ? new : sk); + + /* This reference will be dropped by hvs_close_connection(). */ + sock_hold(conn_from_host ? new : sk); vmbus_set_chn_rescind_callback(chan, hvs_close_connection); /* Set the pending send size to max packet size to always get
[PATCH 5.2 43/56] selftests/bpf: add wrapper scripts for test_xdp_vlan.sh
From: Jesper Dangaard Brouer [ Upstream commit d35661fcf95d8818c1f9acc818a1bad23dda4e1c ] In-order to test both native-XDP (xdpdrv) and generic-XDP (xdpgeneric) create two wrapper test scripts, that start the test_xdp_vlan.sh script with these modes. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- tools/testing/selftests/bpf/Makefile |3 ++- tools/testing/selftests/bpf/test_xdp_vlan.sh |5 - tools/testing/selftests/bpf/test_xdp_vlan_mode_generic.sh |9 + tools/testing/selftests/bpf/test_xdp_vlan_mode_native.sh |9 + 4 files changed, 24 insertions(+), 2 deletions(-) create mode 100755 tools/testing/selftests/bpf/test_xdp_vlan_mode_generic.sh create mode 100755 tools/testing/selftests/bpf/test_xdp_vlan_mode_native.sh --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -51,7 +51,8 @@ TEST_PROGS := test_kmod.sh \ test_lirc_mode2.sh \ test_skb_cgroup_id.sh \ test_flow_dissector.sh \ - test_xdp_vlan.sh \ + test_xdp_vlan_mode_generic.sh \ + test_xdp_vlan_mode_native.sh \ test_lwt_ip_encap.sh \ test_tcp_check_syncookie.sh \ test_tc_tunnel.sh \ --- a/tools/testing/selftests/bpf/test_xdp_vlan.sh +++ b/tools/testing/selftests/bpf/test_xdp_vlan.sh @@ -2,7 +2,10 @@ # SPDX-License-Identifier: GPL-2.0 # Author: Jesper Dangaard Brouer -TESTNAME=xdp_vlan +# Allow wrapper scripts to name test +if [ -z "$TESTNAME" ]; then +TESTNAME=xdp_vlan +fi # Default XDP mode XDP_MODE=xdpgeneric --- /dev/null +++ b/tools/testing/selftests/bpf/test_xdp_vlan_mode_generic.sh @@ -0,0 +1,9 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +# Exit on failure +set -e + +# Wrapper script to test generic-XDP +export TESTNAME=xdp_vlan_mode_generic +./test_xdp_vlan.sh --mode=xdpgeneric --- /dev/null +++ b/tools/testing/selftests/bpf/test_xdp_vlan_mode_native.sh @@ -0,0 +1,9 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +# Exit on failure +set -e + +# Wrapper script to test native-XDP +export TESTNAME=xdp_vlan_mode_native +./test_xdp_vlan.sh --mode=xdpdrv
[PATCH 5.2 42/56] bpf: fix XDP vlan selftests test_xdp_vlan.sh
From: Jesper Dangaard Brouer [ Upstream commit 4de9c89a4982431c4a02739743fd360dc5581f22 ] Change BPF selftest test_xdp_vlan.sh to (default) use generic XDP. This selftest was created together with a fix for generic XDP, in commit 297249569932 ("net: fix generic XDP to handle if eth header was mangled"). And was suppose to catch if generic XDP was broken again. The tests are using veth and assumed that veth driver didn't support native driver XDP, thus it used the (ip link set) 'xdp' attach that fell back to generic-XDP. But veth gained native-XDP support in 948d4f214fde ("veth: Add driver XDP"), which caused this test script to use native-XDP. Fixes: 948d4f214fde ("veth: Add driver XDP") Fixes: 97396ff0bc2d ("selftests/bpf: add XDP selftests for modifying and popping VLAN headers") Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- tools/testing/selftests/bpf/test_xdp_vlan.sh | 42 +++ 1 file changed, 36 insertions(+), 6 deletions(-) --- a/tools/testing/selftests/bpf/test_xdp_vlan.sh +++ b/tools/testing/selftests/bpf/test_xdp_vlan.sh @@ -1,7 +1,12 @@ #!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# Author: Jesper Dangaard Brouer TESTNAME=xdp_vlan +# Default XDP mode +XDP_MODE=xdpgeneric + usage() { echo "Testing XDP + TC eBPF VLAN manipulations: $TESTNAME" echo "" @@ -9,9 +14,23 @@ usage() { echo " -v | --verbose : Verbose" echo " --flush: Flush before starting (e.g. after --interactive)" echo " --interactive : Keep netns setup running after test-run" + echo " --mode=XXX : Choose XDP mode (xdp | xdpgeneric | xdpdrv)" echo "" } +valid_xdp_mode() +{ + local mode=$1 + + case "$mode" in + xdpgeneric | xdpdrv | xdp) + return 0 + ;; + *) + return 1 + esac +} + cleanup() { local status=$? @@ -37,7 +56,7 @@ cleanup() # Using external program "getopt" to get --long-options OPTIONS=$(getopt -o hvfi: \ ---long verbose,flush,help,interactive,debug -- "$@") +--long verbose,flush,help,interactive,debug,mode: -- "$@") if (( $? != 0 )); then usage echo "selftests: $TESTNAME [FAILED] Error calling getopt, unknown option?" @@ -60,6 +79,11 @@ while true; do cleanup shift ;; + --mode ) + shift + XDP_MODE=$1 + shift + ;; -- ) shift break @@ -81,8 +105,14 @@ if [ "$EUID" -ne 0 ]; then exit 1 fi -ip link set dev lo xdp off 2>/dev/null > /dev/null -if [ $? -ne 0 ];then +valid_xdp_mode $XDP_MODE +if [ $? -ne 0 ]; then + echo "selftests: $TESTNAME [FAILED] unknown XDP mode ($XDP_MODE)" + exit 1 +fi + +ip link set dev lo xdpgeneric off 2>/dev/null > /dev/null +if [ $? -ne 0 ]; then echo "selftests: $TESTNAME [SKIP] need ip xdp support" exit 0 fi @@ -166,7 +196,7 @@ export FILE=test_xdp_vlan.o # First test: Remove VLAN by setting VLAN ID 0, using "xdp_vlan_change" export XDP_PROG=xdp_vlan_change -ip netns exec ns1 ip link set $DEVNS1 xdp object $FILE section $XDP_PROG +ip netns exec ns1 ip link set $DEVNS1 $XDP_MODE object $FILE section $XDP_PROG # In ns1: egress use TC to add back VLAN tag 4011 # (del cmd) @@ -187,8 +217,8 @@ ip netns exec ns1 ping -W 2 -c 3 $IPADDR # ETH_P_8021Q indication, and this cause overwriting of our changes. # export XDP_PROG=xdp_vlan_remove_outer2 -ip netns exec ns1 ip link set $DEVNS1 xdp off -ip netns exec ns1 ip link set $DEVNS1 xdp object $FILE section $XDP_PROG +ip netns exec ns1 ip link set $DEVNS1 $XDP_MODE off +ip netns exec ns1 ip link set $DEVNS1 $XDP_MODE object $FILE section $XDP_PROG # Now the namespaces should still be able reach each-other, test with ping: ip netns exec ns2 ping -W 2 -c 3 $IPADDR1
[PATCH 5.2 45/56] net: fix bpf_xdp_adjust_head regression for generic-XDP
From: Jesper Dangaard Brouer [ Upstream commit 065af355470519bd184019a93ac579f22b036045 ] When generic-XDP was moved to a later processing step by commit 458bf2f224f0 ("net: core: support XDP generic on stacked devices.") a regression was introduced when using bpf_xdp_adjust_head. The issue is that after this commit the skb->network_header is now changed prior to calling generic XDP and not after. Thus, if the header is changed by XDP (via bpf_xdp_adjust_head), then skb->network_header also need to be updated again. Fix by calling skb_reset_network_header(). Fixes: 458bf2f224f0 ("net: core: support XDP generic on stacked devices.") Reported-by: Brandon Cazander Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/core/dev.c | 15 ++- 1 file changed, 10 insertions(+), 5 deletions(-) --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4382,12 +4382,17 @@ static u32 netif_receive_generic_xdp(str act = bpf_prog_run_xdp(xdp_prog, xdp); + /* check if bpf_xdp_adjust_head was used */ off = xdp->data - orig_data; - if (off > 0) - __skb_pull(skb, off); - else if (off < 0) - __skb_push(skb, -off); - skb->mac_header += off; + if (off) { + if (off > 0) + __skb_pull(skb, off); + else if (off < 0) + __skb_push(skb, -off); + + skb->mac_header += off; + skb_reset_network_header(skb); + } /* check if bpf_xdp_adjust_tail was used. it can only "shrink" * pckt.
[PATCH 5.2 51/56] net/mlx5: Add missing RDMA_RX capabilities
From: Maor Gottlieb [ Upstream commit 987f6c69dd923069d443f6a37225f5b1630a30f2 ] New flow table type RDMA_RX was added but the MLX5_CAP_FLOW_TABLE_TYPE didn't handle this new flow table type. This means that MLX5_CAP_FLOW_TABLE_TYPE returns an empty capability to this flow table type. Update both the macro and the maximum supported flow table type to RDMA_RX. Fixes: d83eb50e29de ("net/mlx5: Add support in RDMA RX steering") Signed-off-by: Maor Gottlieb Signed-off-by: Saeed Mahameed Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/mellanox/mlx5/core/fs_core.h |5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h @@ -68,7 +68,7 @@ enum fs_flow_table_type { FS_FT_SNIFFER_RX= 0X5, FS_FT_SNIFFER_TX= 0X6, FS_FT_RDMA_RX = 0X7, - FS_FT_MAX_TYPE = FS_FT_SNIFFER_TX, + FS_FT_MAX_TYPE = FS_FT_RDMA_RX, }; enum fs_flow_table_op_mod { @@ -274,7 +274,8 @@ void mlx5_cleanup_fs(struct mlx5_core_de (type == FS_FT_FDB) ? MLX5_CAP_ESW_FLOWTABLE_FDB(mdev, cap) : \ (type == FS_FT_SNIFFER_RX) ? MLX5_CAP_FLOWTABLE_SNIFFER_RX(mdev, cap) : \ (type == FS_FT_SNIFFER_TX) ? MLX5_CAP_FLOWTABLE_SNIFFER_TX(mdev, cap) : \ - (BUILD_BUG_ON_ZERO(FS_FT_SNIFFER_TX != FS_FT_MAX_TYPE))\ + (type == FS_FT_RDMA_RX) ? MLX5_CAP_FLOWTABLE_RDMA_RX(mdev, cap) : \ + (BUILD_BUG_ON_ZERO(FS_FT_RDMA_RX != FS_FT_MAX_TYPE))\ ) #endif
Re: [RFC PATCH v4 9/9] printk: use a new ringbuffer implementation
On Wed, Aug 7, 2019 at 3:27 PM John Ogness wrote: > > 2. For the CONFIG_PPC_POWERNV powerpc platform, kernel log buffer >registration is no longer available because there is no longer >a single contigous block of memory to represent all of the >ringbuffer. So this is tangential, but I've actually been wishing for a special "raw dump" format that has absolutely *no* structure to it at all, and is as a result not necessarily strictly reliable, but is a lot more robust. The background for that is that we have a class of bugs that are really hard to debug "in the wild", because people don't have access to serial consoles or any kind of special hardware at all (ie forget things like nvram etc), and when the machine locks up you're happy to just have a reset button (but more likely you have to turn power off and on). End result: a DRAM buffer can work, but is not "reliable". Particularly if you turn power on and off, data retention of DRAM is iffy. But it's possible, at least in theory. So I have a patch that implements a "stupid ring buffer" for thisa case, with absolutely zero data structures (because in the presense of DRAM corruption, all you can get is "hopefully only slightly garbled ASCII". It actually does work. It's a complete hack, but I have used this on real hardware to see dumps that happened after the machine could no longer send them to any device. I actually suspect that this kind of "stupid non-structured secondary log" can often be much more useful than the existing nvram special cases - yes the output can be garbled for multi-cpu cases because it not only is lockless, it's lockess without even any data structures - but it also works somewhat reliably when the machine is _really_ borked. Which is exactly when you want a log that isn't just the normal "working machine syslog". NOTE! This is *not* a replacement for a lockless printk. This is very much an _additional_ "low overhead buffer in RAM" for post-mortem analysis when anything fancier doesn't work. So I'm throwing this patch out there in case people have interest in looking at that very special case. Also note how right now the example code just steals a random physical memory area at roughly physical location 12GB - this is a hack and would need to be configurable obviously in real life, but it worked for the machines I tested (which both happened to have 16GB of RAM). Those parts are marked with "// HACK HACK HACK" and just a hardcoded physical address (0x32000). Linus From 074ea67afcaba37996a615c41685cd72b088f583 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 30 May 2019 19:56:13 -0700 Subject: [PATCH] Trial "power off buffer" for printk data retention This circumvents ACPI and just forces a random physical address (which happens to be at 0x32000) to contain a 64kB buffer that we take over. Not-yet-signed-off-by: Linus Torvalds --- arch/x86/kernel/setup.c | 7 ++ include/linux/printk.h | 3 + init/main.c | 11 ++ kernel/printk/Makefile | 2 +- kernel/printk/poweroff_buffer.c | 179 kernel/printk/printk.c | 2 + 6 files changed, 203 insertions(+), 1 deletion(-) create mode 100644 kernel/printk/poweroff_buffer.c diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 08a5f4a131f5..2a1d7d7f3f4f 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1042,6 +1042,13 @@ void __init setup_arch(char **cmdline_p) early_gart_iommu_check(); #endif + // HACK HACK HACK + // Magic "this RAM survives boot" fake + e820__range_update(0x32000, 65536, E820_TYPE_RAM, E820_TYPE_RESERVED); + e820__update_table(e820_table); + printk(KERN_INFO "fixed physical RAM map:\n"); + e820__print_table("fake boot-safe buffers"); + /* * partially used pages are not usable - thus * we are rounding upwards: diff --git a/include/linux/printk.h b/include/linux/printk.h index cefd374c47b1..905c47efb98c 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -170,6 +170,9 @@ int vprintk(const char *fmt, va_list args); asmlinkage __printf(1, 2) __cold int printk(const char *fmt, ...); +void poweroff_buffer_log(const char *buf, size_t len); +void poweroff_buffer_register(char *buf, size_t size); + /* * Special printk facility for scheduler/timekeeping use only, _DO_NOT_USE_ ! */ diff --git a/init/main.c b/init/main.c index 66a196c5e4c3..232778603490 100644 --- a/init/main.c +++ b/init/main.c @@ -1100,6 +1100,17 @@ static int __ref kernel_init(void *unused) system_state = SYSTEM_RUNNING; numa_default_policy(); + // + // HACK HACK HACK + // + { + void *base = ioremap_cache(0x32000,65536); + if (base) + poweroff_buffer_register(base, 65536); + else + printk("ioremap failed\n"); + } + rcu_end_inkernel_boot(); if (ramdisk_execute_command) { diff --git a/kernel/printk/Makefile b/kernel/printk/Makefile index 4d052fc6bcde..7ca11d92f280 100644
[PATCH 5.2 00/56] 5.2.8-stable review
This is the start of the stable review cycle for the 5.2.8 release. There are 56 patches in this series, all will be posted as a response to this one. If anyone has any issues with these being applied, please let me know. Responses should be made by Sat 10 Aug 2019 07:03:19 PM UTC. Anything received after that time might be too late. The whole patch series can be found in one patch at: https://www.kernel.org/pub/linux/kernel/v5.x/stable-review/patch-5.2.8-rc1.gz or in the git tree and branch at: git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git linux-5.2.y and the diffstat can be found below. thanks, greg k-h - Pseudo-Shortlog of commits: Greg Kroah-Hartman Linux 5.2.8-rc1 Lukas Wunner spi: bcm2835: Fix 3-wire mode if DMA is enabled Johannes Berg Revert "mac80211: set NETIF_F_LLTX when using intermediate tx queues" Dhinakaran Pandiyan drm/i915/vbt: Fix VBT parsing for the PSR section Arnd Bergmann compat_ioctl: pppoe: fix PPPOEIOCSFWD handling Aya Levin net/mlx5e: Fix matching of speed to PRM link modes Maor Gottlieb net/mlx5: Add missing RDMA_RX capabilities Petr Machata mlxsw: spectrum_buffers: Further reduce pool size on Spectrum-2 Colin Ian King rocker: fix memory leaks of fib_work on two error return paths Ursula Braun net/smc: avoid fallback in case of non-blocking connect Heiner Kallweit net: phy: fix race in genphy_update_link Dexuan Cui hv_sock: Fix hang when a connection is closed Jesper Dangaard Brouer net: fix bpf_xdp_adjust_head regression for generic-XDP Jesper Dangaard Brouer selftests/bpf: reduce time to execute test_xdp_vlan.sh Jesper Dangaard Brouer selftests/bpf: add wrapper scripts for test_xdp_vlan.sh Jesper Dangaard Brouer bpf: fix XDP vlan selftests test_xdp_vlan.sh Heiner Kallweit r8169: don't use MSI before RTL8168d Ariel Levkovich net/mlx5e: Prevent encap flow counter update async to user query Edward Srouji net/mlx5: Fix modify_cq_in alignment Alexis Bauvin tun: mark small packets as owned by the tap sock Jon Maloy tipc: fix unitilized skb list crash Taras Kondratiuk tipc: compat: allow tipc commands without arguments Claudiu Manoil ocelot: Cancel delayed work before wq destruction Johan Hovold NFC: nfcmrvl: fix gpio-handling regression Frode Isaksen net: stmmac: Use netif_tx_napi_add() for TX polling function Ursula Braun net/smc: do not schedule tx_work in SMC_CLOSED state Dmytro Linkin net: sched: use temporary variable for actions indexes Roman Mashak net sched: update vlan action for batched events operations Jia-Ju Bai net: sched: Fix a possible null-pointer dereference in dequeue_func() Subash Abhinov Kasiviswanathan net: qualcomm: rmnet: Fix incorrect UL checksum offload logic Andreas Schwab net: phy: mscc: initialize stats array René van Dorst net: phylink: Fix flow control for fixed-link Arseny Solokha net: phylink: don't start and stop SGMII PHYs in SFP modules twice Hubert Feurstein net: phy: fixed_phy: print gpio error only if gpio node is present Mark Zhang net/mlx5: Use reversed order when unregister devices Qian Cai net/mlx5e: always initialize frag->last_in_page Jiri Pirko net: fix ifindex collision during namespace removal Nikolay Aleksandrov net: bridge: move default pvid init/deinit to NETDEV_REGISTER/UNREGISTER Nikolay Aleksandrov net: bridge: mcast: don't delete permanent entries when fast leave is enabled Nikolay Aleksandrov net: bridge: delete local fdb on device init failure Matteo Croce mvpp2: refactor MTU change code Matteo Croce mvpp2: fix panic on module removal Jiri Pirko mlxsw: spectrum: Fix error path in mlxsw_sp_module_init() Haishuang Yan ipip: validate header length in ipip_tunnel_xmit Haishuang Yan ip6_tunnel: fix possible use-after-free on xmit Haishuang Yan ip6_gre: reload ipv6h in prepare_ip6gre_xmit_ipv6 Cong Wang ife: error out when nla attributes are empty Arnaud Patard drivers/net/ethernet/marvell/mvmdio.c: Fix non OF case Sudarsana Reddy Kalluru bnx2x: Disable multi-cos feature. Gustavo A. R. Silva atm: iphase: Fix Spectre v1 vulnerability Sebastian Parschauer HID: Add quirk for HP X1200 PIXART OEM mouse Aaron Armstrong Skomra HID: wacom: fix bit shift for Cintiq Companion 2 Hillf Danton ALSA: usb-audio: Fix gpf in snd_usb_pipe_sanity_check Takashi Iwai ALSA: usb-audio: Sanity checks for each pipe and EP types Dan Williams libnvdimm/bus: Fix wait_nvdimm_bus_probe_idle() ABBA deadlock Dan Williams libnvdimm/bus: Prepare the nd_ioctl() path to be re-entrant Hannes Reinecke scsi: fcoe: Embed fc_rport_priv in fcoe_rport structure - Diffstat: Makefile | 4 +- drivers/atm/iphase.c
[PATCH 5.2 39/56] net/mlx5: Fix modify_cq_in alignment
From: Edward Srouji [ Upstream commit 7a32f2962c56d9d8a836b4469855caeee8766bd4 ] Fix modify_cq_in alignment to match the device specification. After this fix the 'cq_umem_valid' field will be in the right offset. Cc: # 4.19 Fixes: bd37197554eb ("net/mlx5: Update mlx5_ifc with DEVX UID bits") Signed-off-by: Edward Srouji Reviewed-by: Yishai Hadas Signed-off-by: Leon Romanovsky Signed-off-by: Saeed Mahameed Signed-off-by: Greg Kroah-Hartman --- include/linux/mlx5/mlx5_ifc.h |6 -- 1 file changed, 4 insertions(+), 2 deletions(-) --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -5865,10 +5865,12 @@ struct mlx5_ifc_modify_cq_in_bits { struct mlx5_ifc_cqc_bits cq_context; - u8 reserved_at_280[0x40]; + u8 reserved_at_280[0x60]; u8 cq_umem_valid[0x1]; - u8 reserved_at_2c1[0x5bf]; + u8 reserved_at_2e1[0x1f]; + + u8 reserved_at_300[0x580]; u8 pas[0][0x40]; };
[PATCH 5.2 08/56] atm: iphase: Fix Spectre v1 vulnerability
From: "Gustavo A. R. Silva" [ Upstream commit ea443e5e98b5b74e317ef3d26bcaea54931ccdee ] board is controlled by user-space, hence leading to a potential exploitation of the Spectre variant 1 vulnerability. This issue was detected with the help of Smatch: drivers/atm/iphase.c:2765 ia_ioctl() warn: potential spectre issue 'ia_dev' [r] (local cap) drivers/atm/iphase.c:2774 ia_ioctl() warn: possible spectre second half. 'iadev' drivers/atm/iphase.c:2782 ia_ioctl() warn: possible spectre second half. 'iadev' drivers/atm/iphase.c:2816 ia_ioctl() warn: possible spectre second half. 'iadev' drivers/atm/iphase.c:2823 ia_ioctl() warn: possible spectre second half. 'iadev' drivers/atm/iphase.c:2830 ia_ioctl() warn: potential spectre issue '_ia_dev' [r] (local cap) drivers/atm/iphase.c:2845 ia_ioctl() warn: possible spectre second half. 'iadev' drivers/atm/iphase.c:2856 ia_ioctl() warn: possible spectre second half. 'iadev' Fix this by sanitizing board before using it to index ia_dev and _ia_dev Notice that given that speculation windows are large, the policy is to kill the speculation on the first load and not worry if it can be completed with a dependent load/store [1]. [1] https://lore.kernel.org/lkml/20180423164740.gy17...@dhcp22.suse.cz/ Signed-off-by: Gustavo A. R. Silva Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/atm/iphase.c |8 ++-- 1 file changed, 6 insertions(+), 2 deletions(-) --- a/drivers/atm/iphase.c +++ b/drivers/atm/iphase.c @@ -63,6 +63,7 @@ #include #include #include +#include #include "iphase.h" #include "suni.h" #define swap_byte_order(x) (((x & 0xff) << 8) | ((x & 0xff00) >> 8)) @@ -2760,8 +2761,11 @@ static int ia_ioctl(struct atm_dev *dev, } if (copy_from_user(&ia_cmds, arg, sizeof ia_cmds)) return -EFAULT; board = ia_cmds.status; - if ((board < 0) || (board > iadev_count)) - board = 0; + + if ((board < 0) || (board > iadev_count)) + board = 0; + board = array_index_nospec(board, iadev_count + 1); + iadev = ia_dev[board]; switch (ia_cmds.cmd) { case MEMDUMP:
[PATCH 5.2 04/56] ALSA: usb-audio: Sanity checks for each pipe and EP types
[ Upstream commit 801ebf1043ae7b182588554cc9b9ad3c14bc2ab5 ] The recent USB core code performs sanity checks for the given pipe and EP types, and it can be hit by manipulated USB descriptors by syzbot. For making syzbot happier, this patch introduces a local helper for a sanity check in the driver side and calls it at each place before the message handling, so that we can avoid the WARNING splats. Reported-by: syzbot+d952e5e28f5fb7718...@syzkaller.appspotmail.com Signed-off-by: Takashi Iwai Signed-off-by: Sasha Levin --- sound/usb/helper.c | 17 + sound/usb/helper.h | 1 + sound/usb/quirks.c | 18 +++--- 3 files changed, 33 insertions(+), 3 deletions(-) diff --git a/sound/usb/helper.c b/sound/usb/helper.c index 84aa265dd802c..71d5f540334a2 100644 --- a/sound/usb/helper.c +++ b/sound/usb/helper.c @@ -63,6 +63,20 @@ void *snd_usb_find_csint_desc(void *buffer, int buflen, void *after, u8 dsubtype return NULL; } +/* check the validity of pipe and EP types */ +int snd_usb_pipe_sanity_check(struct usb_device *dev, unsigned int pipe) +{ + static const int pipetypes[4] = { + PIPE_CONTROL, PIPE_ISOCHRONOUS, PIPE_BULK, PIPE_INTERRUPT + }; + struct usb_host_endpoint *ep; + + ep = usb_pipe_endpoint(dev, pipe); + if (usb_pipetype(pipe) != pipetypes[usb_endpoint_type(&ep->desc)]) + return -EINVAL; + return 0; +} + /* * Wrapper for usb_control_msg(). * Allocates a temp buffer to prevent dmaing from/to the stack. @@ -75,6 +89,9 @@ int snd_usb_ctl_msg(struct usb_device *dev, unsigned int pipe, __u8 request, void *buf = NULL; int timeout; + if (snd_usb_pipe_sanity_check(dev, pipe)) + return -EINVAL; + if (size > 0) { buf = kmemdup(data, size, GFP_KERNEL); if (!buf) diff --git a/sound/usb/helper.h b/sound/usb/helper.h index d338bd0e0ca60..6afb70156ec4f 100644 --- a/sound/usb/helper.h +++ b/sound/usb/helper.h @@ -7,6 +7,7 @@ unsigned int snd_usb_combine_bytes(unsigned char *bytes, int size); void *snd_usb_find_desc(void *descstart, int desclen, void *after, u8 dtype); void *snd_usb_find_csint_desc(void *descstart, int desclen, void *after, u8 dsubtype); +int snd_usb_pipe_sanity_check(struct usb_device *dev, unsigned int pipe); int snd_usb_ctl_msg(struct usb_device *dev, unsigned int pipe, __u8 request, __u8 requesttype, __u16 value, __u16 index, void *data, __u16 size); diff --git a/sound/usb/quirks.c b/sound/usb/quirks.c index cf5cff10c08e8..78858918cbc10 100644 --- a/sound/usb/quirks.c +++ b/sound/usb/quirks.c @@ -828,11 +828,13 @@ static int snd_usb_novation_boot_quirk(struct usb_device *dev) static int snd_usb_accessmusic_boot_quirk(struct usb_device *dev) { int err, actual_length; - /* "midi send" enable */ static const u8 seq[] = { 0x4e, 0x73, 0x52, 0x01 }; + void *buf; - void *buf = kmemdup(seq, ARRAY_SIZE(seq), GFP_KERNEL); + if (snd_usb_pipe_sanity_check(dev, usb_sndintpipe(dev, 0x05))) + return -EINVAL; + buf = kmemdup(seq, ARRAY_SIZE(seq), GFP_KERNEL); if (!buf) return -ENOMEM; err = usb_interrupt_msg(dev, usb_sndintpipe(dev, 0x05), buf, @@ -857,7 +859,11 @@ static int snd_usb_accessmusic_boot_quirk(struct usb_device *dev) static int snd_usb_nativeinstruments_boot_quirk(struct usb_device *dev) { - int ret = usb_control_msg(dev, usb_sndctrlpipe(dev, 0), + int ret; + + if (snd_usb_pipe_sanity_check(dev, usb_sndctrlpipe(dev, 0))) + return -EINVAL; + ret = usb_control_msg(dev, usb_sndctrlpipe(dev, 0), 0xaf, USB_TYPE_VENDOR | USB_RECIP_DEVICE, 1, 0, NULL, 0, 1000); @@ -964,6 +970,8 @@ static int snd_usb_axefx3_boot_quirk(struct usb_device *dev) dev_dbg(&dev->dev, "Waiting for Axe-Fx III to boot up...\n"); + if (snd_usb_pipe_sanity_check(dev, usb_sndctrlpipe(dev, 0))) + return -EINVAL; /* If the Axe-Fx III has not fully booted, it will timeout when trying * to enable the audio streaming interface. A more generous timeout is * used here to detect when the Axe-Fx III has finished booting as the @@ -996,6 +1004,8 @@ static int snd_usb_motu_microbookii_communicate(struct usb_device *dev, u8 *buf, { int err, actual_length; + if (snd_usb_pipe_sanity_check(dev, usb_sndintpipe(dev, 0x01))) + return -EINVAL; err = usb_interrupt_msg(dev, usb_sndintpipe(dev, 0x01), buf, *length, &actual_length, 1000); if (err < 0) @@ -1006,6 +1016,8 @@ static int snd_usb_motu_microbookii_communicate(struct usb_device *dev, u8 *buf, memset(buf, 0, buf_size); + if (snd_usb_pipe_sanity_check(dev, usb_rcvintpipe(dev, 0x82))) + return -EINV
[PATCH 5.2 37/56] tipc: fix unitilized skb list crash
From: Jon Maloy [ Upstream commit 2948a1fcd77a8bb11604387e3fc52f0ebf5729e9 ] Our test suite somtimes provokes the following crash: Description of problem: [ 1092.597234] BUG: unable to handle kernel NULL pointer dereference at 00e8 [ 1092.605072] PGD 0 P4D 0 [ 1092.607620] Oops: [#1] SMP PTI [ 1092.68] CPU: 37 PID: 0 Comm: swapper/37 Kdump: loaded Not tainted 4.18.0-122.el8.x86_64 #1 [ 1092.619724] Hardware name: Dell Inc. PowerEdge R740/08D89F, BIOS 1.3.7 02/08/2018 [ 1092.627215] RIP: 0010:tipc_mcast_filter_msg+0x93/0x2d0 [tipc] [ 1092.632955] Code: 0f 84 aa 01 00 00 89 cf 4d 01 ca 4c 8b 26 c1 ef 19 83 e7 0f 83 ff 0c 4d 0f 45 d1 41 8b 6a 10 0f cd 4c 39 e6 0f 84 81 01 00 00 <4d> 8b 9c 24 e8 00 00 00 45 8b 13 41 0f ca 44 89 d7 c1 ef 13 83 e7 [ 1092.651703] RSP: 0018:929e5fa83a18 EFLAGS: 00010282 [ 1092.656927] RAX: 929e3fb38100 RBX: 069f29ee RCX: 416c0045 [ 1092.664058] RDX: 929e5fa83a88 RSI: 929e31a28420 RDI: [ 1092.671209] RBP: 29b11821 R08: R09: 929e39b4407a [ 1092.678343] R10: 929e39b4407a R11: 0007 R12: [ 1092.685475] R13: 0001 R14: 929e3fb38100 R15: 929e39b4407a [ 1092.692614] FS: () GS:929e5fa8() knlGS: [ 1092.700702] CS: 0010 DS: ES: CR0: 80050033 [ 1092.706447] CR2: 00e8 CR3: 00031300a004 CR4: 007606e0 [ 1092.713579] DR0: DR1: DR2: [ 1092.720712] DR3: DR6: fffe0ff0 DR7: 0400 [ 1092.727843] PKRU: 5554 [ 1092.730556] Call Trace: [ 1092.733010] [ 1092.735034] tipc_sk_filter_rcv+0x7ca/0xb80 [tipc] [ 1092.739828] ? __kmalloc_node_track_caller+0x1cb/0x290 [ 1092.744974] ? dev_hard_start_xmit+0xa5/0x210 [ 1092.749332] tipc_sk_rcv+0x389/0x640 [tipc] [ 1092.753519] tipc_sk_mcast_rcv+0x23c/0x3a0 [tipc] [ 1092.758224] tipc_rcv+0x57a/0xf20 [tipc] [ 1092.762154] ? ktime_get_real_ts64+0x40/0xe0 [ 1092.766432] ? tpacket_rcv+0x50/0x9f0 [ 1092.770098] tipc_l2_rcv_msg+0x4a/0x70 [tipc] [ 1092.774452] __netif_receive_skb_core+0xb62/0xbd0 [ 1092.779164] ? enqueue_entity+0xf6/0x630 [ 1092.783084] ? kmem_cache_alloc+0x158/0x1c0 [ 1092.787272] ? __build_skb+0x25/0xd0 [ 1092.790849] netif_receive_skb_internal+0x42/0xf0 [ 1092.795557] napi_gro_receive+0xba/0xe0 [ 1092.799417] mlx5e_handle_rx_cqe+0x83/0xd0 [mlx5_core] [ 1092.804564] mlx5e_poll_rx_cq+0xd5/0x920 [mlx5_core] [ 1092.809536] mlx5e_napi_poll+0xb2/0xce0 [mlx5_core] [ 1092.814415] ? __wake_up_common_lock+0x89/0xc0 [ 1092.818861] net_rx_action+0x149/0x3b0 [ 1092.822616] __do_softirq+0xe3/0x30a [ 1092.826193] irq_exit+0x100/0x110 [ 1092.829512] do_IRQ+0x85/0xd0 [ 1092.832483] common_interrupt+0xf/0xf [ 1092.836147] [ 1092.838255] RIP: 0010:cpuidle_enter_state+0xb7/0x2a0 [ 1092.843221] Code: e8 3e 79 a5 ff 80 7c 24 03 00 74 17 9c 58 0f 1f 44 00 00 f6 c4 02 0f 85 d7 01 00 00 31 ff e8 a0 6b ab ff fb 66 0f 1f 44 00 00 <48> b8 ff ff ff ff f3 01 00 00 4c 29 f3 ba ff ff ff 7f 48 39 c3 7f [ 1092.861967] RSP: 0018:aa5ec6533e98 EFLAGS: 0246 ORIG_RAX: ffdd [ 1092.869530] RAX: 929e5faa3100 RBX: 00fe63dd2092 RCX: 001f [ 1092.876665] RDX: 00fe63dd2092 RSI: 3a518aaa RDI: [ 1092.883795] RBP: 0003 R08: 0004 R09: 00022940 [ 1092.890929] R10: 040cb0666b56 R11: 929e5faa20a8 R12: 929e5faade78 [ 1092.898060] R13: b59258f8 R14: 00fe60f3228d R15: [ 1092.905196] ? cpuidle_enter_state+0x92/0x2a0 [ 1092.909555] do_idle+0x236/0x280 [ 1092.912785] cpu_startup_entry+0x6f/0x80 [ 1092.916715] start_secondary+0x1a7/0x200 [ 1092.920642] secondary_startup_64+0xb7/0xc0 [...] The reason is that the skb list tipc_socket::mc_method.deferredq only is initialized for connectionless sockets, while nothing stops arriving multicast messages from being filtered by connection oriented sockets, with subsequent access to the said list. We fix this by initializing the list unconditionally at socket creation. This eliminates the crash, while the message still is dropped further down in tipc_sk_filter_rcv() as it should be. Reported-by: Li Shuang Signed-off-by: Jon Maloy Reviewed-by: Xin Long Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/tipc/socket.c |3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -485,9 +485,8 @@ static int tipc_sk_create(struct net *ne tsk_set_unreturnable(tsk, true); if (sock->type == SOCK_DGRAM) tsk_set_unreliable(tsk, true); - __skb_queue_head_init(&tsk->mc_method.deferredq); } - + __skb_queue_head_init(&tsk->mc_method.deferredq); trace_tipc_sk_create(sk, NULL, TIPC_DUMP_NONE, " "); re
[PATCH 5.2 30/56] net sched: update vlan action for batched events operations
From: Roman Mashak [ Upstream commit b35475c5491a14c8ce7a5046ef7bcda8a860581a ] Add get_fill_size() routine used to calculate the action size when building a batch of events. Fixes: c7e2b9689 ("sched: introduce vlan action") Signed-off-by: Roman Mashak Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/sched/act_vlan.c |9 + 1 file changed, 9 insertions(+) --- a/net/sched/act_vlan.c +++ b/net/sched/act_vlan.c @@ -306,6 +306,14 @@ static int tcf_vlan_search(struct net *n return tcf_idr_search(tn, a, index); } +static size_t tcf_vlan_get_fill_size(const struct tc_action *act) +{ + return nla_total_size(sizeof(struct tc_vlan)) + + nla_total_size(sizeof(u16)) /* TCA_VLAN_PUSH_VLAN_ID */ + + nla_total_size(sizeof(u16)) /* TCA_VLAN_PUSH_VLAN_PROTOCOL */ + + nla_total_size(sizeof(u8)); /* TCA_VLAN_PUSH_VLAN_PRIORITY */ +} + static struct tc_action_ops act_vlan_ops = { .kind = "vlan", .id = TCA_ID_VLAN, @@ -315,6 +323,7 @@ static struct tc_action_ops act_vlan_ops .init = tcf_vlan_init, .cleanup= tcf_vlan_cleanup, .walk = tcf_vlan_walker, + .get_fill_size = tcf_vlan_get_fill_size, .lookup = tcf_vlan_search, .size = sizeof(struct tcf_vlan), };
[PATCH 5.2 32/56] net/smc: do not schedule tx_work in SMC_CLOSED state
From: Ursula Braun [ Upstream commit f9cedf1a9b1cdcfb0c52edb391d01771e43994a4 ] The setsockopts options TCP_NODELAY and TCP_CORK may schedule the tx worker. Make sure the socket is not yet moved into SMC_CLOSED state (for instance by a shutdown SHUT_RDWR call). Reported-by: syzbot+92209502e7aab127c...@syzkaller.appspotmail.com Reported-by: syzbot+b972214bb803a343f...@syzkaller.appspotmail.com Fixes: 01d2f7e2cdd31 ("net/smc: sockopts TCP_NODELAY and TCP_CORK") Signed-off-by: Ursula Braun Signed-off-by: Karsten Graul Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/smc/af_smc.c |8 ++-- 1 file changed, 6 insertions(+), 2 deletions(-) --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -1741,14 +1741,18 @@ static int smc_setsockopt(struct socket } break; case TCP_NODELAY: - if (sk->sk_state != SMC_INIT && sk->sk_state != SMC_LISTEN) { + if (sk->sk_state != SMC_INIT && + sk->sk_state != SMC_LISTEN && + sk->sk_state != SMC_CLOSED) { if (val && !smc->use_fallback) mod_delayed_work(system_wq, &smc->conn.tx_work, 0); } break; case TCP_CORK: - if (sk->sk_state != SMC_INIT && sk->sk_state != SMC_LISTEN) { + if (sk->sk_state != SMC_INIT && + sk->sk_state != SMC_LISTEN && + sk->sk_state != SMC_CLOSED) { if (!val && !smc->use_fallback) mod_delayed_work(system_wq, &smc->conn.tx_work, 0);
[PATCH 5.2 07/56] HID: Add quirk for HP X1200 PIXART OEM mouse
From: Sebastian Parschauer commit 49869d2ea9eecc105a10724c1abf035151a3c4e2 upstream. The PixArt OEM mice are known for disconnecting every minute in runlevel 1 or 3 if they are not always polled. So add quirk ALWAYS_POLL for this one as well. Jonathan Teh (@jonathan-teh) reported and tested the quirk. Reference: https://github.com/sriemer/fix-linux-mouse/issues/15 Signed-off-by: Sebastian Parschauer CC: sta...@vger.kernel.org Signed-off-by: Jiri Kosina Signed-off-by: Greg Kroah-Hartman --- drivers/hid/hid-ids.h|1 + drivers/hid/hid-quirks.c |1 + 2 files changed, 2 insertions(+) --- a/drivers/hid/hid-ids.h +++ b/drivers/hid/hid-ids.h @@ -568,6 +568,7 @@ #define USB_PRODUCT_ID_HP_LOGITECH_OEM_USB_OPTICAL_MOUSE_0B4A 0x0b4a #define USB_PRODUCT_ID_HP_PIXART_OEM_USB_OPTICAL_MOUSE 0x134a #define USB_PRODUCT_ID_HP_PIXART_OEM_USB_OPTICAL_MOUSE_094A0x094a +#define USB_PRODUCT_ID_HP_PIXART_OEM_USB_OPTICAL_MOUSE_06410x0641 #define USB_VENDOR_ID_HUION0x256c #define USB_DEVICE_ID_HUION_TABLET 0x006e --- a/drivers/hid/hid-quirks.c +++ b/drivers/hid/hid-quirks.c @@ -91,6 +91,7 @@ static const struct hid_device_id hid_qu { HID_USB_DEVICE(USB_VENDOR_ID_HP, USB_PRODUCT_ID_HP_LOGITECH_OEM_USB_OPTICAL_MOUSE_0B4A), HID_QUIRK_ALWAYS_POLL }, { HID_USB_DEVICE(USB_VENDOR_ID_HP, USB_PRODUCT_ID_HP_PIXART_OEM_USB_OPTICAL_MOUSE), HID_QUIRK_ALWAYS_POLL }, { HID_USB_DEVICE(USB_VENDOR_ID_HP, USB_PRODUCT_ID_HP_PIXART_OEM_USB_OPTICAL_MOUSE_094A), HID_QUIRK_ALWAYS_POLL }, + { HID_USB_DEVICE(USB_VENDOR_ID_HP, USB_PRODUCT_ID_HP_PIXART_OEM_USB_OPTICAL_MOUSE_0641), HID_QUIRK_ALWAYS_POLL }, { HID_USB_DEVICE(USB_VENDOR_ID_IDEACOM, USB_DEVICE_ID_IDEACOM_IDC6680), HID_QUIRK_MULTI_INPUT }, { HID_USB_DEVICE(USB_VENDOR_ID_INNOMEDIA, USB_DEVICE_ID_INNEX_GENESIS_ATARI), HID_QUIRK_MULTI_INPUT }, { HID_USB_DEVICE(USB_VENDOR_ID_KYE, USB_DEVICE_ID_KYE_EASYPEN_M610X), HID_QUIRK_MULTI_INPUT },
[PATCH 5.2 28/56] net: qualcomm: rmnet: Fix incorrect UL checksum offload logic
From: Subash Abhinov Kasiviswanathan [ Upstream commit a7cf3d24ee6081930feb4c830a7f6f16ebe31c49 ] The udp_ip4_ind bit is set only for IPv4 UDP non-fragmented packets so that the hardware can flip the checksum to 0x if the computed checksum is 0 per RFC768. However, this bit had to be set for IPv6 UDP non fragmented packets as well per hardware requirements. Otherwise, IPv6 UDP packets with computed checksum as 0 were transmitted by hardware and were dropped in the network. In addition to setting this bit for IPv6 UDP, the field is also appropriately renamed to udp_ind as part of this change. Fixes: 5eb5f8608ef1 ("net: qualcomm: rmnet: Add support for TX checksum offload") Cc: Sean Tranchetti Signed-off-by: Subash Abhinov Kasiviswanathan Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/qualcomm/rmnet/rmnet_map.h |2 +- drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c | 13 + 2 files changed, 10 insertions(+), 5 deletions(-) --- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_map.h +++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_map.h @@ -51,7 +51,7 @@ struct rmnet_map_dl_csum_trailer { struct rmnet_map_ul_csum_header { __be16 csum_start_offset; u16 csum_insert_offset:14; - u16 udp_ip4_ind:1; + u16 udp_ind:1; u16 csum_enabled:1; } __aligned(1); --- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c +++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c @@ -206,9 +206,9 @@ rmnet_map_ipv4_ul_csum_header(void *iphd ul_header->csum_insert_offset = skb->csum_offset; ul_header->csum_enabled = 1; if (ip4h->protocol == IPPROTO_UDP) - ul_header->udp_ip4_ind = 1; + ul_header->udp_ind = 1; else - ul_header->udp_ip4_ind = 0; + ul_header->udp_ind = 0; /* Changing remaining fields to network order */ hdr++; @@ -239,6 +239,7 @@ rmnet_map_ipv6_ul_csum_header(void *ip6h struct rmnet_map_ul_csum_header *ul_header, struct sk_buff *skb) { + struct ipv6hdr *ip6h = (struct ipv6hdr *)ip6hdr; __be16 *hdr = (__be16 *)ul_header, offset; offset = htons((__force u16)(skb_transport_header(skb) - @@ -246,7 +247,11 @@ rmnet_map_ipv6_ul_csum_header(void *ip6h ul_header->csum_start_offset = offset; ul_header->csum_insert_offset = skb->csum_offset; ul_header->csum_enabled = 1; - ul_header->udp_ip4_ind = 0; + + if (ip6h->nexthdr == IPPROTO_UDP) + ul_header->udp_ind = 1; + else + ul_header->udp_ind = 0; /* Changing remaining fields to network order */ hdr++; @@ -419,7 +424,7 @@ sw_csum: ul_header->csum_start_offset = 0; ul_header->csum_insert_offset = 0; ul_header->csum_enabled = 0; - ul_header->udp_ip4_ind = 0; + ul_header->udp_ind = 0; priv->stats.csum_sw++; }
[PATCH 5.2 29/56] net: sched: Fix a possible null-pointer dereference in dequeue_func()
From: Jia-Ju Bai [ Upstream commit 051c7b39be4a91f6b7d8c4548444e4b850f1f56c ] In dequeue_func(), there is an if statement on line 74 to check whether skb is NULL: if (skb) When skb is NULL, it is used on line 77: prefetch(&skb->end); Thus, a possible null-pointer dereference may occur. To fix this bug, skb->end is used when skb is not NULL. This bug is found by a static analysis tool STCheck written by us. Fixes: 76e3cc126bb2 ("codel: Controlled Delay AQM") Signed-off-by: Jia-Ju Bai Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/sched/sch_codel.c |6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) --- a/net/sched/sch_codel.c +++ b/net/sched/sch_codel.c @@ -71,10 +71,10 @@ static struct sk_buff *dequeue_func(stru struct Qdisc *sch = ctx; struct sk_buff *skb = __qdisc_dequeue_head(&sch->q); - if (skb) + if (skb) { sch->qstats.backlog -= qdisc_pkt_len(skb); - - prefetch(&skb->end); /* we'll need skb_shinfo() */ + prefetch(&skb->end); /* we'll need skb_shinfo() */ + } return skb; }
[PATCH 5.2 36/56] tipc: compat: allow tipc commands without arguments
From: Taras Kondratiuk [ Upstream commit 4da5f0018eef4c0de31675b670c80e82e13e99d1 ] Commit 2753ca5d9009 ("tipc: fix uninit-value in tipc_nl_compat_doit") broke older tipc tools that use compat interface (e.g. tipc-config from tipcutils package): % tipc-config -p operation not supported The commit started to reject TIPC netlink compat messages that do not have attributes. It is too restrictive because some of such messages are valid (they don't need any arguments): % grep 'tx none' include/uapi/linux/tipc_config.h #define TIPC_CMD_NOOP 0x/* tx none, rx none */ #define TIPC_CMD_GET_MEDIA_NAMES 0x0002/* tx none, rx media_name(s) */ #define TIPC_CMD_GET_BEARER_NAMES 0x0003/* tx none, rx bearer_name(s) */ #define TIPC_CMD_SHOW_PORTS0x0006/* tx none, rx ultra_string */ #define TIPC_CMD_GET_REMOTE_MNG0x4003/* tx none, rx unsigned */ #define TIPC_CMD_GET_MAX_PORTS 0x4004/* tx none, rx unsigned */ #define TIPC_CMD_GET_NETID 0x400B/* tx none, rx unsigned */ #define TIPC_CMD_NOT_NET_ADMIN 0xC001/* tx none, rx none */ This patch relaxes the original fix and rejects messages without arguments only if such arguments are expected by a command (reg_type is non zero). Fixes: 2753ca5d9009 ("tipc: fix uninit-value in tipc_nl_compat_doit") Cc: sta...@vger.kernel.org Signed-off-by: Taras Kondratiuk Acked-by: Ying Xue Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/tipc/netlink_compat.c | 11 +++ 1 file changed, 7 insertions(+), 4 deletions(-) --- a/net/tipc/netlink_compat.c +++ b/net/tipc/netlink_compat.c @@ -55,6 +55,7 @@ struct tipc_nl_compat_msg { int rep_type; int rep_size; int req_type; + int req_size; struct net *net; struct sk_buff *rep; struct tlv_desc *req; @@ -257,7 +258,8 @@ static int tipc_nl_compat_dumpit(struct int err; struct sk_buff *arg; - if (msg->req_type && !TLV_CHECK_TYPE(msg->req, msg->req_type)) + if (msg->req_type && (!msg->req_size || + !TLV_CHECK_TYPE(msg->req, msg->req_type))) return -EINVAL; msg->rep = tipc_tlv_alloc(msg->rep_size); @@ -354,7 +356,8 @@ static int tipc_nl_compat_doit(struct ti { int err; - if (msg->req_type && !TLV_CHECK_TYPE(msg->req, msg->req_type)) + if (msg->req_type && (!msg->req_size || + !TLV_CHECK_TYPE(msg->req, msg->req_type))) return -EINVAL; err = __tipc_nl_compat_doit(cmd, msg); @@ -1288,8 +1291,8 @@ static int tipc_nl_compat_recv(struct sk goto send; } - len = nlmsg_attrlen(req_nlh, GENL_HDRLEN + TIPC_GENL_HDRLEN); - if (!len || !TLV_OK(msg.req, len)) { + msg.req_size = nlmsg_attrlen(req_nlh, GENL_HDRLEN + TIPC_GENL_HDRLEN); + if (msg.req_size && !TLV_OK(msg.req, msg.req_size)) { msg.rep = tipc_get_err_tlv(TIPC_CFG_NOT_SUPPORTED); err = -EOPNOTSUPP; goto send;
[PATCH 5.2 27/56] net: phy: mscc: initialize stats array
From: Andreas Schwab [ Upstream commit f972037e71246c5e0916eef835174d58ffc517e4 ] The memory allocated for the stats array may contain arbitrary data. Fixes: e4f9ba642f0b ("net: phy: mscc: add support for VSC8514 PHY.") Fixes: 00d70d8e0e78 ("net: phy: mscc: add support for VSC8574 PHY") Fixes: a5afc1678044 ("net: phy: mscc: add support for VSC8584 PHY") Fixes: f76178dc5218 ("net: phy: mscc: add ethtool statistics counters") Signed-off-by: Andreas Schwab Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/phy/mscc.c | 16 1 file changed, 8 insertions(+), 8 deletions(-) --- a/drivers/net/phy/mscc.c +++ b/drivers/net/phy/mscc.c @@ -2226,8 +2226,8 @@ static int vsc8514_probe(struct phy_devi vsc8531->supp_led_modes = VSC85XX_SUPP_LED_MODES; vsc8531->hw_stats = vsc85xx_hw_stats; vsc8531->nstats = ARRAY_SIZE(vsc85xx_hw_stats); - vsc8531->stats = devm_kmalloc_array(&phydev->mdio.dev, vsc8531->nstats, - sizeof(u64), GFP_KERNEL); + vsc8531->stats = devm_kcalloc(&phydev->mdio.dev, vsc8531->nstats, + sizeof(u64), GFP_KERNEL); if (!vsc8531->stats) return -ENOMEM; @@ -2251,8 +2251,8 @@ static int vsc8574_probe(struct phy_devi vsc8531->supp_led_modes = VSC8584_SUPP_LED_MODES; vsc8531->hw_stats = vsc8584_hw_stats; vsc8531->nstats = ARRAY_SIZE(vsc8584_hw_stats); - vsc8531->stats = devm_kmalloc_array(&phydev->mdio.dev, vsc8531->nstats, - sizeof(u64), GFP_KERNEL); + vsc8531->stats = devm_kcalloc(&phydev->mdio.dev, vsc8531->nstats, + sizeof(u64), GFP_KERNEL); if (!vsc8531->stats) return -ENOMEM; @@ -2281,8 +2281,8 @@ static int vsc8584_probe(struct phy_devi vsc8531->supp_led_modes = VSC8584_SUPP_LED_MODES; vsc8531->hw_stats = vsc8584_hw_stats; vsc8531->nstats = ARRAY_SIZE(vsc8584_hw_stats); - vsc8531->stats = devm_kmalloc_array(&phydev->mdio.dev, vsc8531->nstats, - sizeof(u64), GFP_KERNEL); + vsc8531->stats = devm_kcalloc(&phydev->mdio.dev, vsc8531->nstats, + sizeof(u64), GFP_KERNEL); if (!vsc8531->stats) return -ENOMEM; @@ -2311,8 +2311,8 @@ static int vsc85xx_probe(struct phy_devi vsc8531->supp_led_modes = VSC85XX_SUPP_LED_MODES; vsc8531->hw_stats = vsc85xx_hw_stats; vsc8531->nstats = ARRAY_SIZE(vsc85xx_hw_stats); - vsc8531->stats = devm_kmalloc_array(&phydev->mdio.dev, vsc8531->nstats, - sizeof(u64), GFP_KERNEL); + vsc8531->stats = devm_kcalloc(&phydev->mdio.dev, vsc8531->nstats, + sizeof(u64), GFP_KERNEL); if (!vsc8531->stats) return -ENOMEM;
[PATCH 5.2 22/56] net/mlx5e: always initialize frag->last_in_page
From: Qian Cai [ Upstream commit 60d60c8fbd8d1acf25b041ecd72ae4fa16e9405b ] The commit 069d11465a80 ("net/mlx5e: RX, Enhance legacy Receive Queue memory scheme") introduced an undefined behaviour below due to "frag->last_in_page" is only initialized in mlx5e_init_frags_partition() when, if (next_frag.offset + frag_info[f].frag_stride > PAGE_SIZE) or after bailed out the loop, for (i = 0; i < mlx5_wq_cyc_get_size(&rq->wqe.wq); i++) As the result, there could be some "frag" have uninitialized value of "last_in_page". Later, get_frag() obtains those "frag" and check "frag->last_in_page" in mlx5e_put_rx_frag() and triggers the error during boot. Fix it by always initializing "frag->last_in_page" to "false" in mlx5e_init_frags_partition(). UBSAN: Undefined behaviour in drivers/net/ethernet/mellanox/mlx5/core/en_rx.c:325:12 load of value 170 is not a valid value for type 'bool' (aka '_Bool') Call trace: dump_backtrace+0x0/0x264 show_stack+0x20/0x2c dump_stack+0xb0/0x104 __ubsan_handle_load_invalid_value+0x104/0x128 mlx5e_handle_rx_cqe+0x8e8/0x12cc [mlx5_core] mlx5e_poll_rx_cq+0xca8/0x1a94 [mlx5_core] mlx5e_napi_poll+0x17c/0xa30 [mlx5_core] net_rx_action+0x248/0x940 __do_softirq+0x350/0x7b8 irq_exit+0x200/0x26c __handle_domain_irq+0xc8/0x128 gic_handle_irq+0x138/0x228 el1_irq+0xb8/0x140 arch_cpu_idle+0x1a4/0x348 do_idle+0x114/0x1b0 cpu_startup_entry+0x24/0x28 rest_init+0x1ac/0x1dc arch_call_rest_init+0x10/0x18 start_kernel+0x4d4/0x57c Fixes: 069d11465a80 ("net/mlx5e: RX, Enhance legacy Receive Queue memory scheme") Signed-off-by: Qian Cai Reviewed-by: Tariq Toukan Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/mellanox/mlx5/core/en_main.c |5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -340,12 +340,11 @@ static inline u64 mlx5e_get_mpwqe_offset static void mlx5e_init_frags_partition(struct mlx5e_rq *rq) { - struct mlx5e_wqe_frag_info next_frag, *prev; + struct mlx5e_wqe_frag_info next_frag = {}; + struct mlx5e_wqe_frag_info *prev = NULL; int i; next_frag.di = &rq->wqe.di[0]; - next_frag.offset = 0; - prev = NULL; for (i = 0; i < mlx5_wq_cyc_get_size(&rq->wqe.wq); i++) { struct mlx5e_rq_frag_info *frag_info = &rq->wqe.info.arr[0];
[PATCH 5.2 21/56] net: fix ifindex collision during namespace removal
From: Jiri Pirko [ Upstream commit 55b40dbf0e76b4bfb9d8b3a16a0208640a9a45df ] Commit aca51397d014 ("netns: Fix arbitrary net_device-s corruptions on net_ns stop.") introduced a possibility to hit a BUG in case device is returning back to init_net and two following conditions are met: 1) dev->ifindex value is used in a name of another "dev%d" device in init_net. 2) dev->name is used by another device in init_net. Under real life circumstances this is hard to get. Therefore this has been present happily for over 10 years. To reproduce: $ ip a 1: lo: mtu 65536 qdisc noqueue state UNKNOWN group default qlen 1000 link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00 inet 127.0.0.1/8 scope host lo valid_lft forever preferred_lft forever inet6 ::1/128 scope host valid_lft forever preferred_lft forever 2: dummy0: mtu 1500 qdisc noop state DOWN group default qlen 1000 link/ether 86:89:3f:86:61:29 brd ff:ff:ff:ff:ff:ff 3: enp0s2: mtu 1500 qdisc noop state DOWN group default qlen 1000 link/ether 52:54:00:12:34:56 brd ff:ff:ff:ff:ff:ff $ ip netns add ns1 $ ip -n ns1 link add dummy1ns1 type dummy $ ip -n ns1 link add dummy2ns1 type dummy $ ip link set enp0s2 netns ns1 $ ip -n ns1 link set enp0s2 name dummy0 [ 100.858894] virtio_net virtio0 dummy0: renamed from enp0s2 $ ip link add dev4 type dummy $ ip -n ns1 a 1: lo: mtu 65536 qdisc noop state DOWN group default qlen 1000 link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00 2: dummy1ns1: mtu 1500 qdisc noop state DOWN group default qlen 1000 link/ether 16:63:4c:38:3e:ff brd ff:ff:ff:ff:ff:ff 3: dummy2ns1: mtu 1500 qdisc noop state DOWN group default qlen 1000 link/ether aa:9e:86:dd:6b:5d brd ff:ff:ff:ff:ff:ff 4: dummy0: mtu 1500 qdisc noop state DOWN group default qlen 1000 link/ether 52:54:00:12:34:56 brd ff:ff:ff:ff:ff:ff $ ip a 1: lo: mtu 65536 qdisc noqueue state UNKNOWN group default qlen 1000 link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00 inet 127.0.0.1/8 scope host lo valid_lft forever preferred_lft forever inet6 ::1/128 scope host valid_lft forever preferred_lft forever 2: dummy0: mtu 1500 qdisc noop state DOWN group default qlen 1000 link/ether 86:89:3f:86:61:29 brd ff:ff:ff:ff:ff:ff 4: dev4: mtu 1500 qdisc noop state DOWN group default qlen 1000 link/ether 5a:e1:4a:b6:ec:f8 brd ff:ff:ff:ff:ff:ff $ ip netns del ns1 [ 158.717795] default_device_exit: failed to move dummy0 to init_net: -17 [ 158.719316] [ cut here ] [ 158.720591] kernel BUG at net/core/dev.c:9824! [ 158.722260] invalid opcode: [#1] SMP KASAN PTI [ 158.723728] CPU: 0 PID: 56 Comm: kworker/u2:1 Not tainted 5.3.0-rc1+ #18 [ 158.725422] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.12.0-2.fc30 04/01/2014 [ 158.727508] Workqueue: netns cleanup_net [ 158.728915] RIP: 0010:default_device_exit.cold+0x1d/0x1f [ 158.730683] Code: 84 e8 18 c9 3e fe 0f 0b e9 70 90 ff ff e8 36 e4 52 fe 89 d9 4c 89 e2 48 c7 c6 80 d6 25 84 48 c7 c7 20 c0 25 84 e8 f4 c8 3e [ 158.736854] RSP: 0018:8880347e7b90 EFLAGS: 00010282 [ 158.738752] RAX: 003b RBX: ffef RCX: [ 158.741369] RDX: RSI: 8128013d RDI: ed10068fcf64 [ 158.743418] RBP: 888033550170 R08: 003b R09: fbfff0b94b9c [ 158.745626] R10: fbfff0b94b9b R11: 85ca5cdf R12: 888032f28000 [ 158.748405] R13: dc00 R14: 8880335501b8 R15: 1110068fcf72 [ 158.750638] FS: () GS:88803600() knlGS: [ 158.752944] CS: 0010 DS: ES: CR0: 80050033 [ 158.755245] CR2: 7fe8b45d21d0 CR3: 340b4005 CR4: 00360ef0 [ 158.757654] DR0: DR1: DR2: [ 158.760012] DR3: DR6: fffe0ff0 DR7: 0400 [ 158.762758] Call Trace: [ 158.763882] ? dev_change_net_namespace+0xbb0/0xbb0 [ 158.766148] ? devlink_nl_cmd_set_doit+0x520/0x520 [ 158.768034] ? dev_change_net_namespace+0xbb0/0xbb0 [ 158.769870] ops_exit_list.isra.0+0xa8/0x150 [ 158.771544] cleanup_net+0x446/0x8f0 [ 158.772945] ? unregister_pernet_operations+0x4a0/0x4a0 [ 158.775294] process_one_work+0xa1a/0x1740 [ 158.776896] ? pwq_dec_nr_in_flight+0x310/0x310 [ 158.779143] ? do_raw_spin_lock+0x11b/0x280 [ 158.780848] worker_thread+0x9e/0x1060 [ 158.782500] ? process_one_work+0x1740/0x1740 [ 158.784454] kthread+0x31b/0x420 [ 158.786082] ? __kthread_create_on_node+0x3f0/0x3f0 [ 158.788286] ret_from_fork+0x3a/0x50 [ 158.789871] ---[ end trace defd6c657c71f936 ]--- [ 158.792273] RIP: 0010:default_device_exit.cold+0x1d/0x1f [ 158.795478] Code: 84 e8 18 c9 3e fe 0f 0b e9 70 90 ff ff e8 36 e4 52 fe 89 d9 4c 89 e2 48 c7 c6 80 d6 25 84 48 c7 c7 20 c0 25 84 e8 f4 c8 3e [ 158.804854] RSP: 0018:8880347e7b90 EFLAGS: 00010282 [ 158.807865] RAX: 00
[PATCH 5.2 15/56] mlxsw: spectrum: Fix error path in mlxsw_sp_module_init()
From: Jiri Pirko [ Upstream commit 28fe79000e9b0a6f99959869947f1ca305f14599 ] In case of sp2 pci driver registration fail, fix the error path to start with sp1 pci driver unregister. Fixes: c3ab435466d5 ("mlxsw: spectrum: Extend to support Spectrum-2 ASIC") Signed-off-by: Jiri Pirko Signed-off-by: Ido Schimmel Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/mellanox/mlxsw/spectrum.c |2 +- 1 file changed, 1 insertion(+), 1 deletion(-) --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c @@ -5989,7 +5989,7 @@ static int __init mlxsw_sp_module_init(v return 0; err_sp2_pci_driver_register: - mlxsw_pci_driver_unregister(&mlxsw_sp2_pci_driver); + mlxsw_pci_driver_unregister(&mlxsw_sp1_pci_driver); err_sp1_pci_driver_register: mlxsw_core_driver_unregister(&mlxsw_sp2_driver); err_sp2_core_driver_register:
[PATCH 5.2 11/56] ife: error out when nla attributes are empty
From: Cong Wang [ Upstream commit c8ec4632c6ac9cda0e8c3d51aa41eeab66585bd5 ] act_ife at least requires TCA_IFE_PARMS, so we have to bail out when there is no attribute passed in. Reported-by: syzbot+fbb5b288c9cb6a2ee...@syzkaller.appspotmail.com Fixes: ef6980b6becb ("introduce IFE action") Cc: Jamal Hadi Salim Cc: Jiri Pirko Signed-off-by: Cong Wang Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/sched/act_ife.c |5 + 1 file changed, 5 insertions(+) --- a/net/sched/act_ife.c +++ b/net/sched/act_ife.c @@ -481,6 +481,11 @@ static int tcf_ife_init(struct net *net, int ret = 0; int err; + if (!nla) { + NL_SET_ERR_MSG_MOD(extack, "IFE requires attributes to be passed"); + return -EINVAL; + } + err = nla_parse_nested_deprecated(tb, TCA_IFE_MAX, nla, ife_policy, NULL); if (err < 0)
[PATCH] coresight: Serialize enabling/disabling a link device.
When tracing etm data of multiple threads on multiple cpus through perf interface, some link devices are shared between paths of different cpus. It creates race conditions when different cpus wants to enable/disable the same link device at the same time. Example 1: Two cpus want to enable different ports of a coresight funnel, thus calling the funnel enable operation at the same time. But the funnel enable operation isn't reentrantable. Example 2: For an enabled coresight dynamic replicator with refcnt=1, one cpu wants to disable it, while another cpu wants to enable it. Ideally we still have an enabled replicator with refcnt=1 at the end. But in reality the result is uncertain. Since coresight devices claim themselves when enabled for self-hosted usage, the race conditions above usually make the link devices not usable after many cycles. To fix the race conditions, this patch adds a spinlock to serialize enabling/disabling a link device. Signed-off-by: Yabin Cui --- drivers/hwtracing/coresight/coresight.c | 8 include/linux/coresight.h | 3 +++ 2 files changed, 11 insertions(+) diff --git a/drivers/hwtracing/coresight/coresight.c b/drivers/hwtracing/coresight/coresight.c index 55db77f6410b..90f97f4f99b2 100644 --- a/drivers/hwtracing/coresight/coresight.c +++ b/drivers/hwtracing/coresight/coresight.c @@ -256,6 +256,7 @@ static int coresight_enable_link(struct coresight_device *csdev, int ret; int link_subtype; int refport, inport, outport; + unsigned long flags; if (!parent || !child) return -EINVAL; @@ -274,15 +275,18 @@ static int coresight_enable_link(struct coresight_device *csdev, if (refport < 0) return refport; + spin_lock_irqsave(&csdev->spinlock, flags); if (atomic_inc_return(&csdev->refcnt[refport]) == 1) { if (link_ops(csdev)->enable) { ret = link_ops(csdev)->enable(csdev, inport, outport); if (ret) { atomic_dec(&csdev->refcnt[refport]); + spin_unlock_irqrestore(&csdev->spinlock, flags); return ret; } } } + spin_unlock_irqrestore(&csdev->spinlock, flags); csdev->enable = true; @@ -296,6 +300,7 @@ static void coresight_disable_link(struct coresight_device *csdev, int i, nr_conns; int link_subtype; int refport, inport, outport; + unsigned long flags; if (!parent || !child) return; @@ -315,10 +320,12 @@ static void coresight_disable_link(struct coresight_device *csdev, nr_conns = 1; } + spin_lock_irqsave(&csdev->spinlock, flags); if (atomic_dec_return(&csdev->refcnt[refport]) == 0) { if (link_ops(csdev)->disable) link_ops(csdev)->disable(csdev, inport, outport); } + spin_unlock_irqrestore(&csdev->spinlock, flags); for (i = 0; i < nr_conns; i++) if (atomic_read(&csdev->refcnt[i]) != 0) @@ -1225,6 +1232,7 @@ struct coresight_device *coresight_register(struct coresight_desc *desc) csdev->subtype = desc->subtype; csdev->ops = desc->ops; csdev->orphan = false; + spin_lock_init(&csdev->spinlock); csdev->dev.type = &coresight_dev_type[desc->type]; csdev->dev.groups = desc->groups; diff --git a/include/linux/coresight.h b/include/linux/coresight.h index a2b68823717b..dd28d9ab841d 100644 --- a/include/linux/coresight.h +++ b/include/linux/coresight.h @@ -9,6 +9,7 @@ #include #include #include +#include /* Peripheral id registers (0xFD0-0xFEC) */ #define CORESIGHT_PERIPHIDR4 0xfd0 @@ -153,6 +154,7 @@ struct coresight_connection { * activated but not yet enabled. Enabling for a _sink_ * appens when a source has been selected for that it. * @ea:Device attribute for sink representation under PMU directory. + * @spinlock: Serialize enabling/disabling this device. */ struct coresight_device { struct coresight_platform_data *pdata; @@ -166,6 +168,7 @@ struct coresight_device { /* sink specific fields */ bool activated; /* true only if a sink is part of a path */ struct dev_ext_attribute *ea; + spinlock_t spinlock; }; /* -- 2.22.0.770.g0f2c4a37fd-goog
Re: [bonding][patch] Regarding a bonding lacp issue
Felix wrote: >Dear Mainteners, > >Recently I hit a packet drop issue in bonding driver on Linux 4.9. Please >see details below. Please take a look to see if my understanding is >correct. Many thanks. > >What is the problem? >The bonding driver starts to send packets even if the Partner(Switch)'s >Collecting bit is not enabled yet. Partner would drop all packets until >its Collecting bit is enabled. > >What is the root cuase? >According to LACP spec, the Actor need to check Partner's Sync and >Collecting bits before enable its Distributing bit and Distributing >function. Please see the PIC below. The diagram you reference is found in 802.1AX-2014 figure 6-21, which shows the state diagram for an independent control implementation, i.e., collecting and distributing are managed independently. However, Linux bonding implements coupled control, which is shown in figure 6-22. Here, there is no Partner.Collecting requirement on the state transition from ATTACHED to COLLECTING_DISTRIBUTING. To quote 802.1AX-2014 6.4.15: As independent control is not possible, the coupled control state machine does not wait for the Partner to signal that collection has started before enabling both collection and distribution. Now, that said, I agree that what you're seeing is likely explained by this behavior, and your fix should resolve the immediate problem (that bonding sends packets before the peer has enabled COLLECTING). However, your fix does put bonding out of compliance with the standard, as it does not really implement COLLECTING and DISTRIBUTING as discrete states. In particular, if the peer in your case were to later clear Partner.Collecting, bonding will not react to this as a figure 6-21 independent control implementation would (which isn't a change from current behavior, but currently this isn't expected). So, in my opinion a patch like this should have a comment attached noting that we are deliberately not in compliance with the standard in this specific situation. The proper fix is to implement figure 6-21 separate state. Lastly, are you able to test and generate a patch against current upstream, instead of 4.9? -J >How to fix? >Please see the diff as following. And the patch is attached. > >--- ../origin/linux-4.9.188/drivers/net/bonding/bond_3ad.c 2019-08-07 >00:29:42.0 +0800 >+++ drivers/net/bonding/bond_3ad.c 2019-08-08 23:13:29.015640197 +0800 >@@ -937,6 +937,7 @@ > */ >if ((port->sm_vars & AD_PORT_SELECTED) && >(port->partner_oper.port_state & AD_STATE_SYNCHRONIZATION) && >+ (port->partner_oper.port_state & AD_STATE_COLLECTING) && >!__check_agg_selection_timer(port)) { > if (port->aggregator->is_active) > port->sm_mux_state = > >-- >Thanks, >Felix --- -Jay Vosburgh, jay.vosbu...@canonical.com
Re: [PATCH 1/3] mm/mlock.c: convert put_page() to put_user_page*()
On 8/8/19 4:09 AM, Vlastimil Babka wrote: > On 8/8/19 8:21 AM, Michal Hocko wrote: >> On Wed 07-08-19 16:32:08, John Hubbard wrote: >>> On 8/7/19 4:01 AM, Michal Hocko wrote: On Mon 05-08-19 15:20:17, john.hubb...@gmail.com wrote: > From: John Hubbard >>> Actually, I think follow_page_mask() gets all the pages, right? And the >>> get_page() in __munlock_pagevec_fill() is there to allow a >>> pagevec_release() >>> later. >> >> Maybe I am misreading the code (looking at Linus tree) but >> munlock_vma_pages_range >> calls follow_page for the start address and then if not THP tries to >> fill up the pagevec with few more pages (up to end), do the shortcut >> via manual pte walk as an optimization and use generic get_page there. > Yes, I see it finally, thanks. :) > That's true. However, I'm not sure munlocking is where the > put_user_page() machinery is intended to be used anyway? These are > short-term pins for struct page manipulation, not e.g. dirtying of page > contents. Reading commit fc1d8e7cca2d I don't think this case falls > within the reasoning there. Perhaps not all GUP users should be > converted to the planned separate GUP tracking, and instead we should > have a GUP/follow_page_mask() variant that keeps using get_page/put_page? > Interesting. So far, the approach has been to get all the gup callers to release via put_user_page(), but if we add in Jan's and Ira's vaddr_pin_pages() wrapper, then maybe we could leave some sites unconverted. However, in order to do so, we would have to change things so that we have one set of APIs (gup) that do *not* increment a pin count, and another set (vaddr_pin_pages) that do. Is that where we want to go...? I have a tracking patch that only deals with gup/pup. I could post as an RFC, but I think it might just muddy the waters at this point, anyway it's this one: https://github.com/johnhubbard/linux/commit/a0fb73ce0a39c74f0d1fb6bd9d866f660f762eae thanks, -- John Hubbard NVIDIA
Re: [PATCH net] net: phy: rtl8211f: do a double read to get real time link status
On 08.08.2019 08:21, Yonglong Liu wrote: > > > On 2019/8/8 14:11, Heiner Kallweit wrote: >> On 08.08.2019 03:15, Yonglong Liu wrote: >>> >>> >>> On 2019/8/8 0:47, Heiner Kallweit wrote: On 07.08.2019 15:16, Yonglong Liu wrote: > [ 27.232781] hns3 :bd:00.3 eth7: net open > [ 27.237303] 8021q: adding VLAN 0 to HW filter on device eth7 > [ 27.242972] IPv6: ADDRCONF(NETDEV_CHANGE): eth7: link becomes ready > [ 27.29] hns3 :bd:00.3: invalid speed (-1) > [ 27.253904] hns3 :bd:00.3 eth7: failed to adjust link. > [ 27.259379] RTL8211F Gigabit Ethernet mii-:bd:00.3:07: PHY state > change UP -> RUNNING > [ 27.924903] hns3 :bd:00.3 eth7: link up > [ 28.280479] RTL8211F Gigabit Ethernet mii-:bd:00.3:07: PHY state > change RUNNING -> NOLINK > [ 29.208452] hns3 :bd:00.3 eth7: link down > [ 32.376745] RTL8211F Gigabit Ethernet mii-:bd:00.3:07: PHY state > change NOLINK -> RUNNING > [ 33.208448] hns3 :bd:00.3 eth7: link up > [ 35.253821] hns3 :bd:00.3 eth7: net stop > [ 35.258270] hns3 :bd:00.3 eth7: link down > > When using rtl8211f in polling mode, may get a invalid speed, > because of reading a fake link up and autoneg complete status > immediately after starting autoneg: > > ifconfig-1176 [007] 27.232763: mdio_access: > mii-:bd:00.3 read phy:0x07 reg:0x00 val:0x1040 > kworker/u257:1-670 [015] 27.232805: mdio_access: > mii-:bd:00.3 read phy:0x07 reg:0x04 val:0x01e1 > kworker/u257:1-670 [015] 27.232815: mdio_access: > mii-:bd:00.3 write phy:0x07 reg:0x04 val:0x05e1 > kworker/u257:1-670 [015] 27.232869: mdio_access: > mii-:bd:00.3 read phy:0x07 reg:0x01 val:0x79ad > kworker/u257:1-670 [015] 27.232904: mdio_access: > mii-:bd:00.3 read phy:0x07 reg:0x09 val:0x0200 > kworker/u257:1-670 [015] 27.232940: mdio_access: > mii-:bd:00.3 read phy:0x07 reg:0x00 val:0x1040 > kworker/u257:1-670 [015] 27.232949: mdio_access: > mii-:bd:00.3 write phy:0x07 reg:0x00 val:0x1240 > kworker/u257:1-670 [015] 27.233003: mdio_access: > mii-:bd:00.3 read phy:0x07 reg:0x01 val:0x79ad > kworker/u257:1-670 [015] 27.233039: mdio_access: > mii-:bd:00.3 read phy:0x07 reg:0x0a val:0x3002 > kworker/u257:1-670 [015] 27.233074: mdio_access: > mii-:bd:00.3 read phy:0x07 reg:0x09 val:0x0200 > kworker/u257:1-670 [015] 27.233110: mdio_access: > mii-:bd:00.3 read phy:0x07 reg:0x05 val:0x > kworker/u257:1-670 [000] 28.280475: mdio_access: > mii-:bd:00.3 read phy:0x07 reg:0x01 val:0x7989 > kworker/u257:1-670 [000] 29.304471: mdio_access: > mii-:bd:00.3 read phy:0x07 reg:0x01 val:0x7989 > > According to the datasheet of rtl8211f, to get the real time > link status, need to read MII_BMSR twice. > > This patch add a read_status hook for rtl8211f, and do a fake > phy_read before genphy_read_status(), so that can get real link > status in genphy_read_status(). > > Signed-off-by: Yonglong Liu > --- > drivers/net/phy/realtek.c | 13 + > 1 file changed, 13 insertions(+) > Is this an accidental resubmit? Because we discussed this in https://marc.info/?t=15641350993&r=1&w=2 and a fix has been applied already. Heiner . >>> >>> In https://marc.info/?t=15641350993&r=1&w=2 , the invalid speed >>> recurrence rate is almost 100%, and I had test the solution about >>> 5 times and it works. But yesterday it happen again suddenly, and than >>> I fount that the recurrence rate reduce to 10%. This time we get 0x79ad >>> after autoneg started which is not 0x798d from last discussion. >>> >>> >>> >> OK, I'll have a look. >> However the approach is wrong. The double read is related to the latching >> of link-down events. This is done by all PHY's and not specific to RT8211F. >> Also it's not related to the problem. I assume any sufficient delay would >> do instead of the read. >> >> . >> > > So you will send a new patch to fix this problem? I am waiting for it, > and can do a full test this time. > > Can you try the following? This delay should give thy PHY enough time to clear both bits before the following read is done. diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c index ef7aa738e..32f327a44 100644 --- a/drivers/net/phy/phy.c +++ b/drivers/net/phy/phy.c @@ -568,6 +568,11 @@ int phy_start_aneg(struct phy_device *phydev) if (err < 0) goto out_unlock; + /* The PHY may not yet have cleared aneg-completed and link-up bit +* w/o this delay when the following read is done. +*/ + usleep_range(1000, 2000); +
Re: [PATCH] ASoC: soc-core: remove error due to probe deferral
On Thu, Aug 08, 2019 at 03:16:53PM +0200, Stefan Agner wrote: > On 2019-08-08 15:14, Takashi Iwai wrote: > > Mark Brown wrote: > > I guess we can use dev_printk() with the conditional level choice. > How about use dev_info always? We get a dev_err message from > soc_init_dai_link in error cases... > ret = soc_init_dai_link(card, dai_link); > if (ret && ret != -EPROBE_DEFER) { > dev_info(card->dev, "ASoC: failed to init link %s: > %d\n", >dai_link->name, ret); > } Well, if there's adequate error reporting in init_dai_link() it's a bit different - we can just remove the print entirely regardless of what the return code is. The point is to ensure that we don't just silently fail. Unfortunately there's no prints in the probe deferral case there so they need adding, that'll actually improve things though since we can make it print the name of the thing it's mising which will be useful to people trying to figure out what's going on (we used to do that but it got lost in reshufflings). signature.asc Description: PGP signature
[PATCH] coresight: tmc-etr: Remove perf_data check.
When tracing etm data of multiple threads on multiple cpus through perf interface, each cpu has a unique etr_perf_buffer while sharing the same etr device. There is no guarantee that the last cpu starts etm tracing also stops last. So the perf_data check is no longer valid. Signed-off-by: Yabin Cui --- drivers/hwtracing/coresight/coresight-tmc-etr.c | 9 - drivers/hwtracing/coresight/coresight-tmc.h | 2 -- 2 files changed, 11 deletions(-) diff --git a/drivers/hwtracing/coresight/coresight-tmc-etr.c b/drivers/hwtracing/coresight/coresight-tmc-etr.c index 17006705287a..0418440e0141 100644 --- a/drivers/hwtracing/coresight/coresight-tmc-etr.c +++ b/drivers/hwtracing/coresight/coresight-tmc-etr.c @@ -1484,20 +1484,12 @@ tmc_update_etr_buffer(struct coresight_device *csdev, goto out; } - if (WARN_ON(drvdata->perf_data != etr_perf)) { - lost = true; - spin_unlock_irqrestore(&drvdata->spinlock, flags); - goto out; - } - CS_UNLOCK(drvdata->base); tmc_flush_and_stop(drvdata); tmc_sync_etr_buf(drvdata); CS_LOCK(drvdata->base); - /* Reset perf specific data */ - drvdata->perf_data = NULL; spin_unlock_irqrestore(&drvdata->spinlock, flags); size = etr_buf->len; @@ -1556,7 +1548,6 @@ static int tmc_enable_etr_sink_perf(struct coresight_device *csdev, void *data) } etr_perf->head = PERF_IDX2OFF(handle->head, etr_perf); - drvdata->perf_data = etr_perf; /* * No HW configuration is needed if the sink is already in diff --git a/drivers/hwtracing/coresight/coresight-tmc.h b/drivers/hwtracing/coresight/coresight-tmc.h index 1ed50411cc3c..3881a9ee565a 100644 --- a/drivers/hwtracing/coresight/coresight-tmc.h +++ b/drivers/hwtracing/coresight/coresight-tmc.h @@ -178,7 +178,6 @@ struct etr_buf { * device configuration register (DEVID) * @idr: Holds etr_bufs allocated for this ETR. * @idr_mutex: Access serialisation for idr. - * @perf_data: PERF buffer for ETR. * @sysfs_data:SYSFS buffer for ETR. */ struct tmc_drvdata { @@ -202,7 +201,6 @@ struct tmc_drvdata { struct idr idr; struct mutexidr_mutex; struct etr_buf *sysfs_buf; - void*perf_data; }; struct etr_buf_operations { -- 2.22.0.770.g0f2c4a37fd-goog
Re: [PATCH v2 13/15] net: phy: adin: configure downshift on config_init
On 08.08.2019 14:30, Alexandru Ardelean wrote: > Down-speed auto-negotiation may not always be enabled, in which case the > PHY won't down-shift to 100 or 10 during auto-negotiation. > > This change enables downshift and configures the number of retries to > default 8 (maximum supported value). > > The change has been adapted from the Marvell PHY driver. > Instead of a fixed downshift setting (like in the Marvell driver) you may consider to implement the ethtool phy-tunable ETHTOOL_PHY_DOWNSHIFT. See the Aquantia PHY driver for an example. Then the user can configure whether he wants downshift and if yes after how many retries. > Signed-off-by: Alexandru Ardelean > --- > drivers/net/phy/adin.c | 39 +++ > 1 file changed, 39 insertions(+) [...] Heiner
Re: [PATCH v2 02/15] net: phy: adin: hook genphy_read_abilities() to get_features
On 08.08.2019 17:24, Andrew Lunn wrote: > On Thu, Aug 08, 2019 at 03:30:13PM +0300, Alexandru Ardelean wrote: >> The ADIN PHYs can operate with Clause 45, however they are not typical for >> how phylib considers Clause 45 PHYs. >> >> If the `features` field & the `get_features` hook are unspecified, and the >> device wants to operate via Clause 45, it would also try to read features >> via the `genphy_c45_pma_read_abilities()`, which will try to read PMA regs >> that are unsupported. >> >> Hooking the `genphy_read_abilities()` function to the `get_features` hook >> will ensure that this does not happen and the PHY features are read >> correctly regardless of Clause 22 or Clause 45 operation. > > I think we need to stop and think about a PHY which supports both C22 > and C45. > > How does bus enumeration work? Is it discovered twice? I've always > considered phydev->is_c45 means everything is c45, not that some > registers can be accessed via c45. But the driver is mixing c22 and > c45. Does the driver actually require c45? Are some features which are > only accessibly via C45? What does C45 actually bring us for this > device? > genphy_c45_pma_read_abilities() is only called if phydev->is_c45 is set. And this flag means that the PHY complies with Clause 45 incl. all the standard devices like PMA. In the case here only some vendor-specific registers can be accessed via Clause 45 and therefore is_c45 shouldn't bet set. As a consequence this patch isn't needed. > Andrew > Heiner
Re: [PATCH net] net: phy: rtl8211f: do a double read to get real time link status
> @@ -568,6 +568,11 @@ int phy_start_aneg(struct phy_device *phydev) > if (err < 0) > goto out_unlock; > > + /* The PHY may not yet have cleared aneg-completed and link-up bit > + * w/o this delay when the following read is done. > + */ > + usleep_range(1000, 2000); > + Hi Heiner Does 802.3 C22 say anything about this? If this PHY is broken with respect to the standard, i would prefer the workaround is in the PHY specific driver code, not generic core code. Andrew
[RFC PATCH] hugetlbfs: Add hugetlb_cgroup reservation limits
Problem: Currently tasks attempting to allocate more hugetlb memory than is available get a failure at mmap/shmget time. This is thanks to Hugetlbfs Reservations [1]. However, if a task attempts to allocate hugetlb memory only more than its hugetlb_cgroup limit allows, the kernel will allow the mmap/shmget call, but will SIGBUS the task when it attempts to fault the memory in. We have developers interested in using hugetlb_cgroups, and they have expressed dissatisfaction regarding this behavior. We'd like to improve this behavior such that tasks violating the hugetlb_cgroup limits get an error on mmap/shmget time, rather than getting SIGBUS'd when they try to fault the excess memory in. The underlying problem is that today's hugetlb_cgroup accounting happens at hugetlb memory *fault* time, rather than at *reservation* time. Thus, enforcing the hugetlb_cgroup limit only happens at fault time, and the offending task gets SIGBUS'd. Proposed Solution: A new page counter named hugetlb.xMB.reservation_[limit|usage]_in_bytes. This counter has slightly different semantics than hugetlb.xMB.[limit|usage]_in_bytes: - While usage_in_bytes tracks all *faulted* hugetlb memory, reservation_usage_in_bytes tracks all *reserved* hugetlb memory. - If a task attempts to reserve more memory than limit_in_bytes allows, the kernel will allow it to do so. But if a task attempts to reserve more memory than reservation_limit_in_bytes, the kernel will fail this reservation. This proposal is implemented in this patch, with tests to verify functionality and show the usage. Alternatives considered: 1. A new cgroup, instead of only a new page_counter attached to the existing hugetlb_cgroup. Adding a new cgroup seemed like a lot of code duplication with hugetlb_cgroup. Keeping hugetlb related page counters under hugetlb_cgroup seemed cleaner as well. 2. Instead of adding a new counter, we considered adding a sysctl that modifies the behavior of hugetlb.xMB.[limit|usage]_in_bytes, to do accounting at reservation time rather than fault time. Adding a new page_counter seems better as userspace could, if it wants, choose to enforce different cgroups differently: one via limit_in_bytes, and another via reservation_limit_in_bytes. This could be very useful if you're transitioning how hugetlb memory is partitioned on your system one cgroup at a time, for example. Also, someone may find usage for both limit_in_bytes and reservation_limit_in_bytes concurrently, and this approach gives them the option to do so. Caveats: 1. This support is implemented for cgroups-v1. I have not tried hugetlb_cgroups with cgroups v2, and AFAICT it's not supported yet. This is largely because we use cgroups-v1 for now. If required, I can add hugetlb_cgroup support to cgroups v2 in this patch or a follow up. 2. Most complicated bit of this patch I believe is: where to store the pointer to the hugetlb_cgroup to uncharge at unreservation time? Normally the cgroup pointers hang off the struct page. But, with hugetlb_cgroup reservations, one task can reserve a specific page and another task may fault it in (I believe), so storing the pointer in struct page is not appropriate. Proposed approach here is to store the pointer in the resv_map. See patch for details. [1]: https://www.kernel.org/doc/html/latest/vm/hugetlbfs_reserv.html Signed-off-by: Mina Almasry --- include/linux/hugetlb.h | 10 +- include/linux/hugetlb_cgroup.h| 19 +- mm/hugetlb.c | 256 -- mm/hugetlb_cgroup.c | 153 +- tools/testing/selftests/vm/.gitignore | 1 + tools/testing/selftests/vm/Makefile | 4 + .../selftests/vm/charge_reserved_hugetlb.sh | 438 ++ .../selftests/vm/write_hugetlb_memory.sh | 22 + .../testing/selftests/vm/write_to_hugetlbfs.c | 252 ++ 9 files changed, 1087 insertions(+), 68 deletions(-) create mode 100755 tools/testing/selftests/vm/charge_reserved_hugetlb.sh create mode 100644 tools/testing/selftests/vm/write_hugetlb_memory.sh create mode 100644 tools/testing/selftests/vm/write_to_hugetlbfs.c diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index edfca42783192..90b3c928d16c1 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -46,6 +46,14 @@ struct resv_map { long adds_in_progress; struct list_head region_cache; long region_cache_count; + #ifdef CONFIG_CGROUP_HUGETLB + /* +* On private mappings, the counter to uncharge reservations is stored +* here. If these fields are 0, then the mapping is shared. +*/ + struct page_counter *reservation_counter; + unsigned long pages_per_hpage; +#endif }; extern struct resv_map *resv_map_alloc(void); void resv_map_release(struct kref *ref); @@ -340,7 +348,7 @@ struct hstate { unsigned int surplus
[ANNOUNCE] 4.14.137-rt64
Hello RT Folks! I'm pleased to announce the 4.14.137-rt64 stable release. This release is just an update to the new stable 4.14.137 version and no RT specific changes have been made. You can get this release via the git tree at: git://git.kernel.org/pub/scm/linux/kernel/git/rt/linux-stable-rt.git branch: v4.14-rt Head SHA1: b86042812cec9871bba4a0da843cccdc77682ee3 Or to build 4.14.137-rt64 directly, the following patches should be applied: https://www.kernel.org/pub/linux/kernel/v4.x/linux-4.14.tar.xz https://www.kernel.org/pub/linux/kernel/v4.x/patch-4.14.137.xz https://www.kernel.org/pub/linux/kernel/projects/rt/4.14/patch-4.14.137-rt64.patch.xz Enjoy! Tom
Re: [PATCH v2 2/6] thermal: amlogic: Add thermal driver to support G12 SoCs
Hi Kevin, On Thu, Aug 8, 2019 at 4:59 AM Kevin Hilman wrote: > > Martin Blumenstingl writes: > > > Hi Guillaume, > > > > On Mon, Aug 5, 2019 at 2:48 PM guillaume La Roque > > wrote: > >> > >> Hi Martin, > >> > >> again thanks for your review. > > you're welcome - thank you for working on the driver :-) > > > > [...] > >> > The IP block has more functionality, which may be added to this driver > >> > in the future: > >> > - reading up to 16 stored temperature samples > >> > >> it's not working, you can verify it if you check the regmap define in the > >> driver. in fact temp is only write in one register, it's confirmed by > >> amlogic. > > I missed that - so please skip this part > > > > [...] > >> >> +config AMLOGIC_THERMAL > >> > we typically use "MESON" in the Kconfig symbols: > >> > $ grep -c AMLOGIC .config > >> > 1 > >> > $ grep -c MESON .config > >> > 33 > >> > > >> > I also wonder if we should add G12 or G12A so we don't conflict with > >> > upcoming thermal sensors with a different design (assuming that this > >> > will be a thing). > >> > for example we already have three different USB2 PHY drivers > >> > > >> > [...] > >> > >> i check with Neil and for new family it's better to use Amlogic instead of > >> meson. > > can you please share the considerations behind this decision? > > if new drivers should use AMLOGIC_* Kconfig symbols instead of MESON_* > > then we all should know about it > > > >> i don't add G12 because we already know it's same sensors for SM1 SoC > >> family [0]. > > my idea behind this was to avoid conflicts in the future > > in case of the thermal driver we may be fine with using a generic name > > assuming that Amlogic will not switch to a new IP block in the next > > years > > I'm not saying you have to change the name - I'm bringing this up so > > you can decide for yourself based on examples from the past > > > > here are a few examples: > > - when Kevin upstreamed the MMC driver for GX he decided to use > > MMC_MESON_GX for the Kconfig symbol name. it turns out that this is > > smart because there are at least two other MMC controller IPs on the > > 32-bit SoCs. due to him including GX in the name the drivers are easy > > to differentiate (MMC_MESON_MX_SDIO and MMC_MESON_MX_SDHC being the > > other ones, while the latter is not upstream yet) > > - when Carlo upstreamed the eFuse driver he decided to use MESON_EFUSE > > for the Kconfig symbol name. I found out much later that the 32-bit > > SoCs use a different IP (or at least direct register access instead of > > going through Secure Monitor). the driver for the 32-bit SoCs now uses > > MESON_MX_EFUSE. if you don't know which driver applies where then it's > > easy to mix up MESON_EFUSE and MESON_MX_EFUSE > > - when Jerome upstreamed the ALSA driver for AXG (which is also used > > on G12A and G12B) he decided to use the SND_MESON_AXG_* prefix for the > > Kconfig symbol names. in my opinion this was a good choice because GXM > > and everything earlier (including the 32-bit SoCs) use a different > > audio IP block. we won't have a Kconfig symbol name clash when a > > driver for the "older" SoCs is upstreamed > > - (there are more examples, Meson8b USB PHY driver, Meson8b DWMAC > > glue, ... - just like there's many examples where the IP block is > > mostly compatible with older generations: SAR ADC, RNG, SPI, ...) > > While these are all good examples, you can see it can go both ways, so > there's really no way know up front what is the "right" way. We only > know after the fact. Unfortunately, we simply have no visibility into > future chips and where IP blocks may be shared or not (there are other > examples where vendors add a new version of an IP *and* keep the old > version. ;) > > Even having worked inside a (different) SoC vendor and having some > knowledge about what IPs are shared, it's difficult to get this right. right. The fact that it'll be the IP block in SM1 will be backwards compatible (or even the same) means that it has a longer life-span than some of the USB PHY IP. so I'm fine either way Martin
Re: [PATCH] clk: fix devm_platform_ioremap_resource.cocci warnings
Quoting Julia Lawall (2019-08-08 09:10:53) > From: kbuild test robot > > drivers/clk/bcm/clk-bcm63xx-gate.c:174:1-9: WARNING: Use > devm_platform_ioremap_resource for hw -> regs > > Use devm_platform_ioremap_resource helper which wraps > platform_get_resource() and devm_ioremap_resource() together. > > Generated by: scripts/coccinelle/api/devm_platform_ioremap_resource.cocci > > Fixes: 1c099779c1e2 ("clk: add BCM63XX gated clock controller driver") Is it fixing anything? As far as I can tell it's reducing lines of code with another function. > CC: Jonas Gorski > Signed-off-by: kbuild test robot > Signed-off-by: Julia Lawall
Re: [PATCH] clk: fix devm_platform_ioremap_resource.cocci warnings
On Thu, 8 Aug 2019, Stephen Boyd wrote: > Quoting Julia Lawall (2019-08-08 09:10:53) > > From: kbuild test robot > > > > drivers/clk/bcm/clk-bcm63xx-gate.c:174:1-9: WARNING: Use > > devm_platform_ioremap_resource for hw -> regs > > > > Use devm_platform_ioremap_resource helper which wraps > > platform_get_resource() and devm_ioremap_resource() together. > > > > Generated by: scripts/coccinelle/api/devm_platform_ioremap_resource.cocci > > > > Fixes: 1c099779c1e2 ("clk: add BCM63XX gated clock controller driver") > > Is it fixing anything? As far as I can tell it's reducing lines of code > with another function. No, it doesn't fix anything. julia > > > CC: Jonas Gorski > > Signed-off-by: kbuild test robot > > Signed-off-by: Julia Lawall > >
Re: [PATCH v2 1/4] dt-bindings: soundwire: add slave bindings
On Thu, Aug 08, 2019 at 05:48:56PM +0100, Srinivas Kandagatla wrote: > On 08/08/2019 16:58, Pierre-Louis Bossart wrote: > > > +- sdw-instance-id: Should be ('Instance ID') from SoundWire > > > + Enumeration Address. Instance ID is for the cases > > > + where multiple Devices of the same type or Class > > > + are attached to the bus. > > so it is actually required if you have a single Slave device? Or is it > > only required when you have more than 1 device of the same type? > This is mandatory for any slave device! If it's mandatory the wording is a bit unclear. How about something like: Should be ('Instance ID') from the SoundWire Enumeration Address. This must always be provided, if multiple devices with the same type or class or attached to the bus each instance must have a distinct value. signature.asc Description: PGP signature
[PATCH v4 0/2] sched: Improve load balancing on AMD EPYC
This is another version of the AMD EPYC load balancing patch. The difference with this one is that now it fixes the following ia64 build error, reported by 0day: mm/page_alloc.o: In function `get_page_from_freelist': page_alloc.c:(.text+0x7850): undefined reference to `node_reclaim_distance' page_alloc.c:(.text+0x7931): undefined reference to `node_reclaim_distance' Matt Fleming (2): ia64: Make NUMA select SMP sched/topology: Improve load balancing on AMD EPYC arch/ia64/Kconfig | 1 + arch/x86/kernel/cpu/amd.c | 5 + include/linux/topology.h | 14 ++ kernel/sched/topology.c | 3 ++- mm/khugepaged.c | 2 +- mm/page_alloc.c | 2 +- 6 files changed, 24 insertions(+), 3 deletions(-) -- 2.13.7
[PATCH RT 02/19] kthread: add a global worker thread.
From: Sebastian Andrzej Siewior v4.14.137-rt65-rc1 stable review patch. If anyone has any objections, please let me know. --- [ Upstream commit 0532e87d9d44795221aa921ba7024bde689cc894 ] Add kthread_schedule_work() which uses a global kthread for all its jobs. Split the cgroup include to avoid recussive includes from interrupt.h. Fixup everything that fails to build (and did not include all header). Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Tom Zanussi Conflicts: include/linux/blk-cgroup.h include/linux/kthread.h kernel/kthread.c --- drivers/block/loop.c | 2 +- drivers/spi/spi-rockchip.c | 1 + include/linux/blk-cgroup.h | 1 + include/linux/kthread-cgroup.h | 17 + include/linux/kthread.h| 8 init/main.c| 1 + kernel/kthread.c | 13 + 7 files changed, 42 insertions(+), 1 deletion(-) create mode 100644 include/linux/kthread-cgroup.h diff --git a/drivers/block/loop.c b/drivers/block/loop.c index bd447de4a5b8..2a07dfc9b3ae 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -70,7 +70,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/spi/spi-rockchip.c b/drivers/spi/spi-rockchip.c index fdcf3076681b..b56619418cea 100644 --- a/drivers/spi/spi-rockchip.c +++ b/drivers/spi/spi-rockchip.c @@ -22,6 +22,7 @@ #include #include #include +#include #define DRIVER_NAME "rockchip-spi" diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 8bbc3716507a..a9454ad4de06 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -20,6 +20,7 @@ #include #include #include +#include /* percpu_counter batch for blkg_[rw]stats, per-cpu drift doesn't matter */ #define BLKG_STAT_CPU_BATCH(INT_MAX / 2) diff --git a/include/linux/kthread-cgroup.h b/include/linux/kthread-cgroup.h new file mode 100644 index ..53d34bca9d72 --- /dev/null +++ b/include/linux/kthread-cgroup.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_KTHREAD_CGROUP_H +#define _LINUX_KTHREAD_CGROUP_H +#include +#include + +#ifdef CONFIG_BLK_CGROUP +void kthread_associate_blkcg(struct cgroup_subsys_state *css); +struct cgroup_subsys_state *kthread_blkcg(void); +#else +static inline void kthread_associate_blkcg(struct cgroup_subsys_state *css) { } +static inline struct cgroup_subsys_state *kthread_blkcg(void) +{ + return NULL; +} +#endif +#endif diff --git a/include/linux/kthread.h b/include/linux/kthread.h index 4e663f407bd7..59b85b01fb8b 100644 --- a/include/linux/kthread.h +++ b/include/linux/kthread.h @@ -199,4 +199,12 @@ bool kthread_cancel_delayed_work_sync(struct kthread_delayed_work *work); void kthread_destroy_worker(struct kthread_worker *worker); +extern struct kthread_worker kthread_global_worker; +void kthread_init_global_worker(void); + +static inline bool kthread_schedule_work(struct kthread_work *work) +{ + return kthread_queue_work(&kthread_global_worker, work); +} + #endif /* _LINUX_KTHREAD_H */ diff --git a/init/main.c b/init/main.c index f32aebb5ce54..18c1297b2889 100644 --- a/init/main.c +++ b/init/main.c @@ -1059,6 +1059,7 @@ static noinline void __init kernel_init_freeable(void) smp_prepare_cpus(setup_max_cpus); workqueue_init(); + kthread_init_global_worker(); init_mm_internals(); diff --git a/kernel/kthread.c b/kernel/kthread.c index 430fd79cd3fe..44498522e5d5 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -1161,3 +1161,16 @@ void kthread_destroy_worker(struct kthread_worker *worker) kfree(worker); } EXPORT_SYMBOL(kthread_destroy_worker); + +DEFINE_KTHREAD_WORKER(kthread_global_worker); +EXPORT_SYMBOL(kthread_global_worker); + +__init void kthread_init_global_worker(void) +{ + kthread_global_worker.task = kthread_create(kthread_worker_fn, + &kthread_global_worker, + "kswork"); + if (WARN_ON(IS_ERR(kthread_global_worker.task))) + return; + wake_up_process(kthread_global_worker.task); +} -- 2.14.1
[PATCH RT 04/19] genirq: Handle missing work_struct in irq_set_affinity_notifier()
From: Sebastian Andrzej Siewior v4.14.137-rt65-rc1 stable review patch. If anyone has any objections, please let me know. --- [ Upstream commit bbc4d2a7d6ff54ba923640d9a42c7bef7185fe98 ] The backported stable commit 59c39840f5abf ("genirq: Prevent use-after-free and work list corruption") added cancel_work_sync() on a work_struct element which is not available in RT. Replace cancel_work_sync() with kthread_cancel_work_sync() on RT. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Tom Zanussi Conflicts: kernel/irq/manage.c --- kernel/irq/manage.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 3d5b33fe874b..071691963f7b 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -352,7 +352,9 @@ irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify) raw_spin_unlock_irqrestore(&desc->lock, flags); if (old_notify) { -#ifndef CONFIG_PREEMPT_RT_BASE +#ifdef CONFIG_PREEMPT_RT_BASE + kthread_cancel_work_sync(¬ify->work); +#else cancel_work_sync(&old_notify->work); #endif kref_put(&old_notify->kref, old_notify->release); -- 2.14.1
[PATCH v4 2/2] sched/topology: Improve load balancing on AMD EPYC
SD_BALANCE_{FORK,EXEC} and SD_WAKE_AFFINE are stripped in sd_init() for any sched domains with a NUMA distance greater than 2 hops (RECLAIM_DISTANCE). The idea being that it's expensive to balance across domains that far apart. However, as is rather unfortunately explained in commit 32e45ff43eaf ("mm: increase RECLAIM_DISTANCE to 30") the value for RECLAIM_DISTANCE is based on node distance tables from 2011-era hardware. Current AMD EPYC machines have the following NUMA node distances: node distances: node 0 1 2 3 4 5 6 7 0: 10 16 16 16 32 32 32 32 1: 16 10 16 16 32 32 32 32 2: 16 16 10 16 32 32 32 32 3: 16 16 16 10 32 32 32 32 4: 32 32 32 32 10 16 16 16 5: 32 32 32 32 16 10 16 16 6: 32 32 32 32 16 16 10 16 7: 32 32 32 32 16 16 16 10 where 2 hops is 32. The result is that the scheduler fails to load balance properly across NUMA nodes on different sockets -- 2 hops apart. For example, pinning 16 busy threads to NUMA nodes 0 (CPUs 0-7) and 4 (CPUs 32-39) like so, $ numactl -C 0-7,32-39 ./spinner 16 causes all threads to fork and remain on node 0 until the active balancer kicks in after a few seconds and forcibly moves some threads to node 4. Override node_reclaim_distance for AMD Zen. Signed-off-by: Matt Fleming Signed-off-by: Peter Zijlstra (Intel) Acked-by: Mel Gorman Cc: suravee.suthikulpa...@amd.com Cc: Borislav Petkov Cc: thomas.lenda...@amd.com --- arch/x86/kernel/cpu/amd.c | 5 + include/linux/topology.h | 14 ++ kernel/sched/topology.c | 3 ++- mm/khugepaged.c | 2 +- mm/page_alloc.c | 2 +- 5 files changed, 23 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 8d4e50428b68..ceeb8afc7cf3 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -824,6 +825,10 @@ static void init_amd_zn(struct cpuinfo_x86 *c) { set_cpu_cap(c, X86_FEATURE_ZEN); +#ifdef CONFIG_NUMA + node_reclaim_distance = 32; +#endif + /* * Fix erratum 1076: CPB feature bit not being set in CPUID. * Always set it, except when running under a hypervisor. diff --git a/include/linux/topology.h b/include/linux/topology.h index 47a3e3c08036..579522ec446c 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -59,6 +59,20 @@ int arch_update_cpu_topology(void); */ #define RECLAIM_DISTANCE 30 #endif + +/* + * The following tunable allows platforms to override the default node + * reclaim distance (RECLAIM_DISTANCE) if remote memory accesses are + * sufficiently fast that the default value actually hurts + * performance. + * + * AMD EPYC machines use this because even though the 2-hop distance + * is 32 (3.2x slower than a local memory access) performance actually + * *improves* if allowed to reclaim memory and load balance tasks + * between NUMA nodes 2-hops apart. + */ +extern int __read_mostly node_reclaim_distance; + #ifndef PENALTY_FOR_NODE_WITH_CPUS #define PENALTY_FOR_NODE_WITH_CPUS (1) #endif diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 8f83e8e3ea9a..b5667a273bf6 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1284,6 +1284,7 @@ static int sched_domains_curr_level; intsched_max_numa_distance; static int *sched_domains_numa_distance; static struct cpumask ***sched_domains_numa_masks; +int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE; #endif /* @@ -1402,7 +1403,7 @@ sd_init(struct sched_domain_topology_level *tl, sd->flags &= ~SD_PREFER_SIBLING; sd->flags |= SD_SERIALIZE; - if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) { + if (sched_domains_numa_distance[tl->numa_level] > node_reclaim_distance) { sd->flags &= ~(SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE); diff --git a/mm/khugepaged.c b/mm/khugepaged.c index eaaa21b23215..ccede2425c3f 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -710,7 +710,7 @@ static bool khugepaged_scan_abort(int nid) for (i = 0; i < MAX_NUMNODES; i++) { if (!khugepaged_node_load[i]) continue; - if (node_distance(nid, i) > RECLAIM_DISTANCE) + if (node_distance(nid, i) > node_reclaim_distance) return true; } return false; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 272c6de1bf4e..0d54cd2c43a4 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3522,7 +3522,7 @@ bool zone_watermark_ok_safe(struct zone *z, unsigned int order, static bool zone_allows_recla
[PATCH 1/2] ia64: Make NUMA select SMP
While it does make sense to allow CONFIG_NUMA and !CONFIG_SMP in theory, it doesn't make much sense in practice. Follow other architectures and make CONFIG_NUMA select CONFIG_SMP. The motivation for this patch is to allow a new NUMA variable to be initialised in kernel/sched/topology.c. Signed-off-by: Matt Fleming Cc: Tony Luck Cc: Rik van Riel Cc: Peter Zijlstra --- arch/ia64/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index 7468d8e50467..997baba02b70 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -389,6 +389,7 @@ config NUMA depends on !IA64_HP_SIM && !FLATMEM default y if IA64_SGI_SN2 select ACPI_NUMA if ACPI + select SMP help Say Y to compile the kernel to support NUMA (Non-Uniform Memory Access). This option is for configuring high-end multiprocessor -- 2.13.7
[PATCH RT 01/19] kthread: Use __RAW_SPIN_LOCK_UNLOCK to initialize kthread_worker lock
From: Tom Zanussi v4.14.137-rt65-rc1 stable review patch. If anyone has any objections, please let me know. --- commit 2a9060beefcf (kthread: convert worker lock to raw spinlock) forgot to update KTHREAD_WORKER_INIT() to use __RAW_SPIN_LOCK_UNLOCKED() instead of just __SPIN_LOCK_UNLOCKED() when it converted the lock to raw. Change it so that e.g. DEFINE_KTHREAD_WORKER() users don't error out. Signed-off-by: Tom Zanussi --- include/linux/kthread.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/kthread.h b/include/linux/kthread.h index 4e0449df82c3..4e663f407bd7 100644 --- a/include/linux/kthread.h +++ b/include/linux/kthread.h @@ -105,7 +105,7 @@ struct kthread_delayed_work { }; #define KTHREAD_WORKER_INIT(worker){ \ - .lock = __SPIN_LOCK_UNLOCKED((worker).lock),\ + .lock = __RAW_SPIN_LOCK_UNLOCKED((worker).lock),\ .work_list = LIST_HEAD_INIT((worker).work_list),\ .delayed_work_list = LIST_HEAD_INIT((worker).delayed_work_list),\ } -- 2.14.1
[PATCH RT 16/19] futex: Make the futex_hash_bucket lock raw
From: Sebastian Andrzej Siewior v4.14.137-rt65-rc1 stable review patch. If anyone has any objections, please let me know. --- [ Upstream commit f646521aadedab78801c9befe193e2e8a0c99298 ] Since commit 1a1fb985f2e2b ("futex: Handle early deadlock return correctly") we can deadlock while we attempt to acquire the HB lock if we fail to acquire the lock. The RT waiter (for the futex lock) is still enqueued and acquiring the HB lock may build up a lock chain which leads to a deadlock if the owner of the lock futex-lock holds the HB lock. Make the hash bucket lock raw so it does not participate in the lockchain. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Tom Zanussi Conflicts: kernel/futex.c --- kernel/futex.c | 89 +- 1 file changed, 45 insertions(+), 44 deletions(-) diff --git a/kernel/futex.c b/kernel/futex.c index ec90130cd809..0548070cda89 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -240,7 +240,7 @@ struct futex_q { struct plist_node list; struct task_struct *task; - spinlock_t *lock_ptr; + raw_spinlock_t *lock_ptr; union futex_key key; struct futex_pi_state *pi_state; struct rt_mutex_waiter *rt_waiter; @@ -261,7 +261,7 @@ static const struct futex_q futex_q_init = { */ struct futex_hash_bucket { atomic_t waiters; - spinlock_t lock; + raw_spinlock_t lock; struct plist_head chain; } cacheline_aligned_in_smp; @@ -926,7 +926,7 @@ void exit_pi_state_list(struct task_struct *curr) } raw_spin_unlock_irq(&curr->pi_lock); - spin_lock(&hb->lock); + raw_spin_lock(&hb->lock); raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); raw_spin_lock(&curr->pi_lock); /* @@ -936,7 +936,7 @@ void exit_pi_state_list(struct task_struct *curr) if (head->next != next) { /* retain curr->pi_lock for the loop invariant */ raw_spin_unlock(&pi_state->pi_mutex.wait_lock); - spin_unlock(&hb->lock); + raw_spin_unlock(&hb->lock); put_pi_state(pi_state); continue; } @@ -948,7 +948,7 @@ void exit_pi_state_list(struct task_struct *curr) raw_spin_unlock(&curr->pi_lock); raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); - spin_unlock(&hb->lock); + raw_spin_unlock(&hb->lock); rt_mutex_futex_unlock(&pi_state->pi_mutex); put_pi_state(pi_state); @@ -1442,7 +1442,7 @@ static void __unqueue_futex(struct futex_q *q) { struct futex_hash_bucket *hb; - if (WARN_ON_SMP(!q->lock_ptr || !spin_is_locked(q->lock_ptr)) + if (WARN_ON_SMP(!q->lock_ptr || !raw_spin_is_locked(q->lock_ptr)) || WARN_ON(plist_node_empty(&q->list))) return; @@ -1570,21 +1570,21 @@ static inline void double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2) { if (hb1 <= hb2) { - spin_lock(&hb1->lock); + raw_spin_lock(&hb1->lock); if (hb1 < hb2) - spin_lock_nested(&hb2->lock, SINGLE_DEPTH_NESTING); + raw_spin_lock_nested(&hb2->lock, SINGLE_DEPTH_NESTING); } else { /* hb1 > hb2 */ - spin_lock(&hb2->lock); - spin_lock_nested(&hb1->lock, SINGLE_DEPTH_NESTING); + raw_spin_lock(&hb2->lock); + raw_spin_lock_nested(&hb1->lock, SINGLE_DEPTH_NESTING); } } static inline void double_unlock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2) { - spin_unlock(&hb1->lock); + raw_spin_unlock(&hb1->lock); if (hb1 != hb2) - spin_unlock(&hb2->lock); + raw_spin_unlock(&hb2->lock); } /* @@ -1612,7 +1612,7 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) if (!hb_waiters_pending(hb)) goto out_put_key; - spin_lock(&hb->lock); + raw_spin_lock(&hb->lock); plist_for_each_entry_safe(this, next, &hb->chain, list) { if (match_futex (&this->key, &key)) { @@ -1631,7 +1631,7 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) } } - spin_unlock(&hb->lock); + raw_spin_unlock(&hb->lock); wake_up_q(&wake_q); out_put_key: put_futex_key(&key); @@ -2236,7 +2236,8 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q) q->lock_ptr = &hb->lock; - spin_lock(&hb->lock); /* implies smp_mb(); (A) */ + raw_spin_lock(&hb->lock); + return hb; } @@ -2244,7 +2245,7 @@ static inline void queue_unlock(struct futex_hash_bucket *hb) __rel
[PATCH RT 17/19] futex: Delay deallocation of pi_state
From: Thomas Gleixner v4.14.137-rt65-rc1 stable review patch. If anyone has any objections, please let me know. --- [ Upstream commit d7c7cf8cb68b7df17e6e50be1f25f35d83e686c7 ] On -RT we can't invoke kfree() in a non-preemptible context. Defer the deallocation of pi_state to preemptible context. Signed-off-by: Thomas Gleixner Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Tom Zanussi --- kernel/futex.c | 55 --- 1 file changed, 44 insertions(+), 11 deletions(-) diff --git a/kernel/futex.c b/kernel/futex.c index 0548070cda89..5f1cfa2f02b6 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -822,13 +822,13 @@ static void get_pi_state(struct futex_pi_state *pi_state) * Drops a reference to the pi_state object and frees or caches it * when the last reference is gone. */ -static void put_pi_state(struct futex_pi_state *pi_state) +static struct futex_pi_state *__put_pi_state(struct futex_pi_state *pi_state) { if (!pi_state) - return; + return NULL; if (!atomic_dec_and_test(&pi_state->refcount)) - return; + return NULL; /* * If pi_state->owner is NULL, the owner is most probably dying @@ -848,9 +848,7 @@ static void put_pi_state(struct futex_pi_state *pi_state) raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); } - if (current->pi_state_cache) { - kfree(pi_state); - } else { + if (!current->pi_state_cache) { /* * pi_state->list is already empty. * clear pi_state->owner. @@ -859,6 +857,30 @@ static void put_pi_state(struct futex_pi_state *pi_state) pi_state->owner = NULL; atomic_set(&pi_state->refcount, 1); current->pi_state_cache = pi_state; + pi_state = NULL; + } + return pi_state; +} + +static void put_pi_state(struct futex_pi_state *pi_state) +{ + kfree(__put_pi_state(pi_state)); +} + +static void put_pi_state_atomic(struct futex_pi_state *pi_state, + struct list_head *to_free) +{ + if (__put_pi_state(pi_state)) + list_add(&pi_state->list, to_free); +} + +static void free_pi_state_list(struct list_head *to_free) +{ + struct futex_pi_state *p, *next; + + list_for_each_entry_safe(p, next, to_free, list) { + list_del(&p->list); + kfree(p); } } @@ -893,6 +915,7 @@ void exit_pi_state_list(struct task_struct *curr) struct futex_pi_state *pi_state; struct futex_hash_bucket *hb; union futex_key key = FUTEX_KEY_INIT; + LIST_HEAD(to_free); if (!futex_cmpxchg_enabled) return; @@ -937,7 +960,7 @@ void exit_pi_state_list(struct task_struct *curr) /* retain curr->pi_lock for the loop invariant */ raw_spin_unlock(&pi_state->pi_mutex.wait_lock); raw_spin_unlock(&hb->lock); - put_pi_state(pi_state); + put_pi_state_atomic(pi_state, &to_free); continue; } @@ -956,6 +979,8 @@ void exit_pi_state_list(struct task_struct *curr) raw_spin_lock_irq(&curr->pi_lock); } raw_spin_unlock_irq(&curr->pi_lock); + + free_pi_state_list(&to_free); } #endif @@ -1938,6 +1963,7 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, struct futex_hash_bucket *hb1, *hb2; struct futex_q *this, *next; DEFINE_WAKE_Q(wake_q); + LIST_HEAD(to_free); if (nr_wake < 0 || nr_requeue < 0) return -EINVAL; @@ -2175,7 +2201,7 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, * object. */ this->pi_state = NULL; - put_pi_state(pi_state); + put_pi_state_atomic(pi_state, &to_free); /* * We stop queueing more waiters and let user * space deal with the mess. @@ -2192,7 +2218,7 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, * in futex_proxy_trylock_atomic() or in lookup_pi_state(). We * need to drop it here again. */ - put_pi_state(pi_state); + put_pi_state_atomic(pi_state, &to_free); out_unlock: double_unlock_hb(hb1, hb2); @@ -2213,6 +2239,7 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, out_put_key1: put_futex_key(&key1); out: + free_pi_state_list(&to_free); return ret ? ret : task_count; } @@ -2350,13 +2377,16 @@ static int unqueue_me(struct futex_q *q) static void unqueue_me_pi(struct futex_q *q)
[PATCH RT 15/19] Revert "futex: workaround migrate_disable/enable in different context"
From: Sebastian Andrzej Siewior v4.14.137-rt65-rc1 stable review patch. If anyone has any objections, please let me know. --- [ Upstream commit a71221d81cc4873891ae44f3aa02df596079b786 ] Drop the RT fixup, the futex code will be changed to avoid the need for the workaround. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Tom Zanussi Conflicts: kernel/futex.c --- kernel/futex.c | 19 --- 1 file changed, 19 deletions(-) diff --git a/kernel/futex.c b/kernel/futex.c index cb7e212fba0f..ec90130cd809 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -2893,14 +2893,6 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, * before __rt_mutex_start_proxy_lock() is done. */ raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock); - /* -* the migrate_disable() here disables migration in the in_atomic() fast -* path which is enabled again in the following spin_unlock(). We have -* one migrate_disable() pending in the slow-path which is reversed -* after the raw_spin_unlock_irq() where we leave the atomic context. -*/ - migrate_disable(); - spin_unlock(q.lock_ptr); /* * __rt_mutex_start_proxy_lock() unconditionally enqueues the @rt_waiter @@ -2909,7 +2901,6 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, */ ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current); raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock); - migrate_enable(); if (ret) { if (ret == 1) @@ -3058,21 +3049,11 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags) * rt_waiter. Also see the WARN in wake_futex_pi(). */ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); - /* -* Magic trickery for now to make the RT migrate disable -* logic happy. The following spin_unlock() happens with -* interrupts disabled so the internal migrate_enable() -* won't undo the migrate_disable() which was issued when -* locking hb->lock. -*/ - migrate_disable(); spin_unlock(&hb->lock); /* drops pi_state->pi_mutex.wait_lock */ ret = wake_futex_pi(uaddr, uval, pi_state); - migrate_enable(); - put_pi_state(pi_state); /* -- 2.14.1
[PATCH RT 07/19] locking/lockdep: Don't complain about incorrect name for no validate class
From: Sebastian Andrzej Siewior v4.14.137-rt65-rc1 stable review patch. If anyone has any objections, please let me know. --- [ Upstream commit 978315462d3ea3cf6cfacd34c563ec1eb02a3aa5 ] It is possible to ignore the validation for a certain lock by using: lockdep_set_novalidate_class() on it. Each invocation will assign a new name to the class it created for created __lockdep_no_validate__. That means that once lockdep_set_novalidate_class() has been used on two locks then class->name won't match lock->name for the first lock triggering the warning. So ignore changed non-matching ->name pointer for the special __lockdep_no_validate__ class. Signed-off-by: Sebastian Andrzej Siewior Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Will Deacon Link: http://lkml.kernel.org/r/20190517212234.32611-1-bige...@linutronix.de Signed-off-by: Ingo Molnar Signed-off-by: Tom Zanussi --- kernel/locking/lockdep.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index e576d234f3ea..f194de27123d 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -719,7 +719,8 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) * Huh! same key, different name? Did someone trample * on some memory? We're most confused. */ - WARN_ON_ONCE(class->name != lock->name); + WARN_ON_ONCE(class->name != lock->name && +lock->key != &__lockdep_no_validate__); return class; } } -- 2.14.1
[PATCH RT 13/19] Revert "futex: Fix bug on when a requeued RT task times out"
From: Sebastian Andrzej Siewior v4.14.137-rt65-rc1 stable review patch. If anyone has any objections, please let me know. --- [ Upstream commit f1a170cb3289a48df26cae3c60d77608f7a988bb ] Drop the RT fixup, the futex code will be changed to avoid the need for the workaround. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Tom Zanussi --- kernel/locking/rtmutex.c| 31 +-- kernel/locking/rtmutex_common.h | 1 - 2 files changed, 1 insertion(+), 31 deletions(-) diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 1177f2815040..62914dde3f1c 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -144,8 +144,7 @@ static void fixup_rt_mutex_waiters(struct rt_mutex *lock) static int rt_mutex_real_waiter(struct rt_mutex_waiter *waiter) { - return waiter && waiter != PI_WAKEUP_INPROGRESS && - waiter != PI_REQUEUE_INPROGRESS; + return waiter && waiter != PI_WAKEUP_INPROGRESS; } /* @@ -2358,34 +2357,6 @@ int __rt_mutex_start_proxy_lock(struct rt_mutex *lock, if (try_to_take_rt_mutex(lock, task, NULL)) return 1; -#ifdef CONFIG_PREEMPT_RT_FULL - /* -* In PREEMPT_RT there's an added race. -* If the task, that we are about to requeue, times out, -* it can set the PI_WAKEUP_INPROGRESS. This tells the requeue -* to skip this task. But right after the task sets -* its pi_blocked_on to PI_WAKEUP_INPROGRESS it can then -* block on the spin_lock(&hb->lock), which in RT is an rtmutex. -* This will replace the PI_WAKEUP_INPROGRESS with the actual -* lock that it blocks on. We *must not* place this task -* on this proxy lock in that case. -* -* To prevent this race, we first take the task's pi_lock -* and check if it has updated its pi_blocked_on. If it has, -* we assume that it woke up and we return -EAGAIN. -* Otherwise, we set the task's pi_blocked_on to -* PI_REQUEUE_INPROGRESS, so that if the task is waking up -* it will know that we are in the process of requeuing it. -*/ - raw_spin_lock(&task->pi_lock); - if (task->pi_blocked_on) { - raw_spin_unlock(&task->pi_lock); - return -EAGAIN; - } - task->pi_blocked_on = PI_REQUEUE_INPROGRESS; - raw_spin_unlock(&task->pi_lock); -#endif - /* We enforce deadlock detection for futexes */ ret = task_blocks_on_rt_mutex(lock, waiter, task, RT_MUTEX_FULL_CHAINWALK); diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h index 2a157c78e18c..53ca0242101a 100644 --- a/kernel/locking/rtmutex_common.h +++ b/kernel/locking/rtmutex_common.h @@ -132,7 +132,6 @@ enum rtmutex_chainwalk { * PI-futex support (proxy locking functions, etc.): */ #define PI_WAKEUP_INPROGRESS ((struct rt_mutex_waiter *) 1) -#define PI_REQUEUE_INPROGRESS ((struct rt_mutex_waiter *) 2) extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock); extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock, -- 2.14.1
[PATCH RT 09/19] rcu: Don't allow to change rcu_normal_after_boot on RT
From: Sebastian Andrzej Siewior v4.14.137-rt65-rc1 stable review patch. If anyone has any objections, please let me know. --- [ Upstream commit c6c058c10577815a2491ce661876cff00a4c3b15 ] On RT rcu_normal_after_boot is enabled by default. Don't allow to disable it on RT because the "expedited rcu" would introduce latency spikes. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Tom Zanussi --- kernel/rcu/update.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 2006a09680aa..307592810f6b 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -67,7 +67,9 @@ module_param(rcu_expedited, int, 0); extern int rcu_normal; /* from sysctl */ module_param(rcu_normal, int, 0); static int rcu_normal_after_boot = IS_ENABLED(CONFIG_PREEMPT_RT_FULL); +#ifndef CONFIG_PREEMPT_RT_FULL module_param(rcu_normal_after_boot, int, 0); +#endif #endif /* #ifndef CONFIG_TINY_RCU */ #ifdef CONFIG_DEBUG_LOCK_ALLOC -- 2.14.1
[PATCH RT 19/19] Linux 4.14.137-rt65-rc1
From: Tom Zanussi v4.14.137-rt65-rc1 stable review patch. If anyone has any objections, please let me know. --- Signed-off-by: Tom Zanussi --- localversion-rt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/localversion-rt b/localversion-rt index 10474042df49..03188f3e7d8d 100644 --- a/localversion-rt +++ b/localversion-rt @@ -1 +1 @@ --rt64 +-rt65-rc1 -- 2.14.1
[PATCH RT 12/19] Revert "futex: Ensure lock/unlock symetry versus pi_lock and hash bucket lock"
From: Sebastian Andrzej Siewior v4.14.137-rt65-rc1 stable review patch. If anyone has any objections, please let me know. --- [ Upstream commit 6a773b70cf105b46298ed3b44e77c102ce31d9ec ] Drop the RT fixup, the futex code will be changed to avoid the need for the workaround. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Tom Zanussi --- kernel/futex.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernel/futex.c b/kernel/futex.c index ad0abb0e339f..07b148ad703a 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -936,9 +936,7 @@ void exit_pi_state_list(struct task_struct *curr) if (head->next != next) { /* retain curr->pi_lock for the loop invariant */ raw_spin_unlock(&pi_state->pi_mutex.wait_lock); - raw_spin_unlock_irq(&curr->pi_lock); spin_unlock(&hb->lock); - raw_spin_lock_irq(&curr->pi_lock); put_pi_state(pi_state); continue; } -- 2.14.1
[PATCH RT 11/19] sched/core: Drop a preempt_disable_rt() statement
From: Sebastian Andrzej Siewior v4.14.137-rt65-rc1 stable review patch. If anyone has any objections, please let me know. --- [ Upstream commit 761126efdcbe3fa3e99c9079fa0ad6eca2f251f2 ] The caller holds a lock which already disables preemption. Drop the preempt_disable_rt() statement in get_nohz_timer_target(). Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Tom Zanussi Conflicts: kernel/sched/core.c --- kernel/sched/core.c | 9 ++--- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7d2cc0715114..17da1c1aba56 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -583,14 +583,11 @@ void resched_cpu(int cpu) */ int get_nohz_timer_target(void) { - int i, cpu; + int i, cpu = smp_processor_id(); struct sched_domain *sd; - preempt_disable_rt(); - cpu = smp_processor_id(); - if (!idle_cpu(cpu) && is_housekeeping_cpu(cpu)) - goto preempt_en_rt; + return cpu; rcu_read_lock(); for_each_domain(cpu, sd) { @@ -609,8 +606,6 @@ int get_nohz_timer_target(void) cpu = housekeeping_any_cpu(); unlock: rcu_read_unlock(); -preempt_en_rt: - preempt_enable_rt(); return cpu; } -- 2.14.1
[PATCH RT 14/19] Revert "rtmutex: Handle the various new futex race conditions"
From: Sebastian Andrzej Siewior v4.14.137-rt65-rc1 stable review patch. If anyone has any objections, please let me know. --- [ Upstream commit 9e0265c21af4d6388d47dcd5ce20f76ec3a2e468 ] Drop the RT fixup, the futex code will be changed to avoid the need for the workaround. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Tom Zanussi --- kernel/futex.c | 77 - kernel/locking/rtmutex.c| 36 --- kernel/locking/rtmutex_common.h | 2 -- 3 files changed, 21 insertions(+), 94 deletions(-) diff --git a/kernel/futex.c b/kernel/futex.c index 07b148ad703a..cb7e212fba0f 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -2165,16 +2165,6 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, requeue_pi_wake_futex(this, &key2, hb2); drop_count++; continue; - } else if (ret == -EAGAIN) { - /* -* Waiter was woken by timeout or -* signal and has set pi_blocked_on to -* PI_WAKEUP_INPROGRESS before we -* tried to enqueue it on the rtmutex. -*/ - this->pi_state = NULL; - put_pi_state(pi_state); - continue; } else if (ret) { /* * rt_mutex_start_proxy_lock() detected a @@ -3253,7 +3243,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, struct hrtimer_sleeper timeout, *to = NULL; struct futex_pi_state *pi_state = NULL; struct rt_mutex_waiter rt_waiter; - struct futex_hash_bucket *hb, *hb2; + struct futex_hash_bucket *hb; union futex_key key2 = FUTEX_KEY_INIT; struct futex_q q = futex_q_init; int res, ret; @@ -3311,55 +3301,20 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, /* Queue the futex_q, drop the hb lock, wait for wakeup. */ futex_wait_queue_me(hb, &q, to); - /* -* On RT we must avoid races with requeue and trying to block -* on two mutexes (hb->lock and uaddr2's rtmutex) by -* serializing access to pi_blocked_on with pi_lock. -*/ - raw_spin_lock_irq(¤t->pi_lock); - if (current->pi_blocked_on) { - /* -* We have been requeued or are in the process of -* being requeued. -*/ - raw_spin_unlock_irq(¤t->pi_lock); - } else { - /* -* Setting pi_blocked_on to PI_WAKEUP_INPROGRESS -* prevents a concurrent requeue from moving us to the -* uaddr2 rtmutex. After that we can safely acquire -* (and possibly block on) hb->lock. -*/ - current->pi_blocked_on = PI_WAKEUP_INPROGRESS; - raw_spin_unlock_irq(¤t->pi_lock); - - spin_lock(&hb->lock); - - /* -* Clean up pi_blocked_on. We might leak it otherwise -* when we succeeded with the hb->lock in the fast -* path. -*/ - raw_spin_lock_irq(¤t->pi_lock); - current->pi_blocked_on = NULL; - raw_spin_unlock_irq(¤t->pi_lock); - - ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to); - spin_unlock(&hb->lock); - if (ret) - goto out_put_keys; - } + spin_lock(&hb->lock); + ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to); + spin_unlock(&hb->lock); + if (ret) + goto out_put_keys; /* -* In order to be here, we have either been requeued, are in -* the process of being requeued, or requeue successfully -* acquired uaddr2 on our behalf. If pi_blocked_on was -* non-null above, we may be racing with a requeue. Do not -* rely on q->lock_ptr to be hb2->lock until after blocking on -* hb->lock or hb2->lock. The futex_requeue dropped our key1 -* reference and incremented our key2 reference count. +* In order for us to be here, we know our q.key == key2, and since +* we took the hb->lock above, we also know that futex_requeue() has +* completed and we no longer have to concern ourselves with a wakeup +* race with the atomic proxy lock acquisition by the requeue code. The +* futex_requeue dropped our key1 reference and incremented our key2 +* reference count. */ - hb2 = hash_futex(&key2); /* Check if the requeue code acquired the second futex for us. *
[PATCH RT 18/19] mm/zswap: Do not disable preemption in zswap_frontswap_store()
From: "Luis Claudio R. Goncalves" v4.14.137-rt65-rc1 stable review patch. If anyone has any objections, please let me know. --- [ Upstream commit 4e4cf4be79635e67144632d9135286381acbc95a ] Zswap causes "BUG: scheduling while atomic" by blocking on a rt_spin_lock() with preemption disabled. The preemption is disabled by get_cpu_var() in zswap_frontswap_store() to protect the access of the zswap_dstmem percpu variable. Use get_locked_var() to protect the percpu zswap_dstmem variable, making the code preemptive. As get_cpu_ptr() also disables preemption, replace it by this_cpu_ptr() and remove the counterpart put_cpu_ptr(). Steps to Reproduce: 1. # grubby --args "zswap.enabled=1" --update-kernel DEFAULT 2. # reboot 3. Calculate the amount o memory to be used by the test: ---> grep MemAvailable /proc/meminfo ---> Add 25% ~ 50% to that value 4. # stress --vm 1 --vm-bytes ${MemAvailable+25%} --timeout 240s Usually, in less than 5 minutes the backtrace listed below appears, followed by a kernel panic: | BUG: scheduling while atomic: kswapd1/181/0x0002 | | Preemption disabled at: | [] zswap_frontswap_store+0x21a/0x6e1 | | Kernel panic - not syncing: scheduling while atomic | CPU: 14 PID: 181 Comm: kswapd1 Kdump: loaded Not tainted 5.0.14-rt9 #1 | Hardware name: AMD Pence/Pence, BIOS WPN2321X_Weekly_12_03_21 03/19/2012 | Call Trace: | panic+0x106/0x2a7 | __schedule_bug.cold+0x3f/0x51 | __schedule+0x5cb/0x6f0 | schedule+0x43/0xd0 | rt_spin_lock_slowlock_locked+0x114/0x2b0 | rt_spin_lock_slowlock+0x51/0x80 | zbud_alloc+0x1da/0x2d0 | zswap_frontswap_store+0x31a/0x6e1 | __frontswap_store+0xab/0x130 | swap_writepage+0x39/0x70 | pageout.isra.0+0xe3/0x320 | shrink_page_list+0xa8e/0xd10 | shrink_inactive_list+0x251/0x840 | shrink_node_memcg+0x213/0x770 | shrink_node+0xd9/0x450 | balance_pgdat+0x2d5/0x510 | kswapd+0x218/0x470 | kthread+0xfb/0x130 | ret_from_fork+0x27/0x50 Cc: stable...@vger.kernel.org Reported-by: Ping Fang Signed-off-by: Luis Claudio R. Goncalves Reviewed-by: Daniel Bristot de Oliveira Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Tom Zanussi Conflicts: mm/zswap.c --- mm/zswap.c | 12 +++- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index ebb0bc88c5f7..a2b4e14f851c 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include @@ -953,6 +954,8 @@ static int zswap_shrink(void) return ret; } +/* protect zswap_dstmem from concurrency */ +static DEFINE_LOCAL_IRQ_LOCK(zswap_dstmem_lock); /* * frontswap hooks **/ @@ -1016,12 +1019,11 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, } /* compress */ - dst = get_cpu_var(zswap_dstmem); - tfm = *get_cpu_ptr(entry->pool->tfm); + dst = get_locked_var(zswap_dstmem_lock, zswap_dstmem); + tfm = *this_cpu_ptr(entry->pool->tfm); src = kmap_atomic(page); ret = crypto_comp_compress(tfm, src, PAGE_SIZE, dst, &dlen); kunmap_atomic(src); - put_cpu_ptr(entry->pool->tfm); if (ret) { ret = -EINVAL; goto put_dstmem; @@ -1045,7 +1047,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, buf = (u8 *)(zhdr + 1); memcpy(buf, dst, dlen); zpool_unmap_handle(entry->pool->zpool, handle); - put_cpu_var(zswap_dstmem); + put_locked_var(zswap_dstmem_lock, zswap_dstmem); /* populate entry */ entry->offset = offset; @@ -1072,7 +1074,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, return 0; put_dstmem: - put_cpu_var(zswap_dstmem); + put_locked_var(zswap_dstmem_lock, zswap_dstmem); zswap_pool_put(entry->pool); freepage: zswap_entry_cache_free(entry); -- 2.14.1
Re: [PATCH 01/26] drm/dp_mst: Move link address dumping into a function
On Wed, Jul 17, 2019 at 09:42:24PM -0400, Lyude Paul wrote: > Since we're about to be calling this from multiple places. Also it makes > things easier to read! > > Cc: Juston Li > Cc: Imre Deak > Cc: Ville Syrjälä > Cc: Harry Wentland > Signed-off-by: Lyude Paul Reviewed-by: Daniel Vetter > --- > drivers/gpu/drm/drm_dp_mst_topology.c | 35 ++- > 1 file changed, 23 insertions(+), 12 deletions(-) > > diff --git a/drivers/gpu/drm/drm_dp_mst_topology.c > b/drivers/gpu/drm/drm_dp_mst_topology.c > index 0984b9a34d55..998081b9b205 100644 > --- a/drivers/gpu/drm/drm_dp_mst_topology.c > +++ b/drivers/gpu/drm/drm_dp_mst_topology.c > @@ -2013,6 +2013,28 @@ static void drm_dp_queue_down_tx(struct > drm_dp_mst_topology_mgr *mgr, > mutex_unlock(&mgr->qlock); > } > > +static void > +drm_dp_dump_link_address(struct drm_dp_link_address_ack_reply *reply) > +{ > + struct drm_dp_link_addr_reply_port *port_reply; > + int i; > + > + for (i = 0; i < reply->nports; i++) { > + port_reply = &reply->ports[i]; > + DRM_DEBUG_KMS("port %d: input %d, pdt: %d, pn: %d, dpcd_rev: > %02x, mcs: %d, ddps: %d, ldps %d, sdp %d/%d\n", > + i, > + port_reply->input_port, > + port_reply->peer_device_type, > + port_reply->port_number, > + port_reply->dpcd_revision, > + port_reply->mcs, > + port_reply->ddps, > + port_reply->legacy_device_plug_status, > + port_reply->num_sdp_streams, > + port_reply->num_sdp_stream_sinks); > + } > +} > + > static void drm_dp_send_link_address(struct drm_dp_mst_topology_mgr *mgr, >struct drm_dp_mst_branch *mstb) > { > @@ -2038,18 +2060,7 @@ static void drm_dp_send_link_address(struct > drm_dp_mst_topology_mgr *mgr, > DRM_DEBUG_KMS("link address nak received\n"); > } else { > DRM_DEBUG_KMS("link address reply: %d\n", > txmsg->reply.u.link_addr.nports); > - for (i = 0; i < txmsg->reply.u.link_addr.nports; i++) { > - DRM_DEBUG_KMS("port %d: input %d, pdt: %d, pn: > %d, dpcd_rev: %02x, mcs: %d, ddps: %d, ldps %d, sdp %d/%d\n", i, > - > txmsg->reply.u.link_addr.ports[i].input_port, > - > txmsg->reply.u.link_addr.ports[i].peer_device_type, > - > txmsg->reply.u.link_addr.ports[i].port_number, > - > txmsg->reply.u.link_addr.ports[i].dpcd_revision, > -txmsg->reply.u.link_addr.ports[i].mcs, > -txmsg->reply.u.link_addr.ports[i].ddps, > - > txmsg->reply.u.link_addr.ports[i].legacy_device_plug_status, > - > txmsg->reply.u.link_addr.ports[i].num_sdp_streams, > - > txmsg->reply.u.link_addr.ports[i].num_sdp_stream_sinks); > - } > + drm_dp_dump_link_address(&txmsg->reply.u.link_addr); > > drm_dp_check_mstb_guid(mstb, > txmsg->reply.u.link_addr.guid); > > -- > 2.21.0 > -- Daniel Vetter Software Engineer, Intel Corporation http://blog.ffwll.ch
[PATCH RT 06/19] sched/completion: Fix a lockup in wait_for_completion()
From: Corey Minyard v4.14.137-rt65-rc1 stable review patch. If anyone has any objections, please let me know. --- [ Upstream commit f0837746a7e258abb35e65defc432ca66786347f ] Consider following race: T0T1 T2 wait_for_completion() do_wait_for_common() __prepare_to_swait() schedule() complete() x->done++ (0 -> 1) raw_spin_lock_irqsave() swake_up_locked() wait_for_completion() wake_up_process(T0) list_del_init() raw_spin_unlock_irqrestore() raw_spin_lock_irq(&x->wait.lock) raw_spin_lock_irq(&x->wait.lock)x->done != UINT_MAX, 1 -> 0 raw_spin_unlock_irq(&x->wait.lock) return 1 while (!x->done && timeout), continue loop, not enqueued on &x->wait Basically, the problem is that the original wait queues used in completions did not remove the item from the queue in the wakeup function, but swake_up_locked() does. Fix it by adding the thread to the wait queue inside the do loop. The design of swait detects if it is already in the list and doesn't do the list add again. Cc: stable...@vger.kernel.org Fixes: a04ff6b4ec4ee7e ("completion: Use simple wait queues") Signed-off-by: Corey Minyard Acked-by: Steven Rostedt (VMware) [bigeasy: shorten commit message ] Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Tom Zanussi --- kernel/sched/completion.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c index 0fe2982e46a0..ac6d5efcd6ff 100644 --- a/kernel/sched/completion.c +++ b/kernel/sched/completion.c @@ -80,12 +80,12 @@ do_wait_for_common(struct completion *x, if (!x->done) { DECLARE_SWAITQUEUE(wait); - __prepare_to_swait(&x->wait, &wait); do { if (signal_pending_state(state, current)) { timeout = -ERESTARTSYS; break; } + __prepare_to_swait(&x->wait, &wait); __set_current_state(state); raw_spin_unlock_irq(&x->wait.lock); timeout = action(timeout); -- 2.14.1
Re: [PATCH] rtc: bd70528: fix link error
Hello All, On Thu, 2019-08-08 at 10:29 +0800, Yuehaibing wrote: > On 2019/7/9 13:25, Vaittinen, Matti wrote: > > Hello Arnd, > > > > (Added Randy who also sent a patch to fix this) > > > > On Mon, 2019-07-08 at 18:10 +0200, Arnd Bergmann wrote: > > > On Mon, Jul 8, 2019 at 3:24 PM Vaittinen, Matti > > > wrote: > > > > > > > On Mon, 2019-07-08 at 14:41 +0200, Arnd Bergmann wrote: > > > > > With CONFIG_BD70528_WATCHDOG=m, a built-in rtc driver cannot > > > > > call > > > > > into the low-level functions that are part of the watchdog > > > > > module: > > > > > > > > > > drivers/rtc/rtc-bd70528.o: In function `bd70528_set_time': > > > > > rtc-bd70528.c:(.text+0x22c): undefined reference to > > > > > `bd70528_wdt_lock' > > > > > rtc-bd70528.c:(.text+0x2a8): undefined reference to > > > > > `bd70528_wdt_unlock' > > > > > drivers/rtc/rtc-bd70528.o: In function > > > > > `bd70528_set_rtc_based_timers': > > > > > rtc-bd70528.c:(.text+0x50c): undefined reference to > > > > > `bd70528_wdt_set' > > > > > > > > > > Add a Kconfig dependency on this driver, but still allow > > > > > compile- > > > > > testing > > > > > without it. > > > > > > > > > > Fixes: 32a4a4ebf768 ("rtc: bd70528: Initial support for ROHM > > > > > bd70528 > > > > > RTC") > > > > > Signed-off-by: Arnd Bergmann > > > > > --- > > > > > drivers/rtc/Kconfig | 2 ++ > > > > > 1 file changed, 2 insertions(+) > > > > > > > > > > diff --git a/drivers/rtc/Kconfig b/drivers/rtc/Kconfig > > > > > index 3bfc04a86529..7b071cc74422 100644 > > > > > --- a/drivers/rtc/Kconfig > > > > > +++ b/drivers/rtc/Kconfig > > > > > @@ -498,8 +498,10 @@ config RTC_DRV_M41T80_WDT > > > > > help > > > > > If you say Y here you will get support for the > > > > > watchdog timer in the ST M41T60 and M41T80 RTC chips > > > > > series. > > > > > + > > > > > config RTC_DRV_BD70528 > > > > > tristate "ROHM BD70528 PMIC RTC" > > > > > + depends on BD70528_WATCHDOG || (COMPILE_TEST && > > > > > !BD70528_WATCHDOG) > > > > > > > > I am not fan of this. There may well be use-cases where it is > > > > desirable > > > > to leave the watchdog out but still compile in the RTC. This is > > > > why > > > > we > > > > have static inline stubs in the header for cases where WDG is > > > > not > > > > compiled in. (RTC does not need to stop WDG if WDG driver is > > > > not > > > > included) > > > > > > > > Adding dependency from RTC to MFD for BD70528 should be done - > > > > this > > > > will avoid most of the issues (And there has been few patches > > > > sent > > > > for > > > > this already). But that's still not complete solution because > > > > configuring RTC and MFD to be built in-kernel and WDG as a > > > > module > > > > will > > > > cause errors again. > > > > > > > > Is there a way to force WDG in-kernel if RTC is in-kernel? (Or > > > > disallow configuring RTC in-kernel if WDG is a module - while > > > > still > > > > allow RTC to be built without WDG? > > > > > > We could make this > > > > > > depends on BD70528_WATCHDOG || !BD70528_WATCHDOG > > > > > > which would allow building with or without watchdog, even when > > > not > > > compile-testing, but still disallow the combination of > > > . > > > > Thanks for teaching me Arnd! That is clever :) We need something > > like > > > > depends on MFD_ROHM_BD70528 && (BD70528_WATCHDOG || > > !BD70528_WATCHDOG) > > > > (I'm not sure if parenthesis are Ok and respected in Kconfig). I > > would > > never have thought of BD70528_WATCHDOG || !BD70528_WATCHDOG - it > > looks > > awkward at first sight but indeed - depends on BD70528_WATCHDOG > > disallows BD70528_WATCHDOG=m with RTC_DRV_BD70528=y while > > !BD70528_WATCHDOG allows BD70528_WATCHDOG=n. Brilliant and exactly > > what > > we need :) Thanks a bunch! > > Hello Vaittinen, > > the issue still exists in linux-next 20190807, any plan? > > Sorry folks. I thought Arnd would send new patch - I didn't want to > steal his work ;) I will be back to my normal life next week so I > will send a patch at monday if the issue is still open! Br, Matti
[PATCH RT 10/19] pci/switchtec: fix stream_open.cocci warnings
From: kbuild test robot v4.14.137-rt65-rc1 stable review patch. If anyone has any objections, please let me know. --- [ Upstream commit 9462c69e29307adc95c289f50839d5d683973891 ] drivers/pci/switch/switchtec.c:395:1-17: ERROR: switchtec_fops: .read() can deadlock .write(); change nonseekable_open -> stream_open to fix. Generated by: scripts/coccinelle/api/stream_open.cocci Cc: Kirill Smelkov Cc: Julia Lawall Fixes: 8a29a3bae2a2 ("pci/switchtec: Don't use completion's wait queue") Cc: stable...@vger.kernel.org # where it applies to Link: https://lkml.kernel.org/r/alpine.DEB.2.21.1904131849350.2536@hadrien Signed-off-by: kbuild test robot Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Tom Zanussi --- drivers/pci/switch/switchtec.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pci/switch/switchtec.c b/drivers/pci/switch/switchtec.c index 69875a196ad8..2b6641c9e868 100644 --- a/drivers/pci/switch/switchtec.c +++ b/drivers/pci/switch/switchtec.c @@ -625,7 +625,7 @@ static int switchtec_dev_open(struct inode *inode, struct file *filp) return PTR_ERR(stuser); filp->private_data = stuser; - nonseekable_open(inode, filp); + stream_open(inode, filp); dev_dbg(&stdev->dev, "%s: %p\n", __func__, stuser); -- 2.14.1
[PATCH RT 00/19] Linux v4.14.137-rt65-rc1
From: Tom Zanussi Dear RT Folks, This is the RT stable review cycle of patch 4.14.137-rt65-rc1. Please scream at me if I messed something up. Please test the patches too. The -rc release will be uploaded to kernel.org and will be deleted when the final release is out. This is just a review release (or release candidate). The pre-releases will not be pushed to the git repository, only the final release is. If all goes well, this patch will be converted to the next main release on 2019-08-15. To build 4.14.137-rt65-rc1 directly, the following patches should be applied: https://www.kernel.org/pub/linux/kernel/v4.x/linux-4.14.tar.xz https://www.kernel.org/pub/linux/kernel/v4.x/patch-4.14.137.xz https://www.kernel.org/pub/linux/kernel/projects/rt/4.14/patch-4.14.137-rt65-rc1.patch.xz You can also build from 4.14.137-rt64 by applying the incremental patch: https://www.kernel.org/pub/linux/kernel/projects/rt/4.14/incr/patch-4.14.137-rt64-rt65-rc1.patch.xz Enjoy, -- Tom Corey Minyard (1): sched/completion: Fix a lockup in wait_for_completion() Luis Claudio R. Goncalves (1): mm/zswap: Do not disable preemption in zswap_frontswap_store() Sebastian Andrzej Siewior (13): kthread: add a global worker thread. genirq: Do not invoke the affinity callback via a workqueue on RT genirq: Handle missing work_struct in irq_set_affinity_notifier() locking/rwsem: Rename rwsem_rt.h to rwsem-rt.h locking/lockdep: Don't complain about incorrect name for no validate class arm: imx6: cpuidle: Use raw_spinlock_t rcu: Don't allow to change rcu_normal_after_boot on RT sched/core: Drop a preempt_disable_rt() statement Revert "futex: Ensure lock/unlock symetry versus pi_lock and hash bucket lock" Revert "futex: Fix bug on when a requeued RT task times out" Revert "rtmutex: Handle the various new futex race conditions" Revert "futex: workaround migrate_disable/enable in different context" futex: Make the futex_hash_bucket lock raw Thomas Gleixner (1): futex: Delay deallocation of pi_state Tom Zanussi (2): kthread: Use __RAW_SPIN_LOCK_UNLOCK to initialize kthread_worker lock Linux 4.14.137-rt65-rc1 kbuild test robot (1): pci/switchtec: fix stream_open.cocci warnings arch/arm/mach-imx/cpuidle-imx6q.c| 10 +- drivers/block/loop.c | 2 +- drivers/pci/switch/switchtec.c | 2 +- drivers/spi/spi-rockchip.c | 1 + include/linux/blk-cgroup.h | 1 + include/linux/interrupt.h| 5 +- include/linux/kthread-cgroup.h | 17 +++ include/linux/kthread.h | 10 +- include/linux/{rwsem_rt.h => rwsem-rt.h} | 0 include/linux/rwsem.h| 2 +- init/main.c | 1 + kernel/futex.c | 232 +-- kernel/irq/manage.c | 23 +-- kernel/kthread.c | 13 ++ kernel/locking/lockdep.c | 3 +- kernel/locking/rtmutex.c | 65 + kernel/locking/rtmutex_common.h | 3 - kernel/rcu/update.c | 2 + kernel/sched/completion.c| 2 +- kernel/sched/core.c | 9 +- localversion-rt | 2 +- mm/zswap.c | 12 +- 22 files changed, 179 insertions(+), 238 deletions(-) create mode 100644 include/linux/kthread-cgroup.h rename include/linux/{rwsem_rt.h => rwsem-rt.h} (100%) -- 2.14.1
[PATCH RT 08/19] arm: imx6: cpuidle: Use raw_spinlock_t
From: Sebastian Andrzej Siewior v4.14.137-rt65-rc1 stable review patch. If anyone has any objections, please let me know. --- [ Upstream commit 40d0332ec8312e9c090f0a5414d9c90e12b13611 ] The idle call back is invoked with disabled interrupts and requires raw_spinlock_t locks to work. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Tom Zanussi --- arch/arm/mach-imx/cpuidle-imx6q.c | 10 +- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/arm/mach-imx/cpuidle-imx6q.c b/arch/arm/mach-imx/cpuidle-imx6q.c index 326e870d7123..d9ac80aa1eb0 100644 --- a/arch/arm/mach-imx/cpuidle-imx6q.c +++ b/arch/arm/mach-imx/cpuidle-imx6q.c @@ -17,22 +17,22 @@ #include "hardware.h" static int num_idle_cpus = 0; -static DEFINE_SPINLOCK(cpuidle_lock); +static DEFINE_RAW_SPINLOCK(cpuidle_lock); static int imx6q_enter_wait(struct cpuidle_device *dev, struct cpuidle_driver *drv, int index) { - spin_lock(&cpuidle_lock); + raw_spin_lock(&cpuidle_lock); if (++num_idle_cpus == num_online_cpus()) imx6_set_lpm(WAIT_UNCLOCKED); - spin_unlock(&cpuidle_lock); + raw_spin_unlock(&cpuidle_lock); cpu_do_idle(); - spin_lock(&cpuidle_lock); + raw_spin_lock(&cpuidle_lock); if (num_idle_cpus-- == num_online_cpus()) imx6_set_lpm(WAIT_CLOCKED); - spin_unlock(&cpuidle_lock); + raw_spin_unlock(&cpuidle_lock); return index; } -- 2.14.1
[PATCH RT 05/19] locking/rwsem: Rename rwsem_rt.h to rwsem-rt.h
From: Sebastian Andrzej Siewior v4.14.137-rt65-rc1 stable review patch. If anyone has any objections, please let me know. --- [ Upstream commit fc7a6bdcce83ce162c32d991f0ac8e56ea260f5b ] Rename rwsem_rt.h to rwsem-rt.h to remain consistent with rwsem-rt.c. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Tom Zanussi --- include/linux/{rwsem_rt.h => rwsem-rt.h} | 0 include/linux/rwsem.h| 2 +- 2 files changed, 1 insertion(+), 1 deletion(-) rename include/linux/{rwsem_rt.h => rwsem-rt.h} (100%) diff --git a/include/linux/rwsem_rt.h b/include/linux/rwsem-rt.h similarity index 100% rename from include/linux/rwsem_rt.h rename to include/linux/rwsem-rt.h diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h index 513df11a364e..ac0857d60e04 100644 --- a/include/linux/rwsem.h +++ b/include/linux/rwsem.h @@ -21,7 +21,7 @@ #endif #ifdef CONFIG_PREEMPT_RT_FULL -#include +#include #else /* PREEMPT_RT_FULL */ struct rw_semaphore; -- 2.14.1
[PATCH RT 03/19] genirq: Do not invoke the affinity callback via a workqueue on RT
From: Sebastian Andrzej Siewior v4.14.137-rt65-rc1 stable review patch. If anyone has any objections, please let me know. --- [ Upstream commit 2122adbe011cdc0eb62ad62494e181005b23c76a ] Joe Korty reported, that __irq_set_affinity_locked() schedules a workqueue while holding a rawlock which results in a might_sleep() warning. This patch uses swork_queue() instead. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Tom Zanussi Conflicts: include/linux/interrupt.h kernel/irq/manage.c --- include/linux/interrupt.h | 5 ++--- kernel/irq/manage.c | 19 --- 2 files changed, 6 insertions(+), 18 deletions(-) diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 0f25fa19b2d8..233e3c027f53 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -15,7 +15,7 @@ #include #include #include -#include +#include #include #include @@ -230,7 +230,6 @@ extern void resume_device_irqs(void); * struct irq_affinity_notify - context for notification of IRQ affinity changes * @irq: Interrupt to which notification applies * @kref: Reference count, for internal use - * @swork: Swork item, for internal use * @work: Work item, for internal use * @notify:Function to be called on change. This will be * called in process context. @@ -243,7 +242,7 @@ struct irq_affinity_notify { unsigned int irq; struct kref kref; #ifdef CONFIG_PREEMPT_RT_BASE - struct swork_event swork; + struct kthread_work work; #else struct work_struct work; #endif diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index f9415590661c..3d5b33fe874b 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -228,7 +228,7 @@ int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask, kref_get(&desc->affinity_notify->kref); #ifdef CONFIG_PREEMPT_RT_BASE - swork_queue(&desc->affinity_notify->swork); + kthread_schedule_work(&desc->affinity_notify->work); #else schedule_work(&desc->affinity_notify->work); #endif @@ -293,21 +293,11 @@ static void _irq_affinity_notify(struct irq_affinity_notify *notify) } #ifdef CONFIG_PREEMPT_RT_BASE -static void init_helper_thread(void) -{ - static int init_sworker_once; - - if (init_sworker_once) - return; - if (WARN_ON(swork_get())) - return; - init_sworker_once = 1; -} -static void irq_affinity_notify(struct swork_event *swork) +static void irq_affinity_notify(struct kthread_work *work) { struct irq_affinity_notify *notify = - container_of(swork, struct irq_affinity_notify, swork); + container_of(work, struct irq_affinity_notify, work); _irq_affinity_notify(notify); } @@ -350,8 +340,7 @@ irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify) notify->irq = irq; kref_init(¬ify->kref); #ifdef CONFIG_PREEMPT_RT_BASE - INIT_SWORK(¬ify->swork, irq_affinity_notify); - init_helper_thread(); + kthread_init_work(¬ify->work, irq_affinity_notify); #else INIT_WORK(¬ify->work, irq_affinity_notify); #endif -- 2.14.1
Re: [PATCH v2] pci: Kconfig: select PCI_MSI_IRQ_DOMAIN by default on RISC-V
Hi Paul, Wesley, On Thu, Jul 25, 2019 at 02:28:07PM -0700, Paul Walmsley wrote: > From: Wesley Terpstra > > This is part of adding support for RISC-V systems with PCIe host > controllers that support message-signaled interrupts. > > Signed-off-by: Wesley Terpstra > [paul.walms...@sifive.com: wrote patch description; split this > patch from the arch/riscv patch] > Signed-off-by: Paul Walmsley > --- > drivers/pci/Kconfig | 2 +- > 1 file changed, 1 insertion(+), 1 deletion(-) > > diff --git a/drivers/pci/Kconfig b/drivers/pci/Kconfig > index 2ab92409210a..beb3408a0272 100644 > --- a/drivers/pci/Kconfig > +++ b/drivers/pci/Kconfig > @@ -52,7 +52,7 @@ config PCI_MSI > If you don't know what to do here, say Y. > > config PCI_MSI_IRQ_DOMAIN > - def_bool ARC || ARM || ARM64 || X86 > + def_bool ARC || ARM || ARM64 || X86 || RISCV The other arches listed here either supply their own include/asm/msi.h or generate it: $ ls arch/*/include/asm/msi.h arch/x86/include/asm/msi.h $ grep msi.h arch/*/include/asm/Kbuild arch/arc/include/asm/Kbuild:generic-y += msi.h arch/arm64/include/asm/Kbuild:generic-y += msi.h arch/arm/include/asm/Kbuild:generic-y += msi.h arch/mips/include/asm/Kbuild:generic-y += msi.h arch/powerpc/include/asm/Kbuild:generic-y += msi.h arch/sparc/include/asm/Kbuild:generic-y += msi.h For example, see f8430eae9f1b ("PCI/MSI: Enable PCI_MSI_IRQ_DOMAIN support for ARC") be091d468a0a ("arm64: PCI/MSI: Use asm-generic/msi.h") 0ab089c2548c ("ARM: Add msi.h to Kbuild") I didn't look into the details of msi.h generation, but I assume RISC-V needs to do something similar? If so, I think that should be part of this patch to avoid issues. If CONFIG_GENERIC_MSI_IRQ_DOMAIN is defined, include/linux/msi.h #includes and I don't see where that would come from. > depends on PCI_MSI > select GENERIC_MSI_IRQ_DOMAIN Bjorn
Re: [PATCH 1/2] genirq: introduce update_irq_devid()
On Thu, 8 Aug 2019, Ben Luo wrote: > +int update_irq_devid(unsigned int irq, void *dev_id, void *new_dev_id) > +{ > + struct irq_desc *desc = irq_to_desc(irq); > + struct irqaction *action, **action_ptr; > + unsigned long flags; > + > + WARN(in_interrupt(), > + "Trying to update IRQ %d from IRQ context!\n", irq); This is broken. The function needs to return on that condition. Actually it cannot even be called from non-preemptible code. What's worse is that if the interrupt in question is handled concurrently, then it will either see the old or the new dev_id and because the interrupt handler loop runs with desc->lock dropped even more crap can happen because dev_id can be subject to load and store tearing. Staring at that, I see that there is the same issue in setup_irq() and free_irq(). It's actually worse there. I'll have a look. > + /* > + * There can be multiple actions per IRQ descriptor, find the right > + * one based on the dev_id: > + */ > + action_ptr = &desc->action; > + for (;;) { > + action = *action_ptr; > + > + if (!action) { > + WARN(1, "Trying to update already-free IRQ %d\n", irq); That's wrong in two aspects: 1) The warn should be outside of the locked region. 2) Just having the irq number is not useful for debugging either when the interrupt is shared. > + raw_spin_unlock_irqrestore(&desc->lock, flags); > + chip_bus_sync_unlock(desc); > + return -ENXIO; > + } > + > + if (action->dev_id == dev_id) { > + action->dev_id = new_dev_id; > + break; > + } > + action_ptr = &action->next; > + } > + > + raw_spin_unlock_irqrestore(&desc->lock, flags); > + chip_bus_sync_unlock(desc); > + > + /* > + * Make sure it's not being used on another CPU: > + * There is a risk of UAF for old *dev_id, if it is > + * freed in a short time after this func returns > + */ > + synchronize_irq(irq); > + > + return 0; > +} > +EXPORT_SYMBOL(update_irq_devid); EXPORT_SYMBOL_GPL() please. Thanks, tglx
Re: [PATCH net] net: phy: rtl8211f: do a double read to get real time link status
On 08.08.2019 21:40, Andrew Lunn wrote: >> @@ -568,6 +568,11 @@ int phy_start_aneg(struct phy_device *phydev) >> if (err < 0) >> goto out_unlock; >> >> +/* The PHY may not yet have cleared aneg-completed and link-up bit >> + * w/o this delay when the following read is done. >> + */ >> +usleep_range(1000, 2000); >> + > > Hi Heiner > > Does 802.3 C22 say anything about this? > C22 says: "The Auto-Negotiation process shall be restarted by setting bit 0.9 to a logic one. This bit is self- clearing, and a PHY shall return a value of one in bit 0.9 until the Auto-Negotiation process has been initiated." Maybe we should read bit 0.9 in genphy_update_link() after having read BMSR and report aneg-complete and link-up as false (no matter of their current value) if 0.9 is set. > If this PHY is broken with respect to the standard, i would prefer the > workaround is in the PHY specific driver code, not generic core code. > Based on the C22 statement above the PHY may not be broken and the typical time between two MDIO accesses is sufficient for the PHY to clear the bits. I think of MDIO bus access functions in network chips that have a 10us-20us delay after each MDIO access. On HNS3 this may not be the case. > Andrew > Heiner
Re: i2c: imx: support slave mode for imx I2C driver
On Thu, Aug 08, 2019 at 11:53:43AM +0800, Biwen Li wrote: > The patch supports slave mode for imx I2C driver > > Signed-off-by: Biwen Li Wow, this is much simpler than the other approach flying around: http://patchwork.ozlabs.org/patch/1124048/ Can this one be master and slave on the same bus, too? CCing the author of the other patch. > --- > drivers/i2c/busses/i2c-imx.c | 199 --- > 1 file changed, 185 insertions(+), 14 deletions(-) > > diff --git a/drivers/i2c/busses/i2c-imx.c b/drivers/i2c/busses/i2c-imx.c > index b1b8b938d7f4..f7583a9fa56f 100644 > --- a/drivers/i2c/busses/i2c-imx.c > +++ b/drivers/i2c/busses/i2c-imx.c > @@ -202,6 +202,9 @@ struct imx_i2c_struct { > struct pinctrl_state *pinctrl_pins_gpio; > > struct imx_i2c_dma *dma; > +#if IS_ENABLED(CONFIG_I2C_SLAVE) > + struct i2c_client *slave; > +#endif /* CONFIG_I2C_SLAVE */ > }; > > static const struct imx_i2c_hwdata imx1_i2c_hwdata = { > @@ -583,23 +586,40 @@ static void i2c_imx_stop(struct imx_i2c_struct *i2c_imx) > imx_i2c_write_reg(temp, i2c_imx, IMX_I2C_I2CR); > } > > -static irqreturn_t i2c_imx_isr(int irq, void *dev_id) > +/* Clear interrupt flag bit */ > +static void i2c_imx_clr_if_bit(struct imx_i2c_struct *i2c_imx) > { > - struct imx_i2c_struct *i2c_imx = dev_id; > - unsigned int temp; > + unsigned int status; > > - temp = imx_i2c_read_reg(i2c_imx, IMX_I2C_I2SR); > - if (temp & I2SR_IIF) { > - /* save status register */ > - i2c_imx->i2csr = temp; > - temp &= ~I2SR_IIF; > - temp |= (i2c_imx->hwdata->i2sr_clr_opcode & I2SR_IIF); > - imx_i2c_write_reg(temp, i2c_imx, IMX_I2C_I2SR); > - wake_up(&i2c_imx->queue); > - return IRQ_HANDLED; > - } > + status = imx_i2c_read_reg(i2c_imx, IMX_I2C_I2SR); > + status &= ~I2SR_IIF; > + status |= (i2c_imx->hwdata->i2sr_clr_opcode & I2SR_IIF); > + imx_i2c_write_reg(status, i2c_imx, IMX_I2C_I2SR); > +} > + > +/* Clear arbitration lost bit */ > +static void i2c_imx_clr_al_bit(struct imx_i2c_struct *i2c_imx) > +{ > + unsigned int status; > + > + status = imx_i2c_read_reg(i2c_imx, IMX_I2C_I2SR); > + status &= ~I2SR_IAL; > + imx_i2c_write_reg(status, i2c_imx, IMX_I2C_I2SR); > +} > > - return IRQ_NONE; > +static irqreturn_t i2c_imx_master_isr(struct imx_i2c_struct *i2c_imx) > +{ > + unsigned int status; > + > + dev_dbg(&i2c_imx->adapter.dev, "<%s>: master interrupt\n", __func__); > + > + /* Save status register */ > + status = imx_i2c_read_reg(i2c_imx, IMX_I2C_I2SR); > + i2c_imx->i2csr = status | I2SR_IIF; > + > + wake_up(&i2c_imx->queue); > + > + return IRQ_HANDLED; > } > > static int i2c_imx_dma_write(struct imx_i2c_struct *i2c_imx, > @@ -1043,11 +1063,162 @@ static u32 i2c_imx_func(struct i2c_adapter *adapter) > | I2C_FUNC_SMBUS_READ_BLOCK_DATA; > } > > +#if IS_ENABLED(CONFIG_I2C_SLAVE) > +static void i2c_imx_slave_init(struct imx_i2c_struct *i2c_imx) > +{ > + unsigned int temp; > + > + dev_dbg(&i2c_imx->adapter.dev, "<%s>\n", __func__); > + > + /* Set slave addr. */ > + imx_i2c_write_reg((i2c_imx->slave->addr << 1), i2c_imx, IMX_I2C_IADR); > + > + /* Disable i2c module */ > + temp = i2c_imx->hwdata->i2cr_ien_opcode > + ^ I2CR_IEN; > + imx_i2c_write_reg(temp, i2c_imx, IMX_I2C_I2CR); > + > + /* Reset status register */ > + imx_i2c_write_reg(i2c_imx->hwdata->i2sr_clr_opcode, i2c_imx, > + IMX_I2C_I2SR); > + > + /* Enable module and enable interrupt from i2c module */ > + temp = i2c_imx->hwdata->i2cr_ien_opcode > + | I2CR_IIEN; > + imx_i2c_write_reg(temp, i2c_imx, IMX_I2C_I2CR); > + > + /* Wait controller to be stable */ > + usleep_range(50, 150); > +} > + > +static irqreturn_t i2c_imx_slave_isr(struct imx_i2c_struct *i2c_imx) > +{ > + unsigned int status, ctl; > + u8 value; > + > + if (!i2c_imx->slave) { > + dev_err(&i2c_imx->adapter.dev, "cannot deal with slave > irq,i2c_imx->slave is null"); > + return IRQ_NONE; > + } > + > + status = imx_i2c_read_reg(i2c_imx, IMX_I2C_I2SR); > + ctl = imx_i2c_read_reg(i2c_imx, IMX_I2C_I2CR); > + if (status & I2SR_IAL) { /* Arbitration lost */ > + i2c_imx_clr_al_bit(i2c_imx); > + } else if (status & I2SR_IAAS) { /* Addressed as a slave */ > + if (status & I2SR_SRW) { /* Master wants to read from us*/ > + dev_dbg(&i2c_imx->adapter.dev, "read requested"); > + i2c_slave_event(i2c_imx->slave, > I2C_SLAVE_READ_REQUESTED, &value); > + > + /* Slave transimt */ > + ctl |= I2CR_MTX; > + imx_i2c_write_reg(ctl, i2c_imx, IMX_I2C_I2CR); > + > + /* Send data */ > + imx_i2c
Re: [PATCH v2 2/2] nvme-pci: Allow PCI bus-level PM to be used if ASPM is disabled
On Thu, Aug 08, 2019 at 01:39:54PM -0500, Bjorn Helgaas wrote: > On Thu, Aug 08, 2019 at 04:47:45PM +0200, Rafael J. Wysocki wrote: > > On Thu, Aug 8, 2019 at 3:43 PM Bjorn Helgaas wrote: > > > > > IIUC the NVMe device will go to the desired package idle state if > > > the link is in L0s or L1, but not if the link is in L0. I don't > > > understand that connection; AFAIK that would be something outside > > > the scope of the PCIe spec. > > > > Yes, it is outside of the PCIe spec. > > > > No, this is not about the NVMe device, it is about the Intel SoC > > (System-on-a-Chip) the platform is based on. > > Ah. So this problem could occur with any device, not just NVMe? If > so, how do you address that? Obviously you don't want to patch all > drivers this way. We discovered this when using an NVMe protocol specific power setting, so that part is driver specific. We just have to ensure device generic dependencies are met in order to achieve the our power target. So in that sense, I think you would need to patch all drivers if they're also using protocol specific settings incorrectly. Granted, the NVMe specification doesn't detail what PCIe settings may prevent NVMe power management from hitting the objective, but I think ASPM enabled makes sense.
Re: [PATCH] arch/x86/kernel/cpu/umwait.c - remove unused variable
Valdis, On Thu, 8 Aug 2019, Valdis Klētnieks wrote: I really appreciate your work, but can you please refrain from using file names as prefixes? git log $FILE gives you usually a pretty good hint what the proper prefix is: bd9a0c97e53c ("x86/umwait: Add sysfs interface to control umwait maximum time") ff4b353f2ef9 ("x86/umwait: Add sysfs interface to control umwait C0.2 state") bd688c69b7e6 ("x86/umwait: Initialize umwait control values") See? > We get a warning when building with W=1: Please avoid 'We/I' in changelogs. > CC arch/x86/kernel/cpu/umwait.o > arch/x86/kernel/cpu/umwait.c: In function 'umwait_init': > arch/x86/kernel/cpu/umwait.c:183:6: warning: variable 'ret' set but not used > [-Wunused-but-set-variable] > 183 | int ret; > | ^~~ > > And indeed, we don't do anything with it, so clean it up. Well, the question is whether removing the variable is the right thing to do. > Signed-off-by: Valdis Kletnieks > > diff --git a/arch/x86/kernel/cpu/umwait.c b/arch/x86/kernel/cpu/umwait.c > index 6a204e7336c1..3d1d3952774a 100644 > --- a/arch/x86/kernel/cpu/umwait.c > +++ b/arch/x86/kernel/cpu/umwait.c > @@ -180,12 +180,11 @@ static struct attribute_group umwait_attr_group = { > static int __init umwait_init(void) > { > struct device *dev; > - int ret; > > if (!boot_cpu_has(X86_FEATURE_WAITPKG)) > return -ENODEV; > > - ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "umwait:online", > + cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "umwait:online", > umwait_cpu_online, NULL); If that fails then umwait is broken. So instead of removing it, this should actually check the return code and act accordingly. Fenghua? > register_syscore_ops(&umwait_syscore_ops); Thanks, tglx
[ANNOUNCE] 4.4.188-rt185
Hello RT Folks! I'm pleased to announce the 4.4.188-rt185 stable release. This release is just an update to the new stable 4.4.188 version and no RT specific changes have been made. The know issue from last time is now resolved. The missing patch for -rt has is now also part of stable 1ab1512366d4 ("mm, vmstat: make quiet_vmstat lighter"). There was a patch missing fece2f828ffe ("vmstat: Remove BUG_ON from vmstat_update"). With this patch the NVIDIA boards (at least Tegra K1) should work again. You can get this release via the git tree at: git://git.kernel.org/pub/scm/linux/kernel/git/rt/linux-stable-rt.git branch: v4.4-rt Head SHA1: bc22d8bc8f5566ba4fe13115fb11d843d140f37c Or to build 4.4.188-rt185 directly, the following patches should be applied: https://www.kernel.org/pub/linux/kernel/v4.x/linux-4.4.tar.xz https://www.kernel.org/pub/linux/kernel/v4.x/patch-4.4.188.xz https://www.kernel.org/pub/linux/kernel/projects/rt/4.4/patch-4.4.188-rt185.patch.xz Enjoy! Daniel
RE: [PATCH v2 2/2] nvme-pci: Allow PCI bus-level PM to be used if ASPM is disabled
> This is more meaningful to you than to most people because "applying > the standard PCI PM" doesn't tell us what that means in terms of the > device. Presumably it has something to do with a D-state transition? > I *assume* a suspend might involve the D0 -> D3hot transition you > mention below? > > > The reason for doing that was a (reportedly) widespread failure to > > take the PCIe link down during D0 -> D3hot transitions of NVMe > > devices, > > I don't know any of the details, but "failure to take the link down > during D0 -> D3hot transitions" is phrased as though it might be a > hardware erratum. If this *is* related to an NVMe erratum, that would > explain why you only need to patch the nvme driver, and it would be > useful to mention that in the commit log, since otherwise it sounds > like something that might be needed in other drivers, too. NVME is special in this case that there is other logic being put in place to set the drive's power state explicitly. I would mention that also this alternate flow is quicker for s0ix resume since NVME doesn't go through shutdown routine. Unanimously the feedback from vendors was to avoid NVME shutdown and to instead use SetFeatures to go into deepest power state instead over S0ix. > > According to PCIe r5.0 sec 5.3.2, the only legal link states for D3hot > are L1, L2/L3 Ready. So if you put a device in D3hot and its link > stays in L0, that sounds like a defect. Is that what happens? > > Obviously I'm still confused. I think it would help if you could > describe the problem in terms of the specific PCIe states involved > (D0, D3hot, L0, L1, L2, L3, etc) because then the spec would help > explain what's happening. Before that commit, the flow for NVME s0ix was: * Delete IO SQ/CQ * Shutdown NVME controller * Save PCI registers * Go into D3hot * Read PMCSR A functioning drive had the link at L1.2 and NVME power state at PS4 at this point. Resuming looked like this: * Restore PCI registers * Enable NVME controller * Configure NVME controller (IO queues, features, etc). After that commit the flow for NVME s0ix is: * Use NVME SetFeatures to put drive into low power mode (PS3 or PS4) * Save PCI config register * ASPM is used to bring link into L1.2 The resume flow is: * Restore PCI registers "Non-functioning" drives consumed too much power from the old flow. The root cause varied from manufacturer to manufacturer. The two I know off hand: One instance is that when PM status register is read after the device in L1.2 from D3 it causes link to go to L0 and then stay there. Another instance I heard drive isn't able to service D3hot request when NVME was already shut down.
Re: [PATCH] mm/oom: Add killed process selection information
[please do not top-post] On Thu 08-08-19 12:21:30, Edward Chron wrote: > It is helpful to the admin that looks at the kill message and records this > information. OOMs can come in bunches. > Knowing how much resource the oom selected process was using at the time of > the OOM event is very useful, these fields document key process and system > memory/swap values and can be quite helpful. I do agree and we already print that information. rss with a break down to anonymous, file backed and shmem, is usually a large part of the oom victims foot print. It is not a complete information because there might be a lot of memory hidden by other resource (open files etc.). We do not print that information because it is not considered in the oom selection. It is also not guaranteed to be freed upon the task exit. > Also can't you disable printing the oom eligible task list? For systems > with very large numbers of oom eligible processes that would seem to be > very desirable. Yes that is indeed the case. But how does the oom_score and oom_score_adj alone without comparing it to other eligible tasks help in isolation? [...] > I'm not sure that change would be supported upstream but again in our > experience we've found it helpful, since you asked. Could you be more specific about how that information is useful except for recording it? I am all for giving an useful information in the OOM report but I would like to hear a sound justification for each additional piece of information. E.g. this helped us to understand why the task has been selected - this is usually dump_tasks portion of the report because it gives a picture of what the OOM killer sees when choosing who to kill. Then we have the summary to give us an estimation on how much memory will get freed when the victim dies - rss is a very rough estimation. But is a portion of the overal memory or oom_score{_adj} important to print as well? Those are relative values. Say you get memory-usage:10%, oom_score:42 and oom_score_adj:0. What are you going to tell from that information? -- Michal Hocko SUSE Labs
Re: BUG: soft lockup in tcp_delack_timer
On Thu, 8 Aug 2019, syzbot wrote: Cc+ Eric, net-dev > Hello, > > syzbot found the following crash on: > > HEAD commit:0d8b3265 Add linux-next specific files for 20190729 > git tree: linux-next > console output: https://syzkaller.appspot.com/x/log.txt?x=1101fdc860 > kernel config: https://syzkaller.appspot.com/x/.config?x=ae96f3b8a7e885f7 > dashboard link: https://syzkaller.appspot.com/bug?extid=2d55fb97f42947bbcddd > compiler: gcc (GCC) 9.0.0 20181231 (experimental) > > Unfortunately, I don't have any reproducer for this crash yet. > > IMPORTANT: if you fix the bug, please add the following tag to the commit: > Reported-by: syzbot+2d55fb97f42947bbc...@syzkaller.appspotmail.com > > net_ratelimit: 2 callbacks suppressed > TCP: request_sock_TCPv6: Possible SYN flooding on port 20002. Sending cookies. > Check SNMP counters. > watchdog: BUG: soft lockup - CPU#0 stuck for 122s! [swapper/0:0] > Modules linked in: > irq event stamp: 92022 > hardirqs last enabled at (92021): [] > tick_nohz_idle_exit+0x181/0x2e0 kernel/time/tick-sched.c:1180 > hardirqs last disabled at (92022): [] > __schedule+0x1dd/0x15b0 kernel/sched/core.c:3862 > softirqs last enabled at (90810): [] > __do_softirq+0x6cd/0x98c kernel/softirq.c:319 > softirqs last disabled at (90703): [] invoke_softirq > kernel/softirq.c:373 [inline] > softirqs last disabled at (90703): [] irq_exit+0x19b/0x1e0 > kernel/softirq.c:413 > CPU: 0 PID: 0 Comm: swapper/0 Not tainted 5.3.0-rc2-next-20190729 #54 > Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google > 01/01/2011 > RIP: 0010:cpu_relax arch/x86/include/asm/processor.h:656 [inline] > RIP: 0010:virt_spin_lock arch/x86/include/asm/qspinlock.h:84 [inline] > RIP: 0010:native_queued_spin_lock_slowpath+0x132/0x9f0 > kernel/locking/qspinlock.c:325 > Code: 00 00 00 48 8b 45 d0 65 48 33 04 25 28 00 00 00 0f 85 37 07 00 00 48 81 > c4 98 00 00 00 5b 41 5c 41 5d 41 5e 41 5f 5d c3 f3 90 73 ff ff ff 8b 45 > 98 4c 8d 65 d8 3d 00 01 00 00 0f 84 e5 00 00 > RSP: 0018:8880ae809b48 EFLAGS: 0202 ORIG_RAX: ff13 > RAX: RBX: 8880621cd088 RCX: 8158f117 > RDX: RSI: 0004 RDI: 8880621cd088 > RBP: 8880ae809c08 R08: 11100c439a11 R09: ed100c439a12 > R10: ed100c439a11 R11: 8880621cd08b R12: 0001 > R13: 0003 R14: ed100c439a11 R15: 0001 > FS: () GS:8880ae80() knlGS: > CS: 0010 DS: ES: CR0: 80050033 > CR2: 01541e88 CR3: 68089000 CR4: 001406f0 > Call Trace: > > pv_queued_spin_lock_slowpath arch/x86/include/asm/paravirt.h:642 [inline] > queued_spin_lock_slowpath arch/x86/include/asm/qspinlock.h:50 [inline] > queued_spin_lock include/asm-generic/qspinlock.h:81 [inline] > do_raw_spin_lock+0x20e/0x2e0 kernel/locking/spinlock_debug.c:113 > __raw_spin_lock include/linux/spinlock_api_smp.h:143 [inline] > _raw_spin_lock+0x37/0x40 kernel/locking/spinlock.c:151 > spin_lock include/linux/spinlock.h:338 [inline] > tcp_delack_timer+0x2b/0x2a0 net/ipv4/tcp_timer.c:318 > call_timer_fn+0x1ac/0x780 kernel/time/timer.c:1322 > expire_timers kernel/time/timer.c:1366 [inline] > __run_timers kernel/time/timer.c:1685 [inline] > __run_timers kernel/time/timer.c:1653 [inline] > run_timer_softirq+0x697/0x17a0 kernel/time/timer.c:1698 > __do_softirq+0x262/0x98c kernel/softirq.c:292 > invoke_softirq kernel/softirq.c:373 [inline] > irq_exit+0x19b/0x1e0 kernel/softirq.c:413 > exiting_irq arch/x86/include/asm/apic.h:536 [inline] > smp_apic_timer_interrupt+0x1a3/0x610 arch/x86/kernel/apic/apic.c:1095 > apic_timer_interrupt+0xf/0x20 arch/x86/entry/entry_64.S:828 > > RIP: 0010:native_safe_halt+0xe/0x10 arch/x86/include/asm/irqflags.h:61 > Code: c8 75 6e fa eb 8a 90 90 90 90 90 90 e9 07 00 00 00 0f 00 2d c4 b2 49 00 > f4 c3 66 90 e9 07 00 00 00 0f 00 2d b4 b2 49 00 fb f4 90 55 48 89 e5 41 > 57 41 56 41 55 41 54 53 e8 8e 56 21 fa e8 29 > RSP: 0018:88c07ce8 EFLAGS: 0282 ORIG_RAX: ff13 > RAX: 111a5e87 RBX: 88c7a1c0 RCX: 1134bca6 > RDX: dc00 RSI: 81779dee RDI: 873e794c > RBP: 88c07d18 R08: 88c7a1c0 R09: fbfff118f439 > R10: fbfff118f438 R11: 88c7a1c7 R12: dc00 > R13: 89a5b340 R14: R15: > arch_cpu_idle+0xa/0x10 arch/x86/kernel/process.c:571 > default_idle_call+0x84/0xb0 kernel/sched/idle.c:94 > cpuidle_idle_call kernel/sched/idle.c:154 [inline] > do_idle+0x413/0x760 kernel/sched/idle.c:263 > cpu_startup_entry+0x1b/0x20 kernel/sched/idle.c:354 > rest_init+0x245/0x37b init/main.c:451 > arch_call_rest_init+0xe/0x1b > start_kernel+0x912/0x951 init/main.c:785 > x86_64_start_reservations+0x29/0x2b arch/x86/kernel/head64.c:472 > x86_64_start_kernel+0x77/0x7b arch/x86/kernel/head64.c:453 > secondary_startup_64+0xa4/0xb0 arch/x86/kernel/head_64.S:
Re: [PATCH V2 01/10] x86/CPU: Expose if cache is inclusive of lower level caches
Hi Borislav, On 8/8/2019 1:13 AM, Borislav Petkov wrote: > On Thu, Aug 08, 2019 at 10:08:41AM +0200, Borislav Petkov wrote: >> Ok, tglx and I talked it over a bit on IRC: so your 1/10 patch is pretty >> close - just leave out the generic struct cacheinfo bits and put the >> cache inclusivity property in a static variable there. > > ... and by "there" I mean arch/x86/kernel/cpu/cacheinfo.c which contains > all cache properties etc on x86 and is the proper place to put stuff > like that. With the goal of following these guidelines exactly I came up with the below that is an incremental diff on top of what this review started out as. Some changes to highlight that may be of concern: * In your previous email you do mention that this will be a "single bit of information". Please note that I did not specifically use an actual bit to capture this information but an unsigned int (I am very aware that you also commented on this initially). If you do mean that this should be stored as an actual bit, could you please help me by elaborating how you would like to see this implemented? * Please note that I moved the initialization to init_intel_cacheinfo() to be specific to Intel. I did so because from what I understand there are some AMD platforms for which this information cannot be determined and I thought it simpler to make it specific to Intel with the new single static variable. * Please note that while this is a single global static variable it will be set over and over for each CPU on the system. diff --git a/arch/x86/include/asm/cacheinfo.h b/arch/x86/include/asm/cacheinfo.h index 86b63c7feab7..97be5141bb4b 100644 --- a/arch/x86/include/asm/cacheinfo.h +++ b/arch/x86/include/asm/cacheinfo.h @@ -5,4 +5,6 @@ void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, int cpu, u8 node_id); void cacheinfo_hygon_init_llc_id(struct cpuinfo_x86 *c, int cpu, u8 node_id); +unsigned int cacheinfo_intel_l3_inclusive(void); + #endif /* _ASM_X86_CACHEINFO_H */ diff --git a/arch/x86/kernel/cpu/cacheinfo.c b/arch/x86/kernel/cpu/cacheinfo.c index 733874f84f41..247b6a9b5c88 100644 --- a/arch/x86/kernel/cpu/cacheinfo.c +++ b/arch/x86/kernel/cpu/cacheinfo.c @@ -187,6 +187,7 @@ struct _cpuid4_info_regs { }; static unsigned short num_cache_leaves; +static unsigned l3_inclusive; /* AMD doesn't have CPUID4. Emulate it here to report the same information to the user. This makes some assumptions about the machine: @@ -745,6 +746,11 @@ void init_hygon_cacheinfo(struct cpuinfo_x86 *c) num_cache_leaves = find_num_cache_leaves(c); } +unsigned int cacheinfo_intel_l3_inclusive(void) +{ + return l3_inclusive; +} + void init_intel_cacheinfo(struct cpuinfo_x86 *c) { /* Cache sizes */ @@ -795,6 +801,7 @@ void init_intel_cacheinfo(struct cpuinfo_x86 *c) num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing; index_msb = get_count_order(num_threads_sharing); l3_id = c->apicid & ~((1 << index_msb) - 1); + l3_inclusive = this_leaf.edx.split.inclusive; break; default: break; @@ -1010,13 +1017,6 @@ static void ci_leaf_init(struct cacheinfo *this_leaf, this_leaf->physical_line_partition = base->ebx.split.physical_line_partition + 1; - if ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD && -boot_cpu_has(X86_FEATURE_TOPOEXT)) || - boot_cpu_data.x86_vendor == X86_VENDOR_HYGON || - boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) { - this_leaf->attributes |= CACHE_INCLUSIVE_SET; - this_leaf->inclusive = base->edx.split.inclusive; - } this_leaf->priv = base->nb; } What do you think? Reinette
Re: [PATCH] HID: apple: Fix stuck function keys when using FN
Hi Benjamin, On Mon, 8 Jul 2019 at 22:35, João Moreno wrote: > > Hi Benjamin, > > No worries, also pretty busy over here. Didn't mean to press. > > On Mon, 1 Jul 2019 at 10:32, Benjamin Tissoires > wrote: > > > > Hi João, > > > > On Sun, Jun 30, 2019 at 10:15 PM João Moreno wrote: > > > > > > Hi Jiri & Benjamin, > > > > > > Let me know if you need something else to get this patch moving forward. > > > This > > > fixes an issue I hit daily, it would be great to get it fixed. > > > > Sorry for the delay, I am very busy with internal corporate stuff, and > > I tried setting up a new CI system at home, and instead of spending a > > couple of ours, I am down to 2 weeks of hard work, without possibility > > to switch to the new right now :( > > Anyway. > > > > > > > > Thanks. > > > > > > On Mon, 10 Jun 2019 at 23:31, Joao Moreno wrote: > > > > > > > > This fixes an issue in which key down events for function keys would be > > > > repeatedly emitted even after the user has raised the physical key. For > > > > example, the driver fails to emit the F5 key up event when going through > > > > the following steps: > > > > - fnmode=1: hold FN, hold F5, release FN, release F5 > > > > - fnmode=2: hold F5, hold FN, release F5, release FN > > > > Ouch :/ > > > > Right?! > > > > > > > > > The repeated F5 key down events can be easily verified using xev. > > > > > > > > Signed-off-by: Joao Moreno > > > > --- > > > > drivers/hid/hid-apple.c | 21 +++-- > > > > 1 file changed, 11 insertions(+), 10 deletions(-) > > > > > > > > diff --git a/drivers/hid/hid-apple.c b/drivers/hid/hid-apple.c > > > > index 1cb41992aaa1..81867a6fa047 100644 > > > > --- a/drivers/hid/hid-apple.c > > > > +++ b/drivers/hid/hid-apple.c > > > > @@ -205,20 +205,21 @@ static int hidinput_apple_event(struct hid_device > > > > *hid, struct input_dev *input, > > > > trans = apple_find_translation (table, usage->code); > > > > > > > > if (trans) { > > > > - if (test_bit(usage->code, asc->pressed_fn)) > > > > - do_translate = 1; > > > > - else if (trans->flags & APPLE_FLAG_FKEY) > > > > - do_translate = (fnmode == 2 && > > > > asc->fn_on) || > > > > - (fnmode == 1 && !asc->fn_on); > > > > + int fn_on = value ? asc->fn_on : > > > > + test_bit(usage->code, asc->pressed_fn); > > > > + > > > > + if (!value) > > > > + clear_bit(usage->code, asc->pressed_fn); > > > > + else if (asc->fn_on) > > > > + set_bit(usage->code, asc->pressed_fn); > > > > I have the feeling that this is not the correct fix here. > > > > I might be wrong, but the following sequence might also mess up the > > driver state, depending on how the reports are emitted: > > - hold FN, hold F4, hold F5, release F4, release FN, release F5 > > > > I believe this should be fine. Following the code: > > - hold FN, sets asc->fn_on to true > - hold F4, in the trans block fn_on will be true and we'll set the F4 > bit in the bitmap > - hold F5, in the trans block fn_on will be true and we'll set the F5 bit > - release F4, in the trans block fn_on will be true (because of the bitmap) > and > we'll clear the F4 bit > - release FN, asc->fn_on will be false, but it doesn't matter since... > - release F5, in the trans block we'll look into the bitmap (instead > of asc->fn_on), > so fn_on will be true and we'll clear the F5 bit > > I tested it in practice using my changes: > > Interestingly the Apple keyboard doesn't seem to emit an even for F5 when F4 > is > pressed, seems like a hardware limitation. But F6 does work. So, when I > execute > these events in that order, everything works as it should: xev reports > the following: > > KeyPress F4 > KeyPress F6 > KeyRelease F4 > KeyRelease F6 > > > The reason is that the driver only considers you have one key pressed > > with the modifier, and as the code changed its state based on the last > > value. > > > > I believe the bitmap takes care of storing the FN state per key press. The > trick I did was to check on the global `asc->fn_on` state only when a key > is pressed, but check on the bitmap instead when it's released. > > Let me know what you think. Am I missing something here? > > Cheers, > João. > > > IMO a better fix would: > > > > - keep the existing `trans` mapping lookout > > - whenever a `trans` mapping gets found: > > * get both translated and non-translated currently reported values > > (`test_bit(keycode, input_dev->key)`) > > * if one of them is set to true, then consider the keycode to be the > > one of the key (no matter fn_on) > > -> deal with `value` with the corrected keycode > > * if the key was not pressed: > > -> chose the keycode based on `fn_on` and `fnmode` states > > and re
Re: [PATCH V2 09/10] x86/resctrl: Pseudo-lock portions of multiple resources
Hi Borislav, On 8/8/2019 1:44 AM, Borislav Petkov wrote: > On Wed, Aug 07, 2019 at 12:23:29PM -0700, Reinette Chatre wrote: >> I do not fully understand this proposal. All those goto labels take care >> of the the different failures that can be encountered during the >> initialization of the pseudo-lock region. Each initialization failure is >> associated with a goto where it jumps to the cleanup path. The >> initialization starts with the constraining of the c-states >> (initializing plr->pm_reqs), but if I move that I think it will not >> reduce the goto labels, just change the order because of the other >> initialization done (plr->size, plr->line_size, plr->cpu). > > Here's one possible way to do it, pasting the whole function here as it > is easier to read it this way than an incremental diff ontop. > > You basically cache all attributes in local variables and assign them to > the plr struct only on success, at the end. This way, no goto labels and > the C-states constraining, i.e., the most expensive operation, happens > last, only after all the other simpler checks have succeeded. And you > don't have to call pseudo_lock_cstates_relax() prematurely, when one of > those easier checks fail. > > Makes sense? It does. This looks much better. Thank you very much. > > Btw, I've marked the cpu_online() check with "CPU hotplug > lock?!?" question because I don't see you holding that lock with > get_online_cpus()/put_online_cpus(). There is a locking order dependency between cpu_hotplug_lock and rdtgroup_mutex (cpu_hotplug_lock before rdtgroup_mutex) that has to be maintained. To do so in this flow you will find cpus_read_lock() in rdtgroup_schemata_write(), so quite a distance from where it is needed. Perhaps I should add a comment at the location where the lock is required to document where the lock is obtained? > static int pseudo_lock_l2_l3_portions_valid(struct pseudo_lock_region *plr, > struct pseudo_lock_portion *l2_p, > struct pseudo_lock_portion *l3_p) > { > unsigned int l2_size, l3_size, size, line_size, cpu; > struct rdt_domain *l2_d, *l3_d; > > l2_d = rdt_find_domain(l2_p->r, l2_p->d_id, NULL); > if (IS_ERR_OR_NULL(l2_d)) { > rdt_last_cmd_puts("Cannot locate L2 cache domain\n"); > return -1; > } > > l3_d = rdt_find_domain(l3_p->r, l3_p->d_id, NULL); > if (IS_ERR_OR_NULL(l3_d)) { > rdt_last_cmd_puts("Cannot locate L3 cache domain\n"); > return -1; > } > > if (!cpumask_subset(&l2_d->cpu_mask, &l3_d->cpu_mask)) { > rdt_last_cmd_puts("L2 and L3 caches need to be in same > hierarchy\n"); > return -1; > } > > l2_size = rdtgroup_cbm_to_size(l2_p->r, l2_d, l2_p->cbm); > l3_size = rdtgroup_cbm_to_size(l3_p->r, l3_d, l3_p->cbm); > > if (l2_size > l3_size) { > rdt_last_cmd_puts("L3 cache portion has to be same size or > larger than L2 cache portion\n"); > return -1; > } > > size = l2_size; > > l2_size = get_cache_line_size(cpumask_first(&l2_d->cpu_mask), > l2_p->r->cache_level); > l3_size = get_cache_line_size(cpumask_first(&l3_d->cpu_mask), > l3_p->r->cache_level); > if (l2_size != l3_size) { > rdt_last_cmd_puts("L2 and L3 caches have different coherency > cache line sizes\n"); > return -1; > } > > line_size = l2_size; > > cpu = cpumask_first(&l2_d->cpu_mask); > > /* >* CPU hotplug lock?!? >*/ > if (!cpu_online(cpu)) { > rdt_last_cmd_printf("CPU %u associated with cache not > online\n", cpu); > return -1; > } > > if (!get_cache_inclusive(cpu, l3_p->r->cache_level)) { > rdt_last_cmd_puts("L3 cache not inclusive\n"); > return -1; > } > > /* >* All checks passed, constrain C-states: >*/ > if (pseudo_lock_cstates_constrain(plr, &l2_d->cpu_mask)) { > rdt_last_cmd_puts("Cannot limit C-states\n"); > pseudo_lock_cstates_relax(plr); > return -1; > } > > plr->line_size = line_size; > plr->size = size; > plr->cpu= cpu; > > return 0; > } > Thank you very much Reinette
Re: [PATCH RFC v1 1/2] rcu/tree: Add basic support for kfree_rcu batching
On Thu, Aug 08, 2019 at 11:11:12AM -0700, Paul E. McKenney wrote: > On Thu, Aug 08, 2019 at 07:26:10PM +0900, Byungchul Park wrote: > > On Wed, Aug 07, 2019 at 05:45:04AM -0400, Joel Fernandes wrote: > > > On Tue, Aug 06, 2019 at 04:56:31PM -0700, Paul E. McKenney wrote: > > > > [snip] > > > > > > On Tue, Aug 06, 2019 at 05:20:40PM -0400, Joel Fernandes (Google) wrote: > > > > Of course, I am hoping that a later patch uses an array of pointers > > > > built > > > > at kfree_rcu() time, similar to Rao's patch (with or without kfree_bulk) > > > > in order to reduce per-object cache-miss overhead. This would make it > > > > easier for callback invocation to keep up with multi-CPU kfree_rcu() > > > > floods. > > > > > > I think Byungchul tried an experiment with array of pointers and wasn't > > > immediately able to see a benefit. Perhaps his patch needs a bit more > > > polish > > > or another test-case needed to show benefit due to cache-misses, and the > > > perf > > > tool could be used to show if cache misses were reduced. For this initial > > > pass, we decided to keep it without the array optimization. > > > > I'm still seeing no improvement with kfree_bulk(). > > > > I've been thinking I could see improvement with kfree_bulk() because: > > > >1. As you guys said, the number of cache misses will be reduced. > >2. We can save (N - 1) irq-disable instructions while N kfrees. > >3. As Joel said, saving/restoring CPU status that kfree() does inside > > is not required. > > > > But even with the following patch applied, the result was same as just > > batching test. We might need to get kmalloc objects from random > > addresses to maximize the result when using kfree_bulk() and this is > > even closer to real practical world too. > > > > And the second and third reasons doesn't seem to work as much as I > > expected. > > > > Do you have any idea? Or what do you think about it? > > I would not expect kfree_batch() to help all that much unless the > pre-grace-period kfree_rcu() code segregated the objects on a per-slab > basis. You mean kfree_bulk() instead of kfree_batch() right? I agree with you, would be nice to do per-slab optimization in the future. Also, I am thinking that whenever we do per-slab optimization, then the kmem_cache_free_bulk() can be optimized further. If all pointers are on the same slab, then we can just do virt_to_cache on the first pointer and avoid repeated virt_to_cache() calls. That might also give a benefit -- but I could be missing something. Right now kmem_cache_free_bulk() just looks like a kmem_cache_free() in a loop except the small benefit of not disabling/enabling IRQs across each __cache_free, and the reduced cache miss benefit of using the array. thanks, - Joel [snip]
Re: [PATCH] drm/i915: Remove redundant user_access_end() from __copy_from_user() error path
On Tue, Aug 6, 2019 at 5:59 AM Josh Poimboeuf wrote: > > On Mon, Aug 05, 2019 at 09:29:53PM +0200, Sedat Dilek wrote: > > On Wed, Jul 31, 2019 at 2:25 PM Sedat Dilek wrote: > > > > > > On Fri, Jul 26, 2019 at 9:30 PM Chris Wilson > > > wrote: > > > > > > > > Quoting Thomas Gleixner (2019-07-26 20:18:32) > > > > > On Fri, 26 Jul 2019, Chris Wilson wrote: > > > > > > Quoting Thomas Gleixner (2019-07-25 22:55:45) > > > > > > > On Thu, 25 Jul 2019, Josh Poimboeuf wrote: > > > > > > > > > > > > > > > Objtool reports: > > > > > > > > > > > > > > > > drivers/gpu/drm/i915/gem/i915_gem_execbuffer.o: warning: > > > > > > > > objtool: .altinstr_replacement+0x36: redundant UACCESS disable > > > > > > > > > > > > > > > > __copy_from_user() already does both STAC and CLAC, so the > > > > > > > > user_access_end() in its error path adds an extra unnecessary > > > > > > > > CLAC. > > > > > > > > > > > > > > > > Fixes: 0b2c8f8b6b0c ("i915: fix missing user_access_end() in > > > > > > > > page fault exception case") > > > > > > > > Reported-by: Thomas Gleixner > > > > > > > > Reported-by: Sedat Dilek > > > > > > > > Acked-by: Peter Zijlstra (Intel) > > > > > > > > Tested-by: Nick Desaulniers > > > > > > > > Tested-by: Sedat Dilek > > > > > > > > Link: https://github.com/ClangBuiltLinux/linux/issues/617 > > > > > > > > Signed-off-by: Josh Poimboeuf > > > > > > > > > > > > > > Reviewed-by: Thomas Gleixner > > > > > > > > > > > > Which tree do you plan to apply it to? I can put in drm-intel, and > > > > > > with > > > > > > the fixes tag it will percolate through to 5.3 and beyond, but if > > > > > > you > > > > > > want to apply it directly to squash the build warnings, feel free. > > > > > > > > > > It would be nice to get it into 5.3. I can route it linuxwards if you > > > > > give > > > > > an Acked-by, but I'm happy to hand it to you :) > > > > > > > > Acked-by: Chris Wilson > > > > > > Thomas did you take this through tip tree after Chris' ACK? > > > > > > > Hi, > > > > Gentle ping... > > Thomas and Chris: Will someone of you pick this up? > > As "objtool: Improve UACCESS coverage" [1] went trough tip tree I > > highly appreciate to do so with this one. > > I think Thomas has gone on holiday, so hopefully Chris can pick it up > after all. tglx just picked up 2 other patches of mine, bumping just in case he's not picking up patches while on vacation. ;) -- Thanks, ~Nick Desaulniers
[tip:perf/urgent] perf bench numa: Fix cpu0 binding
Commit-ID: 6bbfe4e602691b90ac866712bd4c43c51e546a60 Gitweb: https://git.kernel.org/tip/6bbfe4e602691b90ac866712bd4c43c51e546a60 Author: Jiri Olsa AuthorDate: Thu, 1 Aug 2019 16:26:42 +0200 Committer: Arnaldo Carvalho de Melo CommitDate: Thu, 1 Aug 2019 11:34:13 -0300 perf bench numa: Fix cpu0 binding Michael reported an issue with perf bench numa failing with binding to cpu0 with '-0' option. # perf bench numa mem -p 3 -t 1 -P 512 -s 100 -zZcm0 --thp 1 -M 1 -ddd # Running 'numa/mem' benchmark: # Running main, "perf bench numa numa-mem -p 3 -t 1 -P 512 -s 100 -zZcm0 --thp 1 -M 1 -ddd" binding to node 0, mask: 0001 => -1 perf: bench/numa.c:356: bind_to_memnode: Assertion `!(ret)' failed. Aborted (core dumped) This happens when the cpu0 is not part of node0, which is the benchmark assumption and we can see that's not the case for some powerpc servers. Using correct node for cpu0 binding. Reported-by: Michael Petlan Signed-off-by: Jiri Olsa Cc: Alexander Shishkin Cc: Andi Kleen Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Satheesh Rajendran Link: http://lkml.kernel.org/r/20190801142642.28004-1-jo...@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/bench/numa.c | 6 -- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tools/perf/bench/numa.c b/tools/perf/bench/numa.c index a640ca7aaada..513cb2f2fa32 100644 --- a/tools/perf/bench/numa.c +++ b/tools/perf/bench/numa.c @@ -379,8 +379,10 @@ static u8 *alloc_data(ssize_t bytes0, int map_flags, /* Allocate and initialize all memory on CPU#0: */ if (init_cpu0) { - orig_mask = bind_to_node(0); - bind_to_memnode(0); + int node = numa_node_of_cpu(0); + + orig_mask = bind_to_node(node); + bind_to_memnode(node); } bytes = bytes0 + HPSIZE;
[tip:perf/urgent] perf annotate: Fix printing of unaugmented disassembled instructions from BPF
Commit-ID: 85127775a65fc58e69af0c44513937d471ccbe7b Gitweb: https://git.kernel.org/tip/85127775a65fc58e69af0c44513937d471ccbe7b Author: Arnaldo Carvalho de Melo AuthorDate: Tue, 6 Aug 2019 11:24:09 -0300 Committer: Arnaldo Carvalho de Melo CommitDate: Thu, 8 Aug 2019 15:40:56 -0300 perf annotate: Fix printing of unaugmented disassembled instructions from BPF The code to disassemble BPF programs uses binutil's disassembling routines, and those use in turn fprintf to print to a memstream FILE, adding a newline at the end of each line, which ends up confusing the TUI routines called from: annotate_browser__write() annotate_line__write() annotate_browser__printf() ui_browser__vprintf() SLsmg_vprintf() The SLsmg_vprintf() function in the slang library gets confused with the terminating newline, so make the disasm_line__parse() function that parses the lines produced by the BPF specific disassembler (that uses binutil's libopcodes) and the lines produced by the objdump based disassembler used for everything else (and that doesn't adds this terminating newline) trim the end of the line in addition of the beginning. This way when disasm_line->ops.raw, i.e. for instructions without a special scnprintf() method, we'll not have that \n getting in the way of filling the screen right after the instruction with spaces to avoid leaving what was on the screen before and thus garbling the annotation screen, breaking scrolling, etc. Cc: Adrian Hunter Cc: Jiri Olsa Cc: Namhyung Kim Cc: Song Liu Fixes: 6987561c9e86 ("perf annotate: Enable annotation of BPF programs") Link: https://lkml.kernel.org/n/tip-unbr5a5efakobfr6rhxq9...@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/annotate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index ac9ad2330f93..163536720149 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -1122,7 +1122,7 @@ static int disasm_line__parse(char *line, const char **namep, char **rawp) goto out; (*rawp)[0] = tmp; - *rawp = skip_spaces(*rawp); + *rawp = strim(*rawp); return 0;
Re: [PATCH 2/2 v2] tracing: Document the stack trace algorithm in the comments
On Wed, Aug 07, 2019 at 01:28:28PM -0400, Steven Rostedt wrote: > From: "Steven Rostedt (VMware)" > > As the max stack tracer algorithm is not that easy to understand from the > code, add comments that explain the algorithm and mentions how > ARCH_RET_ADDR_AFTER_LOCAL_VARS affects it. > > Link: http://lkml.kernel.org/r/20190806123455.487ac...@gandalf.local.home > Acked-by: Joel Fernandes (Google) thanks!! - Joel > Suggested-by: Joel Fernandes > Signed-off-by: Steven Rostedt (VMware) > --- > kernel/trace/trace_stack.c | 98 ++ > 1 file changed, 98 insertions(+) > > diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c > index 40e4a88eea8f..f94a2fc567de 100644 > --- a/kernel/trace/trace_stack.c > +++ b/kernel/trace/trace_stack.c > @@ -53,6 +53,104 @@ static void print_max_stack(void) > } > } > > +/* > + * The stack tracer looks for a maximum stack at each call from a function. > It > + * registers a callback from ftrace, and in that callback it examines the > stack > + * size. It determines the stack size from the variable passed in, which is > the > + * address of a local variable in the stack_trace_call() callback function. > + * The stack size is calculated by the address of the local variable to the > top > + * of the current stack. If that size is smaller than the currently saved max > + * stack size, nothing more is done. > + * > + * If the size of the stack is greater than the maximum recorded size, then > the > + * following algorithm takes place. > + * > + * For architectures (like x86) that store the function's return address > before > + * saving the function's local variables, the stack will look something like > + * this: > + * > + * [ top of stack ] > + *0: sys call entry frame > + * 10: return addr to entry code > + * 11: start of sys_foo frame > + * 20: return addr to sys_foo > + * 21: start of kernel_func_bar frame > + * 30: return addr to kernel_func_bar > + * 31: [ do trace stack here ] > + * > + * The save_stack_trace() is called returning all the functions it finds in > the > + * current stack. Which would be (from the bottom of the stack to the top): > + * > + * return addr to kernel_func_bar > + * return addr to sys_foo > + * return addr to entry code > + * > + * Now to figure out how much each of these functions' local variable size > is, > + * a search of the stack is made to find these values. When a match is made, > it > + * is added to the stack_dump_trace[] array. The offset into the stack is > saved > + * in the stack_trace_index[] array. The above example would show: > + * > + *stack_dump_trace[]| stack_trace_index[] > + *--+ --- > + * return addr to kernel_func_bar | 30 > + * return addr to sys_foo | 20 > + * return addr to entry| 10 > + * > + * The print_max_stack() function above, uses these values to print the size > of > + * each function's portion of the stack. > + * > + * for (i = 0; i < nr_entries; i++) { > + * size = i == nr_entries - 1 ? stack_trace_index[i] : > + *stack_trace_index[i] - stack_trace_index[i+1] > + * print "%d %d %d %s\n", i, stack_trace_index[i], size, > stack_dump_trace[i]); > + * } > + * > + * The above shows > + * > + * depth size location > + * - > + * 030 10 kernel_func_bar > + * 120 10 sys_foo > + * 210 10 entry code > + * > + * Now for architectures that might save the return address after the > functions > + * local variables (saving the link register before calling nested > functions), > + * this will cause the stack to look a little different: > + * > + * [ top of stack ] > + * 0: sys call entry frame > + * 10: start of sys_foo_frame > + * 19: return addr to entry code << lr saved before calling kernel_func_bar > + * 20: start of kernel_func_bar frame > + * 29: return addr to sys_foo_frame << lr saved before calling next function > + * 30: [ do trace stack here ] > + * > + * Although the functions returned by save_stack_trace() may be the same, the > + * placement in the stack will be different. Using the same algorithm as > above > + * would yield: > + * > + *stack_dump_trace[]| stack_trace_index[] > + *--+ --- > + * return addr to kernel_func_bar | 30 > + * return addr to sys_foo | 29 > + * return addr to entry| 19 > + * > + * Where the mapping is off by one: > + * > + * kernel_func_bar stack frame size is 29 - 19 not 30 - 29! > + * > + * To fix this, if the architecture sets ARCH_RET_ADDR_AFTER_LOCAL_VARS the > + * values in stack_trace_index[] are shifted by one to and the number of > + * stack trace entries is decremented by one. > + * > + *stack_dump_trace[]| stack_trace_index[] > + *