date:20190808

[PATCH 4.19 42/45] cgroup: Include dying leaders with live threads in PROCS iterations

2019-08-08 Thread Greg Kroah-Hartman

From: Tejun Heo 

commit c03cd7738a83b13739f00546166969342c8ff014 upstream.

CSS_TASK_ITER_PROCS currently iterates live group leaders; however,
this means that a process with dying leader and live threads will be
skipped.  IOW, cgroup.procs might be empty while cgroup.threads isn't,
which is confusing to say the least.

Fix it by making cset track dying tasks and include dying leaders with
live threads in PROCS iteration.

Signed-off-by: Tejun Heo 
Reported-and-tested-by: Topi Miettinen 
Cc: Oleg Nesterov 
Signed-off-by: Greg Kroah-Hartman 

---
 include/linux/cgroup-defs.h |1 +
 include/linux/cgroup.h  |1 +
 kernel/cgroup/cgroup.c  |   44 +---
 3 files changed, 39 insertions(+), 7 deletions(-)

--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -207,6 +207,7 @@ struct css_set {
 */
struct list_head tasks;
struct list_head mg_tasks;
+   struct list_head dying_tasks;
 
/* all css_task_iters currently walking this cset */
struct list_head task_iters;
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -60,6 +60,7 @@ struct css_task_iter {
struct list_head*task_pos;
struct list_head*tasks_head;
struct list_head*mg_tasks_head;
+   struct list_head*dying_tasks_head;
 
struct css_set  *cur_cset;
struct css_set  *cur_dcset;
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -673,6 +673,7 @@ struct css_set init_css_set = {
.dom_cset   = &init_css_set,
.tasks  = LIST_HEAD_INIT(init_css_set.tasks),
.mg_tasks   = LIST_HEAD_INIT(init_css_set.mg_tasks),
+   .dying_tasks= LIST_HEAD_INIT(init_css_set.dying_tasks),
.task_iters = LIST_HEAD_INIT(init_css_set.task_iters),
.threaded_csets = LIST_HEAD_INIT(init_css_set.threaded_csets),
.cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links),
@@ -1145,6 +1146,7 @@ static struct css_set *find_css_set(stru
cset->dom_cset = cset;
INIT_LIST_HEAD(&cset->tasks);
INIT_LIST_HEAD(&cset->mg_tasks);
+   INIT_LIST_HEAD(&cset->dying_tasks);
INIT_LIST_HEAD(&cset->task_iters);
INIT_LIST_HEAD(&cset->threaded_csets);
INIT_HLIST_NODE(&cset->hlist);
@@ -4152,15 +4154,18 @@ static void css_task_iter_advance_css_se
it->task_pos = NULL;
return;
}
-   } while (!css_set_populated(cset));
+   } while (!css_set_populated(cset) && !list_empty(&cset->dying_tasks));
 
if (!list_empty(&cset->tasks))
it->task_pos = cset->tasks.next;
-   else
+   else if (!list_empty(&cset->mg_tasks))
it->task_pos = cset->mg_tasks.next;
+   else
+   it->task_pos = cset->dying_tasks.next;
 
it->tasks_head = &cset->tasks;
it->mg_tasks_head = &cset->mg_tasks;
+   it->dying_tasks_head = &cset->dying_tasks;
 
/*
 * We don't keep css_sets locked across iteration steps and thus
@@ -4199,6 +4204,8 @@ static void css_task_iter_skip(struct cs
 
 static void css_task_iter_advance(struct css_task_iter *it)
 {
+   struct task_struct *task;
+
lockdep_assert_held(&css_set_lock);
 repeat:
if (it->task_pos) {
@@ -4215,17 +4222,32 @@ repeat:
if (it->task_pos == it->tasks_head)
it->task_pos = it->mg_tasks_head->next;
if (it->task_pos == it->mg_tasks_head)
+   it->task_pos = it->dying_tasks_head->next;
+   if (it->task_pos == it->dying_tasks_head)
css_task_iter_advance_css_set(it);
} else {
/* called from start, proceed to the first cset */
css_task_iter_advance_css_set(it);
}
 
-   /* if PROCS, skip over tasks which aren't group leaders */
-   if ((it->flags & CSS_TASK_ITER_PROCS) && it->task_pos &&
-   !thread_group_leader(list_entry(it->task_pos, struct task_struct,
-   cg_list)))
-   goto repeat;
+   if (!it->task_pos)
+   return;
+
+   task = list_entry(it->task_pos, struct task_struct, cg_list);
+
+   if (it->flags & CSS_TASK_ITER_PROCS) {
+   /* if PROCS, skip over tasks which aren't group leaders */
+   if (!thread_group_leader(task))
+   goto repeat;
+
+   /* and dying leaders w/o live member threads */
+   if (!atomic_read(&task->signal->live))
+   goto repeat;
+   } else {
+   /* skip all dying ones */
+   if (task->flags & PF_EXITING)
+   goto repeat;
+   }
 }
 
 /**
@@ -5682,6 +5704,7 @@ void cgroup_exi

[PATCH 4.19 23/45] net: fix ifindex collision during namespace removal

2019-08-08 Thread Greg Kroah-Hartman

From: Jiri Pirko 

[ Upstream commit 55b40dbf0e76b4bfb9d8b3a16a0208640a9a45df ]

Commit aca51397d014 ("netns: Fix arbitrary net_device-s corruptions
on net_ns stop.") introduced a possibility to hit a BUG in case device
is returning back to init_net and two following conditions are met:
1) dev->ifindex value is used in a name of another "dev%d"
   device in init_net.
2) dev->name is used by another device in init_net.

Under real life circumstances this is hard to get. Therefore this has
been present happily for over 10 years. To reproduce:

$ ip a
1: lo:  mtu 65536 qdisc noqueue state UNKNOWN group 
default qlen 1000
link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
inet 127.0.0.1/8 scope host lo
   valid_lft forever preferred_lft forever
inet6 ::1/128 scope host
   valid_lft forever preferred_lft forever
2: dummy0:  mtu 1500 qdisc noop state DOWN group default qlen 
1000
link/ether 86:89:3f:86:61:29 brd ff:ff:ff:ff:ff:ff
3: enp0s2:  mtu 1500 qdisc noop state DOWN group default 
qlen 1000
link/ether 52:54:00:12:34:56 brd ff:ff:ff:ff:ff:ff
$ ip netns add ns1
$ ip -n ns1 link add dummy1ns1 type dummy
$ ip -n ns1 link add dummy2ns1 type dummy
$ ip link set enp0s2 netns ns1
$ ip -n ns1 link set enp0s2 name dummy0
[  100.858894] virtio_net virtio0 dummy0: renamed from enp0s2
$ ip link add dev4 type dummy
$ ip -n ns1 a
1: lo:  mtu 65536 qdisc noop state DOWN group default qlen 1000
link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
2: dummy1ns1:  mtu 1500 qdisc noop state DOWN group default 
qlen 1000
link/ether 16:63:4c:38:3e:ff brd ff:ff:ff:ff:ff:ff
3: dummy2ns1:  mtu 1500 qdisc noop state DOWN group default 
qlen 1000
link/ether aa:9e:86:dd:6b:5d brd ff:ff:ff:ff:ff:ff
4: dummy0:  mtu 1500 qdisc noop state DOWN group default 
qlen 1000
link/ether 52:54:00:12:34:56 brd ff:ff:ff:ff:ff:ff
$ ip a
1: lo:  mtu 65536 qdisc noqueue state UNKNOWN group 
default qlen 1000
link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
inet 127.0.0.1/8 scope host lo
   valid_lft forever preferred_lft forever
inet6 ::1/128 scope host
   valid_lft forever preferred_lft forever
2: dummy0:  mtu 1500 qdisc noop state DOWN group default qlen 
1000
link/ether 86:89:3f:86:61:29 brd ff:ff:ff:ff:ff:ff
4: dev4:  mtu 1500 qdisc noop state DOWN group default qlen 
1000
link/ether 5a:e1:4a:b6:ec:f8 brd ff:ff:ff:ff:ff:ff
$ ip netns del ns1
[  158.717795] default_device_exit: failed to move dummy0 to init_net: -17
[  158.719316] [ cut here ]
[  158.720591] kernel BUG at net/core/dev.c:9824!
[  158.722260] invalid opcode:  [#1] SMP KASAN PTI
[  158.723728] CPU: 0 PID: 56 Comm: kworker/u2:1 Not tainted 5.3.0-rc1+ #18
[  158.725422] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 
1.12.0-2.fc30 04/01/2014
[  158.727508] Workqueue: netns cleanup_net
[  158.728915] RIP: 0010:default_device_exit.cold+0x1d/0x1f
[  158.730683] Code: 84 e8 18 c9 3e fe 0f 0b e9 70 90 ff ff e8 36 e4 52 fe 89 
d9 4c 89 e2 48 c7 c6 80 d6 25 84 48 c7 c7 20 c0 25 84 e8 f4 c8 3e
[  158.736854] RSP: 0018:8880347e7b90 EFLAGS: 00010282
[  158.738752] RAX: 003b RBX: ffef RCX: 
[  158.741369] RDX:  RSI: 8128013d RDI: ed10068fcf64
[  158.743418] RBP: 888033550170 R08: 003b R09: fbfff0b94b9c
[  158.745626] R10: fbfff0b94b9b R11: 85ca5cdf R12: 888032f28000
[  158.748405] R13: dc00 R14: 8880335501b8 R15: 1110068fcf72
[  158.750638] FS:  () GS:88803600() 
knlGS:
[  158.752944] CS:  0010 DS:  ES:  CR0: 80050033
[  158.755245] CR2: 7fe8b45d21d0 CR3: 340b4005 CR4: 00360ef0
[  158.757654] DR0:  DR1:  DR2: 
[  158.760012] DR3:  DR6: fffe0ff0 DR7: 0400
[  158.762758] Call Trace:
[  158.763882]  ? dev_change_net_namespace+0xbb0/0xbb0
[  158.766148]  ? devlink_nl_cmd_set_doit+0x520/0x520
[  158.768034]  ? dev_change_net_namespace+0xbb0/0xbb0
[  158.769870]  ops_exit_list.isra.0+0xa8/0x150
[  158.771544]  cleanup_net+0x446/0x8f0
[  158.772945]  ? unregister_pernet_operations+0x4a0/0x4a0
[  158.775294]  process_one_work+0xa1a/0x1740
[  158.776896]  ? pwq_dec_nr_in_flight+0x310/0x310
[  158.779143]  ? do_raw_spin_lock+0x11b/0x280
[  158.780848]  worker_thread+0x9e/0x1060
[  158.782500]  ? process_one_work+0x1740/0x1740
[  158.784454]  kthread+0x31b/0x420
[  158.786082]  ? __kthread_create_on_node+0x3f0/0x3f0
[  158.788286]  ret_from_fork+0x3a/0x50
[  158.789871] ---[ end trace defd6c657c71f936 ]---
[  158.792273] RIP: 0010:default_device_exit.cold+0x1d/0x1f
[  158.795478] Code: 84 e8 18 c9 3e fe 0f 0b e9 70 90 ff ff e8 36 e4 52 fe 89 
d9 4c 89 e2 48 c7 c6 80 d6 25 84 48 c7 c7 20 c0 25 84 e8 f4 c8 3e
[  158.804854] RSP: 0018:8880347e7b90 EFLAGS: 00010282
[  158.807865] RAX: 00

[PATCH 4.19 40/45] cgroup: Call cgroup_release() before __exit_signal()

2019-08-08 Thread Greg Kroah-Hartman

From: Tejun Heo 

commit 6b115bf58e6f013ca75e7115aabcbd56c20ff31d upstream.

cgroup_release() calls cgroup_subsys->release() which is used by the
pids controller to uncharge its pid.  We want to use it to manage
iteration of dying tasks which requires putting it before
__unhash_process().  Move cgroup_release() above __exit_signal().
While this makes it uncharge before the pid is freed, pid is RCU freed
anyway and the window is very narrow.

Signed-off-by: Tejun Heo 
Cc: Oleg Nesterov 
Signed-off-by: Greg Kroah-Hartman 

---
 kernel/exit.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -194,6 +194,7 @@ repeat:
rcu_read_unlock();
 
proc_flush_task(p);
+   cgroup_release(p);
 
write_lock_irq(&tasklist_lock);
ptrace_release_task(p);
@@ -219,7 +220,6 @@ repeat:
}
 
write_unlock_irq(&tasklist_lock);
-   cgroup_release(p);
release_thread(p);
call_rcu(&p->rcu, delayed_put_task_struct);

[PATCH 4.14 13/33] ife: error out when nla attributes are empty

2019-08-08 Thread Greg Kroah-Hartman

From: Cong Wang 

[ Upstream commit c8ec4632c6ac9cda0e8c3d51aa41eeab66585bd5 ]

act_ife at least requires TCA_IFE_PARMS, so we have to bail out
when there is no attribute passed in.

Reported-by: syzbot+fbb5b288c9cb6a2ee...@syzkaller.appspotmail.com
Fixes: ef6980b6becb ("introduce IFE action")
Cc: Jamal Hadi Salim 
Cc: Jiri Pirko 
Signed-off-by: Cong Wang 
Signed-off-by: David S. Miller 
Signed-off-by: Greg Kroah-Hartman 
---
 net/sched/act_ife.c |3 +++
 1 file changed, 3 insertions(+)

--- a/net/sched/act_ife.c
+++ b/net/sched/act_ife.c
@@ -459,6 +459,9 @@ static int tcf_ife_init(struct net *net,
int ret = 0;
int err;
 
+   if (!nla)
+   return -EINVAL;
+
err = nla_parse_nested(tb, TCA_IFE_MAX, nla, ife_policy, NULL);
if (err < 0)
return err;

[PATCH 4.19 22/45] net: bridge: mcast: dont delete permanent entries when fast leave is enabled

2019-08-08 Thread Greg Kroah-Hartman

From: Nikolay Aleksandrov 

[ Upstream commit 5c725b6b65067909548ac9ca9bc777098ec9883d ]

When permanent entries were introduced by the commit below, they were
exempt from timing out and thus igmp leave wouldn't affect them unless
fast leave was enabled on the port which was added before permanent
entries existed. It shouldn't matter if fast leave is enabled or not
if the user added a permanent entry it shouldn't be deleted on igmp
leave.

Before:
$ echo 1 > /sys/class/net/eth4/brport/multicast_fast_leave
$ bridge mdb add dev br0 port eth4 grp 229.1.1.1 permanent
$ bridge mdb show
dev br0 port eth4 grp 229.1.1.1 permanent

< join and leave 229.1.1.1 on eth4 >

$ bridge mdb show
$

After:
$ echo 1 > /sys/class/net/eth4/brport/multicast_fast_leave
$ bridge mdb add dev br0 port eth4 grp 229.1.1.1 permanent
$ bridge mdb show
dev br0 port eth4 grp 229.1.1.1 permanent

< join and leave 229.1.1.1 on eth4 >

$ bridge mdb show
dev br0 port eth4 grp 229.1.1.1 permanent

Fixes: ccb1c31a7a87 ("bridge: add flags to distinguish permanent mdb entires")
Signed-off-by: Nikolay Aleksandrov 
Signed-off-by: David S. Miller 
Signed-off-by: Greg Kroah-Hartman 
---
 net/bridge/br_multicast.c |3 +++
 1 file changed, 3 insertions(+)

--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -1621,6 +1621,9 @@ br_multicast_leave_group(struct net_brid
if (!br_port_group_equal(p, port, src))
continue;
 
+   if (p->flags & MDB_PG_FLAGS_PERMANENT)
+   break;
+
rcu_assign_pointer(*pp, p->next);
hlist_del_init(&p->mglist);
del_timer(&p->timer);

[PATCH 4.19 41/45] cgroup: Implement css_task_iter_skip()

2019-08-08 Thread Greg Kroah-Hartman

From: Tejun Heo 

commit b636fd38dc40113f853337a7d2a6885ad23b8811 upstream.

When a task is moved out of a cset, task iterators pointing to the
task are advanced using the normal css_task_iter_advance() call.  This
is fine but we'll be tracking dying tasks on csets and thus moving
tasks from cset->tasks to (to be added) cset->dying_tasks.  When we
remove a task from cset->tasks, if we advance the iterators, they may
move over to the next cset before we had the chance to add the task
back on the dying list, which can allow the task to escape iteration.

This patch separates out skipping from advancing.  Skipping only moves
the affected iterators to the next pointer rather than fully advancing
it and the following advancing will recognize that the cursor has
already been moved forward and do the rest of advancing.  This ensures
that when a task moves from one list to another in its cset, as long
as it moves in the right direction, it's always visible to iteration.

This doesn't cause any visible behavior changes.

Signed-off-by: Tejun Heo 
Cc: Oleg Nesterov 
Signed-off-by: Greg Kroah-Hartman 

---
 include/linux/cgroup.h |3 ++
 kernel/cgroup/cgroup.c |   60 +
 2 files changed, 39 insertions(+), 24 deletions(-)

--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -43,6 +43,9 @@
 /* walk all threaded css_sets in the domain */
 #define CSS_TASK_ITER_THREADED (1U << 1)
 
+/* internal flags */
+#define CSS_TASK_ITER_SKIPPED  (1U << 16)
+
 /* a css_task_iter should be treated as an opaque object */
 struct css_task_iter {
struct cgroup_subsys*ss;
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -212,7 +212,8 @@ static struct cftype cgroup_base_files[]
 
 static int cgroup_apply_control(struct cgroup *cgrp);
 static void cgroup_finalize_control(struct cgroup *cgrp, int ret);
-static void css_task_iter_advance(struct css_task_iter *it);
+static void css_task_iter_skip(struct css_task_iter *it,
+  struct task_struct *task);
 static int cgroup_destroy_locked(struct cgroup *cgrp);
 static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
  struct cgroup_subsys *ss);
@@ -775,6 +776,21 @@ static void css_set_update_populated(str
cgroup_update_populated(link->cgrp, populated);
 }
 
+/*
+ * @task is leaving, advance task iterators which are pointing to it so
+ * that they can resume at the next position.  Advancing an iterator might
+ * remove it from the list, use safe walk.  See css_task_iter_skip() for
+ * details.
+ */
+static void css_set_skip_task_iters(struct css_set *cset,
+   struct task_struct *task)
+{
+   struct css_task_iter *it, *pos;
+
+   list_for_each_entry_safe(it, pos, &cset->task_iters, iters_node)
+   css_task_iter_skip(it, task);
+}
+
 /**
  * css_set_move_task - move a task from one css_set to another
  * @task: task being moved
@@ -800,22 +816,9 @@ static void css_set_move_task(struct tas
css_set_update_populated(to_cset, true);
 
if (from_cset) {
-   struct css_task_iter *it, *pos;
-
WARN_ON_ONCE(list_empty(&task->cg_list));
 
-   /*
-* @task is leaving, advance task iterators which are
-* pointing to it so that they can resume at the next
-* position.  Advancing an iterator might remove it from
-* the list, use safe walk.  See css_task_iter_advance*()
-* for details.
-*/
-   list_for_each_entry_safe(it, pos, &from_cset->task_iters,
-iters_node)
-   if (it->task_pos == &task->cg_list)
-   css_task_iter_advance(it);
-
+   css_set_skip_task_iters(from_cset, task);
list_del_init(&task->cg_list);
if (!css_set_populated(from_cset))
css_set_update_populated(from_cset, false);
@@ -4183,10 +4186,19 @@ static void css_task_iter_advance_css_se
list_add(&it->iters_node, &cset->task_iters);
 }
 
-static void css_task_iter_advance(struct css_task_iter *it)
+static void css_task_iter_skip(struct css_task_iter *it,
+  struct task_struct *task)
 {
-   struct list_head *next;
+   lockdep_assert_held(&css_set_lock);
+
+   if (it->task_pos == &task->cg_list) {
+   it->task_pos = it->task_pos->next;
+   it->flags |= CSS_TASK_ITER_SKIPPED;
+   }
+}
 
+static void css_task_iter_advance(struct css_task_iter *it)
+{
lockdep_assert_held(&css_set_lock);
 repeat:
if (it->task_pos) {
@@ -4195,15 +4207,15 @@ repeat:
 * consumed first and then ->mg_tasks.  After ->mg_tasks,
 * we move onto the next cset.
 */
-

[PATCH 4.19 21/45] net: bridge: delete local fdb on device init failure

2019-08-08 Thread Greg Kroah-Hartman

From: Nikolay Aleksandrov 

[ Upstream commit d7bae09fa008c6c9a489580db0a5a12063b97f97 ]

On initialization failure we have to delete the local fdb which was
inserted due to the default pvid creation. This problem has been present
since the inception of default_pvid. Note that currently there are 2 cases:
1) in br_dev_init() when br_multicast_init() fails
2) if register_netdevice() fails after calling ndo_init()

This patch takes care of both since br_vlan_flush() is called on both
occasions. Also the new fdb delete would be a no-op on normal bridge
device destruction since the local fdb would've been already flushed by
br_dev_delete(). This is not an issue for ports since nbp_vlan_init() is
called last when adding a port thus nothing can fail after it.

Reported-by: syzbot+88533dc8b582309bf...@syzkaller.appspotmail.com
Fixes: 5be5a2df40f0 ("bridge: Add filtering support for default_pvid")
Signed-off-by: Nikolay Aleksandrov 
Signed-off-by: David S. Miller 
Signed-off-by: Greg Kroah-Hartman 
---
 net/bridge/br_vlan.c |5 +
 1 file changed, 5 insertions(+)

--- a/net/bridge/br_vlan.c
+++ b/net/bridge/br_vlan.c
@@ -677,6 +677,11 @@ void br_vlan_flush(struct net_bridge *br
 
ASSERT_RTNL();
 
+   /* delete auto-added default pvid local fdb before flushing vlans
+* otherwise it will be leaked on bridge device init failure
+*/
+   br_fdb_delete_by_port(br, NULL, 0, 1);
+
vg = br_vlan_group(br);
__vlan_flush(vg);
RCU_INIT_POINTER(br->vlgrp, NULL);

[PATCH 4.19 36/45] net/mlx5: Fix modify_cq_in alignment

2019-08-08 Thread Greg Kroah-Hartman

From: Edward Srouji 

[ Upstream commit 7a32f2962c56d9d8a836b4469855caeee8766bd4 ]

Fix modify_cq_in alignment to match the device specification.
After this fix the 'cq_umem_valid' field will be in the right offset.

Cc:  # 4.19
Fixes: bd37197554eb ("net/mlx5: Update mlx5_ifc with DEVX UID bits")
Signed-off-by: Edward Srouji 
Reviewed-by: Yishai Hadas 
Signed-off-by: Leon Romanovsky 
Signed-off-by: Saeed Mahameed 
Signed-off-by: Greg Kroah-Hartman 
---
 include/linux/mlx5/mlx5_ifc.h |7 ++-
 1 file changed, 6 insertions(+), 1 deletion(-)

--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -5623,7 +5623,12 @@ struct mlx5_ifc_modify_cq_in_bits {
 
struct mlx5_ifc_cqc_bits cq_context;
 
-   u8 reserved_at_280[0x600];
+   u8 reserved_at_280[0x60];
+
+   u8 cq_umem_valid[0x1];
+   u8 reserved_at_2e1[0x1f];
+
+   u8 reserved_at_300[0x580];
 
u8 pas[0][0x40];
 };

[PATCH 4.19 38/45] r8169: dont use MSI before RTL8168d

2019-08-08 Thread Greg Kroah-Hartman

From: Heiner Kallweit 

[ Upstream commit 003bd5b4a7b4a94b501e3a1e2e7c9df6b2a94ed4 ]

It was reported that after resuming from suspend network fails with
error "do_IRQ: 3.38 No irq handler for vector", see [0]. Enabling WoL
can work around the issue, but the only actual fix is to disable MSI.
So let's mimic the behavior of the vendor driver and disable MSI on
all chip versions before RTL8168d.

[0] https://bugzilla.kernel.org/show_bug.cgi?id=204079

Fixes: 6c6aa15fdea5 ("r8169: improve interrupt handling")
Reported-by: Dušan Dragić 
Tested-by: Dušan Dragić 
Signed-off-by: Heiner Kallweit 
Signed-off-by: Greg Kroah-Hartman 
---
 drivers/net/ethernet/realtek/r8169.c |9 +++--
 1 file changed, 7 insertions(+), 2 deletions(-)

--- a/drivers/net/ethernet/realtek/r8169.c
+++ b/drivers/net/ethernet/realtek/r8169.c
@@ -7239,13 +7239,18 @@ static int rtl_alloc_irq(struct rtl8169_
 {
unsigned int flags;
 
-   if (tp->mac_version <= RTL_GIGA_MAC_VER_06) {
+   switch (tp->mac_version) {
+   case RTL_GIGA_MAC_VER_02 ... RTL_GIGA_MAC_VER_06:
RTL_W8(tp, Cfg9346, Cfg9346_Unlock);
RTL_W8(tp, Config2, RTL_R8(tp, Config2) & ~MSIEnable);
RTL_W8(tp, Cfg9346, Cfg9346_Lock);
+   /* fall through */
+   case RTL_GIGA_MAC_VER_07 ... RTL_GIGA_MAC_VER_24:
flags = PCI_IRQ_LEGACY;
-   } else {
+   break;
+   default:
flags = PCI_IRQ_ALL_TYPES;
+   break;
}
 
return pci_alloc_irq_vectors(tp->pci_dev, 1, 1, flags);

[PATCH 4.19 27/45] net: qualcomm: rmnet: Fix incorrect UL checksum offload logic

2019-08-08 Thread Greg Kroah-Hartman

From: Subash Abhinov Kasiviswanathan 

[ Upstream commit a7cf3d24ee6081930feb4c830a7f6f16ebe31c49 ]

The udp_ip4_ind bit is set only for IPv4 UDP non-fragmented packets
so that the hardware can flip the checksum to 0x if the computed
checksum is 0 per RFC768.

However, this bit had to be set for IPv6 UDP non fragmented packets
as well per hardware requirements. Otherwise, IPv6 UDP packets
with computed checksum as 0 were transmitted by hardware and were
dropped in the network.

In addition to setting this bit for IPv6 UDP, the field is also
appropriately renamed to udp_ind as part of this change.

Fixes: 5eb5f8608ef1 ("net: qualcomm: rmnet: Add support for TX checksum 
offload")
Cc: Sean Tranchetti 
Signed-off-by: Subash Abhinov Kasiviswanathan 
Signed-off-by: David S. Miller 
Signed-off-by: Greg Kroah-Hartman 
---
 drivers/net/ethernet/qualcomm/rmnet/rmnet_map.h  |2 +-
 drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c |   13 +
 2 files changed, 10 insertions(+), 5 deletions(-)

--- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_map.h
+++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_map.h
@@ -59,7 +59,7 @@ struct rmnet_map_dl_csum_trailer {
 struct rmnet_map_ul_csum_header {
__be16 csum_start_offset;
u16 csum_insert_offset:14;
-   u16 udp_ip4_ind:1;
+   u16 udp_ind:1;
u16 csum_enabled:1;
 } __aligned(1);
 
--- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c
+++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c
@@ -215,9 +215,9 @@ rmnet_map_ipv4_ul_csum_header(void *iphd
ul_header->csum_insert_offset = skb->csum_offset;
ul_header->csum_enabled = 1;
if (ip4h->protocol == IPPROTO_UDP)
-   ul_header->udp_ip4_ind = 1;
+   ul_header->udp_ind = 1;
else
-   ul_header->udp_ip4_ind = 0;
+   ul_header->udp_ind = 0;
 
/* Changing remaining fields to network order */
hdr++;
@@ -248,6 +248,7 @@ rmnet_map_ipv6_ul_csum_header(void *ip6h
  struct rmnet_map_ul_csum_header *ul_header,
  struct sk_buff *skb)
 {
+   struct ipv6hdr *ip6h = (struct ipv6hdr *)ip6hdr;
__be16 *hdr = (__be16 *)ul_header, offset;
 
offset = htons((__force u16)(skb_transport_header(skb) -
@@ -255,7 +256,11 @@ rmnet_map_ipv6_ul_csum_header(void *ip6h
ul_header->csum_start_offset = offset;
ul_header->csum_insert_offset = skb->csum_offset;
ul_header->csum_enabled = 1;
-   ul_header->udp_ip4_ind = 0;
+
+   if (ip6h->nexthdr == IPPROTO_UDP)
+   ul_header->udp_ind = 1;
+   else
+   ul_header->udp_ind = 0;
 
/* Changing remaining fields to network order */
hdr++;
@@ -428,7 +433,7 @@ sw_csum:
ul_header->csum_start_offset = 0;
ul_header->csum_insert_offset = 0;
ul_header->csum_enabled = 0;
-   ul_header->udp_ip4_ind = 0;
+   ul_header->udp_ind = 0;
 
priv->stats.csum_sw++;
 }

[PATCH 4.19 31/45] net/smc: do not schedule tx_work in SMC_CLOSED state

2019-08-08 Thread Greg Kroah-Hartman

From: Ursula Braun 

[ Upstream commit f9cedf1a9b1cdcfb0c52edb391d01771e43994a4 ]

The setsockopts options TCP_NODELAY and TCP_CORK may schedule the
tx worker. Make sure the socket is not yet moved into SMC_CLOSED
state (for instance by a shutdown SHUT_RDWR call).

Reported-by: syzbot+92209502e7aab127c...@syzkaller.appspotmail.com
Reported-by: syzbot+b972214bb803a343f...@syzkaller.appspotmail.com
Fixes: 01d2f7e2cdd31 ("net/smc: sockopts TCP_NODELAY and TCP_CORK")
Signed-off-by: Ursula Braun 
Signed-off-by: Karsten Graul 
Signed-off-by: David S. Miller 
Signed-off-by: Greg Kroah-Hartman 
---
 net/smc/af_smc.c |8 ++--
 1 file changed, 6 insertions(+), 2 deletions(-)

--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -1680,14 +1680,18 @@ static int smc_setsockopt(struct socket
}
break;
case TCP_NODELAY:
-   if (sk->sk_state != SMC_INIT && sk->sk_state != SMC_LISTEN) {
+   if (sk->sk_state != SMC_INIT &&
+   sk->sk_state != SMC_LISTEN &&
+   sk->sk_state != SMC_CLOSED) {
if (val && !smc->use_fallback)
mod_delayed_work(system_wq, &smc->conn.tx_work,
 0);
}
break;
case TCP_CORK:
-   if (sk->sk_state != SMC_INIT && sk->sk_state != SMC_LISTEN) {
+   if (sk->sk_state != SMC_INIT &&
+   sk->sk_state != SMC_LISTEN &&
+   sk->sk_state != SMC_CLOSED) {
if (!val && !smc->use_fallback)
mod_delayed_work(system_wq, &smc->conn.tx_work,
 0);

[PATCH 4.19 26/45] net: phylink: Fix flow control for fixed-link

2019-08-08 Thread Greg Kroah-Hartman

From: "Ren� van Dorst" 

[ Upstream commit 8aace4f3eba2a3ceb431e18683ea0e1ecbade5cd ]

In phylink_parse_fixedlink() the pl->link_config.advertising bits are AND
with pl->supported, pl->supported is zeroed and only the speed/duplex
modes and MII bits are set.
So pl->link_config.advertising always loses the flow control/pause bits.

By setting Pause and Asym_Pause bits in pl->supported, the flow control
work again when devicetree "pause" is set in fixes-link node and the MAC
advertise that is supports pause.

Results with this patch.

Legend:
- DT = 'Pause' is set in the fixed-link in devicetree.
- validate() = ‘Yes’ means phylink_set(mask, Pause) is set in the
  validate().
- flow = results reported my link is Up line.

+-++---+
| DT  | validate() | flow  |
+-++---+
| Yes | Yes| rx/tx |
| No  | Yes| off   |
| Yes | No | off   |
+-++---+

Fixes: 9525ae83959b ("phylink: add phylink infrastructure")
Signed-off-by: René van Dorst 
Acked-by: Russell King 
Signed-off-by: David S. Miller 
Signed-off-by: Greg Kroah-Hartman 
---
 drivers/net/phy/phylink.c |2 ++
 1 file changed, 2 insertions(+)

--- a/drivers/net/phy/phylink.c
+++ b/drivers/net/phy/phylink.c
@@ -226,6 +226,8 @@ static int phylink_parse_fixedlink(struc
   __ETHTOOL_LINK_MODE_MASK_NBITS, true);
linkmode_zero(pl->supported);
phylink_set(pl->supported, MII);
+   phylink_set(pl->supported, Pause);
+   phylink_set(pl->supported, Asym_Pause);
if (s) {
__set_bit(s->bit, pl->supported);
} else {

[PATCH 4.19 00/45] 4.19.66-stable review

2019-08-08 Thread Greg Kroah-Hartman

This is the start of the stable review cycle for the 4.19.66 release.
There are 45 patches in this series, all will be posted as a response
to this one.  If anyone has any issues with these being applied, please
let me know.

Responses should be made by Sat 10 Aug 2019 07:03:19 PM UTC.
Anything received after that time might be too late.

The whole patch series can be found in one patch at:

https://www.kernel.org/pub/linux/kernel/v4.x/stable-review/patch-4.19.66-rc1.gz
or in the git tree and branch at:

git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git 
linux-4.19.y
and the diffstat can be found below.

thanks,

greg k-h

-
Pseudo-Shortlog of commits:

Greg Kroah-Hartman 
Linux 4.19.66-rc1

Lukas Wunner 
spi: bcm2835: Fix 3-wire mode if DMA is enabled

Tejun Heo 
cgroup: Fix css_task_iter_advance_css_set() cset skip condition

Tejun Heo 
cgroup: css_task_iter_skip()'d iterators must be advanced before accessed

Tejun Heo 
cgroup: Include dying leaders with live threads in PROCS iterations

Tejun Heo 
cgroup: Implement css_task_iter_skip()

Tejun Heo 
cgroup: Call cgroup_release() before __exit_signal()

Arnd Bergmann 
compat_ioctl: pppoe: fix PPPOEIOCSFWD handling

Heiner Kallweit 
r8169: don't use MSI before RTL8168d

Ariel Levkovich 
net/mlx5e: Prevent encap flow counter update async to user query

Edward Srouji 
net/mlx5: Fix modify_cq_in alignment

Alexis Bauvin 
tun: mark small packets as owned by the tap sock

Taras Kondratiuk 
tipc: compat: allow tipc commands without arguments

Claudiu Manoil 
ocelot: Cancel delayed work before wq destruction

Johan Hovold 
NFC: nfcmrvl: fix gpio-handling regression

Ursula Braun 
net/smc: do not schedule tx_work in SMC_CLOSED state

Dmytro Linkin 
net: sched: use temporary variable for actions indexes

Roman Mashak 
net sched: update vlan action for batched events operations

Jia-Ju Bai 
net: sched: Fix a possible null-pointer dereference in dequeue_func()

Subash Abhinov Kasiviswanathan 
net: qualcomm: rmnet: Fix incorrect UL checksum offload logic

René van Dorst 
net: phylink: Fix flow control for fixed-link

Mark Zhang 
net/mlx5: Use reversed order when unregister devices

Qian Cai 
net/mlx5e: always initialize frag->last_in_page

Jiri Pirko 
net: fix ifindex collision during namespace removal

Nikolay Aleksandrov 
net: bridge: mcast: don't delete permanent entries when fast leave is 
enabled

Nikolay Aleksandrov 
net: bridge: delete local fdb on device init failure

Matteo Croce 
mvpp2: refactor MTU change code

Matteo Croce 
mvpp2: fix panic on module removal

Jiri Pirko 
mlxsw: spectrum: Fix error path in mlxsw_sp_module_init()

Haishuang Yan 
ipip: validate header length in ipip_tunnel_xmit

Haishuang Yan 
ip6_tunnel: fix possible use-after-free on xmit

Haishuang Yan 
ip6_gre: reload ipv6h in prepare_ip6gre_xmit_ipv6

Cong Wang 
ife: error out when nla attributes are empty

Sudarsana Reddy Kalluru 
bnx2x: Disable multi-cos feature.

Gustavo A. R. Silva 
atm: iphase: Fix Spectre v1 vulnerability

Greg Kroah-Hartman 
IB: directly cast the sockaddr union to aockaddr

Sebastian Parschauer 
HID: Add quirk for HP X1200 PIXART OEM mouse

Aaron Armstrong Skomra 
HID: wacom: fix bit shift for Cintiq Companion 2

Dan Williams 
libnvdimm/bus: Fix wait_nvdimm_bus_probe_idle() ABBA deadlock

Dan Williams 
libnvdimm/bus: Prepare the nd_ioctl() path to be re-entrant

Dan Williams 
libnvdimm/region: Register badblocks before namespaces

Dan Williams 
libnvdimm/bus: Prevent duplicate device_unregister() calls

Dan Williams 
drivers/base: Introduce kill_device()

Alexander Duyck 
driver core: Establish order of operations for device_add and device_del 
via bitflag

Linus Torvalds 
gcc-9: don't warn about uninitialized variable

Hannes Reinecke 
scsi: fcoe: Embed fc_rport_priv in fcoe_rport structure


-

Diffstat:

 Makefile   |   4 +-
 drivers/atm/iphase.c   |   8 +-
 drivers/base/base.h|   4 +
 drivers/base/core.c|  22 +
 drivers/base/dd.c  |  22 ++---
 drivers/hid/hid-ids.h  |   1 +
 drivers/hid/hid-quirks.c   |   1 +
 drivers/hid/wacom_wac.c|  12 +--
 drivers/i2c/i2c-core-base.c|   2 +-
 drivers/infiniband/core/sa_query.c |   9 +-
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c|   3 +-
 drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c|  46 +++--
 drivers/net/ethernet/mellanox/mlx5/core/dev.c  |   2 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c  |   5 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_tc.c|   4 +-
 .../n

[PATCH 4.19 08/45] libnvdimm/bus: Fix wait_nvdimm_bus_probe_idle() ABBA deadlock

2019-08-08 Thread Greg Kroah-Hartman

commit ca6bf264f6d856f959c4239cda1047b587745c67 upstream.

A multithreaded namespace creation/destruction stress test currently
deadlocks with the following lockup signature:

INFO: task ndctl:2924 blocked for more than 122 seconds.
  Tainted: G   OE 5.2.0-rc4+ #3382
"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
ndctl   D0  2924   1176 0x
Call Trace:
 ? __schedule+0x27e/0x780
 schedule+0x30/0xb0
 wait_nvdimm_bus_probe_idle+0x8a/0xd0 [libnvdimm]
 ? finish_wait+0x80/0x80
 uuid_store+0xe6/0x2e0 [libnvdimm]
 kernfs_fop_write+0xf0/0x1a0
 vfs_write+0xb7/0x1b0
 ksys_write+0x5c/0xd0
 do_syscall_64+0x60/0x240

 INFO: task ndctl:2923 blocked for more than 122 seconds.
   Tainted: G   OE 5.2.0-rc4+ #3382
 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
 ndctl   D0  2923   1175 0x
 Call Trace:
  ? __schedule+0x27e/0x780
  ? __mutex_lock+0x489/0x910
  schedule+0x30/0xb0
  schedule_preempt_disabled+0x11/0x20
  __mutex_lock+0x48e/0x910
  ? nvdimm_namespace_common_probe+0x95/0x4d0 [libnvdimm]
  ? __lock_acquire+0x23f/0x1710
  ? nvdimm_namespace_common_probe+0x95/0x4d0 [libnvdimm]
  nvdimm_namespace_common_probe+0x95/0x4d0 [libnvdimm]
  __dax_pmem_probe+0x5e/0x210 [dax_pmem_core]
  ? nvdimm_bus_probe+0x1d0/0x2c0 [libnvdimm]
  dax_pmem_probe+0xc/0x20 [dax_pmem]
  nvdimm_bus_probe+0x90/0x2c0 [libnvdimm]
  really_probe+0xef/0x390
  driver_probe_device+0xb4/0x100

In this sequence an 'nd_dax' device is being probed and trying to take
the lock on its backing namespace to validate that the 'nd_dax' device
indeed has exclusive access to the backing namespace. Meanwhile, another
thread is trying to update the uuid property of that same backing
namespace. So one thread is in the probe path trying to acquire the
lock, and the other thread has acquired the lock and tries to flush the
probe path.

Fix this deadlock by not holding the namespace device_lock over the
wait_nvdimm_bus_probe_idle() synchronization step. In turn this requires
the device_lock to be held on entry to wait_nvdimm_bus_probe_idle() and
subsequently dropped internally to wait_nvdimm_bus_probe_idle().

Cc: 
Fixes: bf9bccc14c05 ("libnvdimm: pmem label sets and namespace instantiation")
Cc: Vishal Verma 
Tested-by: Jane Chu 
Link: 
https://lore.kernel.org/r/156341210094.292348.2384694131126767789.st...@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Dan Williams 
Signed-off-by: Sasha Levin 
---
 drivers/nvdimm/bus.c | 14 +-
 drivers/nvdimm/region_devs.c |  4 
 2 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c
index 5abcdb4faa644..2ba22cd1331b0 100644
--- a/drivers/nvdimm/bus.c
+++ b/drivers/nvdimm/bus.c
@@ -865,10 +865,12 @@ void wait_nvdimm_bus_probe_idle(struct device *dev)
do {
if (nvdimm_bus->probe_active == 0)
break;
-   nvdimm_bus_unlock(&nvdimm_bus->dev);
+   nvdimm_bus_unlock(dev);
+   device_unlock(dev);
wait_event(nvdimm_bus->wait,
nvdimm_bus->probe_active == 0);
-   nvdimm_bus_lock(&nvdimm_bus->dev);
+   device_lock(dev);
+   nvdimm_bus_lock(dev);
} while (true);
 }
 
@@ -994,7 +996,7 @@ static int __nd_ioctl(struct nvdimm_bus *nvdimm_bus, struct 
nvdimm *nvdimm,
case ND_CMD_ARS_START:
case ND_CMD_CLEAR_ERROR:
case ND_CMD_CALL:
-   dev_dbg(&nvdimm_bus->dev, "'%s' command while 
read-only.\n",
+   dev_dbg(dev, "'%s' command while read-only.\n",
nvdimm ? nvdimm_cmd_name(cmd)
: nvdimm_bus_cmd_name(cmd));
return -EPERM;
@@ -1083,7 +1085,8 @@ static int __nd_ioctl(struct nvdimm_bus *nvdimm_bus, 
struct nvdimm *nvdimm,
goto out;
}
 
-   nvdimm_bus_lock(&nvdimm_bus->dev);
+   device_lock(dev);
+   nvdimm_bus_lock(dev);
rc = nd_cmd_clear_to_send(nvdimm_bus, nvdimm, func, buf);
if (rc)
goto out_unlock;
@@ -1103,7 +1106,8 @@ static int __nd_ioctl(struct nvdimm_bus *nvdimm_bus, 
struct nvdimm *nvdimm,
rc = -EFAULT;
 
 out_unlock:
-   nvdimm_bus_unlock(&nvdimm_bus->dev);
+   nvdimm_bus_unlock(dev);
+   device_unlock(dev);
 out:
kfree(in_env);
kfree(out_env);
diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c
index e7377f1028ef6..0303296e6d5b6 100644
--- a/drivers/nvdimm/region_devs.c
+++ b/drivers/nvdimm/region_devs.c
@@ -425,10 +425,12 @@ static ssize_t available_size_show(struct device *dev,
 * memory nvdimm_bus_lock() is droppe

[PATCH 4.19 07/45] libnvdimm/bus: Prepare the nd_ioctl() path to be re-entrant

2019-08-08 Thread Greg Kroah-Hartman

commit 6de5d06e657acdbcf9637dac37916a4a5309e0f4 upstream.

In preparation for not holding a lock over the execution of nd_ioctl(),
update the implementation to allow multiple threads to be attempting
ioctls at the same time. The bus lock still prevents multiple in-flight
->ndctl() invocations from corrupting each other's state, but static
global staging buffers are moved to the heap.

Reported-by: Vishal Verma 
Reviewed-by: Vishal Verma 
Tested-by: Vishal Verma 
Link: 
https://lore.kernel.org/r/156341208947.292348.10560140326807607481.st...@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Dan Williams 
Signed-off-by: Sasha Levin 
---
 drivers/nvdimm/bus.c | 59 +++-
 1 file changed, 37 insertions(+), 22 deletions(-)

diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c
index 11cfd23e5aff7..5abcdb4faa644 100644
--- a/drivers/nvdimm/bus.c
+++ b/drivers/nvdimm/bus.c
@@ -951,20 +951,19 @@ static int __nd_ioctl(struct nvdimm_bus *nvdimm_bus, 
struct nvdimm *nvdimm,
int read_only, unsigned int ioctl_cmd, unsigned long arg)
 {
struct nvdimm_bus_descriptor *nd_desc = nvdimm_bus->nd_desc;
-   static char out_env[ND_CMD_MAX_ENVELOPE];
-   static char in_env[ND_CMD_MAX_ENVELOPE];
const struct nd_cmd_desc *desc = NULL;
unsigned int cmd = _IOC_NR(ioctl_cmd);
struct device *dev = &nvdimm_bus->dev;
void __user *p = (void __user *) arg;
+   char *out_env = NULL, *in_env = NULL;
const char *cmd_name, *dimm_name;
u32 in_len = 0, out_len = 0;
unsigned int func = cmd;
unsigned long cmd_mask;
struct nd_cmd_pkg pkg;
int rc, i, cmd_rc;
+   void *buf = NULL;
u64 buf_len = 0;
-   void *buf;
 
if (nvdimm) {
desc = nd_cmd_dimm_desc(cmd);
@@ -1004,6 +1003,9 @@ static int __nd_ioctl(struct nvdimm_bus *nvdimm_bus, 
struct nvdimm *nvdimm,
}
 
/* process an input envelope */
+   in_env = kzalloc(ND_CMD_MAX_ENVELOPE, GFP_KERNEL);
+   if (!in_env)
+   return -ENOMEM;
for (i = 0; i < desc->in_num; i++) {
u32 in_size, copy;
 
@@ -1011,14 +1013,17 @@ static int __nd_ioctl(struct nvdimm_bus *nvdimm_bus, 
struct nvdimm *nvdimm,
if (in_size == UINT_MAX) {
dev_err(dev, "%s:%s unknown input size cmd: %s field: 
%d\n",
__func__, dimm_name, cmd_name, i);
-   return -ENXIO;
+   rc = -ENXIO;
+   goto out;
}
-   if (in_len < sizeof(in_env))
-   copy = min_t(u32, sizeof(in_env) - in_len, in_size);
+   if (in_len < ND_CMD_MAX_ENVELOPE)
+   copy = min_t(u32, ND_CMD_MAX_ENVELOPE - in_len, 
in_size);
else
copy = 0;
-   if (copy && copy_from_user(&in_env[in_len], p + in_len, copy))
-   return -EFAULT;
+   if (copy && copy_from_user(&in_env[in_len], p + in_len, copy)) {
+   rc = -EFAULT;
+   goto out;
+   }
in_len += in_size;
}
 
@@ -1030,6 +1035,12 @@ static int __nd_ioctl(struct nvdimm_bus *nvdimm_bus, 
struct nvdimm *nvdimm,
}
 
/* process an output envelope */
+   out_env = kzalloc(ND_CMD_MAX_ENVELOPE, GFP_KERNEL);
+   if (!out_env) {
+   rc = -ENOMEM;
+   goto out;
+   }
+
for (i = 0; i < desc->out_num; i++) {
u32 out_size = nd_cmd_out_size(nvdimm, cmd, desc, i,
(u32 *) in_env, (u32 *) out_env, 0);
@@ -1038,15 +1049,18 @@ static int __nd_ioctl(struct nvdimm_bus *nvdimm_bus, 
struct nvdimm *nvdimm,
if (out_size == UINT_MAX) {
dev_dbg(dev, "%s unknown output size cmd: %s field: 
%d\n",
dimm_name, cmd_name, i);
-   return -EFAULT;
+   rc = -EFAULT;
+   goto out;
}
-   if (out_len < sizeof(out_env))
-   copy = min_t(u32, sizeof(out_env) - out_len, out_size);
+   if (out_len < ND_CMD_MAX_ENVELOPE)
+   copy = min_t(u32, ND_CMD_MAX_ENVELOPE - out_len, 
out_size);
else
copy = 0;
if (copy && copy_from_user(&out_env[out_len],
-   p + in_len + out_len, copy))
-   return -EFAULT;
+   p + in_len + out_len, copy)) {
+   rc = -EFAULT;
+   goto out;
+   }
out_len += out_size;
}
 
@@ -1054,12 +1068,15 @@ static int __nd_ioctl(struct nvdimm_bus *nvdimm_bus, 
struct nvdimm *nvdimm,
if (buf

[PATCH 4.19 15/45] ip6_gre: reload ipv6h in prepare_ip6gre_xmit_ipv6

2019-08-08 Thread Greg Kroah-Hartman

From: Haishuang Yan 

[ Upstream commit 3bc817d665ac6d9de89f59df522ad86f5b5dfc03 ]

Since ip6_tnl_parse_tlv_enc_lim() can call pskb_may_pull()
which may change skb->data, so we need to re-load ipv6h at
the right place.

Fixes: 898b29798e36 ("ip6_gre: Refactor ip6gre xmit codes")
Cc: William Tu 
Signed-off-by: Haishuang Yan 
Acked-by: William Tu 
Signed-off-by: David S. Miller 
Signed-off-by: Greg Kroah-Hartman 
---
 net/ipv6/ip6_gre.c |3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -680,12 +680,13 @@ static int prepare_ip6gre_xmit_ipv6(stru
struct flowi6 *fl6, __u8 *dsfield,
int *encap_limit)
 {
-   struct ipv6hdr *ipv6h = ipv6_hdr(skb);
+   struct ipv6hdr *ipv6h;
struct ip6_tnl *t = netdev_priv(dev);
__u16 offset;
 
offset = ip6_tnl_parse_tlv_enc_lim(skb, skb_network_header(skb));
/* ip6_tnl_parse_tlv_enc_lim() might have reallocated skb->head */
+   ipv6h = ipv6_hdr(skb);
 
if (offset > 0) {
struct ipv6_tlv_tnl_enc_lim *tel;

[PATCH 4.19 05/45] libnvdimm/bus: Prevent duplicate device_unregister() calls

2019-08-08 Thread Greg Kroah-Hartman

commit 8aac0e2338916e273ccbd438a2b7a1e8c61749f5 upstream.

A multithreaded namespace creation/destruction stress test currently
fails with signatures like the following:

sysfs group 'power' not found for kobject 'dax1.1'
RIP: 0010:sysfs_remove_group+0x76/0x80
Call Trace:
 device_del+0x73/0x370
 device_unregister+0x16/0x50
 nd_async_device_unregister+0x1e/0x30 [libnvdimm]
 async_run_entry_fn+0x39/0x160
 process_one_work+0x23c/0x5e0
 worker_thread+0x3c/0x390

BUG: kernel NULL pointer dereference, address: 0020
RIP: 0010:klist_put+0x1b/0x6c
Call Trace:
 klist_del+0xe/0x10
 device_del+0x8a/0x2c9
 ? __switch_to_asm+0x34/0x70
 ? __switch_to_asm+0x40/0x70
 device_unregister+0x44/0x4f
 nd_async_device_unregister+0x22/0x2d [libnvdimm]
 async_run_entry_fn+0x47/0x15a
 process_one_work+0x1a2/0x2eb
 worker_thread+0x1b8/0x26e

Use the kill_device() helper to atomically resolve the race of multiple
threads issuing kill, device_unregister(), requests.

Reported-by: Jane Chu 
Reported-by: Erwin Tsaur 
Fixes: 4d88a97aa9e8 ("libnvdimm, nvdimm: dimm driver and base libnvdimm 
device-driver...")
Cc: 
Link: https://github.com/pmem/ndctl/issues/96
Tested-by: Tested-by: Jane Chu 
Link: 
https://lore.kernel.org/r/156341207846.292348.10435719262819764054.st...@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: Dan Williams 
Signed-off-by: Sasha Levin 
---
 drivers/nvdimm/bus.c | 25 +
 1 file changed, 25 insertions(+)

diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c
index ee39e2c1644ae..11cfd23e5aff7 100644
--- a/drivers/nvdimm/bus.c
+++ b/drivers/nvdimm/bus.c
@@ -528,13 +528,38 @@ EXPORT_SYMBOL(nd_device_register);
 
 void nd_device_unregister(struct device *dev, enum nd_async_mode mode)
 {
+   bool killed;
+
switch (mode) {
case ND_ASYNC:
+   /*
+* In the async case this is being triggered with the
+* device lock held and the unregistration work needs to
+* be moved out of line iff this is thread has won the
+* race to schedule the deletion.
+*/
+   if (!kill_device(dev))
+   return;
+
get_device(dev);
async_schedule_domain(nd_async_device_unregister, dev,
&nd_async_domain);
break;
case ND_SYNC:
+   /*
+* In the sync case the device is being unregistered due
+* to a state change of the parent. Claim the kill state
+* to synchronize against other unregistration requests,
+* or otherwise let the async path handle it if the
+* unregistration was already queued.
+*/
+   device_lock(dev);
+   killed = kill_device(dev);
+   device_unlock(dev);
+
+   if (!killed)
+   return;
+
nd_synchronize();
device_unregister(dev);
break;
-- 
2.20.1

[PATCH 5.2 47/56] net: phy: fix race in genphy_update_link

2019-08-08 Thread Greg Kroah-Hartman

From: Heiner Kallweit 

[ Upstream commit aa6b1956158f1afc52761137620d4b3f8a058d24 ]

In phy_start_aneg() autoneg is started, and immediately after that
link and autoneg status are read. As reported in [0] it can happen that
at time of this read the PHY has reset the "aneg complete" bit but not
yet the "link up" bit, what can result in a false link-up detection.
To fix this don't report link as up if we're in aneg mode and PHY
doesn't signal "aneg complete".

[0] https://marc.info/?t=15641350993&r=1&w=2

Fixes: 4950c2ba49cc ("net: phy: fix autoneg mismatch case in 
genphy_read_status")
Reported-by: liuyonglong 
Tested-by: liuyonglong 
Signed-off-by: Heiner Kallweit 
Signed-off-by: David S. Miller 
Signed-off-by: Greg Kroah-Hartman 
---
 drivers/net/phy/phy_device.c |6 ++
 1 file changed, 6 insertions(+)

--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -1730,6 +1730,12 @@ done:
phydev->link = status & BMSR_LSTATUS ? 1 : 0;
phydev->autoneg_complete = status & BMSR_ANEGCOMPLETE ? 1 : 0;
 
+   /* Consider the case that autoneg was started and "aneg complete"
+* bit has been reset, but "link up" bit not yet.
+*/
+   if (phydev->autoneg == AUTONEG_ENABLE && !phydev->autoneg_complete)
+   phydev->link = 0;
+
return 0;
 }
 EXPORT_SYMBOL(genphy_update_link);

[PATCH 4.19 01/45] scsi: fcoe: Embed fc_rport_priv in fcoe_rport structure

2019-08-08 Thread Greg Kroah-Hartman

From: Hannes Reinecke 

commit 023358b136d490ca91735ac6490db3741af5a8bd upstream.

Gcc-9 complains for a memset across pointer boundaries, which happens as
the code tries to allocate a flexible array on the stack.  Turns out we
cannot do this without relying on gcc-isms, so with this patch we'll embed
the fc_rport_priv structure into fcoe_rport, can use the normal
'container_of' outcast, and will only have to do a memset over one
structure.

Signed-off-by: Hannes Reinecke 
Signed-off-by: Martin K. Petersen 
Signed-off-by: Greg Kroah-Hartman 

---
 drivers/scsi/fcoe/fcoe_ctlr.c |   51 --
 drivers/scsi/libfc/fc_rport.c |5 +++-
 include/scsi/libfcoe.h|1 
 3 files changed, 25 insertions(+), 32 deletions(-)

--- a/drivers/scsi/fcoe/fcoe_ctlr.c
+++ b/drivers/scsi/fcoe/fcoe_ctlr.c
@@ -2017,7 +2017,7 @@ EXPORT_SYMBOL_GPL(fcoe_wwn_from_mac);
  */
 static inline struct fcoe_rport *fcoe_ctlr_rport(struct fc_rport_priv *rdata)
 {
-   return (struct fcoe_rport *)(rdata + 1);
+   return container_of(rdata, struct fcoe_rport, rdata);
 }
 
 /**
@@ -2281,7 +2281,7 @@ static void fcoe_ctlr_vn_start(struct fc
  */
 static int fcoe_ctlr_vn_parse(struct fcoe_ctlr *fip,
  struct sk_buff *skb,
- struct fc_rport_priv *rdata)
+ struct fcoe_rport *frport)
 {
struct fip_header *fiph;
struct fip_desc *desc = NULL;
@@ -2289,16 +2289,12 @@ static int fcoe_ctlr_vn_parse(struct fco
struct fip_wwn_desc *wwn = NULL;
struct fip_vn_desc *vn = NULL;
struct fip_size_desc *size = NULL;
-   struct fcoe_rport *frport;
size_t rlen;
size_t dlen;
u32 desc_mask = 0;
u32 dtype;
u8 sub;
 
-   memset(rdata, 0, sizeof(*rdata) + sizeof(*frport));
-   frport = fcoe_ctlr_rport(rdata);
-
fiph = (struct fip_header *)skb->data;
frport->flags = ntohs(fiph->fip_flags);
 
@@ -2361,15 +2357,17 @@ static int fcoe_ctlr_vn_parse(struct fco
if (dlen != sizeof(struct fip_wwn_desc))
goto len_err;
wwn = (struct fip_wwn_desc *)desc;
-   rdata->ids.node_name = get_unaligned_be64(&wwn->fd_wwn);
+   frport->rdata.ids.node_name =
+   get_unaligned_be64(&wwn->fd_wwn);
break;
case FIP_DT_VN_ID:
if (dlen != sizeof(struct fip_vn_desc))
goto len_err;
vn = (struct fip_vn_desc *)desc;
memcpy(frport->vn_mac, vn->fd_mac, ETH_ALEN);
-   rdata->ids.port_id = ntoh24(vn->fd_fc_id);
-   rdata->ids.port_name = get_unaligned_be64(&vn->fd_wwpn);
+   frport->rdata.ids.port_id = ntoh24(vn->fd_fc_id);
+   frport->rdata.ids.port_name =
+   get_unaligned_be64(&vn->fd_wwpn);
break;
case FIP_DT_FC4F:
if (dlen != sizeof(struct fip_fc4_feat))
@@ -2750,10 +2748,7 @@ static int fcoe_ctlr_vn_recv(struct fcoe
 {
struct fip_header *fiph;
enum fip_vn2vn_subcode sub;
-   struct {
-   struct fc_rport_priv rdata;
-   struct fcoe_rport frport;
-   } buf;
+   struct fcoe_rport frport = { };
int rc, vlan_id = 0;
 
fiph = (struct fip_header *)skb->data;
@@ -2769,7 +2764,7 @@ static int fcoe_ctlr_vn_recv(struct fcoe
goto drop;
}
 
-   rc = fcoe_ctlr_vn_parse(fip, skb, &buf.rdata);
+   rc = fcoe_ctlr_vn_parse(fip, skb, &frport);
if (rc) {
LIBFCOE_FIP_DBG(fip, "vn_recv vn_parse error %d\n", rc);
goto drop;
@@ -2778,19 +2773,19 @@ static int fcoe_ctlr_vn_recv(struct fcoe
mutex_lock(&fip->ctlr_mutex);
switch (sub) {
case FIP_SC_VN_PROBE_REQ:
-   fcoe_ctlr_vn_probe_req(fip, &buf.rdata);
+   fcoe_ctlr_vn_probe_req(fip, &frport.rdata);
break;
case FIP_SC_VN_PROBE_REP:
-   fcoe_ctlr_vn_probe_reply(fip, &buf.rdata);
+   fcoe_ctlr_vn_probe_reply(fip, &frport.rdata);
break;
case FIP_SC_VN_CLAIM_NOTIFY:
-   fcoe_ctlr_vn_claim_notify(fip, &buf.rdata);
+   fcoe_ctlr_vn_claim_notify(fip, &frport.rdata);
break;
case FIP_SC_VN_CLAIM_REP:
-   fcoe_ctlr_vn_claim_resp(fip, &buf.rdata);
+   fcoe_ctlr_vn_claim_resp(fip, &frport.rdata);
break;
case FIP_SC_VN_BEACON:
-   fcoe_ctlr_vn_beacon(fip, &buf.rdata);
+   fcoe_ctlr_vn_beacon(fip, &frport.rdata);
break;
default:
LIBFCOE_FIP_DBG(fip, "vn_recv unknown subcode %d\n",

[PATCH 4.19 03/45] driver core: Establish order of operations for device_add and device_del via bitflag

2019-08-08 Thread Greg Kroah-Hartman

commit 3451a495ef244a88ed6317a035299d835554d579 upstream.

Add an additional bit flag to the device_private struct named "dead".

This additional flag provides a guarantee that when a device_del is
executed on a given interface an async worker will not attempt to attach
the driver following the earlier device_del call. Previously this
guarantee was not present and could result in the device_del call
attempting to remove a driver from an interface only to have the async
worker attempt to probe the driver later when it finally completes the
asynchronous probe call.

One additional change added was that I pulled the check for dev->driver
out of the __device_attach_driver call and instead placed it in the
__device_attach_async_helper call. This was motivated by the fact that the
only other caller of this, __device_attach, had already taken the
device_lock() and checked for dev->driver. Instead of testing for this
twice in this path it makes more sense to just consolidate the dev->dead
and dev->driver checks together into one set of checks.

Reviewed-by: Dan Williams 
Reviewed-by: Rafael J. Wysocki 
Signed-off-by: Alexander Duyck 
Signed-off-by: Greg Kroah-Hartman 
Signed-off-by: Sasha Levin 
---
 drivers/base/base.h |  4 
 drivers/base/core.c | 11 +++
 drivers/base/dd.c   | 22 +++---
 3 files changed, 26 insertions(+), 11 deletions(-)

diff --git a/drivers/base/base.h b/drivers/base/base.h
index 7a419a7a6235b..559b047de9f75 100644
--- a/drivers/base/base.h
+++ b/drivers/base/base.h
@@ -66,6 +66,9 @@ struct driver_private {
  * probed first.
  * @device - pointer back to the struct device that this structure is
  * associated with.
+ * @dead - This device is currently either in the process of or has been
+ * removed from the system. Any asynchronous events scheduled for this
+ * device should exit without taking any action.
  *
  * Nothing outside of the driver core should ever touch these fields.
  */
@@ -76,6 +79,7 @@ struct device_private {
struct klist_node knode_bus;
struct list_head deferred_probe;
struct device *device;
+   u8 dead:1;
 };
 #define to_device_private_parent(obj)  \
container_of(obj, struct device_private, knode_parent)
diff --git a/drivers/base/core.c b/drivers/base/core.c
index 92e2c32c22270..37a90d72f3736 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -2050,6 +2050,17 @@ void device_del(struct device *dev)
struct kobject *glue_dir = NULL;
struct class_interface *class_intf;
 
+   /*
+* Hold the device lock and set the "dead" flag to guarantee that
+* the update behavior is consistent with the other bitfields near
+* it and that we cannot have an asynchronous probe routine trying
+* to run while we are tearing out the bus/class/sysfs from
+* underneath the device.
+*/
+   device_lock(dev);
+   dev->p->dead = true;
+   device_unlock(dev);
+
/* Notify clients of device removal.  This call must come
 * before dpm_sysfs_remove().
 */
diff --git a/drivers/base/dd.c b/drivers/base/dd.c
index d48b310c47603..11d24a552ee49 100644
--- a/drivers/base/dd.c
+++ b/drivers/base/dd.c
@@ -725,15 +725,6 @@ static int __device_attach_driver(struct device_driver 
*drv, void *_data)
bool async_allowed;
int ret;
 
-   /*
-* Check if device has already been claimed. This may
-* happen with driver loading, device discovery/registration,
-* and deferred probe processing happens all at once with
-* multiple threads.
-*/
-   if (dev->driver)
-   return -EBUSY;
-
ret = driver_match_device(drv, dev);
if (ret == 0) {
/* no match */
@@ -768,6 +759,15 @@ static void __device_attach_async_helper(void *_dev, 
async_cookie_t cookie)
 
device_lock(dev);
 
+   /*
+* Check if device has already been removed or claimed. This may
+* happen with driver loading, device discovery/registration,
+* and deferred probe processing happens all at once with
+* multiple threads.
+*/
+   if (dev->p->dead || dev->driver)
+   goto out_unlock;
+
if (dev->parent)
pm_runtime_get_sync(dev->parent);
 
@@ -778,7 +778,7 @@ static void __device_attach_async_helper(void *_dev, 
async_cookie_t cookie)
 
if (dev->parent)
pm_runtime_put(dev->parent);
-
+out_unlock:
device_unlock(dev);
 
put_device(dev);
@@ -891,7 +891,7 @@ static int __driver_attach(struct device *dev, void *data)
if (dev->parent && dev->bus->need_parent_lock)
device_lock(dev->parent);
device_lock(dev);
-   if (!dev->driver)
+   if (!dev->p->dead && !dev->driver)
driver_probe_device(drv, dev);
device_unlock(dev);
if (dev->parent && dev->bus->need_parent_lock)
-- 
2.20.1

[PATCH 5.2 46/56] hv_sock: Fix hang when a connection is closed

2019-08-08 Thread Greg Kroah-Hartman

From: Dexuan Cui 

[ Upstream commit 8c7885e5690be9a27231ebebf82ef29fbf46c4e4 ]

There is a race condition for an established connection that is being closed
by the guest: the refcnt is 4 at the end of hvs_release() (Note: here the
'remove_sock' is false):

1 for the initial value;
1 for the sk being in the bound list;
1 for the sk being in the connected list;
1 for the delayed close_work.

After hvs_release() finishes, __vsock_release() -> sock_put(sk) *may*
decrease the refcnt to 3.

Concurrently, hvs_close_connection() runs in another thread:
  calls vsock_remove_sock() to decrease the refcnt by 2;
  call sock_put() to decrease the refcnt to 0, and free the sk;
  next, the "release_sock(sk)" may hang due to use-after-free.

In the above, after hvs_release() finishes, if hvs_close_connection() runs
faster than "__vsock_release() -> sock_put(sk)", then there is not any issue,
because at the beginning of hvs_close_connection(), the refcnt is still 4.

The issue can be resolved if an extra reference is taken when the
connection is established.

Fixes: a9eeb998c28d ("hv_sock: Add support for delayed close")
Signed-off-by: Dexuan Cui 
Reviewed-by: Sunil Muthuswamy 
Signed-off-by: David S. Miller 
Signed-off-by: Greg Kroah-Hartman 
---
 net/vmw_vsock/hyperv_transport.c |8 
 1 file changed, 8 insertions(+)

--- a/net/vmw_vsock/hyperv_transport.c
+++ b/net/vmw_vsock/hyperv_transport.c
@@ -311,6 +311,11 @@ static void hvs_close_connection(struct
lock_sock(sk);
hvs_do_close_lock_held(vsock_sk(sk), true);
release_sock(sk);
+
+   /* Release the refcnt for the channel that's opened in
+* hvs_open_connection().
+*/
+   sock_put(sk);
 }
 
 static void hvs_open_connection(struct vmbus_channel *chan)
@@ -378,6 +383,9 @@ static void hvs_open_connection(struct v
}
 
set_per_channel_state(chan, conn_from_host ? new : sk);
+
+   /* This reference will be dropped by hvs_close_connection(). */
+   sock_hold(conn_from_host ? new : sk);
vmbus_set_chn_rescind_callback(chan, hvs_close_connection);
 
/* Set the pending send size to max packet size to always get

[PATCH 5.2 43/56] selftests/bpf: add wrapper scripts for test_xdp_vlan.sh

2019-08-08 Thread Greg Kroah-Hartman

From: Jesper Dangaard Brouer 

[ Upstream commit d35661fcf95d8818c1f9acc818a1bad23dda4e1c ]

In-order to test both native-XDP (xdpdrv) and generic-XDP (xdpgeneric)
create two wrapper test scripts, that start the test_xdp_vlan.sh script
with these modes.

Signed-off-by: Jesper Dangaard Brouer 
Signed-off-by: David S. Miller 
Signed-off-by: Greg Kroah-Hartman 
---
 tools/testing/selftests/bpf/Makefile  |3 ++-
 tools/testing/selftests/bpf/test_xdp_vlan.sh  |5 -
 tools/testing/selftests/bpf/test_xdp_vlan_mode_generic.sh |9 +
 tools/testing/selftests/bpf/test_xdp_vlan_mode_native.sh  |9 +
 4 files changed, 24 insertions(+), 2 deletions(-)
 create mode 100755 tools/testing/selftests/bpf/test_xdp_vlan_mode_generic.sh
 create mode 100755 tools/testing/selftests/bpf/test_xdp_vlan_mode_native.sh

--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -51,7 +51,8 @@ TEST_PROGS := test_kmod.sh \
test_lirc_mode2.sh \
test_skb_cgroup_id.sh \
test_flow_dissector.sh \
-   test_xdp_vlan.sh \
+   test_xdp_vlan_mode_generic.sh \
+   test_xdp_vlan_mode_native.sh \
test_lwt_ip_encap.sh \
test_tcp_check_syncookie.sh \
test_tc_tunnel.sh \
--- a/tools/testing/selftests/bpf/test_xdp_vlan.sh
+++ b/tools/testing/selftests/bpf/test_xdp_vlan.sh
@@ -2,7 +2,10 @@
 # SPDX-License-Identifier: GPL-2.0
 # Author: Jesper Dangaard Brouer 
 
-TESTNAME=xdp_vlan
+# Allow wrapper scripts to name test
+if [ -z "$TESTNAME" ]; then
+TESTNAME=xdp_vlan
+fi
 
 # Default XDP mode
 XDP_MODE=xdpgeneric
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_xdp_vlan_mode_generic.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# Exit on failure
+set -e
+
+# Wrapper script to test generic-XDP
+export TESTNAME=xdp_vlan_mode_generic
+./test_xdp_vlan.sh --mode=xdpgeneric
--- /dev/null
+++ b/tools/testing/selftests/bpf/test_xdp_vlan_mode_native.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# Exit on failure
+set -e
+
+# Wrapper script to test native-XDP
+export TESTNAME=xdp_vlan_mode_native
+./test_xdp_vlan.sh --mode=xdpdrv

[PATCH 5.2 42/56] bpf: fix XDP vlan selftests test_xdp_vlan.sh

2019-08-08 Thread Greg Kroah-Hartman

From: Jesper Dangaard Brouer 

[ Upstream commit 4de9c89a4982431c4a02739743fd360dc5581f22 ]

Change BPF selftest test_xdp_vlan.sh to (default) use generic XDP.

This selftest was created together with a fix for generic XDP, in commit
297249569932 ("net: fix generic XDP to handle if eth header was
mangled"). And was suppose to catch if generic XDP was broken again.

The tests are using veth and assumed that veth driver didn't support
native driver XDP, thus it used the (ip link set) 'xdp' attach that fell
back to generic-XDP. But veth gained native-XDP support in 948d4f214fde
("veth: Add driver XDP"), which caused this test script to use
native-XDP.

Fixes: 948d4f214fde ("veth: Add driver XDP")
Fixes: 97396ff0bc2d ("selftests/bpf: add XDP selftests for modifying and 
popping VLAN headers")
Signed-off-by: Jesper Dangaard Brouer 
Signed-off-by: David S. Miller 
Signed-off-by: Greg Kroah-Hartman 
---
 tools/testing/selftests/bpf/test_xdp_vlan.sh |   42 +++
 1 file changed, 36 insertions(+), 6 deletions(-)

--- a/tools/testing/selftests/bpf/test_xdp_vlan.sh
+++ b/tools/testing/selftests/bpf/test_xdp_vlan.sh
@@ -1,7 +1,12 @@
 #!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# Author: Jesper Dangaard Brouer 
 
 TESTNAME=xdp_vlan
 
+# Default XDP mode
+XDP_MODE=xdpgeneric
+
 usage() {
   echo "Testing XDP + TC eBPF VLAN manipulations: $TESTNAME"
   echo ""
@@ -9,9 +14,23 @@ usage() {
   echo "  -v | --verbose : Verbose"
   echo "  --flush: Flush before starting (e.g. after --interactive)"
   echo "  --interactive  : Keep netns setup running after test-run"
+  echo "  --mode=XXX : Choose XDP mode (xdp | xdpgeneric | xdpdrv)"
   echo ""
 }
 
+valid_xdp_mode()
+{
+   local mode=$1
+
+   case "$mode" in
+   xdpgeneric | xdpdrv | xdp)
+   return 0
+   ;;
+   *)
+   return 1
+   esac
+}
+
 cleanup()
 {
local status=$?
@@ -37,7 +56,7 @@ cleanup()
 
 # Using external program "getopt" to get --long-options
 OPTIONS=$(getopt -o hvfi: \
---long verbose,flush,help,interactive,debug -- "$@")
+--long verbose,flush,help,interactive,debug,mode: -- "$@")
 if (( $? != 0 )); then
 usage
 echo "selftests: $TESTNAME [FAILED] Error calling getopt, unknown option?"
@@ -60,6 +79,11 @@ while true; do
cleanup
shift
;;
+   --mode )
+   shift
+   XDP_MODE=$1
+   shift
+   ;;
-- )
shift
break
@@ -81,8 +105,14 @@ if [ "$EUID" -ne 0 ]; then
exit 1
 fi
 
-ip link set dev lo xdp off 2>/dev/null > /dev/null
-if [ $? -ne 0 ];then
+valid_xdp_mode $XDP_MODE
+if [ $? -ne 0 ]; then
+   echo "selftests: $TESTNAME [FAILED] unknown XDP mode ($XDP_MODE)"
+   exit 1
+fi
+
+ip link set dev lo xdpgeneric off 2>/dev/null > /dev/null
+if [ $? -ne 0 ]; then
echo "selftests: $TESTNAME [SKIP] need ip xdp support"
exit 0
 fi
@@ -166,7 +196,7 @@ export FILE=test_xdp_vlan.o
 
 # First test: Remove VLAN by setting VLAN ID 0, using "xdp_vlan_change"
 export XDP_PROG=xdp_vlan_change
-ip netns exec ns1 ip link set $DEVNS1 xdp object $FILE section $XDP_PROG
+ip netns exec ns1 ip link set $DEVNS1 $XDP_MODE object $FILE section $XDP_PROG
 
 # In ns1: egress use TC to add back VLAN tag 4011
 #  (del cmd)
@@ -187,8 +217,8 @@ ip netns exec ns1 ping -W 2 -c 3 $IPADDR
 # ETH_P_8021Q indication, and this cause overwriting of our changes.
 #
 export XDP_PROG=xdp_vlan_remove_outer2
-ip netns exec ns1 ip link set $DEVNS1 xdp off
-ip netns exec ns1 ip link set $DEVNS1 xdp object $FILE section $XDP_PROG
+ip netns exec ns1 ip link set $DEVNS1 $XDP_MODE off
+ip netns exec ns1 ip link set $DEVNS1 $XDP_MODE object $FILE section $XDP_PROG
 
 # Now the namespaces should still be able reach each-other, test with ping:
 ip netns exec ns2 ping -W 2 -c 3 $IPADDR1

[PATCH 5.2 45/56] net: fix bpf_xdp_adjust_head regression for generic-XDP

2019-08-08 Thread Greg Kroah-Hartman

From: Jesper Dangaard Brouer 

[ Upstream commit 065af355470519bd184019a93ac579f22b036045 ]

When generic-XDP was moved to a later processing step by commit
458bf2f224f0 ("net: core: support XDP generic on stacked devices.")
a regression was introduced when using bpf_xdp_adjust_head.

The issue is that after this commit the skb->network_header is now
changed prior to calling generic XDP and not after. Thus, if the header
is changed by XDP (via bpf_xdp_adjust_head), then skb->network_header
also need to be updated again.  Fix by calling skb_reset_network_header().

Fixes: 458bf2f224f0 ("net: core: support XDP generic on stacked devices.")
Reported-by: Brandon Cazander 
Signed-off-by: Jesper Dangaard Brouer 
Signed-off-by: David S. Miller 
Signed-off-by: Greg Kroah-Hartman 
---
 net/core/dev.c |   15 ++-
 1 file changed, 10 insertions(+), 5 deletions(-)

--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4382,12 +4382,17 @@ static u32 netif_receive_generic_xdp(str
 
act = bpf_prog_run_xdp(xdp_prog, xdp);
 
+   /* check if bpf_xdp_adjust_head was used */
off = xdp->data - orig_data;
-   if (off > 0)
-   __skb_pull(skb, off);
-   else if (off < 0)
-   __skb_push(skb, -off);
-   skb->mac_header += off;
+   if (off) {
+   if (off > 0)
+   __skb_pull(skb, off);
+   else if (off < 0)
+   __skb_push(skb, -off);
+
+   skb->mac_header += off;
+   skb_reset_network_header(skb);
+   }
 
/* check if bpf_xdp_adjust_tail was used. it can only "shrink"
 * pckt.

[PATCH 5.2 51/56] net/mlx5: Add missing RDMA_RX capabilities

2019-08-08 Thread Greg Kroah-Hartman

From: Maor Gottlieb 

[ Upstream commit 987f6c69dd923069d443f6a37225f5b1630a30f2 ]

New flow table type RDMA_RX was added but the MLX5_CAP_FLOW_TABLE_TYPE
didn't handle this new flow table type.
This means that MLX5_CAP_FLOW_TABLE_TYPE returns an empty capability to
this flow table type.

Update both the macro and the maximum supported flow table type to
RDMA_RX.

Fixes: d83eb50e29de ("net/mlx5: Add support in RDMA RX steering")
Signed-off-by: Maor Gottlieb 
Signed-off-by: Saeed Mahameed 
Signed-off-by: Greg Kroah-Hartman 
---
 drivers/net/ethernet/mellanox/mlx5/core/fs_core.h |5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.h
@@ -68,7 +68,7 @@ enum fs_flow_table_type {
FS_FT_SNIFFER_RX= 0X5,
FS_FT_SNIFFER_TX= 0X6,
FS_FT_RDMA_RX   = 0X7,
-   FS_FT_MAX_TYPE = FS_FT_SNIFFER_TX,
+   FS_FT_MAX_TYPE = FS_FT_RDMA_RX,
 };
 
 enum fs_flow_table_op_mod {
@@ -274,7 +274,8 @@ void mlx5_cleanup_fs(struct mlx5_core_de
(type == FS_FT_FDB) ? MLX5_CAP_ESW_FLOWTABLE_FDB(mdev, cap) :   
\
(type == FS_FT_SNIFFER_RX) ? MLX5_CAP_FLOWTABLE_SNIFFER_RX(mdev, cap) : 
\
(type == FS_FT_SNIFFER_TX) ? MLX5_CAP_FLOWTABLE_SNIFFER_TX(mdev, cap) : 
\
-   (BUILD_BUG_ON_ZERO(FS_FT_SNIFFER_TX != FS_FT_MAX_TYPE))\
+   (type == FS_FT_RDMA_RX) ? MLX5_CAP_FLOWTABLE_RDMA_RX(mdev, cap) :   
\
+   (BUILD_BUG_ON_ZERO(FS_FT_RDMA_RX != FS_FT_MAX_TYPE))\
)
 
 #endif

Re: [RFC PATCH v4 9/9] printk: use a new ringbuffer implementation

2019-08-08 Thread Linus Torvalds

On Wed, Aug 7, 2019 at 3:27 PM John Ogness  wrote:
>
> 2. For the CONFIG_PPC_POWERNV powerpc platform, kernel log buffer
>registration is no longer available because there is no longer
>a single contigous block of memory to represent all of the
>ringbuffer.

So this is tangential, but I've actually been wishing for a special
"raw dump" format that has absolutely *no* structure to it at all, and
is as a result not necessarily strictly reliable, but is a lot more
robust.

The background for that is that we have a class of bugs that are
really hard to debug "in the wild", because people don't have access
to serial consoles or any kind of special hardware at all (ie forget
things like nvram etc), and when the machine locks up you're happy to
just have a reset button (but more likely you have to turn power off
and on).

End result: a DRAM buffer can work, but is not "reliable".
Particularly if you turn power on and off, data retention of DRAM is
iffy. But it's possible, at least in theory.

So I have a patch that implements a "stupid ring buffer" for thisa
case, with absolutely zero data structures (because in the presense of
DRAM corruption, all you can get is "hopefully only slightly garbled
ASCII".

It actually does work. It's a complete hack, but I have used this on
real hardware to see dumps that happened after the machine could no
longer send them to any device.

I actually suspect that this kind of "stupid non-structured secondary
log" can often be much more useful than the existing nvram special
cases - yes the output can be garbled for multi-cpu cases because it
not only is lockless, it's lockess without even any data structures -
but it also works somewhat reliably when the machine is _really_
borked. Which is exactly when you want a log that isn't just the
normal "working machine syslog".

NOTE! This is *not* a replacement for a lockless printk. This is very
much an _additional_ "low overhead buffer in RAM" for post-mortem
analysis when anything fancier doesn't work.

So I'm throwing this patch out there in case people have interest in
looking at that very special case. Also note how right now the example
code just steals a random physical memory area at roughly physical
location 12GB - this is a hack and would need to be configurable
obviously in real life, but it worked for the machines I tested (which
both happened to have 16GB of RAM).

Those parts are marked with "// HACK HACK HACK" and just a hardcoded
physical address (0x32000).

  Linus
From 074ea67afcaba37996a615c41685cd72b088f583 Mon Sep 17 00:00:00 2001
From: Linus Torvalds 
Date: Thu, 30 May 2019 19:56:13 -0700
Subject: [PATCH] Trial "power off buffer" for printk data retention

This circumvents ACPI and just forces a random physical address (which
happens to be at 0x32000) to contain a 64kB buffer that we take
over.

Not-yet-signed-off-by: Linus Torvalds 
---
 arch/x86/kernel/setup.c |   7 ++
 include/linux/printk.h  |   3 +
 init/main.c |  11 ++
 kernel/printk/Makefile  |   2 +-
 kernel/printk/poweroff_buffer.c | 179 
 kernel/printk/printk.c  |   2 +
 6 files changed, 203 insertions(+), 1 deletion(-)
 create mode 100644 kernel/printk/poweroff_buffer.c

diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 08a5f4a131f5..2a1d7d7f3f4f 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -1042,6 +1042,13 @@ void __init setup_arch(char **cmdline_p)
 	early_gart_iommu_check();
 #endif

+	// HACK HACK HACK
+	// Magic "this RAM survives boot" fake
+	e820__range_update(0x32000, 65536, E820_TYPE_RAM, E820_TYPE_RESERVED);
+	e820__update_table(e820_table);
+	printk(KERN_INFO "fixed physical RAM map:\n");
+	e820__print_table("fake boot-safe buffers");
+
 	/*
 	 * partially used pages are not usable - thus
 	 * we are rounding upwards:
diff --git a/include/linux/printk.h b/include/linux/printk.h
index cefd374c47b1..905c47efb98c 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -170,6 +170,9 @@ int vprintk(const char *fmt, va_list args);
 asmlinkage __printf(1, 2) __cold
 int printk(const char *fmt, ...);

+void poweroff_buffer_log(const char *buf, size_t len);
+void poweroff_buffer_register(char *buf, size_t size);
+
 /*
  * Special printk facility for scheduler/timekeeping use only, _DO_NOT_USE_ !
  */
diff --git a/init/main.c b/init/main.c
index 66a196c5e4c3..232778603490 100644
--- a/init/main.c
+++ b/init/main.c
@@ -1100,6 +1100,17 @@ static int __ref kernel_init(void *unused)
 	system_state = SYSTEM_RUNNING;
 	numa_default_policy();

+	//
+	// HACK HACK HACK
+	//
+	{
+		void *base = ioremap_cache(0x32000,65536);
+		if (base)
+			poweroff_buffer_register(base, 65536);
+		else
+			printk("ioremap failed\n");
+	}
+
 	rcu_end_inkernel_boot();

 	if (ramdisk_execute_command) {
diff --git a/kernel/printk/Makefile b/kernel/printk/Makefile
index 4d052fc6bcde..7ca11d92f280 100644

[PATCH 5.2 00/56] 5.2.8-stable review

2019-08-08 Thread Greg Kroah-Hartman

This is the start of the stable review cycle for the 5.2.8 release.
There are 56 patches in this series, all will be posted as a response
to this one.  If anyone has any issues with these being applied, please
let me know.

Responses should be made by Sat 10 Aug 2019 07:03:19 PM UTC.
Anything received after that time might be too late.

The whole patch series can be found in one patch at:

https://www.kernel.org/pub/linux/kernel/v5.x/stable-review/patch-5.2.8-rc1.gz
or in the git tree and branch at:

git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git 
linux-5.2.y
and the diffstat can be found below.

thanks,

greg k-h

-
Pseudo-Shortlog of commits:

Greg Kroah-Hartman 
Linux 5.2.8-rc1

Lukas Wunner 
spi: bcm2835: Fix 3-wire mode if DMA is enabled

Johannes Berg 
Revert "mac80211: set NETIF_F_LLTX when using intermediate tx queues"

Dhinakaran Pandiyan 
drm/i915/vbt: Fix VBT parsing for the PSR section

Arnd Bergmann 
compat_ioctl: pppoe: fix PPPOEIOCSFWD handling

Aya Levin 
net/mlx5e: Fix matching of speed to PRM link modes

Maor Gottlieb 
net/mlx5: Add missing RDMA_RX capabilities

Petr Machata 
mlxsw: spectrum_buffers: Further reduce pool size on Spectrum-2

Colin Ian King 
rocker: fix memory leaks of fib_work on two error return paths

Ursula Braun 
net/smc: avoid fallback in case of non-blocking connect

Heiner Kallweit 
net: phy: fix race in genphy_update_link

Dexuan Cui 
hv_sock: Fix hang when a connection is closed

Jesper Dangaard Brouer 
net: fix bpf_xdp_adjust_head regression for generic-XDP

Jesper Dangaard Brouer 
selftests/bpf: reduce time to execute test_xdp_vlan.sh

Jesper Dangaard Brouer 
selftests/bpf: add wrapper scripts for test_xdp_vlan.sh

Jesper Dangaard Brouer 
bpf: fix XDP vlan selftests test_xdp_vlan.sh

Heiner Kallweit 
r8169: don't use MSI before RTL8168d

Ariel Levkovich 
net/mlx5e: Prevent encap flow counter update async to user query

Edward Srouji 
net/mlx5: Fix modify_cq_in alignment

Alexis Bauvin 
tun: mark small packets as owned by the tap sock

Jon Maloy 
tipc: fix unitilized skb list crash

Taras Kondratiuk 
tipc: compat: allow tipc commands without arguments

Claudiu Manoil 
ocelot: Cancel delayed work before wq destruction

Johan Hovold 
NFC: nfcmrvl: fix gpio-handling regression

Frode Isaksen 
net: stmmac: Use netif_tx_napi_add() for TX polling function

Ursula Braun 
net/smc: do not schedule tx_work in SMC_CLOSED state

Dmytro Linkin 
net: sched: use temporary variable for actions indexes

Roman Mashak 
net sched: update vlan action for batched events operations

Jia-Ju Bai 
net: sched: Fix a possible null-pointer dereference in dequeue_func()

Subash Abhinov Kasiviswanathan 
net: qualcomm: rmnet: Fix incorrect UL checksum offload logic

Andreas Schwab 
net: phy: mscc: initialize stats array

René van Dorst 
net: phylink: Fix flow control for fixed-link

Arseny Solokha 
net: phylink: don't start and stop SGMII PHYs in SFP modules twice

Hubert Feurstein 
net: phy: fixed_phy: print gpio error only if gpio node is present

Mark Zhang 
net/mlx5: Use reversed order when unregister devices

Qian Cai 
net/mlx5e: always initialize frag->last_in_page

Jiri Pirko 
net: fix ifindex collision during namespace removal

Nikolay Aleksandrov 
net: bridge: move default pvid init/deinit to NETDEV_REGISTER/UNREGISTER

Nikolay Aleksandrov 
net: bridge: mcast: don't delete permanent entries when fast leave is 
enabled

Nikolay Aleksandrov 
net: bridge: delete local fdb on device init failure

Matteo Croce 
mvpp2: refactor MTU change code

Matteo Croce 
mvpp2: fix panic on module removal

Jiri Pirko 
mlxsw: spectrum: Fix error path in mlxsw_sp_module_init()

Haishuang Yan 
ipip: validate header length in ipip_tunnel_xmit

Haishuang Yan 
ip6_tunnel: fix possible use-after-free on xmit

Haishuang Yan 
ip6_gre: reload ipv6h in prepare_ip6gre_xmit_ipv6

Cong Wang 
ife: error out when nla attributes are empty

Arnaud Patard 
drivers/net/ethernet/marvell/mvmdio.c: Fix non OF case

Sudarsana Reddy Kalluru 
bnx2x: Disable multi-cos feature.

Gustavo A. R. Silva 
atm: iphase: Fix Spectre v1 vulnerability

Sebastian Parschauer 
HID: Add quirk for HP X1200 PIXART OEM mouse

Aaron Armstrong Skomra 
HID: wacom: fix bit shift for Cintiq Companion 2

Hillf Danton 
ALSA: usb-audio: Fix gpf in snd_usb_pipe_sanity_check

Takashi Iwai 
ALSA: usb-audio: Sanity checks for each pipe and EP types

Dan Williams 
libnvdimm/bus: Fix wait_nvdimm_bus_probe_idle() ABBA deadlock

Dan Williams 
libnvdimm/bus: Prepare the nd_ioctl() path to be re-entrant

Hannes Reinecke 
scsi: fcoe: Embed fc_rport_priv in fcoe_rport structure


-

Diffstat:

 Makefile   |  4 +-
 drivers/atm/iphase.c

[PATCH 5.2 39/56] net/mlx5: Fix modify_cq_in alignment

2019-08-08 Thread Greg Kroah-Hartman

From: Edward Srouji 

[ Upstream commit 7a32f2962c56d9d8a836b4469855caeee8766bd4 ]

Fix modify_cq_in alignment to match the device specification.
After this fix the 'cq_umem_valid' field will be in the right offset.

Cc:  # 4.19
Fixes: bd37197554eb ("net/mlx5: Update mlx5_ifc with DEVX UID bits")
Signed-off-by: Edward Srouji 
Reviewed-by: Yishai Hadas 
Signed-off-by: Leon Romanovsky 
Signed-off-by: Saeed Mahameed 
Signed-off-by: Greg Kroah-Hartman 
---
 include/linux/mlx5/mlx5_ifc.h |6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -5865,10 +5865,12 @@ struct mlx5_ifc_modify_cq_in_bits {
 
struct mlx5_ifc_cqc_bits cq_context;
 
-   u8 reserved_at_280[0x40];
+   u8 reserved_at_280[0x60];
 
u8 cq_umem_valid[0x1];
-   u8 reserved_at_2c1[0x5bf];
+   u8 reserved_at_2e1[0x1f];
+
+   u8 reserved_at_300[0x580];
 
u8 pas[0][0x40];
 };

[PATCH 5.2 08/56] atm: iphase: Fix Spectre v1 vulnerability

2019-08-08 Thread Greg Kroah-Hartman

From: "Gustavo A. R. Silva" 

[ Upstream commit ea443e5e98b5b74e317ef3d26bcaea54931ccdee ]

board is controlled by user-space, hence leading to a potential
exploitation of the Spectre variant 1 vulnerability.

This issue was detected with the help of Smatch:

drivers/atm/iphase.c:2765 ia_ioctl() warn: potential spectre issue 'ia_dev' [r] 
(local cap)
drivers/atm/iphase.c:2774 ia_ioctl() warn: possible spectre second half.  
'iadev'
drivers/atm/iphase.c:2782 ia_ioctl() warn: possible spectre second half.  
'iadev'
drivers/atm/iphase.c:2816 ia_ioctl() warn: possible spectre second half.  
'iadev'
drivers/atm/iphase.c:2823 ia_ioctl() warn: possible spectre second half.  
'iadev'
drivers/atm/iphase.c:2830 ia_ioctl() warn: potential spectre issue '_ia_dev' 
[r] (local cap)
drivers/atm/iphase.c:2845 ia_ioctl() warn: possible spectre second half.  
'iadev'
drivers/atm/iphase.c:2856 ia_ioctl() warn: possible spectre second half.  
'iadev'

Fix this by sanitizing board before using it to index ia_dev and _ia_dev

Notice that given that speculation windows are large, the policy is
to kill the speculation on the first load and not worry if it can be
completed with a dependent load/store [1].

[1] https://lore.kernel.org/lkml/20180423164740.gy17...@dhcp22.suse.cz/

Signed-off-by: Gustavo A. R. Silva 
Signed-off-by: David S. Miller 
Signed-off-by: Greg Kroah-Hartman 
---
 drivers/atm/iphase.c |8 ++--
 1 file changed, 6 insertions(+), 2 deletions(-)

--- a/drivers/atm/iphase.c
+++ b/drivers/atm/iphase.c
@@ -63,6 +63,7 @@
 #include   
 #include 
 #include 
+#include 
 #include "iphase.h"  
 #include "suni.h"
 #define swap_byte_order(x) (((x & 0xff) << 8) | ((x & 0xff00) >> 8))
@@ -2760,8 +2761,11 @@ static int ia_ioctl(struct atm_dev *dev,
}
if (copy_from_user(&ia_cmds, arg, sizeof ia_cmds)) return -EFAULT; 
board = ia_cmds.status;
-   if ((board < 0) || (board > iadev_count))
- board = 0;
+
+   if ((board < 0) || (board > iadev_count))
+   board = 0;
+   board = array_index_nospec(board, iadev_count + 1);
+
iadev = ia_dev[board];
switch (ia_cmds.cmd) {
case MEMDUMP:

[PATCH 5.2 04/56] ALSA: usb-audio: Sanity checks for each pipe and EP types

2019-08-08 Thread Greg Kroah-Hartman

[ Upstream commit 801ebf1043ae7b182588554cc9b9ad3c14bc2ab5 ]

The recent USB core code performs sanity checks for the given pipe and
EP types, and it can be hit by manipulated USB descriptors by syzbot.
For making syzbot happier, this patch introduces a local helper for a
sanity check in the driver side and calls it at each place before the
message handling, so that we can avoid the WARNING splats.

Reported-by: syzbot+d952e5e28f5fb7718...@syzkaller.appspotmail.com
Signed-off-by: Takashi Iwai 
Signed-off-by: Sasha Levin 
---
 sound/usb/helper.c | 17 +
 sound/usb/helper.h |  1 +
 sound/usb/quirks.c | 18 +++---
 3 files changed, 33 insertions(+), 3 deletions(-)

diff --git a/sound/usb/helper.c b/sound/usb/helper.c
index 84aa265dd802c..71d5f540334a2 100644
--- a/sound/usb/helper.c
+++ b/sound/usb/helper.c
@@ -63,6 +63,20 @@ void *snd_usb_find_csint_desc(void *buffer, int buflen, void 
*after, u8 dsubtype
return NULL;
 }
 
+/* check the validity of pipe and EP types */
+int snd_usb_pipe_sanity_check(struct usb_device *dev, unsigned int pipe)
+{
+   static const int pipetypes[4] = {
+   PIPE_CONTROL, PIPE_ISOCHRONOUS, PIPE_BULK, PIPE_INTERRUPT
+   };
+   struct usb_host_endpoint *ep;
+
+   ep = usb_pipe_endpoint(dev, pipe);
+   if (usb_pipetype(pipe) != pipetypes[usb_endpoint_type(&ep->desc)])
+   return -EINVAL;
+   return 0;
+}
+
 /*
  * Wrapper for usb_control_msg().
  * Allocates a temp buffer to prevent dmaing from/to the stack.
@@ -75,6 +89,9 @@ int snd_usb_ctl_msg(struct usb_device *dev, unsigned int 
pipe, __u8 request,
void *buf = NULL;
int timeout;
 
+   if (snd_usb_pipe_sanity_check(dev, pipe))
+   return -EINVAL;
+
if (size > 0) {
buf = kmemdup(data, size, GFP_KERNEL);
if (!buf)
diff --git a/sound/usb/helper.h b/sound/usb/helper.h
index d338bd0e0ca60..6afb70156ec4f 100644
--- a/sound/usb/helper.h
+++ b/sound/usb/helper.h
@@ -7,6 +7,7 @@ unsigned int snd_usb_combine_bytes(unsigned char *bytes, int 
size);
 void *snd_usb_find_desc(void *descstart, int desclen, void *after, u8 dtype);
 void *snd_usb_find_csint_desc(void *descstart, int desclen, void *after, u8 
dsubtype);
 
+int snd_usb_pipe_sanity_check(struct usb_device *dev, unsigned int pipe);
 int snd_usb_ctl_msg(struct usb_device *dev, unsigned int pipe,
__u8 request, __u8 requesttype, __u16 value, __u16 index,
void *data, __u16 size);
diff --git a/sound/usb/quirks.c b/sound/usb/quirks.c
index cf5cff10c08e8..78858918cbc10 100644
--- a/sound/usb/quirks.c
+++ b/sound/usb/quirks.c
@@ -828,11 +828,13 @@ static int snd_usb_novation_boot_quirk(struct usb_device 
*dev)
 static int snd_usb_accessmusic_boot_quirk(struct usb_device *dev)
 {
int err, actual_length;
-
/* "midi send" enable */
static const u8 seq[] = { 0x4e, 0x73, 0x52, 0x01 };
+   void *buf;
 
-   void *buf = kmemdup(seq, ARRAY_SIZE(seq), GFP_KERNEL);
+   if (snd_usb_pipe_sanity_check(dev, usb_sndintpipe(dev, 0x05)))
+   return -EINVAL;
+   buf = kmemdup(seq, ARRAY_SIZE(seq), GFP_KERNEL);
if (!buf)
return -ENOMEM;
err = usb_interrupt_msg(dev, usb_sndintpipe(dev, 0x05), buf,
@@ -857,7 +859,11 @@ static int snd_usb_accessmusic_boot_quirk(struct 
usb_device *dev)
 
 static int snd_usb_nativeinstruments_boot_quirk(struct usb_device *dev)
 {
-   int ret = usb_control_msg(dev, usb_sndctrlpipe(dev, 0),
+   int ret;
+
+   if (snd_usb_pipe_sanity_check(dev, usb_sndctrlpipe(dev, 0)))
+   return -EINVAL;
+   ret = usb_control_msg(dev, usb_sndctrlpipe(dev, 0),
  0xaf, USB_TYPE_VENDOR | USB_RECIP_DEVICE,
  1, 0, NULL, 0, 1000);
 
@@ -964,6 +970,8 @@ static int snd_usb_axefx3_boot_quirk(struct usb_device *dev)
 
dev_dbg(&dev->dev, "Waiting for Axe-Fx III to boot up...\n");
 
+   if (snd_usb_pipe_sanity_check(dev, usb_sndctrlpipe(dev, 0)))
+   return -EINVAL;
/* If the Axe-Fx III has not fully booted, it will timeout when trying
 * to enable the audio streaming interface. A more generous timeout is
 * used here to detect when the Axe-Fx III has finished booting as the
@@ -996,6 +1004,8 @@ static int snd_usb_motu_microbookii_communicate(struct 
usb_device *dev, u8 *buf,
 {
int err, actual_length;
 
+   if (snd_usb_pipe_sanity_check(dev, usb_sndintpipe(dev, 0x01)))
+   return -EINVAL;
err = usb_interrupt_msg(dev, usb_sndintpipe(dev, 0x01), buf, *length,
&actual_length, 1000);
if (err < 0)
@@ -1006,6 +1016,8 @@ static int snd_usb_motu_microbookii_communicate(struct 
usb_device *dev, u8 *buf,
 
memset(buf, 0, buf_size);
 
+   if (snd_usb_pipe_sanity_check(dev, usb_rcvintpipe(dev, 0x82)))
+   return -EINV

[PATCH 5.2 37/56] tipc: fix unitilized skb list crash

2019-08-08 Thread Greg Kroah-Hartman

From: Jon Maloy 

[ Upstream commit 2948a1fcd77a8bb11604387e3fc52f0ebf5729e9 ]

Our test suite somtimes provokes the following crash:

Description of problem:
[ 1092.597234] BUG: unable to handle kernel NULL pointer dereference at 
00e8
[ 1092.605072] PGD 0 P4D 0
[ 1092.607620] Oops:  [#1] SMP PTI
[ 1092.68] CPU: 37 PID: 0 Comm: swapper/37 Kdump: loaded Not tainted 
4.18.0-122.el8.x86_64 #1
[ 1092.619724] Hardware name: Dell Inc. PowerEdge R740/08D89F, BIOS 1.3.7 
02/08/2018
[ 1092.627215] RIP: 0010:tipc_mcast_filter_msg+0x93/0x2d0 [tipc]
[ 1092.632955] Code: 0f 84 aa 01 00 00 89 cf 4d 01 ca 4c 8b 26 c1 ef 19 83 e7 
0f 83 ff 0c 4d 0f 45 d1 41 8b 6a 10 0f cd 4c 39 e6 0f 84 81 01 00 00 <4d> 8b 9c 
24 e8 00 00 00 45 8b 13 41 0f ca 44 89 d7 c1 ef 13 83 e7
[ 1092.651703] RSP: 0018:929e5fa83a18 EFLAGS: 00010282
[ 1092.656927] RAX: 929e3fb38100 RBX: 069f29ee RCX: 416c0045
[ 1092.664058] RDX: 929e5fa83a88 RSI: 929e31a28420 RDI: 
[ 1092.671209] RBP: 29b11821 R08:  R09: 929e39b4407a
[ 1092.678343] R10: 929e39b4407a R11: 0007 R12: 
[ 1092.685475] R13: 0001 R14: 929e3fb38100 R15: 929e39b4407a
[ 1092.692614] FS:  () GS:929e5fa8() 
knlGS:
[ 1092.700702] CS:  0010 DS:  ES:  CR0: 80050033
[ 1092.706447] CR2: 00e8 CR3: 00031300a004 CR4: 007606e0
[ 1092.713579] DR0:  DR1:  DR2: 
[ 1092.720712] DR3:  DR6: fffe0ff0 DR7: 0400
[ 1092.727843] PKRU: 5554
[ 1092.730556] Call Trace:
[ 1092.733010]  
[ 1092.735034]  tipc_sk_filter_rcv+0x7ca/0xb80 [tipc]
[ 1092.739828]  ? __kmalloc_node_track_caller+0x1cb/0x290
[ 1092.744974]  ? dev_hard_start_xmit+0xa5/0x210
[ 1092.749332]  tipc_sk_rcv+0x389/0x640 [tipc]
[ 1092.753519]  tipc_sk_mcast_rcv+0x23c/0x3a0 [tipc]
[ 1092.758224]  tipc_rcv+0x57a/0xf20 [tipc]
[ 1092.762154]  ? ktime_get_real_ts64+0x40/0xe0
[ 1092.766432]  ? tpacket_rcv+0x50/0x9f0
[ 1092.770098]  tipc_l2_rcv_msg+0x4a/0x70 [tipc]
[ 1092.774452]  __netif_receive_skb_core+0xb62/0xbd0
[ 1092.779164]  ? enqueue_entity+0xf6/0x630
[ 1092.783084]  ? kmem_cache_alloc+0x158/0x1c0
[ 1092.787272]  ? __build_skb+0x25/0xd0
[ 1092.790849]  netif_receive_skb_internal+0x42/0xf0
[ 1092.795557]  napi_gro_receive+0xba/0xe0
[ 1092.799417]  mlx5e_handle_rx_cqe+0x83/0xd0 [mlx5_core]
[ 1092.804564]  mlx5e_poll_rx_cq+0xd5/0x920 [mlx5_core]
[ 1092.809536]  mlx5e_napi_poll+0xb2/0xce0 [mlx5_core]
[ 1092.814415]  ? __wake_up_common_lock+0x89/0xc0
[ 1092.818861]  net_rx_action+0x149/0x3b0
[ 1092.822616]  __do_softirq+0xe3/0x30a
[ 1092.826193]  irq_exit+0x100/0x110
[ 1092.829512]  do_IRQ+0x85/0xd0
[ 1092.832483]  common_interrupt+0xf/0xf
[ 1092.836147]  
[ 1092.838255] RIP: 0010:cpuidle_enter_state+0xb7/0x2a0
[ 1092.843221] Code: e8 3e 79 a5 ff 80 7c 24 03 00 74 17 9c 58 0f 1f 44 00 00 
f6 c4 02 0f 85 d7 01 00 00 31 ff e8 a0 6b ab ff fb 66 0f 1f 44 00 00 <48> b8 ff 
ff ff ff f3 01 00 00 4c 29 f3 ba ff ff ff 7f 48 39 c3 7f
[ 1092.861967] RSP: 0018:aa5ec6533e98 EFLAGS: 0246 ORIG_RAX: 
ffdd
[ 1092.869530] RAX: 929e5faa3100 RBX: 00fe63dd2092 RCX: 001f
[ 1092.876665] RDX: 00fe63dd2092 RSI: 3a518aaa RDI: 
[ 1092.883795] RBP: 0003 R08: 0004 R09: 00022940
[ 1092.890929] R10: 040cb0666b56 R11: 929e5faa20a8 R12: 929e5faade78
[ 1092.898060] R13: b59258f8 R14: 00fe60f3228d R15: 
[ 1092.905196]  ? cpuidle_enter_state+0x92/0x2a0
[ 1092.909555]  do_idle+0x236/0x280
[ 1092.912785]  cpu_startup_entry+0x6f/0x80
[ 1092.916715]  start_secondary+0x1a7/0x200
[ 1092.920642]  secondary_startup_64+0xb7/0xc0
[...]

The reason is that the skb list tipc_socket::mc_method.deferredq only
is initialized for connectionless sockets, while nothing stops arriving
multicast messages from being filtered by connection oriented sockets,
with subsequent access to the said list.

We fix this by initializing the list unconditionally at socket creation.
This eliminates the crash, while the message still is dropped further
down in tipc_sk_filter_rcv() as it should be.

Reported-by: Li Shuang 
Signed-off-by: Jon Maloy 
Reviewed-by: Xin Long 
Signed-off-by: David S. Miller 
Signed-off-by: Greg Kroah-Hartman 
---
 net/tipc/socket.c |3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -485,9 +485,8 @@ static int tipc_sk_create(struct net *ne
tsk_set_unreturnable(tsk, true);
if (sock->type == SOCK_DGRAM)
tsk_set_unreliable(tsk, true);
-   __skb_queue_head_init(&tsk->mc_method.deferredq);
}
-
+   __skb_queue_head_init(&tsk->mc_method.deferredq);
trace_tipc_sk_create(sk, NULL, TIPC_DUMP_NONE, " ");
re

[PATCH 5.2 30/56] net sched: update vlan action for batched events operations

2019-08-08 Thread Greg Kroah-Hartman

From: Roman Mashak 

[ Upstream commit b35475c5491a14c8ce7a5046ef7bcda8a860581a ]

Add get_fill_size() routine used to calculate the action size
when building a batch of events.

Fixes: c7e2b9689 ("sched: introduce vlan action")
Signed-off-by: Roman Mashak 
Signed-off-by: David S. Miller 
Signed-off-by: Greg Kroah-Hartman 
---
 net/sched/act_vlan.c |9 +
 1 file changed, 9 insertions(+)

--- a/net/sched/act_vlan.c
+++ b/net/sched/act_vlan.c
@@ -306,6 +306,14 @@ static int tcf_vlan_search(struct net *n
return tcf_idr_search(tn, a, index);
 }
 
+static size_t tcf_vlan_get_fill_size(const struct tc_action *act)
+{
+   return nla_total_size(sizeof(struct tc_vlan))
+   + nla_total_size(sizeof(u16)) /* TCA_VLAN_PUSH_VLAN_ID */
+   + nla_total_size(sizeof(u16)) /* TCA_VLAN_PUSH_VLAN_PROTOCOL */
+   + nla_total_size(sizeof(u8)); /* TCA_VLAN_PUSH_VLAN_PRIORITY */
+}
+
 static struct tc_action_ops act_vlan_ops = {
.kind   =   "vlan",
.id =   TCA_ID_VLAN,
@@ -315,6 +323,7 @@ static struct tc_action_ops act_vlan_ops
.init   =   tcf_vlan_init,
.cleanup=   tcf_vlan_cleanup,
.walk   =   tcf_vlan_walker,
+   .get_fill_size  =   tcf_vlan_get_fill_size,
.lookup =   tcf_vlan_search,
.size   =   sizeof(struct tcf_vlan),
 };

[PATCH 5.2 32/56] net/smc: do not schedule tx_work in SMC_CLOSED state

2019-08-08 Thread Greg Kroah-Hartman

From: Ursula Braun 

[ Upstream commit f9cedf1a9b1cdcfb0c52edb391d01771e43994a4 ]

The setsockopts options TCP_NODELAY and TCP_CORK may schedule the
tx worker. Make sure the socket is not yet moved into SMC_CLOSED
state (for instance by a shutdown SHUT_RDWR call).

Reported-by: syzbot+92209502e7aab127c...@syzkaller.appspotmail.com
Reported-by: syzbot+b972214bb803a343f...@syzkaller.appspotmail.com
Fixes: 01d2f7e2cdd31 ("net/smc: sockopts TCP_NODELAY and TCP_CORK")
Signed-off-by: Ursula Braun 
Signed-off-by: Karsten Graul 
Signed-off-by: David S. Miller 
Signed-off-by: Greg Kroah-Hartman 
---
 net/smc/af_smc.c |8 ++--
 1 file changed, 6 insertions(+), 2 deletions(-)

--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -1741,14 +1741,18 @@ static int smc_setsockopt(struct socket
}
break;
case TCP_NODELAY:
-   if (sk->sk_state != SMC_INIT && sk->sk_state != SMC_LISTEN) {
+   if (sk->sk_state != SMC_INIT &&
+   sk->sk_state != SMC_LISTEN &&
+   sk->sk_state != SMC_CLOSED) {
if (val && !smc->use_fallback)
mod_delayed_work(system_wq, &smc->conn.tx_work,
 0);
}
break;
case TCP_CORK:
-   if (sk->sk_state != SMC_INIT && sk->sk_state != SMC_LISTEN) {
+   if (sk->sk_state != SMC_INIT &&
+   sk->sk_state != SMC_LISTEN &&
+   sk->sk_state != SMC_CLOSED) {
if (!val && !smc->use_fallback)
mod_delayed_work(system_wq, &smc->conn.tx_work,
 0);

[PATCH 5.2 07/56] HID: Add quirk for HP X1200 PIXART OEM mouse

2019-08-08 Thread Greg Kroah-Hartman

From: Sebastian Parschauer 

commit 49869d2ea9eecc105a10724c1abf035151a3c4e2 upstream.

The PixArt OEM mice are known for disconnecting every minute in
runlevel 1 or 3 if they are not always polled. So add quirk
ALWAYS_POLL for this one as well.

Jonathan Teh (@jonathan-teh) reported and tested the quirk.
Reference: https://github.com/sriemer/fix-linux-mouse/issues/15

Signed-off-by: Sebastian Parschauer 
CC: sta...@vger.kernel.org
Signed-off-by: Jiri Kosina 
Signed-off-by: Greg Kroah-Hartman 

---
 drivers/hid/hid-ids.h|1 +
 drivers/hid/hid-quirks.c |1 +
 2 files changed, 2 insertions(+)

--- a/drivers/hid/hid-ids.h
+++ b/drivers/hid/hid-ids.h
@@ -568,6 +568,7 @@
 #define USB_PRODUCT_ID_HP_LOGITECH_OEM_USB_OPTICAL_MOUSE_0B4A  0x0b4a
 #define USB_PRODUCT_ID_HP_PIXART_OEM_USB_OPTICAL_MOUSE 0x134a
 #define USB_PRODUCT_ID_HP_PIXART_OEM_USB_OPTICAL_MOUSE_094A0x094a
+#define USB_PRODUCT_ID_HP_PIXART_OEM_USB_OPTICAL_MOUSE_06410x0641
 
 #define USB_VENDOR_ID_HUION0x256c
 #define USB_DEVICE_ID_HUION_TABLET 0x006e
--- a/drivers/hid/hid-quirks.c
+++ b/drivers/hid/hid-quirks.c
@@ -91,6 +91,7 @@ static const struct hid_device_id hid_qu
{ HID_USB_DEVICE(USB_VENDOR_ID_HP, 
USB_PRODUCT_ID_HP_LOGITECH_OEM_USB_OPTICAL_MOUSE_0B4A), HID_QUIRK_ALWAYS_POLL },
{ HID_USB_DEVICE(USB_VENDOR_ID_HP, 
USB_PRODUCT_ID_HP_PIXART_OEM_USB_OPTICAL_MOUSE), HID_QUIRK_ALWAYS_POLL },
{ HID_USB_DEVICE(USB_VENDOR_ID_HP, 
USB_PRODUCT_ID_HP_PIXART_OEM_USB_OPTICAL_MOUSE_094A), HID_QUIRK_ALWAYS_POLL },
+   { HID_USB_DEVICE(USB_VENDOR_ID_HP, 
USB_PRODUCT_ID_HP_PIXART_OEM_USB_OPTICAL_MOUSE_0641), HID_QUIRK_ALWAYS_POLL },
{ HID_USB_DEVICE(USB_VENDOR_ID_IDEACOM, USB_DEVICE_ID_IDEACOM_IDC6680), 
HID_QUIRK_MULTI_INPUT },
{ HID_USB_DEVICE(USB_VENDOR_ID_INNOMEDIA, 
USB_DEVICE_ID_INNEX_GENESIS_ATARI), HID_QUIRK_MULTI_INPUT },
{ HID_USB_DEVICE(USB_VENDOR_ID_KYE, USB_DEVICE_ID_KYE_EASYPEN_M610X), 
HID_QUIRK_MULTI_INPUT },

[PATCH 5.2 28/56] net: qualcomm: rmnet: Fix incorrect UL checksum offload logic

2019-08-08 Thread Greg Kroah-Hartman

From: Subash Abhinov Kasiviswanathan 

[ Upstream commit a7cf3d24ee6081930feb4c830a7f6f16ebe31c49 ]

The udp_ip4_ind bit is set only for IPv4 UDP non-fragmented packets
so that the hardware can flip the checksum to 0x if the computed
checksum is 0 per RFC768.

However, this bit had to be set for IPv6 UDP non fragmented packets
as well per hardware requirements. Otherwise, IPv6 UDP packets
with computed checksum as 0 were transmitted by hardware and were
dropped in the network.

In addition to setting this bit for IPv6 UDP, the field is also
appropriately renamed to udp_ind as part of this change.

Fixes: 5eb5f8608ef1 ("net: qualcomm: rmnet: Add support for TX checksum 
offload")
Cc: Sean Tranchetti 
Signed-off-by: Subash Abhinov Kasiviswanathan 
Signed-off-by: David S. Miller 
Signed-off-by: Greg Kroah-Hartman 
---
 drivers/net/ethernet/qualcomm/rmnet/rmnet_map.h  |2 +-
 drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c |   13 +
 2 files changed, 10 insertions(+), 5 deletions(-)

--- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_map.h
+++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_map.h
@@ -51,7 +51,7 @@ struct rmnet_map_dl_csum_trailer {
 struct rmnet_map_ul_csum_header {
__be16 csum_start_offset;
u16 csum_insert_offset:14;
-   u16 udp_ip4_ind:1;
+   u16 udp_ind:1;
u16 csum_enabled:1;
 } __aligned(1);
 
--- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c
+++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c
@@ -206,9 +206,9 @@ rmnet_map_ipv4_ul_csum_header(void *iphd
ul_header->csum_insert_offset = skb->csum_offset;
ul_header->csum_enabled = 1;
if (ip4h->protocol == IPPROTO_UDP)
-   ul_header->udp_ip4_ind = 1;
+   ul_header->udp_ind = 1;
else
-   ul_header->udp_ip4_ind = 0;
+   ul_header->udp_ind = 0;
 
/* Changing remaining fields to network order */
hdr++;
@@ -239,6 +239,7 @@ rmnet_map_ipv6_ul_csum_header(void *ip6h
  struct rmnet_map_ul_csum_header *ul_header,
  struct sk_buff *skb)
 {
+   struct ipv6hdr *ip6h = (struct ipv6hdr *)ip6hdr;
__be16 *hdr = (__be16 *)ul_header, offset;
 
offset = htons((__force u16)(skb_transport_header(skb) -
@@ -246,7 +247,11 @@ rmnet_map_ipv6_ul_csum_header(void *ip6h
ul_header->csum_start_offset = offset;
ul_header->csum_insert_offset = skb->csum_offset;
ul_header->csum_enabled = 1;
-   ul_header->udp_ip4_ind = 0;
+
+   if (ip6h->nexthdr == IPPROTO_UDP)
+   ul_header->udp_ind = 1;
+   else
+   ul_header->udp_ind = 0;
 
/* Changing remaining fields to network order */
hdr++;
@@ -419,7 +424,7 @@ sw_csum:
ul_header->csum_start_offset = 0;
ul_header->csum_insert_offset = 0;
ul_header->csum_enabled = 0;
-   ul_header->udp_ip4_ind = 0;
+   ul_header->udp_ind = 0;
 
priv->stats.csum_sw++;
 }

[PATCH 5.2 29/56] net: sched: Fix a possible null-pointer dereference in dequeue_func()

2019-08-08 Thread Greg Kroah-Hartman

From: Jia-Ju Bai 

[ Upstream commit 051c7b39be4a91f6b7d8c4548444e4b850f1f56c ]

In dequeue_func(), there is an if statement on line 74 to check whether
skb is NULL:
if (skb)

When skb is NULL, it is used on line 77:
prefetch(&skb->end);

Thus, a possible null-pointer dereference may occur.

To fix this bug, skb->end is used when skb is not NULL.

This bug is found by a static analysis tool STCheck written by us.

Fixes: 76e3cc126bb2 ("codel: Controlled Delay AQM")
Signed-off-by: Jia-Ju Bai 
Reviewed-by: Jiri Pirko 
Signed-off-by: David S. Miller 
Signed-off-by: Greg Kroah-Hartman 
---
 net/sched/sch_codel.c |6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

--- a/net/sched/sch_codel.c
+++ b/net/sched/sch_codel.c
@@ -71,10 +71,10 @@ static struct sk_buff *dequeue_func(stru
struct Qdisc *sch = ctx;
struct sk_buff *skb = __qdisc_dequeue_head(&sch->q);
 
-   if (skb)
+   if (skb) {
sch->qstats.backlog -= qdisc_pkt_len(skb);
-
-   prefetch(&skb->end); /* we'll need skb_shinfo() */
+   prefetch(&skb->end); /* we'll need skb_shinfo() */
+   }
return skb;
 }

[PATCH 5.2 36/56] tipc: compat: allow tipc commands without arguments

2019-08-08 Thread Greg Kroah-Hartman

From: Taras Kondratiuk 

[ Upstream commit 4da5f0018eef4c0de31675b670c80e82e13e99d1 ]

Commit 2753ca5d9009 ("tipc: fix uninit-value in tipc_nl_compat_doit")
broke older tipc tools that use compat interface (e.g. tipc-config from
tipcutils package):

% tipc-config -p
operation not supported

The commit started to reject TIPC netlink compat messages that do not
have attributes. It is too restrictive because some of such messages are
valid (they don't need any arguments):

% grep 'tx none' include/uapi/linux/tipc_config.h
#define  TIPC_CMD_NOOP  0x/* tx none, rx none */
#define  TIPC_CMD_GET_MEDIA_NAMES   0x0002/* tx none, rx media_name(s) */
#define  TIPC_CMD_GET_BEARER_NAMES  0x0003/* tx none, rx bearer_name(s) */
#define  TIPC_CMD_SHOW_PORTS0x0006/* tx none, rx ultra_string */
#define  TIPC_CMD_GET_REMOTE_MNG0x4003/* tx none, rx unsigned */
#define  TIPC_CMD_GET_MAX_PORTS 0x4004/* tx none, rx unsigned */
#define  TIPC_CMD_GET_NETID 0x400B/* tx none, rx unsigned */
#define  TIPC_CMD_NOT_NET_ADMIN 0xC001/* tx none, rx none */

This patch relaxes the original fix and rejects messages without
arguments only if such arguments are expected by a command (reg_type is
non zero).

Fixes: 2753ca5d9009 ("tipc: fix uninit-value in tipc_nl_compat_doit")
Cc: sta...@vger.kernel.org
Signed-off-by: Taras Kondratiuk 
Acked-by: Ying Xue 
Signed-off-by: David S. Miller 
Signed-off-by: Greg Kroah-Hartman 
---
 net/tipc/netlink_compat.c |   11 +++
 1 file changed, 7 insertions(+), 4 deletions(-)

--- a/net/tipc/netlink_compat.c
+++ b/net/tipc/netlink_compat.c
@@ -55,6 +55,7 @@ struct tipc_nl_compat_msg {
int rep_type;
int rep_size;
int req_type;
+   int req_size;
struct net *net;
struct sk_buff *rep;
struct tlv_desc *req;
@@ -257,7 +258,8 @@ static int tipc_nl_compat_dumpit(struct
int err;
struct sk_buff *arg;
 
-   if (msg->req_type && !TLV_CHECK_TYPE(msg->req, msg->req_type))
+   if (msg->req_type && (!msg->req_size ||
+ !TLV_CHECK_TYPE(msg->req, msg->req_type)))
return -EINVAL;
 
msg->rep = tipc_tlv_alloc(msg->rep_size);
@@ -354,7 +356,8 @@ static int tipc_nl_compat_doit(struct ti
 {
int err;
 
-   if (msg->req_type && !TLV_CHECK_TYPE(msg->req, msg->req_type))
+   if (msg->req_type && (!msg->req_size ||
+ !TLV_CHECK_TYPE(msg->req, msg->req_type)))
return -EINVAL;
 
err = __tipc_nl_compat_doit(cmd, msg);
@@ -1288,8 +1291,8 @@ static int tipc_nl_compat_recv(struct sk
goto send;
}
 
-   len = nlmsg_attrlen(req_nlh, GENL_HDRLEN + TIPC_GENL_HDRLEN);
-   if (!len || !TLV_OK(msg.req, len)) {
+   msg.req_size = nlmsg_attrlen(req_nlh, GENL_HDRLEN + TIPC_GENL_HDRLEN);
+   if (msg.req_size && !TLV_OK(msg.req, msg.req_size)) {
msg.rep = tipc_get_err_tlv(TIPC_CFG_NOT_SUPPORTED);
err = -EOPNOTSUPP;
goto send;

[PATCH 5.2 27/56] net: phy: mscc: initialize stats array

2019-08-08 Thread Greg Kroah-Hartman

From: Andreas Schwab 

[ Upstream commit f972037e71246c5e0916eef835174d58ffc517e4 ]

The memory allocated for the stats array may contain arbitrary data.

Fixes: e4f9ba642f0b ("net: phy: mscc: add support for VSC8514 PHY.")
Fixes: 00d70d8e0e78 ("net: phy: mscc: add support for VSC8574 PHY")
Fixes: a5afc1678044 ("net: phy: mscc: add support for VSC8584 PHY")
Fixes: f76178dc5218 ("net: phy: mscc: add ethtool statistics counters")
Signed-off-by: Andreas Schwab 
Reviewed-by: Andrew Lunn 
Signed-off-by: David S. Miller 
Signed-off-by: Greg Kroah-Hartman 
---
 drivers/net/phy/mscc.c |   16 
 1 file changed, 8 insertions(+), 8 deletions(-)

--- a/drivers/net/phy/mscc.c
+++ b/drivers/net/phy/mscc.c
@@ -2226,8 +2226,8 @@ static int vsc8514_probe(struct phy_devi
vsc8531->supp_led_modes = VSC85XX_SUPP_LED_MODES;
vsc8531->hw_stats = vsc85xx_hw_stats;
vsc8531->nstats = ARRAY_SIZE(vsc85xx_hw_stats);
-   vsc8531->stats = devm_kmalloc_array(&phydev->mdio.dev, vsc8531->nstats,
-   sizeof(u64), GFP_KERNEL);
+   vsc8531->stats = devm_kcalloc(&phydev->mdio.dev, vsc8531->nstats,
+ sizeof(u64), GFP_KERNEL);
if (!vsc8531->stats)
return -ENOMEM;
 
@@ -2251,8 +2251,8 @@ static int vsc8574_probe(struct phy_devi
vsc8531->supp_led_modes = VSC8584_SUPP_LED_MODES;
vsc8531->hw_stats = vsc8584_hw_stats;
vsc8531->nstats = ARRAY_SIZE(vsc8584_hw_stats);
-   vsc8531->stats = devm_kmalloc_array(&phydev->mdio.dev, vsc8531->nstats,
-   sizeof(u64), GFP_KERNEL);
+   vsc8531->stats = devm_kcalloc(&phydev->mdio.dev, vsc8531->nstats,
+ sizeof(u64), GFP_KERNEL);
if (!vsc8531->stats)
return -ENOMEM;
 
@@ -2281,8 +2281,8 @@ static int vsc8584_probe(struct phy_devi
vsc8531->supp_led_modes = VSC8584_SUPP_LED_MODES;
vsc8531->hw_stats = vsc8584_hw_stats;
vsc8531->nstats = ARRAY_SIZE(vsc8584_hw_stats);
-   vsc8531->stats = devm_kmalloc_array(&phydev->mdio.dev, vsc8531->nstats,
-   sizeof(u64), GFP_KERNEL);
+   vsc8531->stats = devm_kcalloc(&phydev->mdio.dev, vsc8531->nstats,
+ sizeof(u64), GFP_KERNEL);
if (!vsc8531->stats)
return -ENOMEM;
 
@@ -2311,8 +2311,8 @@ static int vsc85xx_probe(struct phy_devi
vsc8531->supp_led_modes = VSC85XX_SUPP_LED_MODES;
vsc8531->hw_stats = vsc85xx_hw_stats;
vsc8531->nstats = ARRAY_SIZE(vsc85xx_hw_stats);
-   vsc8531->stats = devm_kmalloc_array(&phydev->mdio.dev, vsc8531->nstats,
-   sizeof(u64), GFP_KERNEL);
+   vsc8531->stats = devm_kcalloc(&phydev->mdio.dev, vsc8531->nstats,
+ sizeof(u64), GFP_KERNEL);
if (!vsc8531->stats)
return -ENOMEM;

[PATCH 5.2 22/56] net/mlx5e: always initialize frag->last_in_page

2019-08-08 Thread Greg Kroah-Hartman

From: Qian Cai 

[ Upstream commit 60d60c8fbd8d1acf25b041ecd72ae4fa16e9405b ]

The commit 069d11465a80 ("net/mlx5e: RX, Enhance legacy Receive Queue
memory scheme") introduced an undefined behaviour below due to
"frag->last_in_page" is only initialized in mlx5e_init_frags_partition()
when,

if (next_frag.offset + frag_info[f].frag_stride > PAGE_SIZE)

or after bailed out the loop,

for (i = 0; i < mlx5_wq_cyc_get_size(&rq->wqe.wq); i++)

As the result, there could be some "frag" have uninitialized
value of "last_in_page".

Later, get_frag() obtains those "frag" and check "frag->last_in_page" in
mlx5e_put_rx_frag() and triggers the error during boot. Fix it by always
initializing "frag->last_in_page" to "false" in
mlx5e_init_frags_partition().

UBSAN: Undefined behaviour in
drivers/net/ethernet/mellanox/mlx5/core/en_rx.c:325:12
load of value 170 is not a valid value for type 'bool' (aka '_Bool')
Call trace:
 dump_backtrace+0x0/0x264
 show_stack+0x20/0x2c
 dump_stack+0xb0/0x104
 __ubsan_handle_load_invalid_value+0x104/0x128
 mlx5e_handle_rx_cqe+0x8e8/0x12cc [mlx5_core]
 mlx5e_poll_rx_cq+0xca8/0x1a94 [mlx5_core]
 mlx5e_napi_poll+0x17c/0xa30 [mlx5_core]
 net_rx_action+0x248/0x940
 __do_softirq+0x350/0x7b8
 irq_exit+0x200/0x26c
 __handle_domain_irq+0xc8/0x128
 gic_handle_irq+0x138/0x228
 el1_irq+0xb8/0x140
 arch_cpu_idle+0x1a4/0x348
 do_idle+0x114/0x1b0
 cpu_startup_entry+0x24/0x28
 rest_init+0x1ac/0x1dc
 arch_call_rest_init+0x10/0x18
 start_kernel+0x4d4/0x57c

Fixes: 069d11465a80 ("net/mlx5e: RX, Enhance legacy Receive Queue memory 
scheme")
Signed-off-by: Qian Cai 
Reviewed-by: Tariq Toukan 
Signed-off-by: David S. Miller 
Signed-off-by: Greg Kroah-Hartman 
---
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c |5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -340,12 +340,11 @@ static inline u64 mlx5e_get_mpwqe_offset
 
 static void mlx5e_init_frags_partition(struct mlx5e_rq *rq)
 {
-   struct mlx5e_wqe_frag_info next_frag, *prev;
+   struct mlx5e_wqe_frag_info next_frag = {};
+   struct mlx5e_wqe_frag_info *prev = NULL;
int i;
 
next_frag.di = &rq->wqe.di[0];
-   next_frag.offset = 0;
-   prev = NULL;
 
for (i = 0; i < mlx5_wq_cyc_get_size(&rq->wqe.wq); i++) {
struct mlx5e_rq_frag_info *frag_info = &rq->wqe.info.arr[0];

[PATCH 5.2 21/56] net: fix ifindex collision during namespace removal

2019-08-08 Thread Greg Kroah-Hartman

From: Jiri Pirko 

[ Upstream commit 55b40dbf0e76b4bfb9d8b3a16a0208640a9a45df ]

Commit aca51397d014 ("netns: Fix arbitrary net_device-s corruptions
on net_ns stop.") introduced a possibility to hit a BUG in case device
is returning back to init_net and two following conditions are met:
1) dev->ifindex value is used in a name of another "dev%d"
   device in init_net.
2) dev->name is used by another device in init_net.

Under real life circumstances this is hard to get. Therefore this has
been present happily for over 10 years. To reproduce:

$ ip a
1: lo:  mtu 65536 qdisc noqueue state UNKNOWN group 
default qlen 1000
link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
inet 127.0.0.1/8 scope host lo
   valid_lft forever preferred_lft forever
inet6 ::1/128 scope host
   valid_lft forever preferred_lft forever
2: dummy0:  mtu 1500 qdisc noop state DOWN group default qlen 
1000
link/ether 86:89:3f:86:61:29 brd ff:ff:ff:ff:ff:ff
3: enp0s2:  mtu 1500 qdisc noop state DOWN group default 
qlen 1000
link/ether 52:54:00:12:34:56 brd ff:ff:ff:ff:ff:ff
$ ip netns add ns1
$ ip -n ns1 link add dummy1ns1 type dummy
$ ip -n ns1 link add dummy2ns1 type dummy
$ ip link set enp0s2 netns ns1
$ ip -n ns1 link set enp0s2 name dummy0
[  100.858894] virtio_net virtio0 dummy0: renamed from enp0s2
$ ip link add dev4 type dummy
$ ip -n ns1 a
1: lo:  mtu 65536 qdisc noop state DOWN group default qlen 1000
link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
2: dummy1ns1:  mtu 1500 qdisc noop state DOWN group default 
qlen 1000
link/ether 16:63:4c:38:3e:ff brd ff:ff:ff:ff:ff:ff
3: dummy2ns1:  mtu 1500 qdisc noop state DOWN group default 
qlen 1000
link/ether aa:9e:86:dd:6b:5d brd ff:ff:ff:ff:ff:ff
4: dummy0:  mtu 1500 qdisc noop state DOWN group default 
qlen 1000
link/ether 52:54:00:12:34:56 brd ff:ff:ff:ff:ff:ff
$ ip a
1: lo:  mtu 65536 qdisc noqueue state UNKNOWN group 
default qlen 1000
link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
inet 127.0.0.1/8 scope host lo
   valid_lft forever preferred_lft forever
inet6 ::1/128 scope host
   valid_lft forever preferred_lft forever
2: dummy0:  mtu 1500 qdisc noop state DOWN group default qlen 
1000
link/ether 86:89:3f:86:61:29 brd ff:ff:ff:ff:ff:ff
4: dev4:  mtu 1500 qdisc noop state DOWN group default qlen 
1000
link/ether 5a:e1:4a:b6:ec:f8 brd ff:ff:ff:ff:ff:ff
$ ip netns del ns1
[  158.717795] default_device_exit: failed to move dummy0 to init_net: -17
[  158.719316] [ cut here ]
[  158.720591] kernel BUG at net/core/dev.c:9824!
[  158.722260] invalid opcode:  [#1] SMP KASAN PTI
[  158.723728] CPU: 0 PID: 56 Comm: kworker/u2:1 Not tainted 5.3.0-rc1+ #18
[  158.725422] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 
1.12.0-2.fc30 04/01/2014
[  158.727508] Workqueue: netns cleanup_net
[  158.728915] RIP: 0010:default_device_exit.cold+0x1d/0x1f
[  158.730683] Code: 84 e8 18 c9 3e fe 0f 0b e9 70 90 ff ff e8 36 e4 52 fe 89 
d9 4c 89 e2 48 c7 c6 80 d6 25 84 48 c7 c7 20 c0 25 84 e8 f4 c8 3e
[  158.736854] RSP: 0018:8880347e7b90 EFLAGS: 00010282
[  158.738752] RAX: 003b RBX: ffef RCX: 
[  158.741369] RDX:  RSI: 8128013d RDI: ed10068fcf64
[  158.743418] RBP: 888033550170 R08: 003b R09: fbfff0b94b9c
[  158.745626] R10: fbfff0b94b9b R11: 85ca5cdf R12: 888032f28000
[  158.748405] R13: dc00 R14: 8880335501b8 R15: 1110068fcf72
[  158.750638] FS:  () GS:88803600() 
knlGS:
[  158.752944] CS:  0010 DS:  ES:  CR0: 80050033
[  158.755245] CR2: 7fe8b45d21d0 CR3: 340b4005 CR4: 00360ef0
[  158.757654] DR0:  DR1:  DR2: 
[  158.760012] DR3:  DR6: fffe0ff0 DR7: 0400
[  158.762758] Call Trace:
[  158.763882]  ? dev_change_net_namespace+0xbb0/0xbb0
[  158.766148]  ? devlink_nl_cmd_set_doit+0x520/0x520
[  158.768034]  ? dev_change_net_namespace+0xbb0/0xbb0
[  158.769870]  ops_exit_list.isra.0+0xa8/0x150
[  158.771544]  cleanup_net+0x446/0x8f0
[  158.772945]  ? unregister_pernet_operations+0x4a0/0x4a0
[  158.775294]  process_one_work+0xa1a/0x1740
[  158.776896]  ? pwq_dec_nr_in_flight+0x310/0x310
[  158.779143]  ? do_raw_spin_lock+0x11b/0x280
[  158.780848]  worker_thread+0x9e/0x1060
[  158.782500]  ? process_one_work+0x1740/0x1740
[  158.784454]  kthread+0x31b/0x420
[  158.786082]  ? __kthread_create_on_node+0x3f0/0x3f0
[  158.788286]  ret_from_fork+0x3a/0x50
[  158.789871] ---[ end trace defd6c657c71f936 ]---
[  158.792273] RIP: 0010:default_device_exit.cold+0x1d/0x1f
[  158.795478] Code: 84 e8 18 c9 3e fe 0f 0b e9 70 90 ff ff e8 36 e4 52 fe 89 
d9 4c 89 e2 48 c7 c6 80 d6 25 84 48 c7 c7 20 c0 25 84 e8 f4 c8 3e
[  158.804854] RSP: 0018:8880347e7b90 EFLAGS: 00010282
[  158.807865] RAX: 00

[PATCH 5.2 15/56] mlxsw: spectrum: Fix error path in mlxsw_sp_module_init()

2019-08-08 Thread Greg Kroah-Hartman

From: Jiri Pirko 

[ Upstream commit 28fe79000e9b0a6f99959869947f1ca305f14599 ]

In case of sp2 pci driver registration fail, fix the error path to
start with sp1 pci driver unregister.

Fixes: c3ab435466d5 ("mlxsw: spectrum: Extend to support Spectrum-2 ASIC")
Signed-off-by: Jiri Pirko 
Signed-off-by: Ido Schimmel 
Signed-off-by: David S. Miller 
Signed-off-by: Greg Kroah-Hartman 
---
 drivers/net/ethernet/mellanox/mlxsw/spectrum.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
@@ -5989,7 +5989,7 @@ static int __init mlxsw_sp_module_init(v
return 0;
 
 err_sp2_pci_driver_register:
-   mlxsw_pci_driver_unregister(&mlxsw_sp2_pci_driver);
+   mlxsw_pci_driver_unregister(&mlxsw_sp1_pci_driver);
 err_sp1_pci_driver_register:
mlxsw_core_driver_unregister(&mlxsw_sp2_driver);
 err_sp2_core_driver_register:

[PATCH 5.2 11/56] ife: error out when nla attributes are empty

2019-08-08 Thread Greg Kroah-Hartman

From: Cong Wang 

[ Upstream commit c8ec4632c6ac9cda0e8c3d51aa41eeab66585bd5 ]

act_ife at least requires TCA_IFE_PARMS, so we have to bail out
when there is no attribute passed in.

Reported-by: syzbot+fbb5b288c9cb6a2ee...@syzkaller.appspotmail.com
Fixes: ef6980b6becb ("introduce IFE action")
Cc: Jamal Hadi Salim 
Cc: Jiri Pirko 
Signed-off-by: Cong Wang 
Signed-off-by: David S. Miller 
Signed-off-by: Greg Kroah-Hartman 
---
 net/sched/act_ife.c |5 +
 1 file changed, 5 insertions(+)

--- a/net/sched/act_ife.c
+++ b/net/sched/act_ife.c
@@ -481,6 +481,11 @@ static int tcf_ife_init(struct net *net,
int ret = 0;
int err;
 
+   if (!nla) {
+   NL_SET_ERR_MSG_MOD(extack, "IFE requires attributes to be 
passed");
+   return -EINVAL;
+   }
+
err = nla_parse_nested_deprecated(tb, TCA_IFE_MAX, nla, ife_policy,
  NULL);
if (err < 0)

[PATCH] coresight: Serialize enabling/disabling a link device.

2019-08-08 Thread Yabin Cui

When tracing etm data of multiple threads on multiple cpus through perf
interface, some link devices are shared between paths of different cpus.
It creates race conditions when different cpus wants to enable/disable
the same link device at the same time.

Example 1:
Two cpus want to enable different ports of a coresight funnel, thus
calling the funnel enable operation at the same time. But the funnel
enable operation isn't reentrantable.

Example 2:
For an enabled coresight dynamic replicator with refcnt=1, one cpu wants
to disable it, while another cpu wants to enable it. Ideally we still have
an enabled replicator with refcnt=1 at the end. But in reality the result
is uncertain.

Since coresight devices claim themselves when enabled for self-hosted
usage, the race conditions above usually make the link devices not usable
after many cycles.

To fix the race conditions, this patch adds a spinlock to serialize
enabling/disabling a link device.

Signed-off-by: Yabin Cui 
---
 drivers/hwtracing/coresight/coresight.c | 8 
 include/linux/coresight.h   | 3 +++
 2 files changed, 11 insertions(+)

diff --git a/drivers/hwtracing/coresight/coresight.c 
b/drivers/hwtracing/coresight/coresight.c
index 55db77f6410b..90f97f4f99b2 100644
--- a/drivers/hwtracing/coresight/coresight.c
+++ b/drivers/hwtracing/coresight/coresight.c
@@ -256,6 +256,7 @@ static int coresight_enable_link(struct coresight_device 
*csdev,
int ret;
int link_subtype;
int refport, inport, outport;
+   unsigned long flags;
 
if (!parent || !child)
return -EINVAL;
@@ -274,15 +275,18 @@ static int coresight_enable_link(struct coresight_device 
*csdev,
if (refport < 0)
return refport;
 
+   spin_lock_irqsave(&csdev->spinlock, flags);
if (atomic_inc_return(&csdev->refcnt[refport]) == 1) {
if (link_ops(csdev)->enable) {
ret = link_ops(csdev)->enable(csdev, inport, outport);
if (ret) {
atomic_dec(&csdev->refcnt[refport]);
+   spin_unlock_irqrestore(&csdev->spinlock, flags);
return ret;
}
}
}
+   spin_unlock_irqrestore(&csdev->spinlock, flags);
 
csdev->enable = true;
 
@@ -296,6 +300,7 @@ static void coresight_disable_link(struct coresight_device 
*csdev,
int i, nr_conns;
int link_subtype;
int refport, inport, outport;
+   unsigned long flags;
 
if (!parent || !child)
return;
@@ -315,10 +320,12 @@ static void coresight_disable_link(struct 
coresight_device *csdev,
nr_conns = 1;
}
 
+   spin_lock_irqsave(&csdev->spinlock, flags);
if (atomic_dec_return(&csdev->refcnt[refport]) == 0) {
if (link_ops(csdev)->disable)
link_ops(csdev)->disable(csdev, inport, outport);
}
+   spin_unlock_irqrestore(&csdev->spinlock, flags);
 
for (i = 0; i < nr_conns; i++)
if (atomic_read(&csdev->refcnt[i]) != 0)
@@ -1225,6 +1232,7 @@ struct coresight_device *coresight_register(struct 
coresight_desc *desc)
csdev->subtype = desc->subtype;
csdev->ops = desc->ops;
csdev->orphan = false;
+   spin_lock_init(&csdev->spinlock);
 
csdev->dev.type = &coresight_dev_type[desc->type];
csdev->dev.groups = desc->groups;
diff --git a/include/linux/coresight.h b/include/linux/coresight.h
index a2b68823717b..dd28d9ab841d 100644
--- a/include/linux/coresight.h
+++ b/include/linux/coresight.h
@@ -9,6 +9,7 @@
 #include 
 #include 
 #include 
+#include 
 
 /* Peripheral id registers (0xFD0-0xFEC) */
 #define CORESIGHT_PERIPHIDR4   0xfd0
@@ -153,6 +154,7 @@ struct coresight_connection {
  * activated but not yet enabled.  Enabling for a _sink_
  * appens when a source has been selected for that it.
  * @ea:Device attribute for sink representation under PMU 
directory.
+ * @spinlock:  Serialize enabling/disabling this device.
  */
 struct coresight_device {
struct coresight_platform_data *pdata;
@@ -166,6 +168,7 @@ struct coresight_device {
/* sink specific fields */
bool activated; /* true only if a sink is part of a path */
struct dev_ext_attribute *ea;
+   spinlock_t spinlock;
 };
 
 /*
-- 
2.22.0.770.g0f2c4a37fd-goog

Re: [bonding][patch] Regarding a bonding lacp issue

2019-08-08 Thread Jay Vosburgh

Felix  wrote:

>Dear Mainteners,
>
>Recently I hit a packet drop issue in bonding driver on Linux 4.9. Please
>see details below. Please take a look to see if my understanding is
>correct. Many thanks.
>
>What is the problem?
>The bonding driver starts to send packets even if the Partner(Switch)'s
>Collecting bit is not enabled yet. Partner would drop all packets until
>its Collecting bit is enabled.
>
>What is the root cuase?
>According to LACP spec, the Actor need to check Partner's Sync and
>Collecting bits before enable its Distributing bit and Distributing
>function. Please see the PIC below.

The diagram you reference is found in 802.1AX-2014 figure 6-21,
which shows the state diagram for an independent control implementation,
i.e., collecting and distributing are managed independently.

However, Linux bonding implements coupled control, which is
shown in figure 6-22.  Here, there is no Partner.Collecting requirement
on the state transition from ATTACHED to COLLECTING_DISTRIBUTING.

To quote 802.1AX-2014 6.4.15:

As independent control is not possible, the coupled control
state machine does not wait for the Partner to signal that
collection has started before enabling both collection and
distribution.

Now, that said, I agree that what you're seeing is likely
explained by this behavior, and your fix should resolve the immediate
problem (that bonding sends packets before the peer has enabled
COLLECTING).

However, your fix does put bonding out of compliance with the
standard, as it does not really implement COLLECTING and DISTRIBUTING as
discrete states.  In particular, if the peer in your case were to later
clear Partner.Collecting, bonding will not react to this as a figure
6-21 independent control implementation would (which isn't a change from
current behavior, but currently this isn't expected).

So, in my opinion a patch like this should have a comment
attached noting that we are deliberately not in compliance with the
standard in this specific situation.  The proper fix is to implement
figure 6-21 separate state.

Lastly, are you able to test and generate a patch against
current upstream, instead of 4.9?

-J

>How to fix?
>Please see the diff as following. And the patch is attached.
>
>--- ../origin/linux-4.9.188/drivers/net/bonding/bond_3ad.c 2019-08-07
>00:29:42.0 +0800
>+++ drivers/net/bonding/bond_3ad.c 2019-08-08 23:13:29.015640197 +0800
>@@ -937,6 +937,7 @@
> */
>if ((port->sm_vars & AD_PORT_SELECTED) &&
>(port->partner_oper.port_state & AD_STATE_SYNCHRONIZATION) &&
>+   (port->partner_oper.port_state & AD_STATE_COLLECTING) &&
>!__check_agg_selection_timer(port)) {
> if (port->aggregator->is_active)
>  port->sm_mux_state =
>
>--
>Thanks,
>Felix

---
-Jay Vosburgh, jay.vosbu...@canonical.com

Re: [PATCH 1/3] mm/mlock.c: convert put_page() to put_user_page*()

2019-08-08 Thread John Hubbard

On 8/8/19 4:09 AM, Vlastimil Babka wrote:
> On 8/8/19 8:21 AM, Michal Hocko wrote:
>> On Wed 07-08-19 16:32:08, John Hubbard wrote:
>>> On 8/7/19 4:01 AM, Michal Hocko wrote:
 On Mon 05-08-19 15:20:17, john.hubb...@gmail.com wrote:
> From: John Hubbard 
>>> Actually, I think follow_page_mask() gets all the pages, right? And the
>>> get_page() in __munlock_pagevec_fill() is there to allow a 
>>> pagevec_release() 
>>> later.
>>
>> Maybe I am misreading the code (looking at Linus tree) but 
>> munlock_vma_pages_range
>> calls follow_page for the start address and then if not THP tries to
>> fill up the pagevec with few more pages (up to end), do the shortcut
>> via manual pte walk as an optimization and use generic get_page there.
> 

Yes, I see it finally, thanks. :)  

> That's true. However, I'm not sure munlocking is where the
> put_user_page() machinery is intended to be used anyway? These are
> short-term pins for struct page manipulation, not e.g. dirtying of page
> contents. Reading commit fc1d8e7cca2d I don't think this case falls
> within the reasoning there. Perhaps not all GUP users should be
> converted to the planned separate GUP tracking, and instead we should
> have a GUP/follow_page_mask() variant that keeps using get_page/put_page?
>  

Interesting. So far, the approach has been to get all the gup callers to
release via put_user_page(), but if we add in Jan's and Ira's vaddr_pin_pages()
wrapper, then maybe we could leave some sites unconverted.

However, in order to do so, we would have to change things so that we have
one set of APIs (gup) that do *not* increment a pin count, and another set
(vaddr_pin_pages) that do. 

Is that where we want to go...?

I have a tracking patch that only deals with gup/pup. I could post as an RFC,
but I think it might just muddy the waters at this point, anyway it's this one:

https://github.com/johnhubbard/linux/commit/a0fb73ce0a39c74f0d1fb6bd9d866f660f762eae

thanks,
-- 
John Hubbard
NVIDIA

Re: [PATCH net] net: phy: rtl8211f: do a double read to get real time link status

2019-08-08 Thread Heiner Kallweit

On 08.08.2019 08:21, Yonglong Liu wrote:
> 
> 
> On 2019/8/8 14:11, Heiner Kallweit wrote:
>> On 08.08.2019 03:15, Yonglong Liu wrote:
>>>
>>>
>>> On 2019/8/8 0:47, Heiner Kallweit wrote:
 On 07.08.2019 15:16, Yonglong Liu wrote:
> [   27.232781] hns3 :bd:00.3 eth7: net open
> [   27.237303] 8021q: adding VLAN 0 to HW filter on device eth7
> [   27.242972] IPv6: ADDRCONF(NETDEV_CHANGE): eth7: link becomes ready
> [   27.29] hns3 :bd:00.3: invalid speed (-1)
> [   27.253904] hns3 :bd:00.3 eth7: failed to adjust link.
> [   27.259379] RTL8211F Gigabit Ethernet mii-:bd:00.3:07: PHY state 
> change UP -> RUNNING
> [   27.924903] hns3 :bd:00.3 eth7: link up
> [   28.280479] RTL8211F Gigabit Ethernet mii-:bd:00.3:07: PHY state 
> change RUNNING -> NOLINK
> [   29.208452] hns3 :bd:00.3 eth7: link down
> [   32.376745] RTL8211F Gigabit Ethernet mii-:bd:00.3:07: PHY state 
> change NOLINK -> RUNNING
> [   33.208448] hns3 :bd:00.3 eth7: link up
> [   35.253821] hns3 :bd:00.3 eth7: net stop
> [   35.258270] hns3 :bd:00.3 eth7: link down
>
> When using rtl8211f in polling mode, may get a invalid speed,
> because of reading a fake link up and autoneg complete status
> immediately after starting autoneg:
>
> ifconfig-1176  [007] 27.232763: mdio_access: 
> mii-:bd:00.3 read  phy:0x07 reg:0x00 val:0x1040
>   kworker/u257:1-670   [015] 27.232805: mdio_access: 
> mii-:bd:00.3 read  phy:0x07 reg:0x04 val:0x01e1
>   kworker/u257:1-670   [015] 27.232815: mdio_access: 
> mii-:bd:00.3 write phy:0x07 reg:0x04 val:0x05e1
>   kworker/u257:1-670   [015] 27.232869: mdio_access: 
> mii-:bd:00.3 read  phy:0x07 reg:0x01 val:0x79ad
>   kworker/u257:1-670   [015] 27.232904: mdio_access: 
> mii-:bd:00.3 read  phy:0x07 reg:0x09 val:0x0200
>   kworker/u257:1-670   [015] 27.232940: mdio_access: 
> mii-:bd:00.3 read  phy:0x07 reg:0x00 val:0x1040
>   kworker/u257:1-670   [015] 27.232949: mdio_access: 
> mii-:bd:00.3 write phy:0x07 reg:0x00 val:0x1240
>   kworker/u257:1-670   [015] 27.233003: mdio_access: 
> mii-:bd:00.3 read  phy:0x07 reg:0x01 val:0x79ad
>   kworker/u257:1-670   [015] 27.233039: mdio_access: 
> mii-:bd:00.3 read  phy:0x07 reg:0x0a val:0x3002
>   kworker/u257:1-670   [015] 27.233074: mdio_access: 
> mii-:bd:00.3 read  phy:0x07 reg:0x09 val:0x0200
>   kworker/u257:1-670   [015] 27.233110: mdio_access: 
> mii-:bd:00.3 read  phy:0x07 reg:0x05 val:0x
>   kworker/u257:1-670   [000] 28.280475: mdio_access: 
> mii-:bd:00.3 read  phy:0x07 reg:0x01 val:0x7989
>   kworker/u257:1-670   [000] 29.304471: mdio_access: 
> mii-:bd:00.3 read  phy:0x07 reg:0x01 val:0x7989
>
> According to the datasheet of rtl8211f, to get the real time
> link status, need to read MII_BMSR twice.
>
> This patch add a read_status hook for rtl8211f, and do a fake
> phy_read before genphy_read_status(), so that can get real link
> status in genphy_read_status().
>
> Signed-off-by: Yonglong Liu 
> ---
>  drivers/net/phy/realtek.c | 13 +
>  1 file changed, 13 insertions(+)
>
 Is this an accidental resubmit? Because we discussed this in
 https://marc.info/?t=15641350993&r=1&w=2 and a fix has
 been applied already.

 Heiner

 .

>>>
>>> In https://marc.info/?t=15641350993&r=1&w=2 , the invalid speed
>>> recurrence rate is almost 100%, and I had test the solution about
>>> 5 times and it works. But yesterday it happen again suddenly, and than
>>> I fount that the recurrence rate reduce to 10%. This time we get 0x79ad
>>> after autoneg started which is not 0x798d from last discussion.
>>>
>>>
>>>
>> OK, I'll have a look.
>> However the approach is wrong. The double read is related to the latching
>> of link-down events. This is done by all PHY's and not specific to RT8211F.
>> Also it's not related to the problem. I assume any sufficient delay would
>> do instead of the read.
>>
>> .
>>
> 
> So you will send a new patch to fix this problem? I am waiting for it,
> and can do a full test this time.
> 
> 
Can you try the following? This delay should give thy PHY enough time
to clear both bits before the following read is done.

diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index ef7aa738e..32f327a44 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -568,6 +568,11 @@ int phy_start_aneg(struct phy_device *phydev)
if (err < 0)
goto out_unlock;
 
+   /* The PHY may not yet have cleared aneg-completed and link-up bit
+* w/o this delay when the following read is done.
+*/
+   usleep_range(1000, 2000);
+

Re: [PATCH] ASoC: soc-core: remove error due to probe deferral

2019-08-08 Thread Mark Brown

On Thu, Aug 08, 2019 at 03:16:53PM +0200, Stefan Agner wrote:
> On 2019-08-08 15:14, Takashi Iwai wrote:
> > Mark Brown wrote:

> > I guess we can use dev_printk() with the conditional level choice.

> How about use dev_info always? We get a dev_err message from
> soc_init_dai_link in error cases...

>   ret = soc_init_dai_link(card, dai_link);
>   if (ret && ret != -EPROBE_DEFER) {
>   dev_info(card->dev, "ASoC: failed to init link %s: 
> %d\n",
>dai_link->name, ret);
>   }

Well, if there's adequate error reporting in init_dai_link() it's a bit
different - we can just remove the print entirely regardless of what the
return code is.  The point is to ensure that we don't just silently
fail.  Unfortunately there's no prints in the probe deferral case there
so they need adding, that'll actually improve things though since we can
make it print the name of the thing it's mising which will be useful to
people trying to figure out what's going on (we used to do that but it
got lost in reshufflings).

signature.asc
Description: PGP signature

[PATCH] coresight: tmc-etr: Remove perf_data check.

2019-08-08 Thread Yabin Cui

When tracing etm data of multiple threads on multiple cpus through
perf interface, each cpu has a unique etr_perf_buffer while sharing
the same etr device. There is no guarantee that the last cpu starts
etm tracing also stops last. So the perf_data check is no longer valid.

Signed-off-by: Yabin Cui 
---
 drivers/hwtracing/coresight/coresight-tmc-etr.c | 9 -
 drivers/hwtracing/coresight/coresight-tmc.h | 2 --
 2 files changed, 11 deletions(-)

diff --git a/drivers/hwtracing/coresight/coresight-tmc-etr.c 
b/drivers/hwtracing/coresight/coresight-tmc-etr.c
index 17006705287a..0418440e0141 100644
--- a/drivers/hwtracing/coresight/coresight-tmc-etr.c
+++ b/drivers/hwtracing/coresight/coresight-tmc-etr.c
@@ -1484,20 +1484,12 @@ tmc_update_etr_buffer(struct coresight_device *csdev,
goto out;
}
 
-   if (WARN_ON(drvdata->perf_data != etr_perf)) {
-   lost = true;
-   spin_unlock_irqrestore(&drvdata->spinlock, flags);
-   goto out;
-   }
-
CS_UNLOCK(drvdata->base);
 
tmc_flush_and_stop(drvdata);
tmc_sync_etr_buf(drvdata);
 
CS_LOCK(drvdata->base);
-   /* Reset perf specific data */
-   drvdata->perf_data = NULL;
spin_unlock_irqrestore(&drvdata->spinlock, flags);
 
size = etr_buf->len;
@@ -1556,7 +1548,6 @@ static int tmc_enable_etr_sink_perf(struct 
coresight_device *csdev, void *data)
}
 
etr_perf->head = PERF_IDX2OFF(handle->head, etr_perf);
-   drvdata->perf_data = etr_perf;
 
/*
 * No HW configuration is needed if the sink is already in
diff --git a/drivers/hwtracing/coresight/coresight-tmc.h 
b/drivers/hwtracing/coresight/coresight-tmc.h
index 1ed50411cc3c..3881a9ee565a 100644
--- a/drivers/hwtracing/coresight/coresight-tmc.h
+++ b/drivers/hwtracing/coresight/coresight-tmc.h
@@ -178,7 +178,6 @@ struct etr_buf {
  * device configuration register (DEVID)
  * @idr:   Holds etr_bufs allocated for this ETR.
  * @idr_mutex: Access serialisation for idr.
- * @perf_data: PERF buffer for ETR.
  * @sysfs_data:SYSFS buffer for ETR.
  */
 struct tmc_drvdata {
@@ -202,7 +201,6 @@ struct tmc_drvdata {
struct idr  idr;
struct mutexidr_mutex;
struct etr_buf  *sysfs_buf;
-   void*perf_data;
 };
 
 struct etr_buf_operations {
-- 
2.22.0.770.g0f2c4a37fd-goog

Re: [PATCH v2 13/15] net: phy: adin: configure downshift on config_init

2019-08-08 Thread Heiner Kallweit

On 08.08.2019 14:30, Alexandru Ardelean wrote:
> Down-speed auto-negotiation may not always be enabled, in which case the
> PHY won't down-shift to 100 or 10 during auto-negotiation.
> 
> This change enables downshift and configures the number of retries to
> default 8 (maximum supported value).
> 
> The change has been adapted from the Marvell PHY driver.
> 
Instead of a fixed downshift setting (like in the Marvell driver) you
may consider to implement the ethtool phy-tunable ETHTOOL_PHY_DOWNSHIFT.
See the Aquantia PHY driver for an example.
Then the user can configure whether he wants downshift and if yes after
how many retries.

> Signed-off-by: Alexandru Ardelean 
> ---
>  drivers/net/phy/adin.c | 39 +++
>  1 file changed, 39 insertions(+)
[...]

Heiner

Re: [PATCH v2 02/15] net: phy: adin: hook genphy_read_abilities() to get_features

2019-08-08 Thread Heiner Kallweit

On 08.08.2019 17:24, Andrew Lunn wrote:
> On Thu, Aug 08, 2019 at 03:30:13PM +0300, Alexandru Ardelean wrote:
>> The ADIN PHYs can operate with Clause 45, however they are not typical for
>> how phylib considers Clause 45 PHYs.
>>
>> If the `features` field & the `get_features` hook are unspecified, and the
>> device wants to operate via Clause 45, it would also try to read features
>> via the `genphy_c45_pma_read_abilities()`, which will try to read PMA regs
>> that are unsupported.
>>
>> Hooking the `genphy_read_abilities()` function to the `get_features` hook
>> will ensure that this does not happen and the PHY features are read
>> correctly regardless of Clause 22 or Clause 45 operation.
> 
> I think we need to stop and think about a PHY which supports both C22
> and C45.
> 
> How does bus enumeration work? Is it discovered twice?  I've always
> considered phydev->is_c45 means everything is c45, not that some
> registers can be accessed via c45. But the driver is mixing c22 and
> c45. Does the driver actually require c45? Are some features which are
> only accessibly via C45? What does C45 actually bring us for this
> device?
> 
genphy_c45_pma_read_abilities() is only called if phydev->is_c45 is set.
And this flag means that the PHY complies with Clause 45 incl. all the
standard devices like PMA. In the case here only some vendor-specific
registers can be accessed via Clause 45 and therefore is_c45 shouldn't
bet set. As a consequence this patch isn't needed.

>  Andrew
> 
Heiner

Re: [PATCH net] net: phy: rtl8211f: do a double read to get real time link status

2019-08-08 Thread Andrew Lunn

> @@ -568,6 +568,11 @@ int phy_start_aneg(struct phy_device *phydev)
>   if (err < 0)
>   goto out_unlock;
>  
> + /* The PHY may not yet have cleared aneg-completed and link-up bit
> +  * w/o this delay when the following read is done.
> +  */
> + usleep_range(1000, 2000);
> +

Hi Heiner

Does 802.3 C22 say anything about this?

If this PHY is broken with respect to the standard, i would prefer the
workaround is in the PHY specific driver code, not generic core code.

   Andrew

[RFC PATCH] hugetlbfs: Add hugetlb_cgroup reservation limits

2019-08-08 Thread Mina Almasry

Problem:
Currently tasks attempting to allocate more hugetlb memory than is available get
a failure at mmap/shmget time. This is thanks to Hugetlbfs Reservations [1].
However, if a task attempts to allocate hugetlb memory only more than its
hugetlb_cgroup limit allows, the kernel will allow the mmap/shmget call,
but will SIGBUS the task when it attempts to fault the memory in.

We have developers interested in using hugetlb_cgroups, and they have expressed
dissatisfaction regarding this behavior. We'd like to improve this
behavior such that tasks violating the hugetlb_cgroup limits get an error on
mmap/shmget time, rather than getting SIGBUS'd when they try to fault
the excess memory in.

The underlying problem is that today's hugetlb_cgroup accounting happens
at hugetlb memory *fault* time, rather than at *reservation* time.
Thus, enforcing the hugetlb_cgroup limit only happens at fault time, and
the offending task gets SIGBUS'd.

Proposed Solution:
A new page counter named hugetlb.xMB.reservation_[limit|usage]_in_bytes. This
counter has slightly different semantics than
hugetlb.xMB.[limit|usage]_in_bytes:

- While usage_in_bytes tracks all *faulted* hugetlb memory,
reservation_usage_in_bytes tracks all *reserved* hugetlb memory.

- If a task attempts to reserve more memory than limit_in_bytes allows,
the kernel will allow it to do so. But if a task attempts to reserve
more memory than reservation_limit_in_bytes, the kernel will fail this
reservation.

This proposal is implemented in this patch, with tests to verify
functionality and show the usage.

Alternatives considered:
1. A new cgroup, instead of only a new page_counter attached to
   the existing hugetlb_cgroup. Adding a new cgroup seemed like a lot of code
   duplication with hugetlb_cgroup. Keeping hugetlb related page counters under
   hugetlb_cgroup seemed cleaner as well.

2. Instead of adding a new counter, we considered adding a sysctl that modifies
   the behavior of hugetlb.xMB.[limit|usage]_in_bytes, to do accounting at
   reservation time rather than fault time. Adding a new page_counter seems
   better as userspace could, if it wants, choose to enforce different cgroups
   differently: one via limit_in_bytes, and another via
   reservation_limit_in_bytes. This could be very useful if you're
   transitioning how hugetlb memory is partitioned on your system one
   cgroup at a time, for example. Also, someone may find usage for both
   limit_in_bytes and reservation_limit_in_bytes concurrently, and this
   approach gives them the option to do so.

Caveats:
1. This support is implemented for cgroups-v1. I have not tried
   hugetlb_cgroups with cgroups v2, and AFAICT it's not supported yet.
   This is largely because we use cgroups-v1 for now. If required, I
   can add hugetlb_cgroup support to cgroups v2 in this patch or
   a follow up.
2. Most complicated bit of this patch I believe is: where to store the
   pointer to the hugetlb_cgroup to uncharge at unreservation time?
   Normally the cgroup pointers hang off the struct page. But, with
   hugetlb_cgroup reservations, one task can reserve a specific page and another
   task may fault it in (I believe), so storing the pointer in struct
   page is not appropriate. Proposed approach here is to store the pointer in
   the resv_map. See patch for details.

[1]: https://www.kernel.org/doc/html/latest/vm/hugetlbfs_reserv.html

Signed-off-by: Mina Almasry 
---
 include/linux/hugetlb.h   |  10 +-
 include/linux/hugetlb_cgroup.h|  19 +-
 mm/hugetlb.c  | 256 --
 mm/hugetlb_cgroup.c   | 153 +-
 tools/testing/selftests/vm/.gitignore |   1 +
 tools/testing/selftests/vm/Makefile   |   4 +
 .../selftests/vm/charge_reserved_hugetlb.sh   | 438 ++
 .../selftests/vm/write_hugetlb_memory.sh  |  22 +
 .../testing/selftests/vm/write_to_hugetlbfs.c | 252 ++
 9 files changed, 1087 insertions(+), 68 deletions(-)
 create mode 100755 tools/testing/selftests/vm/charge_reserved_hugetlb.sh
 create mode 100644 tools/testing/selftests/vm/write_hugetlb_memory.sh
 create mode 100644 tools/testing/selftests/vm/write_to_hugetlbfs.c

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index edfca42783192..90b3c928d16c1 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -46,6 +46,14 @@ struct resv_map {
long adds_in_progress;
struct list_head region_cache;
long region_cache_count;
+ #ifdef CONFIG_CGROUP_HUGETLB
+   /*
+* On private mappings, the counter to uncharge reservations is stored
+* here. If these fields are 0, then the mapping is shared.
+*/
+   struct page_counter *reservation_counter;
+   unsigned long pages_per_hpage;
+#endif
 };
 extern struct resv_map *resv_map_alloc(void);
 void resv_map_release(struct kref *ref);
@@ -340,7 +348,7 @@ struct hstate {
unsigned int surplus

[ANNOUNCE] 4.14.137-rt64

2019-08-08 Thread Tom Zanussi

Hello RT Folks!

I'm pleased to announce the 4.14.137-rt64 stable release.

This release is just an update to the new stable 4.14.137
version and no RT specific changes have been made.

You can get this release via the git tree at:

  git://git.kernel.org/pub/scm/linux/kernel/git/rt/linux-stable-rt.git

  branch: v4.14-rt
  Head SHA1: b86042812cec9871bba4a0da843cccdc77682ee3

Or to build 4.14.137-rt64 directly, the following patches should be applied:

  https://www.kernel.org/pub/linux/kernel/v4.x/linux-4.14.tar.xz

  https://www.kernel.org/pub/linux/kernel/v4.x/patch-4.14.137.xz

  
https://www.kernel.org/pub/linux/kernel/projects/rt/4.14/patch-4.14.137-rt64.patch.xz

Enjoy!

   Tom

Re: [PATCH v2 2/6] thermal: amlogic: Add thermal driver to support G12 SoCs

2019-08-08 Thread Martin Blumenstingl

Hi Kevin,

On Thu, Aug 8, 2019 at 4:59 AM Kevin Hilman  wrote:
>
> Martin Blumenstingl  writes:
>
> > Hi Guillaume,
> >
> > On Mon, Aug 5, 2019 at 2:48 PM guillaume La Roque  
> > wrote:
> >>
> >> Hi Martin,
> >>
> >> again thanks for your review.
> > you're welcome - thank you for working on the driver :-)
> >
> > [...]
> >> > The IP block has more functionality, which may be added to this driver
> >> > in the future:
> >> > - reading up to 16 stored temperature samples
> >>
> >> it's not working, you can verify it if you check the regmap define in the 
> >> driver. in fact temp is only write in one register, it's confirmed by 
> >> amlogic.
> > I missed that - so please skip this part
> >
> > [...]
> >> >> +config AMLOGIC_THERMAL
> >> > we typically use "MESON" in the Kconfig symbols:
> >> > $ grep -c AMLOGIC .config
> >> > 1
> >> > $ grep -c MESON .config
> >> > 33
> >> >
> >> > I also wonder if we should add G12 or G12A so we don't conflict with
> >> > upcoming thermal sensors with a different design (assuming that this
> >> > will be a thing).
> >> > for example we already have three different USB2 PHY drivers
> >> >
> >> > [...]
> >>
> >> i check with Neil and for new family it's better to use Amlogic instead of 
> >> meson.
> > can you please share the considerations behind this decision?
> > if new drivers should use AMLOGIC_* Kconfig symbols instead of MESON_*
> > then we all should know about it
> >
> >> i don't add G12 because we already know it's same sensors for SM1 SoC 
> >> family [0].
> > my idea behind this was to avoid conflicts in the future
> > in case of the thermal driver we may be fine with using a generic name
> > assuming that Amlogic will not switch to a new IP block in the next
> > years
> > I'm not saying you have to change the name - I'm bringing this up so
> > you can decide for yourself based on examples from the past
> >
> > here are a few examples:
> > - when Kevin upstreamed the MMC driver for GX he decided to use
> > MMC_MESON_GX for the Kconfig symbol name. it turns out that this is
> > smart because there are at least two other MMC controller IPs on the
> > 32-bit SoCs. due to him including GX in the name the drivers are easy
> > to differentiate (MMC_MESON_MX_SDIO and MMC_MESON_MX_SDHC being the
> > other ones, while the latter is not upstream yet)
> > - when Carlo upstreamed the eFuse driver he decided to use MESON_EFUSE
> > for the Kconfig symbol name. I found out much later that the 32-bit
> > SoCs use a different IP (or at least direct register access instead of
> > going through Secure Monitor). the driver for the 32-bit SoCs now uses
> > MESON_MX_EFUSE. if you don't know which driver applies where then it's
> > easy to mix up MESON_EFUSE and MESON_MX_EFUSE
> > - when Jerome upstreamed the ALSA driver for AXG (which is also used
> > on G12A and G12B) he decided to use the SND_MESON_AXG_* prefix for the
> > Kconfig symbol names. in my opinion this was a good choice because GXM
> > and everything earlier (including the 32-bit SoCs) use a different
> > audio IP block. we won't have a Kconfig symbol name clash when a
> > driver for the "older" SoCs is upstreamed
> > - (there are more examples, Meson8b USB PHY driver, Meson8b DWMAC
> > glue, ... - just like there's many examples where the IP block is
> > mostly compatible with older generations: SAR ADC, RNG, SPI, ...)
>
> While these are all good examples, you can see it can go both ways, so
> there's really no way know up front what is the "right" way.  We only
> know after the fact.  Unfortunately, we simply have no visibility into
> future chips and where IP blocks may be shared or not (there are other
> examples where vendors add a new version of an IP *and* keep the old
> version. ;)
>
> Even having worked inside a (different) SoC vendor and having some
> knowledge about what IPs are shared, it's difficult to get this right.
right. The fact that it'll be the IP block in SM1 will be backwards
compatible (or even the same) means that it has a longer life-span
than some of the USB PHY IP.
so I'm fine either way


Martin

Re: [PATCH] clk: fix devm_platform_ioremap_resource.cocci warnings

2019-08-08 Thread Stephen Boyd

Quoting Julia Lawall (2019-08-08 09:10:53)
> From: kbuild test robot 
> 
> drivers/clk/bcm/clk-bcm63xx-gate.c:174:1-9: WARNING: Use 
> devm_platform_ioremap_resource for hw -> regs
> 
>  Use devm_platform_ioremap_resource helper which wraps
>  platform_get_resource() and devm_ioremap_resource() together.
> 
> Generated by: scripts/coccinelle/api/devm_platform_ioremap_resource.cocci
> 
> Fixes: 1c099779c1e2 ("clk: add BCM63XX gated clock controller driver")

Is it fixing anything? As far as I can tell it's reducing lines of code
with another function.

> CC: Jonas Gorski 
> Signed-off-by: kbuild test robot 
> Signed-off-by: Julia Lawall

Re: [PATCH] clk: fix devm_platform_ioremap_resource.cocci warnings

2019-08-08 Thread Julia Lawall




On Thu, 8 Aug 2019, Stephen Boyd wrote:

> Quoting Julia Lawall (2019-08-08 09:10:53)
> > From: kbuild test robot 
> >
> > drivers/clk/bcm/clk-bcm63xx-gate.c:174:1-9: WARNING: Use 
> > devm_platform_ioremap_resource for hw -> regs
> >
> >  Use devm_platform_ioremap_resource helper which wraps
> >  platform_get_resource() and devm_ioremap_resource() together.
> >
> > Generated by: scripts/coccinelle/api/devm_platform_ioremap_resource.cocci
> >
> > Fixes: 1c099779c1e2 ("clk: add BCM63XX gated clock controller driver")
>
> Is it fixing anything? As far as I can tell it's reducing lines of code
> with another function.

No, it doesn't fix anything.

julia

>
> > CC: Jonas Gorski 
> > Signed-off-by: kbuild test robot 
> > Signed-off-by: Julia Lawall 
>
>

Re: [PATCH v2 1/4] dt-bindings: soundwire: add slave bindings

2019-08-08 Thread Mark Brown

On Thu, Aug 08, 2019 at 05:48:56PM +0100, Srinivas Kandagatla wrote:
> On 08/08/2019 16:58, Pierre-Louis Bossart wrote:

> > > +- sdw-instance-id: Should be ('Instance ID') from SoundWire
> > > +  Enumeration Address. Instance ID is for the cases
> > > +  where multiple Devices of the same type or Class
> > > +  are attached to the bus.

> > so it is actually required if you have a single Slave device? Or is it
> > only required when you have more than 1 device of the same type?

> This is mandatory for any slave device!

If it's mandatory the wording is a bit unclear.  How about something
like:

Should be ('Instance ID') from the SoundWire Enumeration
Address.  This must always be provided, if multiple devices
with the same type or class or attached to the bus each
instance must have a distinct value.

signature.asc
Description: PGP signature

[PATCH v4 0/2] sched: Improve load balancing on AMD EPYC

2019-08-08 Thread Matt Fleming

This is another version of the AMD EPYC load balancing patch. The
difference with this one is that now it fixes the following ia64 build
error, reported by 0day:

   mm/page_alloc.o: In function `get_page_from_freelist':
   page_alloc.c:(.text+0x7850): undefined reference to `node_reclaim_distance'
   page_alloc.c:(.text+0x7931): undefined reference to `node_reclaim_distance'

Matt Fleming (2):
  ia64: Make NUMA select SMP
  sched/topology: Improve load balancing on AMD EPYC

 arch/ia64/Kconfig |  1 +
 arch/x86/kernel/cpu/amd.c |  5 +
 include/linux/topology.h  | 14 ++
 kernel/sched/topology.c   |  3 ++-
 mm/khugepaged.c   |  2 +-
 mm/page_alloc.c   |  2 +-
 6 files changed, 24 insertions(+), 3 deletions(-)

-- 
2.13.7

[PATCH RT 02/19] kthread: add a global worker thread.

2019-08-08 Thread zanussi

From: Sebastian Andrzej Siewior 

v4.14.137-rt65-rc1 stable review patch.
If anyone has any objections, please let me know.

---


[ Upstream commit 0532e87d9d44795221aa921ba7024bde689cc894 ]

Add kthread_schedule_work() which uses a global kthread for all its
jobs.
Split the cgroup include to avoid recussive includes from interrupt.h.
Fixup everything that fails to build (and did not include all header).

Signed-off-by: Sebastian Andrzej Siewior 
Signed-off-by: Tom Zanussi 

 Conflicts:
include/linux/blk-cgroup.h
include/linux/kthread.h
kernel/kthread.c
---
 drivers/block/loop.c   |  2 +-
 drivers/spi/spi-rockchip.c |  1 +
 include/linux/blk-cgroup.h |  1 +
 include/linux/kthread-cgroup.h | 17 +
 include/linux/kthread.h|  8 
 init/main.c|  1 +
 kernel/kthread.c   | 13 +
 7 files changed, 42 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/kthread-cgroup.h

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index bd447de4a5b8..2a07dfc9b3ae 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -70,7 +70,7 @@
 #include 
 #include 
 #include 
-#include 
+#include 
 #include 
 #include 
 #include 
diff --git a/drivers/spi/spi-rockchip.c b/drivers/spi/spi-rockchip.c
index fdcf3076681b..b56619418cea 100644
--- a/drivers/spi/spi-rockchip.c
+++ b/drivers/spi/spi-rockchip.c
@@ -22,6 +22,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #define DRIVER_NAME "rockchip-spi"
 
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index 8bbc3716507a..a9454ad4de06 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -20,6 +20,7 @@
 #include 
 #include 
 #include 
+#include 
 
 /* percpu_counter batch for blkg_[rw]stats, per-cpu drift doesn't matter */
 #define BLKG_STAT_CPU_BATCH(INT_MAX / 2)
diff --git a/include/linux/kthread-cgroup.h b/include/linux/kthread-cgroup.h
new file mode 100644
index ..53d34bca9d72
--- /dev/null
+++ b/include/linux/kthread-cgroup.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_KTHREAD_CGROUP_H
+#define _LINUX_KTHREAD_CGROUP_H
+#include 
+#include 
+
+#ifdef CONFIG_BLK_CGROUP
+void kthread_associate_blkcg(struct cgroup_subsys_state *css);
+struct cgroup_subsys_state *kthread_blkcg(void);
+#else
+static inline void kthread_associate_blkcg(struct cgroup_subsys_state *css) { }
+static inline struct cgroup_subsys_state *kthread_blkcg(void)
+{
+   return NULL;
+}
+#endif
+#endif
diff --git a/include/linux/kthread.h b/include/linux/kthread.h
index 4e663f407bd7..59b85b01fb8b 100644
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -199,4 +199,12 @@ bool kthread_cancel_delayed_work_sync(struct 
kthread_delayed_work *work);
 
 void kthread_destroy_worker(struct kthread_worker *worker);
 
+extern struct kthread_worker kthread_global_worker;
+void kthread_init_global_worker(void);
+
+static inline bool kthread_schedule_work(struct kthread_work *work)
+{
+   return kthread_queue_work(&kthread_global_worker, work);
+}
+
 #endif /* _LINUX_KTHREAD_H */
diff --git a/init/main.c b/init/main.c
index f32aebb5ce54..18c1297b2889 100644
--- a/init/main.c
+++ b/init/main.c
@@ -1059,6 +1059,7 @@ static noinline void __init kernel_init_freeable(void)
smp_prepare_cpus(setup_max_cpus);
 
workqueue_init();
+   kthread_init_global_worker();
 
init_mm_internals();
 
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 430fd79cd3fe..44498522e5d5 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -1161,3 +1161,16 @@ void kthread_destroy_worker(struct kthread_worker 
*worker)
kfree(worker);
 }
 EXPORT_SYMBOL(kthread_destroy_worker);
+
+DEFINE_KTHREAD_WORKER(kthread_global_worker);
+EXPORT_SYMBOL(kthread_global_worker);
+
+__init void kthread_init_global_worker(void)
+{
+   kthread_global_worker.task = kthread_create(kthread_worker_fn,
+   &kthread_global_worker,
+   "kswork");
+   if (WARN_ON(IS_ERR(kthread_global_worker.task)))
+   return;
+   wake_up_process(kthread_global_worker.task);
+}
-- 
2.14.1

[PATCH RT 04/19] genirq: Handle missing work_struct in irq_set_affinity_notifier()

2019-08-08 Thread zanussi

From: Sebastian Andrzej Siewior 

v4.14.137-rt65-rc1 stable review patch.
If anyone has any objections, please let me know.

---


[ Upstream commit bbc4d2a7d6ff54ba923640d9a42c7bef7185fe98 ]

The backported stable commit
   59c39840f5abf ("genirq: Prevent use-after-free and work list corruption")

added cancel_work_sync() on a work_struct element which is not available
in RT.

Replace cancel_work_sync() with kthread_cancel_work_sync() on RT.

Signed-off-by: Sebastian Andrzej Siewior 
Signed-off-by: Tom Zanussi 

 Conflicts:
kernel/irq/manage.c
---
 kernel/irq/manage.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 3d5b33fe874b..071691963f7b 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -352,7 +352,9 @@ irq_set_affinity_notifier(unsigned int irq, struct 
irq_affinity_notify *notify)
raw_spin_unlock_irqrestore(&desc->lock, flags);
 
if (old_notify) {
-#ifndef CONFIG_PREEMPT_RT_BASE
+#ifdef CONFIG_PREEMPT_RT_BASE
+   kthread_cancel_work_sync(¬ify->work);
+#else
cancel_work_sync(&old_notify->work);
 #endif
kref_put(&old_notify->kref, old_notify->release);
-- 
2.14.1

[PATCH v4 2/2] sched/topology: Improve load balancing on AMD EPYC

2019-08-08 Thread Matt Fleming

SD_BALANCE_{FORK,EXEC} and SD_WAKE_AFFINE are stripped in sd_init()
for any sched domains with a NUMA distance greater than 2 hops
(RECLAIM_DISTANCE). The idea being that it's expensive to balance
across domains that far apart.

However, as is rather unfortunately explained in

  commit 32e45ff43eaf ("mm: increase RECLAIM_DISTANCE to 30")

the value for RECLAIM_DISTANCE is based on node distance tables from
2011-era hardware.

Current AMD EPYC machines have the following NUMA node distances:

node distances:
node   0   1   2   3   4   5   6   7
  0:  10  16  16  16  32  32  32  32
  1:  16  10  16  16  32  32  32  32
  2:  16  16  10  16  32  32  32  32
  3:  16  16  16  10  32  32  32  32
  4:  32  32  32  32  10  16  16  16
  5:  32  32  32  32  16  10  16  16
  6:  32  32  32  32  16  16  10  16
  7:  32  32  32  32  16  16  16  10

where 2 hops is 32.

The result is that the scheduler fails to load balance properly across
NUMA nodes on different sockets -- 2 hops apart.

For example, pinning 16 busy threads to NUMA nodes 0 (CPUs 0-7) and 4
(CPUs 32-39) like so,

  $ numactl -C 0-7,32-39 ./spinner 16

causes all threads to fork and remain on node 0 until the active
balancer kicks in after a few seconds and forcibly moves some threads
to node 4.

Override node_reclaim_distance for AMD Zen.

Signed-off-by: Matt Fleming 
Signed-off-by: Peter Zijlstra (Intel) 
Acked-by: Mel Gorman 
Cc: suravee.suthikulpa...@amd.com
Cc: Borislav Petkov 
Cc: thomas.lenda...@amd.com
---
 arch/x86/kernel/cpu/amd.c |  5 +
 include/linux/topology.h  | 14 ++
 kernel/sched/topology.c   |  3 ++-
 mm/khugepaged.c   |  2 +-
 mm/page_alloc.c   |  2 +-
 5 files changed, 23 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 8d4e50428b68..ceeb8afc7cf3 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -8,6 +8,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -824,6 +825,10 @@ static void init_amd_zn(struct cpuinfo_x86 *c)
 {
set_cpu_cap(c, X86_FEATURE_ZEN);
 
+#ifdef CONFIG_NUMA
+   node_reclaim_distance = 32;
+#endif
+
/*
 * Fix erratum 1076: CPB feature bit not being set in CPUID.
 * Always set it, except when running under a hypervisor.
diff --git a/include/linux/topology.h b/include/linux/topology.h
index 47a3e3c08036..579522ec446c 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -59,6 +59,20 @@ int arch_update_cpu_topology(void);
  */
 #define RECLAIM_DISTANCE 30
 #endif
+
+/*
+ * The following tunable allows platforms to override the default node
+ * reclaim distance (RECLAIM_DISTANCE) if remote memory accesses are
+ * sufficiently fast that the default value actually hurts
+ * performance.
+ *
+ * AMD EPYC machines use this because even though the 2-hop distance
+ * is 32 (3.2x slower than a local memory access) performance actually
+ * *improves* if allowed to reclaim memory and load balance tasks
+ * between NUMA nodes 2-hops apart.
+ */
+extern int __read_mostly node_reclaim_distance;
+
 #ifndef PENALTY_FOR_NODE_WITH_CPUS
 #define PENALTY_FOR_NODE_WITH_CPUS (1)
 #endif
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 8f83e8e3ea9a..b5667a273bf6 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1284,6 +1284,7 @@ static int
sched_domains_curr_level;
 intsched_max_numa_distance;
 static int *sched_domains_numa_distance;
 static struct cpumask  ***sched_domains_numa_masks;
+int __read_mostly  node_reclaim_distance = RECLAIM_DISTANCE;
 #endif
 
 /*
@@ -1402,7 +1403,7 @@ sd_init(struct sched_domain_topology_level *tl,
 
sd->flags &= ~SD_PREFER_SIBLING;
sd->flags |= SD_SERIALIZE;
-   if (sched_domains_numa_distance[tl->numa_level] > 
RECLAIM_DISTANCE) {
+   if (sched_domains_numa_distance[tl->numa_level] > 
node_reclaim_distance) {
sd->flags &= ~(SD_BALANCE_EXEC |
   SD_BALANCE_FORK |
   SD_WAKE_AFFINE);
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index eaaa21b23215..ccede2425c3f 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -710,7 +710,7 @@ static bool khugepaged_scan_abort(int nid)
for (i = 0; i < MAX_NUMNODES; i++) {
if (!khugepaged_node_load[i])
continue;
-   if (node_distance(nid, i) > RECLAIM_DISTANCE)
+   if (node_distance(nid, i) > node_reclaim_distance)
return true;
}
return false;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 272c6de1bf4e..0d54cd2c43a4 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3522,7 +3522,7 @@ bool zone_watermark_ok_safe(struct zone *z, unsigned int 
order,
 static bool zone_allows_recla

[PATCH 1/2] ia64: Make NUMA select SMP

2019-08-08 Thread Matt Fleming

While it does make sense to allow CONFIG_NUMA and !CONFIG_SMP in
theory, it doesn't make much sense in practice.

Follow other architectures and make CONFIG_NUMA select CONFIG_SMP.

The motivation for this patch is to allow a new NUMA variable to be
initialised in kernel/sched/topology.c.

Signed-off-by: Matt Fleming 
Cc: Tony Luck 
Cc: Rik van Riel 
Cc: Peter Zijlstra 
---
 arch/ia64/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index 7468d8e50467..997baba02b70 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -389,6 +389,7 @@ config NUMA
depends on !IA64_HP_SIM && !FLATMEM
default y if IA64_SGI_SN2
select ACPI_NUMA if ACPI
+   select SMP
help
  Say Y to compile the kernel to support NUMA (Non-Uniform Memory
  Access).  This option is for configuring high-end multiprocessor
-- 
2.13.7

[PATCH RT 01/19] kthread: Use __RAW_SPIN_LOCK_UNLOCK to initialize kthread_worker lock

2019-08-08 Thread zanussi

From: Tom Zanussi 

v4.14.137-rt65-rc1 stable review patch.
If anyone has any objections, please let me know.

---


commit 2a9060beefcf (kthread: convert worker lock to raw spinlock)
forgot to update KTHREAD_WORKER_INIT() to use
__RAW_SPIN_LOCK_UNLOCKED() instead of just __SPIN_LOCK_UNLOCKED() when
it converted the lock to raw.

Change it so that e.g. DEFINE_KTHREAD_WORKER() users don't error out.

Signed-off-by: Tom Zanussi 
---
 include/linux/kthread.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/kthread.h b/include/linux/kthread.h
index 4e0449df82c3..4e663f407bd7 100644
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -105,7 +105,7 @@ struct kthread_delayed_work {
 };
 
 #define KTHREAD_WORKER_INIT(worker){   \
-   .lock = __SPIN_LOCK_UNLOCKED((worker).lock),\
+   .lock = __RAW_SPIN_LOCK_UNLOCKED((worker).lock),\
.work_list = LIST_HEAD_INIT((worker).work_list),\
.delayed_work_list = LIST_HEAD_INIT((worker).delayed_work_list),\
}
-- 
2.14.1

[PATCH RT 16/19] futex: Make the futex_hash_bucket lock raw

2019-08-08 Thread zanussi

From: Sebastian Andrzej Siewior 

v4.14.137-rt65-rc1 stable review patch.
If anyone has any objections, please let me know.

---


[ Upstream commit f646521aadedab78801c9befe193e2e8a0c99298 ]

Since commit 1a1fb985f2e2b ("futex: Handle early deadlock return
correctly") we can deadlock while we attempt to acquire the HB lock if
we fail to acquire the lock.
The RT waiter (for the futex lock) is still enqueued and acquiring the
HB lock may build up a lock chain which leads to a deadlock if the owner
of the lock futex-lock holds the HB lock.

Make the hash bucket lock raw so it does not participate in the
lockchain.

Signed-off-by: Sebastian Andrzej Siewior 
Signed-off-by: Tom Zanussi 

 Conflicts:
kernel/futex.c
---
 kernel/futex.c | 89 +-
 1 file changed, 45 insertions(+), 44 deletions(-)

diff --git a/kernel/futex.c b/kernel/futex.c
index ec90130cd809..0548070cda89 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -240,7 +240,7 @@ struct futex_q {
struct plist_node list;
 
struct task_struct *task;
-   spinlock_t *lock_ptr;
+   raw_spinlock_t *lock_ptr;
union futex_key key;
struct futex_pi_state *pi_state;
struct rt_mutex_waiter *rt_waiter;
@@ -261,7 +261,7 @@ static const struct futex_q futex_q_init = {
  */
 struct futex_hash_bucket {
atomic_t waiters;
-   spinlock_t lock;
+   raw_spinlock_t lock;
struct plist_head chain;
 } cacheline_aligned_in_smp;
 
@@ -926,7 +926,7 @@ void exit_pi_state_list(struct task_struct *curr)
}
raw_spin_unlock_irq(&curr->pi_lock);
 
-   spin_lock(&hb->lock);
+   raw_spin_lock(&hb->lock);
raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
raw_spin_lock(&curr->pi_lock);
/*
@@ -936,7 +936,7 @@ void exit_pi_state_list(struct task_struct *curr)
if (head->next != next) {
/* retain curr->pi_lock for the loop invariant */
raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
-   spin_unlock(&hb->lock);
+   raw_spin_unlock(&hb->lock);
put_pi_state(pi_state);
continue;
}
@@ -948,7 +948,7 @@ void exit_pi_state_list(struct task_struct *curr)
 
raw_spin_unlock(&curr->pi_lock);
raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
-   spin_unlock(&hb->lock);
+   raw_spin_unlock(&hb->lock);
 
rt_mutex_futex_unlock(&pi_state->pi_mutex);
put_pi_state(pi_state);
@@ -1442,7 +1442,7 @@ static void __unqueue_futex(struct futex_q *q)
 {
struct futex_hash_bucket *hb;
 
-   if (WARN_ON_SMP(!q->lock_ptr || !spin_is_locked(q->lock_ptr))
+   if (WARN_ON_SMP(!q->lock_ptr || !raw_spin_is_locked(q->lock_ptr))
|| WARN_ON(plist_node_empty(&q->list)))
return;
 
@@ -1570,21 +1570,21 @@ static inline void
 double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
 {
if (hb1 <= hb2) {
-   spin_lock(&hb1->lock);
+   raw_spin_lock(&hb1->lock);
if (hb1 < hb2)
-   spin_lock_nested(&hb2->lock, SINGLE_DEPTH_NESTING);
+   raw_spin_lock_nested(&hb2->lock, SINGLE_DEPTH_NESTING);
} else { /* hb1 > hb2 */
-   spin_lock(&hb2->lock);
-   spin_lock_nested(&hb1->lock, SINGLE_DEPTH_NESTING);
+   raw_spin_lock(&hb2->lock);
+   raw_spin_lock_nested(&hb1->lock, SINGLE_DEPTH_NESTING);
}
 }
 
 static inline void
 double_unlock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
 {
-   spin_unlock(&hb1->lock);
+   raw_spin_unlock(&hb1->lock);
if (hb1 != hb2)
-   spin_unlock(&hb2->lock);
+   raw_spin_unlock(&hb2->lock);
 }
 
 /*
@@ -1612,7 +1612,7 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int 
nr_wake, u32 bitset)
if (!hb_waiters_pending(hb))
goto out_put_key;
 
-   spin_lock(&hb->lock);
+   raw_spin_lock(&hb->lock);
 
plist_for_each_entry_safe(this, next, &hb->chain, list) {
if (match_futex (&this->key, &key)) {
@@ -1631,7 +1631,7 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int 
nr_wake, u32 bitset)
}
}
 
-   spin_unlock(&hb->lock);
+   raw_spin_unlock(&hb->lock);
wake_up_q(&wake_q);
 out_put_key:
put_futex_key(&key);
@@ -2236,7 +2236,8 @@ static inline struct futex_hash_bucket *queue_lock(struct 
futex_q *q)
 
q->lock_ptr = &hb->lock;
 
-   spin_lock(&hb->lock); /* implies smp_mb(); (A) */
+   raw_spin_lock(&hb->lock);
+
return hb;
 }
 
@@ -2244,7 +2245,7 @@ static inline void
 queue_unlock(struct futex_hash_bucket *hb)
__rel

[PATCH RT 17/19] futex: Delay deallocation of pi_state

2019-08-08 Thread zanussi

From: Thomas Gleixner 

v4.14.137-rt65-rc1 stable review patch.
If anyone has any objections, please let me know.

---


[ Upstream commit d7c7cf8cb68b7df17e6e50be1f25f35d83e686c7 ]

On -RT we can't invoke kfree() in a non-preemptible context.

Defer the deallocation of pi_state to preemptible context.

Signed-off-by: Thomas Gleixner 
Signed-off-by: Sebastian Andrzej Siewior 
Signed-off-by: Tom Zanussi 
---
 kernel/futex.c | 55 ---
 1 file changed, 44 insertions(+), 11 deletions(-)

diff --git a/kernel/futex.c b/kernel/futex.c
index 0548070cda89..5f1cfa2f02b6 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -822,13 +822,13 @@ static void get_pi_state(struct futex_pi_state *pi_state)
  * Drops a reference to the pi_state object and frees or caches it
  * when the last reference is gone.
  */
-static void put_pi_state(struct futex_pi_state *pi_state)
+static struct futex_pi_state *__put_pi_state(struct futex_pi_state *pi_state)
 {
if (!pi_state)
-   return;
+   return NULL;
 
if (!atomic_dec_and_test(&pi_state->refcount))
-   return;
+   return NULL;
 
/*
 * If pi_state->owner is NULL, the owner is most probably dying
@@ -848,9 +848,7 @@ static void put_pi_state(struct futex_pi_state *pi_state)
raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
}
 
-   if (current->pi_state_cache) {
-   kfree(pi_state);
-   } else {
+   if (!current->pi_state_cache) {
/*
 * pi_state->list is already empty.
 * clear pi_state->owner.
@@ -859,6 +857,30 @@ static void put_pi_state(struct futex_pi_state *pi_state)
pi_state->owner = NULL;
atomic_set(&pi_state->refcount, 1);
current->pi_state_cache = pi_state;
+   pi_state = NULL;
+   }
+   return pi_state;
+}
+
+static void put_pi_state(struct futex_pi_state *pi_state)
+{
+   kfree(__put_pi_state(pi_state));
+}
+
+static void put_pi_state_atomic(struct futex_pi_state *pi_state,
+   struct list_head *to_free)
+{
+   if (__put_pi_state(pi_state))
+   list_add(&pi_state->list, to_free);
+}
+
+static void free_pi_state_list(struct list_head *to_free)
+{
+   struct futex_pi_state *p, *next;
+
+   list_for_each_entry_safe(p, next, to_free, list) {
+   list_del(&p->list);
+   kfree(p);
}
 }
 
@@ -893,6 +915,7 @@ void exit_pi_state_list(struct task_struct *curr)
struct futex_pi_state *pi_state;
struct futex_hash_bucket *hb;
union futex_key key = FUTEX_KEY_INIT;
+   LIST_HEAD(to_free);
 
if (!futex_cmpxchg_enabled)
return;
@@ -937,7 +960,7 @@ void exit_pi_state_list(struct task_struct *curr)
/* retain curr->pi_lock for the loop invariant */
raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
raw_spin_unlock(&hb->lock);
-   put_pi_state(pi_state);
+   put_pi_state_atomic(pi_state, &to_free);
continue;
}
 
@@ -956,6 +979,8 @@ void exit_pi_state_list(struct task_struct *curr)
raw_spin_lock_irq(&curr->pi_lock);
}
raw_spin_unlock_irq(&curr->pi_lock);
+
+   free_pi_state_list(&to_free);
 }
 
 #endif
@@ -1938,6 +1963,7 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int 
flags,
struct futex_hash_bucket *hb1, *hb2;
struct futex_q *this, *next;
DEFINE_WAKE_Q(wake_q);
+   LIST_HEAD(to_free);
 
if (nr_wake < 0 || nr_requeue < 0)
return -EINVAL;
@@ -2175,7 +2201,7 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int 
flags,
 * object.
 */
this->pi_state = NULL;
-   put_pi_state(pi_state);
+   put_pi_state_atomic(pi_state, &to_free);
/*
 * We stop queueing more waiters and let user
 * space deal with the mess.
@@ -2192,7 +2218,7 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int 
flags,
 * in futex_proxy_trylock_atomic() or in lookup_pi_state(). We
 * need to drop it here again.
 */
-   put_pi_state(pi_state);
+   put_pi_state_atomic(pi_state, &to_free);
 
 out_unlock:
double_unlock_hb(hb1, hb2);
@@ -2213,6 +2239,7 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int 
flags,
 out_put_key1:
put_futex_key(&key1);
 out:
+   free_pi_state_list(&to_free);
return ret ? ret : task_count;
 }
 
@@ -2350,13 +2377,16 @@ static int unqueue_me(struct futex_q *q)
 static void unqueue_me_pi(struct futex_q *q)

[PATCH RT 15/19] Revert "futex: workaround migrate_disable/enable in different context"

2019-08-08 Thread zanussi

From: Sebastian Andrzej Siewior 

v4.14.137-rt65-rc1 stable review patch.
If anyone has any objections, please let me know.

---


[ Upstream commit a71221d81cc4873891ae44f3aa02df596079b786 ]

Drop the RT fixup, the futex code will be changed to avoid the need for
the workaround.

Signed-off-by: Sebastian Andrzej Siewior 
Signed-off-by: Tom Zanussi 

 Conflicts:
kernel/futex.c
---
 kernel/futex.c | 19 ---
 1 file changed, 19 deletions(-)

diff --git a/kernel/futex.c b/kernel/futex.c
index cb7e212fba0f..ec90130cd809 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -2893,14 +2893,6 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int 
flags,
 * before __rt_mutex_start_proxy_lock() is done.
 */
raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock);
-   /*
-* the migrate_disable() here disables migration in the in_atomic() fast
-* path which is enabled again in the following spin_unlock(). We have
-* one migrate_disable() pending in the slow-path which is reversed
-* after the raw_spin_unlock_irq() where we leave the atomic context.
-*/
-   migrate_disable();
-
spin_unlock(q.lock_ptr);
/*
 * __rt_mutex_start_proxy_lock() unconditionally enqueues the @rt_waiter
@@ -2909,7 +2901,6 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int 
flags,
 */
ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, 
current);
raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock);
-   migrate_enable();
 
if (ret) {
if (ret == 1)
@@ -3058,21 +3049,11 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned 
int flags)
 * rt_waiter. Also see the WARN in wake_futex_pi().
 */
raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
-   /*
-* Magic trickery for now to make the RT migrate disable
-* logic happy. The following spin_unlock() happens with
-* interrupts disabled so the internal migrate_enable()
-* won't undo the migrate_disable() which was issued when
-* locking hb->lock.
-*/
-   migrate_disable();
spin_unlock(&hb->lock);
 
/* drops pi_state->pi_mutex.wait_lock */
ret = wake_futex_pi(uaddr, uval, pi_state);
 
-   migrate_enable();
-
put_pi_state(pi_state);
 
/*
-- 
2.14.1

[PATCH RT 07/19] locking/lockdep: Don't complain about incorrect name for no validate class

2019-08-08 Thread zanussi

From: Sebastian Andrzej Siewior 

v4.14.137-rt65-rc1 stable review patch.
If anyone has any objections, please let me know.

---


[ Upstream commit 978315462d3ea3cf6cfacd34c563ec1eb02a3aa5 ]

It is possible to ignore the validation for a certain lock by using:

lockdep_set_novalidate_class()

on it. Each invocation will assign a new name to the class it created
for created __lockdep_no_validate__. That means that once
lockdep_set_novalidate_class() has been used on two locks then
class->name won't match lock->name for the first lock triggering the
warning.

So ignore changed non-matching ->name pointer for the special
__lockdep_no_validate__ class.

Signed-off-by: Sebastian Andrzej Siewior 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: Thomas Gleixner 
Cc: Will Deacon 
Link: http://lkml.kernel.org/r/20190517212234.32611-1-bige...@linutronix.de
Signed-off-by: Ingo Molnar 
Signed-off-by: Tom Zanussi 
---
 kernel/locking/lockdep.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index e576d234f3ea..f194de27123d 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -719,7 +719,8 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int 
subclass)
 * Huh! same key, different name? Did someone trample
 * on some memory? We're most confused.
 */
-   WARN_ON_ONCE(class->name != lock->name);
+   WARN_ON_ONCE(class->name != lock->name &&
+lock->key != &__lockdep_no_validate__);
return class;
}
}
-- 
2.14.1

[PATCH RT 13/19] Revert "futex: Fix bug on when a requeued RT task times out"

2019-08-08 Thread zanussi

From: Sebastian Andrzej Siewior 

v4.14.137-rt65-rc1 stable review patch.
If anyone has any objections, please let me know.

---


[ Upstream commit f1a170cb3289a48df26cae3c60d77608f7a988bb ]

Drop the RT fixup, the futex code will be changed to avoid the need for
the workaround.

Signed-off-by: Sebastian Andrzej Siewior 
Signed-off-by: Tom Zanussi 
---
 kernel/locking/rtmutex.c| 31 +--
 kernel/locking/rtmutex_common.h |  1 -
 2 files changed, 1 insertion(+), 31 deletions(-)

diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 1177f2815040..62914dde3f1c 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -144,8 +144,7 @@ static void fixup_rt_mutex_waiters(struct rt_mutex *lock)
 
 static int rt_mutex_real_waiter(struct rt_mutex_waiter *waiter)
 {
-   return waiter && waiter != PI_WAKEUP_INPROGRESS &&
-   waiter != PI_REQUEUE_INPROGRESS;
+   return waiter && waiter != PI_WAKEUP_INPROGRESS;
 }
 
 /*
@@ -2358,34 +2357,6 @@ int __rt_mutex_start_proxy_lock(struct rt_mutex *lock,
if (try_to_take_rt_mutex(lock, task, NULL))
return 1;
 
-#ifdef CONFIG_PREEMPT_RT_FULL
-   /*
-* In PREEMPT_RT there's an added race.
-* If the task, that we are about to requeue, times out,
-* it can set the PI_WAKEUP_INPROGRESS. This tells the requeue
-* to skip this task. But right after the task sets
-* its pi_blocked_on to PI_WAKEUP_INPROGRESS it can then
-* block on the spin_lock(&hb->lock), which in RT is an rtmutex.
-* This will replace the PI_WAKEUP_INPROGRESS with the actual
-* lock that it blocks on. We *must not* place this task
-* on this proxy lock in that case.
-*
-* To prevent this race, we first take the task's pi_lock
-* and check if it has updated its pi_blocked_on. If it has,
-* we assume that it woke up and we return -EAGAIN.
-* Otherwise, we set the task's pi_blocked_on to
-* PI_REQUEUE_INPROGRESS, so that if the task is waking up
-* it will know that we are in the process of requeuing it.
-*/
-   raw_spin_lock(&task->pi_lock);
-   if (task->pi_blocked_on) {
-   raw_spin_unlock(&task->pi_lock);
-   return -EAGAIN;
-   }
-   task->pi_blocked_on = PI_REQUEUE_INPROGRESS;
-   raw_spin_unlock(&task->pi_lock);
-#endif
-
/* We enforce deadlock detection for futexes */
ret = task_blocks_on_rt_mutex(lock, waiter, task,
  RT_MUTEX_FULL_CHAINWALK);
diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
index 2a157c78e18c..53ca0242101a 100644
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -132,7 +132,6 @@ enum rtmutex_chainwalk {
  * PI-futex support (proxy locking functions, etc.):
  */
 #define PI_WAKEUP_INPROGRESS   ((struct rt_mutex_waiter *) 1)
-#define PI_REQUEUE_INPROGRESS  ((struct rt_mutex_waiter *) 2)
 
 extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock);
 extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
-- 
2.14.1

[PATCH RT 09/19] rcu: Don't allow to change rcu_normal_after_boot on RT

2019-08-08 Thread zanussi

From: Sebastian Andrzej Siewior 

v4.14.137-rt65-rc1 stable review patch.
If anyone has any objections, please let me know.

---


[ Upstream commit c6c058c10577815a2491ce661876cff00a4c3b15 ]

On RT rcu_normal_after_boot is enabled by default.
Don't allow to disable it on RT because the "expedited rcu" would
introduce latency spikes.

Signed-off-by: Sebastian Andrzej Siewior 
Signed-off-by: Tom Zanussi 
---
 kernel/rcu/update.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 2006a09680aa..307592810f6b 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -67,7 +67,9 @@ module_param(rcu_expedited, int, 0);
 extern int rcu_normal; /* from sysctl */
 module_param(rcu_normal, int, 0);
 static int rcu_normal_after_boot = IS_ENABLED(CONFIG_PREEMPT_RT_FULL);
+#ifndef CONFIG_PREEMPT_RT_FULL
 module_param(rcu_normal_after_boot, int, 0);
+#endif
 #endif /* #ifndef CONFIG_TINY_RCU */
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
-- 
2.14.1

[PATCH RT 19/19] Linux 4.14.137-rt65-rc1

2019-08-08 Thread zanussi

From: Tom Zanussi 

v4.14.137-rt65-rc1 stable review patch.
If anyone has any objections, please let me know.

---


Signed-off-by: Tom Zanussi 
---
 localversion-rt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/localversion-rt b/localversion-rt
index 10474042df49..03188f3e7d8d 100644
--- a/localversion-rt
+++ b/localversion-rt
@@ -1 +1 @@
--rt64
+-rt65-rc1
-- 
2.14.1

[PATCH RT 12/19] Revert "futex: Ensure lock/unlock symetry versus pi_lock and hash bucket lock"

2019-08-08 Thread zanussi

From: Sebastian Andrzej Siewior 

v4.14.137-rt65-rc1 stable review patch.
If anyone has any objections, please let me know.

---


[ Upstream commit 6a773b70cf105b46298ed3b44e77c102ce31d9ec ]

Drop the RT fixup, the futex code will be changed to avoid the need for
the workaround.

Signed-off-by: Sebastian Andrzej Siewior 
Signed-off-by: Tom Zanussi 
---
 kernel/futex.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/kernel/futex.c b/kernel/futex.c
index ad0abb0e339f..07b148ad703a 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -936,9 +936,7 @@ void exit_pi_state_list(struct task_struct *curr)
if (head->next != next) {
/* retain curr->pi_lock for the loop invariant */
raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
-   raw_spin_unlock_irq(&curr->pi_lock);
spin_unlock(&hb->lock);
-   raw_spin_lock_irq(&curr->pi_lock);
put_pi_state(pi_state);
continue;
}
-- 
2.14.1

[PATCH RT 11/19] sched/core: Drop a preempt_disable_rt() statement

2019-08-08 Thread zanussi

From: Sebastian Andrzej Siewior 

v4.14.137-rt65-rc1 stable review patch.
If anyone has any objections, please let me know.

---


[ Upstream commit 761126efdcbe3fa3e99c9079fa0ad6eca2f251f2 ]

The caller holds a lock which already disables preemption.
Drop the preempt_disable_rt() statement in get_nohz_timer_target().

Signed-off-by: Sebastian Andrzej Siewior 
Signed-off-by: Tom Zanussi 

 Conflicts:
kernel/sched/core.c
---
 kernel/sched/core.c | 9 ++---
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 7d2cc0715114..17da1c1aba56 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -583,14 +583,11 @@ void resched_cpu(int cpu)
  */
 int get_nohz_timer_target(void)
 {
-   int i, cpu;
+   int i, cpu = smp_processor_id();
struct sched_domain *sd;
 
-   preempt_disable_rt();
-   cpu = smp_processor_id();
-
if (!idle_cpu(cpu) && is_housekeeping_cpu(cpu))
-   goto preempt_en_rt;
+   return cpu;
 
rcu_read_lock();
for_each_domain(cpu, sd) {
@@ -609,8 +606,6 @@ int get_nohz_timer_target(void)
cpu = housekeeping_any_cpu();
 unlock:
rcu_read_unlock();
-preempt_en_rt:
-   preempt_enable_rt();
return cpu;
 }
 
-- 
2.14.1

[PATCH RT 14/19] Revert "rtmutex: Handle the various new futex race conditions"

2019-08-08 Thread zanussi

From: Sebastian Andrzej Siewior 

v4.14.137-rt65-rc1 stable review patch.
If anyone has any objections, please let me know.

---


[ Upstream commit 9e0265c21af4d6388d47dcd5ce20f76ec3a2e468 ]

Drop the RT fixup, the futex code will be changed to avoid the need for
the workaround.

Signed-off-by: Sebastian Andrzej Siewior 
Signed-off-by: Tom Zanussi 
---
 kernel/futex.c  | 77 -
 kernel/locking/rtmutex.c| 36 ---
 kernel/locking/rtmutex_common.h |  2 --
 3 files changed, 21 insertions(+), 94 deletions(-)

diff --git a/kernel/futex.c b/kernel/futex.c
index 07b148ad703a..cb7e212fba0f 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -2165,16 +2165,6 @@ static int futex_requeue(u32 __user *uaddr1, unsigned 
int flags,
requeue_pi_wake_futex(this, &key2, hb2);
drop_count++;
continue;
-   } else if (ret == -EAGAIN) {
-   /*
-* Waiter was woken by timeout or
-* signal and has set pi_blocked_on to
-* PI_WAKEUP_INPROGRESS before we
-* tried to enqueue it on the rtmutex.
-*/
-   this->pi_state = NULL;
-   put_pi_state(pi_state);
-   continue;
} else if (ret) {
/*
 * rt_mutex_start_proxy_lock() detected a
@@ -3253,7 +3243,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, 
unsigned int flags,
struct hrtimer_sleeper timeout, *to = NULL;
struct futex_pi_state *pi_state = NULL;
struct rt_mutex_waiter rt_waiter;
-   struct futex_hash_bucket *hb, *hb2;
+   struct futex_hash_bucket *hb;
union futex_key key2 = FUTEX_KEY_INIT;
struct futex_q q = futex_q_init;
int res, ret;
@@ -3311,55 +3301,20 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, 
unsigned int flags,
/* Queue the futex_q, drop the hb lock, wait for wakeup. */
futex_wait_queue_me(hb, &q, to);
 
-   /*
-* On RT we must avoid races with requeue and trying to block
-* on two mutexes (hb->lock and uaddr2's rtmutex) by
-* serializing access to pi_blocked_on with pi_lock.
-*/
-   raw_spin_lock_irq(¤t->pi_lock);
-   if (current->pi_blocked_on) {
-   /*
-* We have been requeued or are in the process of
-* being requeued.
-*/
-   raw_spin_unlock_irq(¤t->pi_lock);
-   } else {
-   /*
-* Setting pi_blocked_on to PI_WAKEUP_INPROGRESS
-* prevents a concurrent requeue from moving us to the
-* uaddr2 rtmutex. After that we can safely acquire
-* (and possibly block on) hb->lock.
-*/
-   current->pi_blocked_on = PI_WAKEUP_INPROGRESS;
-   raw_spin_unlock_irq(¤t->pi_lock);
-
-   spin_lock(&hb->lock);
-
-   /*
-* Clean up pi_blocked_on. We might leak it otherwise
-* when we succeeded with the hb->lock in the fast
-* path.
-*/
-   raw_spin_lock_irq(¤t->pi_lock);
-   current->pi_blocked_on = NULL;
-   raw_spin_unlock_irq(¤t->pi_lock);
-
-   ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
-   spin_unlock(&hb->lock);
-   if (ret)
-   goto out_put_keys;
-   }
+   spin_lock(&hb->lock);
+   ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
+   spin_unlock(&hb->lock);
+   if (ret)
+   goto out_put_keys;
 
/*
-* In order to be here, we have either been requeued, are in
-* the process of being requeued, or requeue successfully
-* acquired uaddr2 on our behalf.  If pi_blocked_on was
-* non-null above, we may be racing with a requeue.  Do not
-* rely on q->lock_ptr to be hb2->lock until after blocking on
-* hb->lock or hb2->lock. The futex_requeue dropped our key1
-* reference and incremented our key2 reference count.
+* In order for us to be here, we know our q.key == key2, and since
+* we took the hb->lock above, we also know that futex_requeue() has
+* completed and we no longer have to concern ourselves with a wakeup
+* race with the atomic proxy lock acquisition by the requeue code. The
+* futex_requeue dropped our key1 reference and incremented our key2
+* reference count.
 */
-   hb2 = hash_futex(&key2);
 
/* Check if the requeue code acquired the second futex for us. *

[PATCH RT 18/19] mm/zswap: Do not disable preemption in zswap_frontswap_store()

2019-08-08 Thread zanussi

From: "Luis Claudio R. Goncalves" 

v4.14.137-rt65-rc1 stable review patch.
If anyone has any objections, please let me know.

---


[ Upstream commit 4e4cf4be79635e67144632d9135286381acbc95a ]

Zswap causes "BUG: scheduling while atomic" by blocking on a rt_spin_lock() with
preemption disabled. The preemption is disabled by get_cpu_var() in
zswap_frontswap_store() to protect the access of the zswap_dstmem percpu 
variable.

Use get_locked_var() to protect the percpu zswap_dstmem variable, making the
code preemptive.

As get_cpu_ptr() also disables preemption, replace it by this_cpu_ptr() and
remove the counterpart put_cpu_ptr().

Steps to Reproduce:

1. # grubby --args "zswap.enabled=1" --update-kernel DEFAULT
2. # reboot
3. Calculate the amount o memory to be used by the test:
   ---> grep MemAvailable /proc/meminfo
   ---> Add 25% ~ 50% to that value
4. # stress --vm 1 --vm-bytes ${MemAvailable+25%} --timeout 240s

Usually, in less than 5 minutes the backtrace listed below appears, followed
by a kernel panic:

| BUG: scheduling while atomic: kswapd1/181/0x0002
|
| Preemption disabled at:
| [] zswap_frontswap_store+0x21a/0x6e1
|
| Kernel panic - not syncing: scheduling while atomic
| CPU: 14 PID: 181 Comm: kswapd1 Kdump: loaded Not tainted 5.0.14-rt9 #1
| Hardware name: AMD Pence/Pence, BIOS WPN2321X_Weekly_12_03_21 03/19/2012
| Call Trace:
|  panic+0x106/0x2a7
|  __schedule_bug.cold+0x3f/0x51
|  __schedule+0x5cb/0x6f0
|  schedule+0x43/0xd0
|  rt_spin_lock_slowlock_locked+0x114/0x2b0
|  rt_spin_lock_slowlock+0x51/0x80
|  zbud_alloc+0x1da/0x2d0
|  zswap_frontswap_store+0x31a/0x6e1
|  __frontswap_store+0xab/0x130
|  swap_writepage+0x39/0x70
|  pageout.isra.0+0xe3/0x320
|  shrink_page_list+0xa8e/0xd10
|  shrink_inactive_list+0x251/0x840
|  shrink_node_memcg+0x213/0x770
|  shrink_node+0xd9/0x450
|  balance_pgdat+0x2d5/0x510
|  kswapd+0x218/0x470
|  kthread+0xfb/0x130
|  ret_from_fork+0x27/0x50

Cc: stable...@vger.kernel.org
Reported-by: Ping Fang 
Signed-off-by: Luis Claudio R. Goncalves 
Reviewed-by: Daniel Bristot de Oliveira 
Signed-off-by: Sebastian Andrzej Siewior 
Signed-off-by: Tom Zanussi 

 Conflicts:
mm/zswap.c
---
 mm/zswap.c | 12 +++-
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/mm/zswap.c b/mm/zswap.c
index ebb0bc88c5f7..a2b4e14f851c 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -27,6 +27,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -953,6 +954,8 @@ static int zswap_shrink(void)
return ret;
 }
 
+/* protect zswap_dstmem from concurrency */
+static DEFINE_LOCAL_IRQ_LOCK(zswap_dstmem_lock);
 /*
 * frontswap hooks
 **/
@@ -1016,12 +1019,11 @@ static int zswap_frontswap_store(unsigned type, pgoff_t 
offset,
}
 
/* compress */
-   dst = get_cpu_var(zswap_dstmem);
-   tfm = *get_cpu_ptr(entry->pool->tfm);
+   dst = get_locked_var(zswap_dstmem_lock, zswap_dstmem);
+   tfm = *this_cpu_ptr(entry->pool->tfm);
src = kmap_atomic(page);
ret = crypto_comp_compress(tfm, src, PAGE_SIZE, dst, &dlen);
kunmap_atomic(src);
-   put_cpu_ptr(entry->pool->tfm);
if (ret) {
ret = -EINVAL;
goto put_dstmem;
@@ -1045,7 +1047,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t 
offset,
buf = (u8 *)(zhdr + 1);
memcpy(buf, dst, dlen);
zpool_unmap_handle(entry->pool->zpool, handle);
-   put_cpu_var(zswap_dstmem);
+   put_locked_var(zswap_dstmem_lock, zswap_dstmem);
 
/* populate entry */
entry->offset = offset;
@@ -1072,7 +1074,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t 
offset,
return 0;
 
 put_dstmem:
-   put_cpu_var(zswap_dstmem);
+   put_locked_var(zswap_dstmem_lock, zswap_dstmem);
zswap_pool_put(entry->pool);
 freepage:
zswap_entry_cache_free(entry);
-- 
2.14.1

Re: [PATCH 01/26] drm/dp_mst: Move link address dumping into a function

2019-08-08 Thread Daniel Vetter

On Wed, Jul 17, 2019 at 09:42:24PM -0400, Lyude Paul wrote:
> Since we're about to be calling this from multiple places. Also it makes
> things easier to read!
> 
> Cc: Juston Li 
> Cc: Imre Deak 
> Cc: Ville Syrjälä 
> Cc: Harry Wentland 
> Signed-off-by: Lyude Paul 

Reviewed-by: Daniel Vetter 

> ---
>  drivers/gpu/drm/drm_dp_mst_topology.c | 35 ++-
>  1 file changed, 23 insertions(+), 12 deletions(-)
> 
> diff --git a/drivers/gpu/drm/drm_dp_mst_topology.c 
> b/drivers/gpu/drm/drm_dp_mst_topology.c
> index 0984b9a34d55..998081b9b205 100644
> --- a/drivers/gpu/drm/drm_dp_mst_topology.c
> +++ b/drivers/gpu/drm/drm_dp_mst_topology.c
> @@ -2013,6 +2013,28 @@ static void drm_dp_queue_down_tx(struct 
> drm_dp_mst_topology_mgr *mgr,
>   mutex_unlock(&mgr->qlock);
>  }
>  
> +static void
> +drm_dp_dump_link_address(struct drm_dp_link_address_ack_reply *reply)
> +{
> + struct drm_dp_link_addr_reply_port *port_reply;
> + int i;
> +
> + for (i = 0; i < reply->nports; i++) {
> + port_reply = &reply->ports[i];
> + DRM_DEBUG_KMS("port %d: input %d, pdt: %d, pn: %d, dpcd_rev: 
> %02x, mcs: %d, ddps: %d, ldps %d, sdp %d/%d\n",
> +   i,
> +   port_reply->input_port,
> +   port_reply->peer_device_type,
> +   port_reply->port_number,
> +   port_reply->dpcd_revision,
> +   port_reply->mcs,
> +   port_reply->ddps,
> +   port_reply->legacy_device_plug_status,
> +   port_reply->num_sdp_streams,
> +   port_reply->num_sdp_stream_sinks);
> + }
> +}
> +
>  static void drm_dp_send_link_address(struct drm_dp_mst_topology_mgr *mgr,
>struct drm_dp_mst_branch *mstb)
>  {
> @@ -2038,18 +2060,7 @@ static void drm_dp_send_link_address(struct 
> drm_dp_mst_topology_mgr *mgr,
>   DRM_DEBUG_KMS("link address nak received\n");
>   } else {
>   DRM_DEBUG_KMS("link address reply: %d\n", 
> txmsg->reply.u.link_addr.nports);
> - for (i = 0; i < txmsg->reply.u.link_addr.nports; i++) {
> - DRM_DEBUG_KMS("port %d: input %d, pdt: %d, pn: 
> %d, dpcd_rev: %02x, mcs: %d, ddps: %d, ldps %d, sdp %d/%d\n", i,
> -
> txmsg->reply.u.link_addr.ports[i].input_port,
> -
> txmsg->reply.u.link_addr.ports[i].peer_device_type,
> -
> txmsg->reply.u.link_addr.ports[i].port_number,
> -
> txmsg->reply.u.link_addr.ports[i].dpcd_revision,
> -txmsg->reply.u.link_addr.ports[i].mcs,
> -txmsg->reply.u.link_addr.ports[i].ddps,
> -
> txmsg->reply.u.link_addr.ports[i].legacy_device_plug_status,
> -
> txmsg->reply.u.link_addr.ports[i].num_sdp_streams,
> -
> txmsg->reply.u.link_addr.ports[i].num_sdp_stream_sinks);
> - }
> + drm_dp_dump_link_address(&txmsg->reply.u.link_addr);
>  
>   drm_dp_check_mstb_guid(mstb, 
> txmsg->reply.u.link_addr.guid);
>  
> -- 
> 2.21.0
> 

-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch

[PATCH RT 06/19] sched/completion: Fix a lockup in wait_for_completion()

2019-08-08 Thread zanussi

From: Corey Minyard 

v4.14.137-rt65-rc1 stable review patch.
If anyone has any objections, please let me know.

---


[ Upstream commit f0837746a7e258abb35e65defc432ca66786347f ]

Consider following race:

  T0T1   T2
  wait_for_completion()
   do_wait_for_common()
__prepare_to_swait()
 schedule()
complete()
 x->done++ (0 -> 1)
 raw_spin_lock_irqsave()
 swake_up_locked()   wait_for_completion()
  wake_up_process(T0)
  list_del_init()
 raw_spin_unlock_irqrestore()
  
raw_spin_lock_irq(&x->wait.lock)
  raw_spin_lock_irq(&x->wait.lock)x->done != UINT_MAX, 1 -> 0
  
raw_spin_unlock_irq(&x->wait.lock)
  return 1
   while (!x->done && timeout),
   continue loop, not enqueued
   on &x->wait

Basically, the problem is that the original wait queues used in
completions did not remove the item from the queue in the wakeup
function, but swake_up_locked() does.

Fix it by adding the thread to the wait queue inside the do loop.
The design of swait detects if it is already in the list and doesn't
do the list add again.

Cc: stable...@vger.kernel.org
Fixes: a04ff6b4ec4ee7e ("completion: Use simple wait queues")
Signed-off-by: Corey Minyard 
Acked-by: Steven Rostedt (VMware) 
[bigeasy: shorten commit message ]
Signed-off-by: Sebastian Andrzej Siewior 
Signed-off-by: Tom Zanussi 
---
 kernel/sched/completion.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
index 0fe2982e46a0..ac6d5efcd6ff 100644
--- a/kernel/sched/completion.c
+++ b/kernel/sched/completion.c
@@ -80,12 +80,12 @@ do_wait_for_common(struct completion *x,
if (!x->done) {
DECLARE_SWAITQUEUE(wait);
 
-   __prepare_to_swait(&x->wait, &wait);
do {
if (signal_pending_state(state, current)) {
timeout = -ERESTARTSYS;
break;
}
+   __prepare_to_swait(&x->wait, &wait);
__set_current_state(state);
raw_spin_unlock_irq(&x->wait.lock);
timeout = action(timeout);
-- 
2.14.1

Re: [PATCH] rtc: bd70528: fix link error

2019-08-08 Thread Vaittinen, Matti

Hello All,

On Thu, 2019-08-08 at 10:29 +0800, Yuehaibing wrote:
> On 2019/7/9 13:25, Vaittinen, Matti wrote:
> > Hello Arnd,
> > 
> > (Added Randy who also sent a patch to fix this)
> > 
> > On Mon, 2019-07-08 at 18:10 +0200, Arnd Bergmann wrote:
> > > On Mon, Jul 8, 2019 at 3:24 PM Vaittinen, Matti
> > >  wrote:
> > > 
> > > > On Mon, 2019-07-08 at 14:41 +0200, Arnd Bergmann wrote:
> > > > > With CONFIG_BD70528_WATCHDOG=m, a built-in rtc driver cannot
> > > > > call
> > > > > into the low-level functions that are part of the watchdog
> > > > > module:
> > > > > 
> > > > > drivers/rtc/rtc-bd70528.o: In function `bd70528_set_time':
> > > > > rtc-bd70528.c:(.text+0x22c): undefined reference to
> > > > > `bd70528_wdt_lock'
> > > > > rtc-bd70528.c:(.text+0x2a8): undefined reference to
> > > > > `bd70528_wdt_unlock'
> > > > > drivers/rtc/rtc-bd70528.o: In function
> > > > > `bd70528_set_rtc_based_timers':
> > > > > rtc-bd70528.c:(.text+0x50c): undefined reference to
> > > > > `bd70528_wdt_set'
> > > > > 
> > > > > Add a Kconfig dependency on this driver, but still allow
> > > > > compile-
> > > > > testing
> > > > > without it.
> > > > > 
> > > > > Fixes: 32a4a4ebf768 ("rtc: bd70528: Initial support for ROHM
> > > > > bd70528
> > > > > RTC")
> > > > > Signed-off-by: Arnd Bergmann 
> > > > > ---
> > > > >  drivers/rtc/Kconfig | 2 ++
> > > > >  1 file changed, 2 insertions(+)
> > > > > 
> > > > > diff --git a/drivers/rtc/Kconfig b/drivers/rtc/Kconfig
> > > > > index 3bfc04a86529..7b071cc74422 100644
> > > > > --- a/drivers/rtc/Kconfig
> > > > > +++ b/drivers/rtc/Kconfig
> > > > > @@ -498,8 +498,10 @@ config RTC_DRV_M41T80_WDT
> > > > >   help
> > > > > If you say Y here you will get support for the
> > > > > watchdog timer in the ST M41T60 and M41T80 RTC chips
> > > > > series.
> > > > > +
> > > > >  config RTC_DRV_BD70528
> > > > >   tristate "ROHM BD70528 PMIC RTC"
> > > > > + depends on BD70528_WATCHDOG || (COMPILE_TEST &&
> > > > > !BD70528_WATCHDOG)
> > > > 
> > > > I am not fan of this. There may well be use-cases where it is
> > > > desirable
> > > > to leave the watchdog out but still compile in the RTC. This is
> > > > why
> > > > we
> > > > have static inline stubs in the header for cases where WDG is
> > > > not
> > > > compiled in. (RTC does not need to stop WDG if WDG driver is
> > > > not
> > > > included)
> > > > 
> > > > Adding dependency from RTC to MFD for BD70528 should be done -
> > > > this
> > > > will avoid most of the issues (And there has been few patches
> > > > sent
> > > > for
> > > > this already). But that's still not complete solution because
> > > > configuring RTC and MFD to be built in-kernel and WDG as a
> > > > module
> > > > will
> > > > cause errors again.
> > > > 
> > > > Is there a way to force WDG in-kernel if RTC is in-kernel? (Or
> > > > disallow configuring RTC in-kernel if WDG is a module - while
> > > > still
> > > > allow RTC to be built without WDG?
> > > 
> > > We could make this
> > > 
> > > depends on BD70528_WATCHDOG || !BD70528_WATCHDOG
> > > 
> > > which would allow building with or without watchdog, even when
> > > not
> > > compile-testing, but still disallow the combination of
> > > .
> > 
> > Thanks for teaching me Arnd! That is clever :) We need something
> > like
> > 
> > depends on MFD_ROHM_BD70528 && (BD70528_WATCHDOG ||
> > !BD70528_WATCHDOG)
> > 
> > (I'm not sure if parenthesis are Ok and respected in Kconfig). I
> > would
> > never have thought of BD70528_WATCHDOG || !BD70528_WATCHDOG - it
> > looks
> > awkward at first sight but indeed - depends on BD70528_WATCHDOG
> > disallows BD70528_WATCHDOG=m with RTC_DRV_BD70528=y while
> > !BD70528_WATCHDOG allows BD70528_WATCHDOG=n. Brilliant and exactly
> > what
> > we need :) Thanks a bunch!
> 
> Hello Vaittinen,
> 
> the issue still exists in linux-next 20190807, any plan?
> 

> Sorry folks. I thought Arnd would send new patch - I didn't want to
> steal his work ;) I will be back to my normal life next week so I
> will send a patch at monday if the issue is still open!

Br,
Matti

[PATCH RT 10/19] pci/switchtec: fix stream_open.cocci warnings

2019-08-08 Thread zanussi

From: kbuild test robot 

v4.14.137-rt65-rc1 stable review patch.
If anyone has any objections, please let me know.

---


[ Upstream commit 9462c69e29307adc95c289f50839d5d683973891 ]

drivers/pci/switch/switchtec.c:395:1-17: ERROR: switchtec_fops: .read() can 
deadlock .write(); change nonseekable_open -> stream_open to fix.

Generated by: scripts/coccinelle/api/stream_open.cocci

Cc: Kirill Smelkov 
Cc: Julia Lawall 
Fixes: 8a29a3bae2a2 ("pci/switchtec: Don't use completion's wait queue")
Cc: stable...@vger.kernel.org # where it applies to
Link: https://lkml.kernel.org/r/alpine.DEB.2.21.1904131849350.2536@hadrien
Signed-off-by: kbuild test robot 
Signed-off-by: Sebastian Andrzej Siewior 
Signed-off-by: Tom Zanussi 
---
 drivers/pci/switch/switchtec.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/pci/switch/switchtec.c b/drivers/pci/switch/switchtec.c
index 69875a196ad8..2b6641c9e868 100644
--- a/drivers/pci/switch/switchtec.c
+++ b/drivers/pci/switch/switchtec.c
@@ -625,7 +625,7 @@ static int switchtec_dev_open(struct inode *inode, struct 
file *filp)
return PTR_ERR(stuser);
 
filp->private_data = stuser;
-   nonseekable_open(inode, filp);
+   stream_open(inode, filp);
 
dev_dbg(&stdev->dev, "%s: %p\n", __func__, stuser);
 
-- 
2.14.1

[PATCH RT 00/19] Linux v4.14.137-rt65-rc1

2019-08-08 Thread zanussi

From: Tom Zanussi 

Dear RT Folks,

This is the RT stable review cycle of patch 4.14.137-rt65-rc1.

Please scream at me if I messed something up. Please test the patches
too.

The -rc release will be uploaded to kernel.org and will be deleted
when the final release is out. This is just a review release (or
release candidate).

The pre-releases will not be pushed to the git repository, only the
final release is.

If all goes well, this patch will be converted to the next main
release on 2019-08-15.

To build 4.14.137-rt65-rc1 directly, the following patches should be applied:

  https://www.kernel.org/pub/linux/kernel/v4.x/linux-4.14.tar.xz

  https://www.kernel.org/pub/linux/kernel/v4.x/patch-4.14.137.xz

  
https://www.kernel.org/pub/linux/kernel/projects/rt/4.14/patch-4.14.137-rt65-rc1.patch.xz

You can also build from 4.14.137-rt64 by applying the incremental patch:

  
https://www.kernel.org/pub/linux/kernel/projects/rt/4.14/incr/patch-4.14.137-rt64-rt65-rc1.patch.xz


Enjoy,

-- Tom


Corey Minyard (1):
  sched/completion: Fix a lockup in wait_for_completion()

Luis Claudio R. Goncalves (1):
  mm/zswap: Do not disable preemption in zswap_frontswap_store()

Sebastian Andrzej Siewior (13):
  kthread: add a global worker thread.
  genirq: Do not invoke the affinity callback via a workqueue on RT
  genirq: Handle missing work_struct in irq_set_affinity_notifier()
  locking/rwsem: Rename rwsem_rt.h to rwsem-rt.h
  locking/lockdep: Don't complain about incorrect name for no validate
class
  arm: imx6: cpuidle: Use raw_spinlock_t
  rcu: Don't allow to change rcu_normal_after_boot on RT
  sched/core: Drop a preempt_disable_rt() statement
  Revert "futex: Ensure lock/unlock symetry versus pi_lock and hash
bucket lock"
  Revert "futex: Fix bug on when a requeued RT task times out"
  Revert "rtmutex: Handle the various new futex race conditions"
  Revert "futex: workaround migrate_disable/enable in different context"
  futex: Make the futex_hash_bucket lock raw

Thomas Gleixner (1):
  futex: Delay deallocation of pi_state

Tom Zanussi (2):
  kthread: Use __RAW_SPIN_LOCK_UNLOCK to initialize kthread_worker lock
  Linux 4.14.137-rt65-rc1

kbuild test robot (1):
  pci/switchtec: fix stream_open.cocci warnings

 arch/arm/mach-imx/cpuidle-imx6q.c|  10 +-
 drivers/block/loop.c |   2 +-
 drivers/pci/switch/switchtec.c   |   2 +-
 drivers/spi/spi-rockchip.c   |   1 +
 include/linux/blk-cgroup.h   |   1 +
 include/linux/interrupt.h|   5 +-
 include/linux/kthread-cgroup.h   |  17 +++
 include/linux/kthread.h  |  10 +-
 include/linux/{rwsem_rt.h => rwsem-rt.h} |   0
 include/linux/rwsem.h|   2 +-
 init/main.c  |   1 +
 kernel/futex.c   | 232 +--
 kernel/irq/manage.c  |  23 +--
 kernel/kthread.c |  13 ++
 kernel/locking/lockdep.c |   3 +-
 kernel/locking/rtmutex.c |  65 +
 kernel/locking/rtmutex_common.h  |   3 -
 kernel/rcu/update.c  |   2 +
 kernel/sched/completion.c|   2 +-
 kernel/sched/core.c  |   9 +-
 localversion-rt  |   2 +-
 mm/zswap.c   |  12 +-
 22 files changed, 179 insertions(+), 238 deletions(-)
 create mode 100644 include/linux/kthread-cgroup.h
 rename include/linux/{rwsem_rt.h => rwsem-rt.h} (100%)

-- 
2.14.1

[PATCH RT 08/19] arm: imx6: cpuidle: Use raw_spinlock_t

2019-08-08 Thread zanussi

From: Sebastian Andrzej Siewior 

v4.14.137-rt65-rc1 stable review patch.
If anyone has any objections, please let me know.

---


[ Upstream commit 40d0332ec8312e9c090f0a5414d9c90e12b13611 ]

The idle call back is invoked with disabled interrupts and requires
raw_spinlock_t locks to work.

Signed-off-by: Sebastian Andrzej Siewior 
Signed-off-by: Tom Zanussi 
---
 arch/arm/mach-imx/cpuidle-imx6q.c | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/arch/arm/mach-imx/cpuidle-imx6q.c 
b/arch/arm/mach-imx/cpuidle-imx6q.c
index 326e870d7123..d9ac80aa1eb0 100644
--- a/arch/arm/mach-imx/cpuidle-imx6q.c
+++ b/arch/arm/mach-imx/cpuidle-imx6q.c
@@ -17,22 +17,22 @@
 #include "hardware.h"
 
 static int num_idle_cpus = 0;
-static DEFINE_SPINLOCK(cpuidle_lock);
+static DEFINE_RAW_SPINLOCK(cpuidle_lock);
 
 static int imx6q_enter_wait(struct cpuidle_device *dev,
struct cpuidle_driver *drv, int index)
 {
-   spin_lock(&cpuidle_lock);
+   raw_spin_lock(&cpuidle_lock);
if (++num_idle_cpus == num_online_cpus())
imx6_set_lpm(WAIT_UNCLOCKED);
-   spin_unlock(&cpuidle_lock);
+   raw_spin_unlock(&cpuidle_lock);
 
cpu_do_idle();
 
-   spin_lock(&cpuidle_lock);
+   raw_spin_lock(&cpuidle_lock);
if (num_idle_cpus-- == num_online_cpus())
imx6_set_lpm(WAIT_CLOCKED);
-   spin_unlock(&cpuidle_lock);
+   raw_spin_unlock(&cpuidle_lock);
 
return index;
 }
-- 
2.14.1

[PATCH RT 05/19] locking/rwsem: Rename rwsem_rt.h to rwsem-rt.h

2019-08-08 Thread zanussi

From: Sebastian Andrzej Siewior 

v4.14.137-rt65-rc1 stable review patch.
If anyone has any objections, please let me know.

---


[ Upstream commit fc7a6bdcce83ce162c32d991f0ac8e56ea260f5b ]

Rename rwsem_rt.h to rwsem-rt.h to remain consistent with rwsem-rt.c.

Signed-off-by: Sebastian Andrzej Siewior 
Signed-off-by: Tom Zanussi 
---
 include/linux/{rwsem_rt.h => rwsem-rt.h} | 0
 include/linux/rwsem.h| 2 +-
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename include/linux/{rwsem_rt.h => rwsem-rt.h} (100%)

diff --git a/include/linux/rwsem_rt.h b/include/linux/rwsem-rt.h
similarity index 100%
rename from include/linux/rwsem_rt.h
rename to include/linux/rwsem-rt.h
diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
index 513df11a364e..ac0857d60e04 100644
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -21,7 +21,7 @@
 #endif
 
 #ifdef CONFIG_PREEMPT_RT_FULL
-#include 
+#include 
 #else /* PREEMPT_RT_FULL */
 
 struct rw_semaphore;
-- 
2.14.1

[PATCH RT 03/19] genirq: Do not invoke the affinity callback via a workqueue on RT

2019-08-08 Thread zanussi

From: Sebastian Andrzej Siewior 

v4.14.137-rt65-rc1 stable review patch.
If anyone has any objections, please let me know.

---


[ Upstream commit 2122adbe011cdc0eb62ad62494e181005b23c76a ]

Joe Korty reported, that __irq_set_affinity_locked() schedules a
workqueue while holding a rawlock which results in a might_sleep()
warning.
This patch uses swork_queue() instead.

Signed-off-by: Sebastian Andrzej Siewior 
Signed-off-by: Tom Zanussi 

 Conflicts:
include/linux/interrupt.h
kernel/irq/manage.c
---
 include/linux/interrupt.h |  5 ++---
 kernel/irq/manage.c   | 19 ---
 2 files changed, 6 insertions(+), 18 deletions(-)

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 0f25fa19b2d8..233e3c027f53 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -15,7 +15,7 @@
 #include 
 #include 
 #include 
-#include 
+#include 
 
 #include 
 #include 
@@ -230,7 +230,6 @@ extern void resume_device_irqs(void);
  * struct irq_affinity_notify - context for notification of IRQ affinity 
changes
  * @irq:   Interrupt to which notification applies
  * @kref:  Reference count, for internal use
- * @swork: Swork item, for internal use
  * @work:  Work item, for internal use
  * @notify:Function to be called on change.  This will be
  * called in process context.
@@ -243,7 +242,7 @@ struct irq_affinity_notify {
unsigned int irq;
struct kref kref;
 #ifdef CONFIG_PREEMPT_RT_BASE
-   struct swork_event swork;
+   struct kthread_work work;
 #else
struct work_struct work;
 #endif
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index f9415590661c..3d5b33fe874b 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -228,7 +228,7 @@ int irq_set_affinity_locked(struct irq_data *data, const 
struct cpumask *mask,
kref_get(&desc->affinity_notify->kref);
 
 #ifdef CONFIG_PREEMPT_RT_BASE
-   swork_queue(&desc->affinity_notify->swork);
+   kthread_schedule_work(&desc->affinity_notify->work);
 #else
schedule_work(&desc->affinity_notify->work);
 #endif
@@ -293,21 +293,11 @@ static void _irq_affinity_notify(struct 
irq_affinity_notify *notify)
 }
 
 #ifdef CONFIG_PREEMPT_RT_BASE
-static void init_helper_thread(void)
-{
-   static int init_sworker_once;
-
-   if (init_sworker_once)
-   return;
-   if (WARN_ON(swork_get()))
-   return;
-   init_sworker_once = 1;
-}
 
-static void irq_affinity_notify(struct swork_event *swork)
+static void irq_affinity_notify(struct kthread_work *work)
 {
struct irq_affinity_notify *notify =
-   container_of(swork, struct irq_affinity_notify, swork);
+   container_of(work, struct irq_affinity_notify, work);
_irq_affinity_notify(notify);
 }
 
@@ -350,8 +340,7 @@ irq_set_affinity_notifier(unsigned int irq, struct 
irq_affinity_notify *notify)
notify->irq = irq;
kref_init(¬ify->kref);
 #ifdef CONFIG_PREEMPT_RT_BASE
-   INIT_SWORK(¬ify->swork, irq_affinity_notify);
-   init_helper_thread();
+   kthread_init_work(¬ify->work, irq_affinity_notify);
 #else
INIT_WORK(¬ify->work, irq_affinity_notify);
 #endif
-- 
2.14.1

Re: [PATCH v2] pci: Kconfig: select PCI_MSI_IRQ_DOMAIN by default on RISC-V

2019-08-08 Thread Bjorn Helgaas

Hi Paul, Wesley,

On Thu, Jul 25, 2019 at 02:28:07PM -0700, Paul Walmsley wrote:
> From: Wesley Terpstra 
> 
> This is part of adding support for RISC-V systems with PCIe host 
> controllers that support message-signaled interrupts.
> 
> Signed-off-by: Wesley Terpstra 
> [paul.walms...@sifive.com: wrote patch description; split this
>  patch from the arch/riscv patch]
> Signed-off-by: Paul Walmsley 
> ---
>  drivers/pci/Kconfig | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/drivers/pci/Kconfig b/drivers/pci/Kconfig
> index 2ab92409210a..beb3408a0272 100644
> --- a/drivers/pci/Kconfig
> +++ b/drivers/pci/Kconfig
> @@ -52,7 +52,7 @@ config PCI_MSI
>  If you don't know what to do here, say Y.
>  
>  config PCI_MSI_IRQ_DOMAIN
> - def_bool ARC || ARM || ARM64 || X86
> + def_bool ARC || ARM || ARM64 || X86 || RISCV

The other arches listed here either supply their own include/asm/msi.h
or generate it:

  $ ls arch/*/include/asm/msi.h
  arch/x86/include/asm/msi.h

  $ grep msi.h arch/*/include/asm/Kbuild
  arch/arc/include/asm/Kbuild:generic-y += msi.h
  arch/arm64/include/asm/Kbuild:generic-y += msi.h
  arch/arm/include/asm/Kbuild:generic-y += msi.h
  arch/mips/include/asm/Kbuild:generic-y += msi.h
  arch/powerpc/include/asm/Kbuild:generic-y += msi.h
  arch/sparc/include/asm/Kbuild:generic-y += msi.h

For example, see

  f8430eae9f1b ("PCI/MSI: Enable PCI_MSI_IRQ_DOMAIN support for ARC")
  be091d468a0a ("arm64: PCI/MSI: Use asm-generic/msi.h")
  0ab089c2548c ("ARM: Add msi.h to Kbuild")

I didn't look into the details of msi.h generation, but I assume
RISC-V needs to do something similar?  If so, I think that should be
part of this patch to avoid issues.

If CONFIG_GENERIC_MSI_IRQ_DOMAIN is defined, include/linux/msi.h
#includes  and I don't see where that would come from.

>   depends on PCI_MSI
>   select GENERIC_MSI_IRQ_DOMAIN

Bjorn

Re: [PATCH 1/2] genirq: introduce update_irq_devid()

2019-08-08 Thread Thomas Gleixner

On Thu, 8 Aug 2019, Ben Luo wrote:
> +int update_irq_devid(unsigned int irq, void *dev_id, void *new_dev_id)
> +{
> + struct irq_desc *desc = irq_to_desc(irq);
> + struct irqaction *action, **action_ptr;
> + unsigned long flags;
> +
> + WARN(in_interrupt(),
> + "Trying to update IRQ %d from IRQ context!\n", irq);

This is broken. The function needs to return on that condition. Actually it
cannot even be called from non-preemptible code.

What's worse is that if the interrupt in question is handled concurrently,
then it will either see the old or the new dev_id and because the interrupt
handler loop runs with desc->lock dropped even more crap can happen because
dev_id can be subject to load and store tearing.

Staring at that, I see that there is the same issue in setup_irq() and
free_irq(). It's actually worse there. I'll have a look.

> + /*
> +  * There can be multiple actions per IRQ descriptor, find the right
> +  * one based on the dev_id:
> +  */
> + action_ptr = &desc->action;
> + for (;;) {
> + action = *action_ptr;
> +
> + if (!action) {
> + WARN(1, "Trying to update already-free IRQ %d\n", irq);

That's wrong in two aspects:

   1) The warn should be outside of the locked region.

   2) Just having the irq number is not useful for debugging either
  when the interrupt is shared.

> + raw_spin_unlock_irqrestore(&desc->lock, flags);
> + chip_bus_sync_unlock(desc);
> + return -ENXIO;
> + }
> +
> + if (action->dev_id == dev_id) {
> + action->dev_id = new_dev_id;
> + break;
> + }
> + action_ptr = &action->next;
> + }
> +
> + raw_spin_unlock_irqrestore(&desc->lock, flags);
> + chip_bus_sync_unlock(desc);
> +
> + /*
> +  * Make sure it's not being used on another CPU:
> +  * There is a risk of UAF for old *dev_id, if it is
> +  * freed in a short time after this func returns
> +  */
> + synchronize_irq(irq);
> +
> + return 0;
> +}
> +EXPORT_SYMBOL(update_irq_devid);

EXPORT_SYMBOL_GPL() please.

Thanks,

tglx

Re: [PATCH net] net: phy: rtl8211f: do a double read to get real time link status

2019-08-08 Thread Heiner Kallweit

On 08.08.2019 21:40, Andrew Lunn wrote:
>> @@ -568,6 +568,11 @@ int phy_start_aneg(struct phy_device *phydev)
>>  if (err < 0)
>>  goto out_unlock;
>>  
>> +/* The PHY may not yet have cleared aneg-completed and link-up bit
>> + * w/o this delay when the following read is done.
>> + */
>> +usleep_range(1000, 2000);
>> +
> 
> Hi Heiner
> 
> Does 802.3 C22 say anything about this?
> 
C22 says:
"The Auto-Negotiation process shall be restarted by setting bit 0.9 to a logic 
one. This bit is self-
clearing, and a PHY shall return a value of one in bit 0.9 until the 
Auto-Negotiation process has been
initiated."

Maybe we should read bit 0.9 in genphy_update_link() after having read BMSR and 
report
aneg-complete and link-up as false (no matter of their current value) if 0.9 is 
set.

> If this PHY is broken with respect to the standard, i would prefer the
> workaround is in the PHY specific driver code, not generic core code.
> 
Based on the C22 statement above the PHY may not be broken and the typical time 
between
two MDIO accesses is sufficient for the PHY to clear the bits. I think of MDIO 
bus access
functions in network chips that have a 10us-20us delay after each MDIO access.
On HNS3 this may not be the case.

>  Andrew
> 
Heiner

Re: i2c: imx: support slave mode for imx I2C driver

2019-08-08 Thread Wolfram Sang

On Thu, Aug 08, 2019 at 11:53:43AM +0800, Biwen Li wrote:
> The patch supports slave mode for imx I2C driver
> 
> Signed-off-by: Biwen Li 

Wow, this is much simpler than the other approach flying around:

http://patchwork.ozlabs.org/patch/1124048/

Can this one be master and slave on the same bus, too?

CCing the author of the other patch.

> ---
>  drivers/i2c/busses/i2c-imx.c | 199 ---
>  1 file changed, 185 insertions(+), 14 deletions(-)
> 
> diff --git a/drivers/i2c/busses/i2c-imx.c b/drivers/i2c/busses/i2c-imx.c
> index b1b8b938d7f4..f7583a9fa56f 100644
> --- a/drivers/i2c/busses/i2c-imx.c
> +++ b/drivers/i2c/busses/i2c-imx.c
> @@ -202,6 +202,9 @@ struct imx_i2c_struct {
>   struct pinctrl_state *pinctrl_pins_gpio;
>  
>   struct imx_i2c_dma  *dma;
> +#if IS_ENABLED(CONFIG_I2C_SLAVE)
> + struct i2c_client   *slave;
> +#endif /* CONFIG_I2C_SLAVE */
>  };
>  
>  static const struct imx_i2c_hwdata imx1_i2c_hwdata = {
> @@ -583,23 +586,40 @@ static void i2c_imx_stop(struct imx_i2c_struct *i2c_imx)
>   imx_i2c_write_reg(temp, i2c_imx, IMX_I2C_I2CR);
>  }
>  
> -static irqreturn_t i2c_imx_isr(int irq, void *dev_id)
> +/* Clear interrupt flag bit */
> +static void i2c_imx_clr_if_bit(struct imx_i2c_struct *i2c_imx)
>  {
> - struct imx_i2c_struct *i2c_imx = dev_id;
> - unsigned int temp;
> + unsigned int status;
>  
> - temp = imx_i2c_read_reg(i2c_imx, IMX_I2C_I2SR);
> - if (temp & I2SR_IIF) {
> - /* save status register */
> - i2c_imx->i2csr = temp;
> - temp &= ~I2SR_IIF;
> - temp |= (i2c_imx->hwdata->i2sr_clr_opcode & I2SR_IIF);
> - imx_i2c_write_reg(temp, i2c_imx, IMX_I2C_I2SR);
> - wake_up(&i2c_imx->queue);
> - return IRQ_HANDLED;
> - }
> + status = imx_i2c_read_reg(i2c_imx, IMX_I2C_I2SR);
> + status &= ~I2SR_IIF;
> + status |= (i2c_imx->hwdata->i2sr_clr_opcode & I2SR_IIF);
> + imx_i2c_write_reg(status, i2c_imx, IMX_I2C_I2SR);
> +}
> +
> +/* Clear arbitration lost bit */
> +static void i2c_imx_clr_al_bit(struct imx_i2c_struct *i2c_imx)
> +{
> + unsigned int status;
> +
> + status = imx_i2c_read_reg(i2c_imx, IMX_I2C_I2SR);
> + status &= ~I2SR_IAL;
> + imx_i2c_write_reg(status, i2c_imx, IMX_I2C_I2SR);
> +}
>  
> - return IRQ_NONE;
> +static irqreturn_t i2c_imx_master_isr(struct imx_i2c_struct *i2c_imx)
> +{
> + unsigned int status;
> +
> + dev_dbg(&i2c_imx->adapter.dev, "<%s>: master interrupt\n", __func__);
> +
> + /* Save status register */
> + status = imx_i2c_read_reg(i2c_imx, IMX_I2C_I2SR);
> + i2c_imx->i2csr = status | I2SR_IIF;
> +
> + wake_up(&i2c_imx->queue);
> +
> + return IRQ_HANDLED;
>  }
>  
>  static int i2c_imx_dma_write(struct imx_i2c_struct *i2c_imx,
> @@ -1043,11 +1063,162 @@ static u32 i2c_imx_func(struct i2c_adapter *adapter)
>   | I2C_FUNC_SMBUS_READ_BLOCK_DATA;
>  }
>  
> +#if IS_ENABLED(CONFIG_I2C_SLAVE)
> +static void i2c_imx_slave_init(struct imx_i2c_struct *i2c_imx)
> +{
> + unsigned int temp;
> +
> + dev_dbg(&i2c_imx->adapter.dev, "<%s>\n", __func__);
> +
> + /* Set slave addr. */
> + imx_i2c_write_reg((i2c_imx->slave->addr << 1), i2c_imx, IMX_I2C_IADR);
> +
> + /* Disable i2c module */
> + temp = i2c_imx->hwdata->i2cr_ien_opcode
> + ^ I2CR_IEN;
> + imx_i2c_write_reg(temp, i2c_imx, IMX_I2C_I2CR);
> +
> + /* Reset status register */
> + imx_i2c_write_reg(i2c_imx->hwdata->i2sr_clr_opcode, i2c_imx,
> +   IMX_I2C_I2SR);
> +
> + /* Enable module and enable interrupt from i2c module */
> + temp = i2c_imx->hwdata->i2cr_ien_opcode
> + | I2CR_IIEN;
> + imx_i2c_write_reg(temp, i2c_imx, IMX_I2C_I2CR);
> +
> + /* Wait controller to be stable */
> + usleep_range(50, 150);
> +}
> +
> +static irqreturn_t i2c_imx_slave_isr(struct imx_i2c_struct *i2c_imx)
> +{
> + unsigned int status, ctl;
> + u8 value;
> +
> + if (!i2c_imx->slave) {
> + dev_err(&i2c_imx->adapter.dev, "cannot deal with slave 
> irq,i2c_imx->slave is null");
> + return IRQ_NONE;
> + }
> +
> + status = imx_i2c_read_reg(i2c_imx, IMX_I2C_I2SR);
> + ctl = imx_i2c_read_reg(i2c_imx, IMX_I2C_I2CR);
> + if (status & I2SR_IAL) { /* Arbitration lost */
> + i2c_imx_clr_al_bit(i2c_imx);
> + } else if (status & I2SR_IAAS) { /* Addressed as a slave */
> + if (status & I2SR_SRW) { /* Master wants to read from us*/
> + dev_dbg(&i2c_imx->adapter.dev, "read requested");
> + i2c_slave_event(i2c_imx->slave, 
> I2C_SLAVE_READ_REQUESTED, &value);
> +
> + /* Slave transimt */
> + ctl |= I2CR_MTX;
> + imx_i2c_write_reg(ctl, i2c_imx, IMX_I2C_I2CR);
> +
> + /* Send data */
> + imx_i2c

Re: [PATCH v2 2/2] nvme-pci: Allow PCI bus-level PM to be used if ASPM is disabled

2019-08-08 Thread Keith Busch

On Thu, Aug 08, 2019 at 01:39:54PM -0500, Bjorn Helgaas wrote:
> On Thu, Aug 08, 2019 at 04:47:45PM +0200, Rafael J. Wysocki wrote:
> > On Thu, Aug 8, 2019 at 3:43 PM Bjorn Helgaas  wrote:
> > 
> > > IIUC the NVMe device will go to the desired package idle state if
> > > the link is in L0s or L1, but not if the link is in L0.  I don't
> > > understand that connection; AFAIK that would be something outside
> > > the scope of the PCIe spec.
> > 
> > Yes, it is outside of the PCIe spec.
> > 
> > No, this is not about the NVMe device, it is about the Intel SoC
> > (System-on-a-Chip) the platform is based on.
> 
> Ah.  So this problem could occur with any device, not just NVMe?  If
> so, how do you address that?  Obviously you don't want to patch all
> drivers this way.

We discovered this when using an NVMe protocol specific power setting, so
that part is driver specific. We just have to ensure device generic
dependencies are met in order to achieve the our power target. So in
that sense, I think you would need to patch all drivers if they're also
using protocol specific settings incorrectly.

Granted, the NVMe specification doesn't detail what PCIe settings may
prevent NVMe power management from hitting the objective, but I think
ASPM enabled makes sense.

Re: [PATCH] arch/x86/kernel/cpu/umwait.c - remove unused variable

2019-08-08 Thread Thomas Gleixner

Valdis,

On Thu, 8 Aug 2019, Valdis Klētnieks wrote:

I really appreciate your work, but can you please refrain from using file
names as prefixes?

git log $FILE gives you usually a pretty good hint what the proper prefix
is:

  bd9a0c97e53c ("x86/umwait: Add sysfs interface to control umwait maximum 
time")
  ff4b353f2ef9 ("x86/umwait: Add sysfs interface to control umwait C0.2 state")
  bd688c69b7e6 ("x86/umwait: Initialize umwait control values")

See?

> We get a warning when building with W=1:

Please avoid 'We/I' in changelogs.
 
>   CC  arch/x86/kernel/cpu/umwait.o
> arch/x86/kernel/cpu/umwait.c: In function 'umwait_init':
> arch/x86/kernel/cpu/umwait.c:183:6: warning: variable 'ret' set but not used 
> [-Wunused-but-set-variable]
>   183 |  int ret;
>   |  ^~~
> 
> And indeed, we don't do anything with it, so clean it  up.

Well, the question is whether removing the variable is the right thing to
do.
 
> Signed-off-by: Valdis Kletnieks 
> 
> diff --git a/arch/x86/kernel/cpu/umwait.c b/arch/x86/kernel/cpu/umwait.c
> index 6a204e7336c1..3d1d3952774a 100644
> --- a/arch/x86/kernel/cpu/umwait.c
> +++ b/arch/x86/kernel/cpu/umwait.c
> @@ -180,12 +180,11 @@ static struct attribute_group umwait_attr_group = {
>  static int __init umwait_init(void)
>  {
>   struct device *dev;
> - int ret;
>  
>   if (!boot_cpu_has(X86_FEATURE_WAITPKG))
>   return -ENODEV;
>  
> - ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "umwait:online",
> + cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "umwait:online",
>   umwait_cpu_online, NULL);

If that fails then umwait is broken. So instead of removing it, this should
actually check the return code and act accordingly. Fenghua?
  
>   register_syscore_ops(&umwait_syscore_ops);

Thanks,

tglx

[ANNOUNCE] 4.4.188-rt185

2019-08-08 Thread Daniel Wagner

Hello RT Folks!

I'm pleased to announce the 4.4.188-rt185 stable release.

This release is just an update to the new stable 4.4.188 version
and no RT specific changes have been made.

The know issue from last time is now resolved. The missing patch for
-rt has is now also part of stable 1ab1512366d4 ("mm, vmstat: make
quiet_vmstat lighter"). There was a patch missing fece2f828ffe
("vmstat: Remove BUG_ON from vmstat_update"). With this patch the
NVIDIA boards (at least Tegra K1) should work again.

You can get this release via the git tree at:

  git://git.kernel.org/pub/scm/linux/kernel/git/rt/linux-stable-rt.git

  branch: v4.4-rt
  Head SHA1: bc22d8bc8f5566ba4fe13115fb11d843d140f37c

Or to build 4.4.188-rt185 directly, the following patches should be applied:

  https://www.kernel.org/pub/linux/kernel/v4.x/linux-4.4.tar.xz

  https://www.kernel.org/pub/linux/kernel/v4.x/patch-4.4.188.xz

  
https://www.kernel.org/pub/linux/kernel/projects/rt/4.4/patch-4.4.188-rt185.patch.xz

Enjoy!
   Daniel

RE: [PATCH v2 2/2] nvme-pci: Allow PCI bus-level PM to be used if ASPM is disabled

2019-08-08 Thread Mario.Limonciello

> This is more meaningful to you than to most people because "applying
> the standard PCI PM" doesn't tell us what that means in terms of the
> device.  Presumably it has something to do with a D-state transition?
> I *assume* a suspend might involve the D0 -> D3hot transition you
> mention below?
> 
> > The reason for doing that was a (reportedly) widespread failure to
> > take the PCIe link down during D0 -> D3hot transitions of NVMe
> > devices,
> 
> I don't know any of the details, but "failure to take the link down
> during D0 -> D3hot transitions" is phrased as though it might be a
> hardware erratum.  If this *is* related to an NVMe erratum, that would
> explain why you only need to patch the nvme driver, and it would be
> useful to mention that in the commit log, since otherwise it sounds
> like something that might be needed in other drivers, too.

NVME is special in this case that there is other logic being put in place
to set the drive's power state explicitly.

I would mention that also this alternate flow is quicker for s0ix
resume since NVME doesn't go through shutdown routine.

Unanimously the feedback from vendors was to avoid NVME shutdown
and to instead use SetFeatures to go into deepest power state instead
over S0ix.

> 
> According to PCIe r5.0 sec 5.3.2, the only legal link states for D3hot
> are L1, L2/L3 Ready.  So if you put a device in D3hot and its link
> stays in L0, that sounds like a defect.  Is that what happens?
> 
> Obviously I'm still confused.  I think it would help if you could
> describe the problem in terms of the specific PCIe states involved
> (D0, D3hot, L0, L1, L2, L3, etc) because then the spec would help
> explain what's happening.

Before that commit, the flow for NVME s0ix was:

* Delete IO SQ/CQ
* Shutdown NVME controller
* Save PCI registers
* Go into D3hot
* Read PMCSR

A functioning drive had the link at L1.2 and NVME power state at PS4
at this point.
Resuming looked like this:

* Restore PCI registers
* Enable NVME controller
* Configure NVME controller (IO queues, features, etc).

After that commit the flow for NVME s0ix is:

* Use NVME SetFeatures to put drive into low power mode (PS3 or PS4)
* Save PCI config register
* ASPM is used to bring link into L1.2

The resume flow is:

* Restore PCI registers

"Non-functioning" drives consumed too much power from the old flow.

The root cause varied from manufacturer to manufacturer.
The two I know off hand:

One instance is that when PM status register is read after the device in L1.2
from D3 it causes link to go to L0 and then stay there.

Another instance I heard drive isn't able to service D3hot request when NVME
was already shut down.

Re: [PATCH] mm/oom: Add killed process selection information

2019-08-08 Thread Michal Hocko

[please do not top-post]

On Thu 08-08-19 12:21:30, Edward Chron wrote:
> It is helpful to the admin that looks at the kill message and records this
> information. OOMs can come in bunches.
> Knowing how much resource the oom selected process was using at the time of
> the OOM event is very useful, these fields document key process and system
> memory/swap values and can be quite helpful.

I do agree and we already print that information. rss with a break down
to anonymous, file backed and shmem, is usually a large part of the oom
victims foot print. It is not a complete information because there might
be a lot of memory hidden by other resource (open files etc.). We do not
print that information because it is not considered in the oom
selection. It is also not guaranteed to be freed upon the task exit.

> Also can't you disable printing the oom eligible task list? For systems
> with very large numbers of oom eligible processes that would seem to be
> very desirable.

Yes that is indeed the case. But how does the oom_score and
oom_score_adj alone without comparing it to other eligible tasks help in
isolation?

[...]

> I'm not sure that change would be supported upstream but again in our
> experience we've found it helpful, since you asked.

Could you be more specific about how that information is useful except
for recording it? I am all for giving an useful information in the OOM
report but I would like to hear a sound justification for each
additional piece of information.

E.g. this helped us to understand why the task has been selected - this
is usually dump_tasks portion of the report because it gives a picture
of what the OOM killer sees when choosing who to kill.

Then we have the summary to give us an estimation on how much
memory will get freed when the victim dies - rss is a very rough
estimation. But is a portion of the overal memory or oom_score{_adj}
important to print as well? Those are relative values. Say you get
memory-usage:10%, oom_score:42 and oom_score_adj:0. What are you going
to tell from that information?
-- 
Michal Hocko
SUSE Labs

Re: BUG: soft lockup in tcp_delack_timer

2019-08-08 Thread Thomas Gleixner

On Thu, 8 Aug 2019, syzbot wrote:

Cc+ Eric, net-dev

> Hello,
> 
> syzbot found the following crash on:
> 
> HEAD commit:0d8b3265 Add linux-next specific files for 20190729
> git tree:   linux-next
> console output: https://syzkaller.appspot.com/x/log.txt?x=1101fdc860
> kernel config:  https://syzkaller.appspot.com/x/.config?x=ae96f3b8a7e885f7
> dashboard link: https://syzkaller.appspot.com/bug?extid=2d55fb97f42947bbcddd
> compiler:   gcc (GCC) 9.0.0 20181231 (experimental)
> 
> Unfortunately, I don't have any reproducer for this crash yet.
> 
> IMPORTANT: if you fix the bug, please add the following tag to the commit:
> Reported-by: syzbot+2d55fb97f42947bbc...@syzkaller.appspotmail.com
> 
> net_ratelimit: 2 callbacks suppressed
> TCP: request_sock_TCPv6: Possible SYN flooding on port 20002. Sending cookies.
> Check SNMP counters.
> watchdog: BUG: soft lockup - CPU#0 stuck for 122s! [swapper/0:0]
> Modules linked in:
> irq event stamp: 92022
> hardirqs last  enabled at (92021): []
> tick_nohz_idle_exit+0x181/0x2e0 kernel/time/tick-sched.c:1180
> hardirqs last disabled at (92022): []
> __schedule+0x1dd/0x15b0 kernel/sched/core.c:3862
> softirqs last  enabled at (90810): []
> __do_softirq+0x6cd/0x98c kernel/softirq.c:319
> softirqs last disabled at (90703): [] invoke_softirq
> kernel/softirq.c:373 [inline]
> softirqs last disabled at (90703): [] irq_exit+0x19b/0x1e0
> kernel/softirq.c:413
> CPU: 0 PID: 0 Comm: swapper/0 Not tainted 5.3.0-rc2-next-20190729 #54
> Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google
> 01/01/2011
> RIP: 0010:cpu_relax arch/x86/include/asm/processor.h:656 [inline]
> RIP: 0010:virt_spin_lock arch/x86/include/asm/qspinlock.h:84 [inline]
> RIP: 0010:native_queued_spin_lock_slowpath+0x132/0x9f0
> kernel/locking/qspinlock.c:325
> Code: 00 00 00 48 8b 45 d0 65 48 33 04 25 28 00 00 00 0f 85 37 07 00 00 48 81
> c4 98 00 00 00 5b 41 5c 41 5d 41 5e 41 5f 5d c3 f3 90  73 ff ff ff 8b 45
> 98 4c 8d 65 d8 3d 00 01 00 00 0f 84 e5 00 00
> RSP: 0018:8880ae809b48 EFLAGS: 0202 ORIG_RAX: ff13
> RAX:  RBX: 8880621cd088 RCX: 8158f117
> RDX:  RSI: 0004 RDI: 8880621cd088
> RBP: 8880ae809c08 R08: 11100c439a11 R09: ed100c439a12
> R10: ed100c439a11 R11: 8880621cd08b R12: 0001
> R13: 0003 R14: ed100c439a11 R15: 0001
> FS:  () GS:8880ae80() knlGS:
> CS:  0010 DS:  ES:  CR0: 80050033
> CR2: 01541e88 CR3: 68089000 CR4: 001406f0
> Call Trace:
> 
> pv_queued_spin_lock_slowpath arch/x86/include/asm/paravirt.h:642 [inline]
> queued_spin_lock_slowpath arch/x86/include/asm/qspinlock.h:50 [inline]
> queued_spin_lock include/asm-generic/qspinlock.h:81 [inline]
> do_raw_spin_lock+0x20e/0x2e0 kernel/locking/spinlock_debug.c:113
> __raw_spin_lock include/linux/spinlock_api_smp.h:143 [inline]
> _raw_spin_lock+0x37/0x40 kernel/locking/spinlock.c:151
> spin_lock include/linux/spinlock.h:338 [inline]
> tcp_delack_timer+0x2b/0x2a0 net/ipv4/tcp_timer.c:318
> call_timer_fn+0x1ac/0x780 kernel/time/timer.c:1322
> expire_timers kernel/time/timer.c:1366 [inline]
> __run_timers kernel/time/timer.c:1685 [inline]
> __run_timers kernel/time/timer.c:1653 [inline]
> run_timer_softirq+0x697/0x17a0 kernel/time/timer.c:1698
> __do_softirq+0x262/0x98c kernel/softirq.c:292
> invoke_softirq kernel/softirq.c:373 [inline]
> irq_exit+0x19b/0x1e0 kernel/softirq.c:413
> exiting_irq arch/x86/include/asm/apic.h:536 [inline]
> smp_apic_timer_interrupt+0x1a3/0x610 arch/x86/kernel/apic/apic.c:1095
> apic_timer_interrupt+0xf/0x20 arch/x86/entry/entry_64.S:828
> 
> RIP: 0010:native_safe_halt+0xe/0x10 arch/x86/include/asm/irqflags.h:61
> Code: c8 75 6e fa eb 8a 90 90 90 90 90 90 e9 07 00 00 00 0f 00 2d c4 b2 49 00
> f4 c3 66 90 e9 07 00 00 00 0f 00 2d b4 b2 49 00 fb f4  90 55 48 89 e5 41
> 57 41 56 41 55 41 54 53 e8 8e 56 21 fa e8 29
> RSP: 0018:88c07ce8 EFLAGS: 0282 ORIG_RAX: ff13
> RAX: 111a5e87 RBX: 88c7a1c0 RCX: 1134bca6
> RDX: dc00 RSI: 81779dee RDI: 873e794c
> RBP: 88c07d18 R08: 88c7a1c0 R09: fbfff118f439
> R10: fbfff118f438 R11: 88c7a1c7 R12: dc00
> R13: 89a5b340 R14:  R15: 
> arch_cpu_idle+0xa/0x10 arch/x86/kernel/process.c:571
> default_idle_call+0x84/0xb0 kernel/sched/idle.c:94
> cpuidle_idle_call kernel/sched/idle.c:154 [inline]
> do_idle+0x413/0x760 kernel/sched/idle.c:263
> cpu_startup_entry+0x1b/0x20 kernel/sched/idle.c:354
> rest_init+0x245/0x37b init/main.c:451
> arch_call_rest_init+0xe/0x1b
> start_kernel+0x912/0x951 init/main.c:785
> x86_64_start_reservations+0x29/0x2b arch/x86/kernel/head64.c:472
> x86_64_start_kernel+0x77/0x7b arch/x86/kernel/head64.c:453
> secondary_startup_64+0xa4/0xb0 arch/x86/kernel/head_64.S:

Re: [PATCH V2 01/10] x86/CPU: Expose if cache is inclusive of lower level caches

2019-08-08 Thread Reinette Chatre

Hi Borislav,

On 8/8/2019 1:13 AM, Borislav Petkov wrote:
> On Thu, Aug 08, 2019 at 10:08:41AM +0200, Borislav Petkov wrote:
>> Ok, tglx and I talked it over a bit on IRC: so your 1/10 patch is pretty
>> close - just leave out the generic struct cacheinfo bits and put the
>> cache inclusivity property in a static variable there.
> 
> ... and by "there" I mean arch/x86/kernel/cpu/cacheinfo.c which contains
> all cache properties etc on x86 and is the proper place to put stuff
> like that.

With the goal of following these guidelines exactly I came up with the
below that is an incremental diff on top of what this review started out as.

Some changes to highlight that may be of concern:
* In your previous email you do mention that this will be a "single bit
of information". Please note that I did not specifically use an actual
bit to capture this information but an unsigned int (I am very aware
that you also commented on this initially). If you do mean that this
should be stored as an actual bit, could you please help me by
elaborating how you would like to see this implemented?
* Please note that I moved the initialization to init_intel_cacheinfo()
to be specific to Intel. I did so because from what I understand there
are some AMD platforms for which this information cannot be determined
and I thought it simpler to make it specific to Intel with the new
single static variable.
* Please note that while this is a single global static variable it will
be set over and over for each CPU on the system.

diff --git a/arch/x86/include/asm/cacheinfo.h
b/arch/x86/include/asm/cacheinfo.h
index 86b63c7feab7..97be5141bb4b 100644
--- a/arch/x86/include/asm/cacheinfo.h
+++ b/arch/x86/include/asm/cacheinfo.h
@@ -5,4 +5,6 @@
 void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, int cpu, u8 node_id);
 void cacheinfo_hygon_init_llc_id(struct cpuinfo_x86 *c, int cpu, u8
node_id);

+unsigned int cacheinfo_intel_l3_inclusive(void);
+
 #endif /* _ASM_X86_CACHEINFO_H */
diff --git a/arch/x86/kernel/cpu/cacheinfo.c
b/arch/x86/kernel/cpu/cacheinfo.c
index 733874f84f41..247b6a9b5c88 100644
--- a/arch/x86/kernel/cpu/cacheinfo.c
+++ b/arch/x86/kernel/cpu/cacheinfo.c
@@ -187,6 +187,7 @@ struct _cpuid4_info_regs {
 };

 static unsigned short num_cache_leaves;
+static unsigned l3_inclusive;

 /* AMD doesn't have CPUID4. Emulate it here to report the same
information to the user.  This makes some assumptions about the machine:
@@ -745,6 +746,11 @@ void init_hygon_cacheinfo(struct cpuinfo_x86 *c)
num_cache_leaves = find_num_cache_leaves(c);
 }

+unsigned int cacheinfo_intel_l3_inclusive(void)
+{
+   return l3_inclusive;
+}
+
 void init_intel_cacheinfo(struct cpuinfo_x86 *c)
 {
/* Cache sizes */
@@ -795,6 +801,7 @@ void init_intel_cacheinfo(struct cpuinfo_x86 *c)
num_threads_sharing = 1 +
this_leaf.eax.split.num_threads_sharing;
index_msb =
get_count_order(num_threads_sharing);
l3_id = c->apicid & ~((1 << index_msb) - 1);
+   l3_inclusive =
this_leaf.edx.split.inclusive;
break;
default:
break;
@@ -1010,13 +1017,6 @@ static void ci_leaf_init(struct cacheinfo *this_leaf,
this_leaf->physical_line_partition =
base->ebx.split.physical_line_partition + 1;

-   if ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
-boot_cpu_has(X86_FEATURE_TOPOEXT)) ||
-   boot_cpu_data.x86_vendor == X86_VENDOR_HYGON ||
-   boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) {
-   this_leaf->attributes |= CACHE_INCLUSIVE_SET;
-   this_leaf->inclusive = base->edx.split.inclusive;
-   }
this_leaf->priv = base->nb;
 }

What do you think?

Reinette

Re: [PATCH] HID: apple: Fix stuck function keys when using FN

2019-08-08 Thread João Moreno

Hi Benjamin,

On Mon, 8 Jul 2019 at 22:35, João Moreno  wrote:
>
> Hi Benjamin,
>
> No worries, also pretty busy over here. Didn't mean to press.
>
> On Mon, 1 Jul 2019 at 10:32, Benjamin Tissoires
>  wrote:
> >
> > Hi João,
> >
> > On Sun, Jun 30, 2019 at 10:15 PM João Moreno  wrote:
> > >
> > > Hi Jiri & Benjamin,
> > >
> > > Let me know if you need something else to get this patch moving forward. 
> > > This
> > > fixes an issue I hit daily, it would be great to get it fixed.
> >
> > Sorry for the delay, I am very busy with internal corporate stuff, and
> > I tried setting up a new CI system at home, and instead of spending a
> > couple of ours, I am down to 2 weeks of hard work, without possibility
> > to switch to the new right now :(
> > Anyway.
> >
> > >
> > > Thanks.
> > >
> > > On Mon, 10 Jun 2019 at 23:31, Joao Moreno  wrote:
> > > >
> > > > This fixes an issue in which key down events for function keys would be
> > > > repeatedly emitted even after the user has raised the physical key. For
> > > > example, the driver fails to emit the F5 key up event when going through
> > > > the following steps:
> > > > - fnmode=1: hold FN, hold F5, release FN, release F5
> > > > - fnmode=2: hold F5, hold FN, release F5, release FN
> >
> > Ouch :/
> >
>
> Right?!
>
> > > >
> > > > The repeated F5 key down events can be easily verified using xev.
> > > >
> > > > Signed-off-by: Joao Moreno 
> > > > ---
> > > >  drivers/hid/hid-apple.c | 21 +++--
> > > >  1 file changed, 11 insertions(+), 10 deletions(-)
> > > >
> > > > diff --git a/drivers/hid/hid-apple.c b/drivers/hid/hid-apple.c
> > > > index 1cb41992aaa1..81867a6fa047 100644
> > > > --- a/drivers/hid/hid-apple.c
> > > > +++ b/drivers/hid/hid-apple.c
> > > > @@ -205,20 +205,21 @@ static int hidinput_apple_event(struct hid_device 
> > > > *hid, struct input_dev *input,
> > > > trans = apple_find_translation (table, usage->code);
> > > >
> > > > if (trans) {
> > > > -   if (test_bit(usage->code, asc->pressed_fn))
> > > > -   do_translate = 1;
> > > > -   else if (trans->flags & APPLE_FLAG_FKEY)
> > > > -   do_translate = (fnmode == 2 && 
> > > > asc->fn_on) ||
> > > > -   (fnmode == 1 && !asc->fn_on);
> > > > +   int fn_on = value ? asc->fn_on :
> > > > +   test_bit(usage->code, asc->pressed_fn);
> > > > +
> > > > +   if (!value)
> > > > +   clear_bit(usage->code, asc->pressed_fn);
> > > > +   else if (asc->fn_on)
> > > > +   set_bit(usage->code, asc->pressed_fn);
> >
> > I have the feeling that this is not the correct fix here.
> >
> > I might be wrong, but the following sequence might also mess up the
> > driver state, depending on how the reports are emitted:
> > - hold FN, hold F4, hold F5, release F4, release FN, release F5
> >
>
> I believe this should be fine. Following the code:
>
> - hold FN, sets asc->fn_on to true
> - hold F4, in the trans block fn_on will be true and we'll set the F4
> bit in the bitmap
> - hold F5, in the trans block fn_on will be true and we'll set the F5 bit
> - release F4, in the trans block fn_on will be true (because of the bitmap) 
> and
> we'll clear the F4 bit
> - release FN, asc->fn_on will be false, but it doesn't matter since...
> - release F5, in the trans block we'll look into the bitmap (instead
> of asc->fn_on),
> so fn_on will be true and we'll clear the F5 bit
>
> I tested it in practice using my changes:
>
> Interestingly the Apple keyboard doesn't seem to emit an even for F5 when F4 
> is
> pressed, seems like a hardware limitation. But F6 does work. So, when I 
> execute
> these events in that order, everything works as it should: xev reports
> the following:
>
> KeyPress F4
> KeyPress F6
> KeyRelease F4
> KeyRelease F6
>
> > The reason is that the driver only considers you have one key pressed
> > with the modifier, and as the code changed its state based on the last
> > value.
> >
>
> I believe the bitmap takes care of storing the FN state per key press. The
> trick I did was to check on the global `asc->fn_on` state only when a key
> is pressed, but check on the bitmap instead when it's released.
>
> Let me know what you think. Am I missing something here?
>
> Cheers,
> João.
>
> > IMO a better fix would:
> >
> > - keep the existing `trans` mapping lookout
> > - whenever a `trans` mapping gets found:
> >   * get both translated and non-translated currently reported values
> > (`test_bit(keycode, input_dev->key)`)
> >   * if one of them is set to true, then consider the keycode to be the
> > one of the key (no matter fn_on)
> > -> deal with `value` with the corrected keycode
> >   * if the key was not pressed:
> > -> chose the keycode based on `fn_on` and `fnmode` states
> > and re

Re: [PATCH V2 09/10] x86/resctrl: Pseudo-lock portions of multiple resources

2019-08-08 Thread Reinette Chatre

Hi Borislav,

On 8/8/2019 1:44 AM, Borislav Petkov wrote:
> On Wed, Aug 07, 2019 at 12:23:29PM -0700, Reinette Chatre wrote:
>> I do not fully understand this proposal. All those goto labels take care
>> of the the different failures that can be encountered during the
>> initialization of the pseudo-lock region. Each initialization failure is
>> associated with a goto where it jumps to the cleanup path. The
>> initialization starts with the constraining of the c-states
>> (initializing plr->pm_reqs), but if I move that I think it will not
>> reduce the goto labels, just change the order because of the other
>> initialization done (plr->size, plr->line_size, plr->cpu).
> 
> Here's one possible way to do it, pasting the whole function here as it
> is easier to read it this way than an incremental diff ontop.
> 
> You basically cache all attributes in local variables and assign them to
> the plr struct only on success, at the end. This way, no goto labels and
> the C-states constraining, i.e., the most expensive operation, happens
> last, only after all the other simpler checks have succeeded. And you
> don't have to call pseudo_lock_cstates_relax() prematurely, when one of
> those easier checks fail.
> 
> Makes sense?

It does. This looks much better. Thank you very much.

> 
> Btw, I've marked the cpu_online() check with "CPU hotplug
> lock?!?" question because I don't see you holding that lock with
> get_online_cpus()/put_online_cpus().

There is a locking order dependency between cpu_hotplug_lock and
rdtgroup_mutex (cpu_hotplug_lock before rdtgroup_mutex) that has to be
maintained. To do so in this flow you will find cpus_read_lock() in
rdtgroup_schemata_write(), so quite a distance from where it is needed.

Perhaps I should add a comment at the location where the lock is
required to document where the lock is obtained?

> static int pseudo_lock_l2_l3_portions_valid(struct pseudo_lock_region *plr,
>   struct pseudo_lock_portion *l2_p,
>   struct pseudo_lock_portion *l3_p)
> {
>   unsigned int l2_size, l3_size, size, line_size, cpu;
>   struct rdt_domain *l2_d, *l3_d;
> 
>   l2_d = rdt_find_domain(l2_p->r, l2_p->d_id, NULL);
>   if (IS_ERR_OR_NULL(l2_d)) {
>   rdt_last_cmd_puts("Cannot locate L2 cache domain\n");
>   return -1;
>   }
> 
>   l3_d = rdt_find_domain(l3_p->r, l3_p->d_id, NULL);
>   if (IS_ERR_OR_NULL(l3_d)) {
>   rdt_last_cmd_puts("Cannot locate L3 cache domain\n");
>   return -1;
>   }
> 
>   if (!cpumask_subset(&l2_d->cpu_mask, &l3_d->cpu_mask)) {
>   rdt_last_cmd_puts("L2 and L3 caches need to be in same 
> hierarchy\n");
>   return -1;
>   }
> 
>   l2_size = rdtgroup_cbm_to_size(l2_p->r, l2_d, l2_p->cbm);
>   l3_size = rdtgroup_cbm_to_size(l3_p->r, l3_d, l3_p->cbm);
> 
>   if (l2_size > l3_size) {
>   rdt_last_cmd_puts("L3 cache portion has to be same size or 
> larger than L2 cache portion\n");
>   return -1;
>   }
> 
>   size = l2_size;
> 
>   l2_size = get_cache_line_size(cpumask_first(&l2_d->cpu_mask), 
> l2_p->r->cache_level);
>   l3_size = get_cache_line_size(cpumask_first(&l3_d->cpu_mask), 
> l3_p->r->cache_level);
>   if (l2_size != l3_size) {
>   rdt_last_cmd_puts("L2 and L3 caches have different coherency 
> cache line sizes\n");
>   return -1;
>   }
> 
>   line_size = l2_size;
> 
>   cpu = cpumask_first(&l2_d->cpu_mask);
> 
>   /*
>* CPU hotplug lock?!?
>*/
>   if (!cpu_online(cpu)) {
>   rdt_last_cmd_printf("CPU %u associated with cache not 
> online\n", cpu);
>   return -1;
>   }
> 
>   if (!get_cache_inclusive(cpu, l3_p->r->cache_level)) {
>   rdt_last_cmd_puts("L3 cache not inclusive\n");
>   return -1;
>   }
> 
>   /*
>* All checks passed, constrain C-states:
>*/
>   if (pseudo_lock_cstates_constrain(plr, &l2_d->cpu_mask)) {
>   rdt_last_cmd_puts("Cannot limit C-states\n");
>   pseudo_lock_cstates_relax(plr);
>   return -1;
>   }
> 
>   plr->line_size  = line_size;
>   plr->size   = size;
>   plr->cpu= cpu;
> 
>   return 0;
> }
> 

Thank you very much

Reinette

Re: [PATCH RFC v1 1/2] rcu/tree: Add basic support for kfree_rcu batching

2019-08-08 Thread Joel Fernandes

On Thu, Aug 08, 2019 at 11:11:12AM -0700, Paul E. McKenney wrote:
> On Thu, Aug 08, 2019 at 07:26:10PM +0900, Byungchul Park wrote:
> > On Wed, Aug 07, 2019 at 05:45:04AM -0400, Joel Fernandes wrote:
> > > On Tue, Aug 06, 2019 at 04:56:31PM -0700, Paul E. McKenney wrote:
> > 
> > [snip]
> > 
> > > > On Tue, Aug 06, 2019 at 05:20:40PM -0400, Joel Fernandes (Google) wrote:
> > > > Of course, I am hoping that a later patch uses an array of pointers 
> > > > built
> > > > at kfree_rcu() time, similar to Rao's patch (with or without kfree_bulk)
> > > > in order to reduce per-object cache-miss overhead.  This would make it
> > > > easier for callback invocation to keep up with multi-CPU kfree_rcu()
> > > > floods.
> > > 
> > > I think Byungchul tried an experiment with array of pointers and wasn't
> > > immediately able to see a benefit. Perhaps his patch needs a bit more 
> > > polish
> > > or another test-case needed to show benefit due to cache-misses, and the 
> > > perf
> > > tool could be used to show if cache misses were reduced. For this initial
> > > pass, we decided to keep it without the array optimization.
> > 
> > I'm still seeing no improvement with kfree_bulk().
> > 
> > I've been thinking I could see improvement with kfree_bulk() because:
> > 
> >1. As you guys said, the number of cache misses will be reduced.
> >2. We can save (N - 1) irq-disable instructions while N kfrees.
> >3. As Joel said, saving/restoring CPU status that kfree() does inside
> >   is not required.
> > 
> > But even with the following patch applied, the result was same as just
> > batching test. We might need to get kmalloc objects from random
> > addresses to maximize the result when using kfree_bulk() and this is
> > even closer to real practical world too.
> > 
> > And the second and third reasons doesn't seem to work as much as I
> > expected.
> > 
> > Do you have any idea? Or what do you think about it?
> 
> I would not expect kfree_batch() to help all that much unless the
> pre-grace-period kfree_rcu() code segregated the objects on a per-slab
> basis.

You mean kfree_bulk() instead of kfree_batch() right? I agree with you, would
be nice to do per-slab optimization in the future.

Also, I am thinking that whenever we do per-slab optimization, then the
kmem_cache_free_bulk() can be optimized further. If all pointers are on the
same slab, then we can just do virt_to_cache on the first pointer and avoid
repeated virt_to_cache() calls. That might also give a benefit -- but I could
be missing something.

Right now kmem_cache_free_bulk() just looks like a kmem_cache_free() in a
loop except the small benefit of not disabling/enabling IRQs across each
__cache_free, and the reduced cache miss benefit of using the array.

thanks,

 - Joel

[snip]

Re: [PATCH] drm/i915: Remove redundant user_access_end() from __copy_from_user() error path

2019-08-08 Thread Nick Desaulniers

On Tue, Aug 6, 2019 at 5:59 AM Josh Poimboeuf  wrote:
>
> On Mon, Aug 05, 2019 at 09:29:53PM +0200, Sedat Dilek wrote:
> > On Wed, Jul 31, 2019 at 2:25 PM Sedat Dilek  wrote:
> > >
> > > On Fri, Jul 26, 2019 at 9:30 PM Chris Wilson  
> > > wrote:
> > > >
> > > > Quoting Thomas Gleixner (2019-07-26 20:18:32)
> > > > > On Fri, 26 Jul 2019, Chris Wilson wrote:
> > > > > > Quoting Thomas Gleixner (2019-07-25 22:55:45)
> > > > > > > On Thu, 25 Jul 2019, Josh Poimboeuf wrote:
> > > > > > >
> > > > > > > > Objtool reports:
> > > > > > > >
> > > > > > > >   drivers/gpu/drm/i915/gem/i915_gem_execbuffer.o: warning: 
> > > > > > > > objtool: .altinstr_replacement+0x36: redundant UACCESS disable
> > > > > > > >
> > > > > > > > __copy_from_user() already does both STAC and CLAC, so the
> > > > > > > > user_access_end() in its error path adds an extra unnecessary 
> > > > > > > > CLAC.
> > > > > > > >
> > > > > > > > Fixes: 0b2c8f8b6b0c ("i915: fix missing user_access_end() in 
> > > > > > > > page fault exception case")
> > > > > > > > Reported-by: Thomas Gleixner 
> > > > > > > > Reported-by: Sedat Dilek 
> > > > > > > > Acked-by: Peter Zijlstra (Intel) 
> > > > > > > > Tested-by: Nick Desaulniers 
> > > > > > > > Tested-by: Sedat Dilek 
> > > > > > > > Link: https://github.com/ClangBuiltLinux/linux/issues/617
> > > > > > > > Signed-off-by: Josh Poimboeuf 
> > > > > > >
> > > > > > > Reviewed-by: Thomas Gleixner 
> > > > > >
> > > > > > Which tree do you plan to apply it to? I can put in drm-intel, and 
> > > > > > with
> > > > > > the fixes tag it will percolate through to 5.3 and beyond, but if 
> > > > > > you
> > > > > > want to apply it directly to squash the build warnings, feel free.
> > > > >
> > > > > It would be nice to get it into 5.3. I can route it linuxwards if you 
> > > > > give
> > > > > an Acked-by, but I'm happy to hand it to you :)
> > > >
> > > > Acked-by: Chris Wilson 
> > >
> > > Thomas did you take this through tip tree after Chris' ACK?
> > >
> >
> > Hi,
> >
> > Gentle ping...
> > Thomas and Chris: Will someone of you pick this up?
> > As "objtool: Improve UACCESS coverage" [1] went trough tip tree I
> > highly appreciate to do so with this one.
>
> I think Thomas has gone on holiday, so hopefully Chris can pick it up
> after all.

tglx just picked up 2 other patches of mine, bumping just in case he's
not picking up patches while on vacation. ;)
-- 
Thanks,
~Nick Desaulniers

[tip:perf/urgent] perf bench numa: Fix cpu0 binding

2019-08-08 Thread tip-bot for Jiri Olsa

Commit-ID:  6bbfe4e602691b90ac866712bd4c43c51e546a60
Gitweb: https://git.kernel.org/tip/6bbfe4e602691b90ac866712bd4c43c51e546a60
Author: Jiri Olsa 
AuthorDate: Thu, 1 Aug 2019 16:26:42 +0200
Committer:  Arnaldo Carvalho de Melo 
CommitDate: Thu, 1 Aug 2019 11:34:13 -0300

perf bench numa: Fix cpu0 binding

Michael reported an issue with perf bench numa failing with binding to
cpu0 with '-0' option.

  # perf bench numa mem -p 3 -t 1 -P 512 -s 100 -zZcm0 --thp 1 -M 1 -ddd
  # Running 'numa/mem' benchmark:

   # Running main, "perf bench numa numa-mem -p 3 -t 1 -P 512 -s 100 -zZcm0 
--thp 1 -M 1 -ddd"
  binding to node 0, mask: 0001 => -1
  perf: bench/numa.c:356: bind_to_memnode: Assertion `!(ret)' failed.
  Aborted (core dumped)

This happens when the cpu0 is not part of node0, which is the benchmark
assumption and we can see that's not the case for some powerpc servers.

Using correct node for cpu0 binding.

Reported-by: Michael Petlan 
Signed-off-by: Jiri Olsa 
Cc: Alexander Shishkin 
Cc: Andi Kleen 
Cc: Namhyung Kim 
Cc: Peter Zijlstra 
Cc: Satheesh Rajendran 
Link: http://lkml.kernel.org/r/20190801142642.28004-1-jo...@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo 
---
 tools/perf/bench/numa.c | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tools/perf/bench/numa.c b/tools/perf/bench/numa.c
index a640ca7aaada..513cb2f2fa32 100644
--- a/tools/perf/bench/numa.c
+++ b/tools/perf/bench/numa.c
@@ -379,8 +379,10 @@ static u8 *alloc_data(ssize_t bytes0, int map_flags,
 
/* Allocate and initialize all memory on CPU#0: */
if (init_cpu0) {
-   orig_mask = bind_to_node(0);
-   bind_to_memnode(0);
+   int node = numa_node_of_cpu(0);
+
+   orig_mask = bind_to_node(node);
+   bind_to_memnode(node);
}
 
bytes = bytes0 + HPSIZE;

[tip:perf/urgent] perf annotate: Fix printing of unaugmented disassembled instructions from BPF

2019-08-08 Thread tip-bot for Arnaldo Carvalho de Melo

Commit-ID:  85127775a65fc58e69af0c44513937d471ccbe7b
Gitweb: https://git.kernel.org/tip/85127775a65fc58e69af0c44513937d471ccbe7b
Author: Arnaldo Carvalho de Melo 
AuthorDate: Tue, 6 Aug 2019 11:24:09 -0300
Committer:  Arnaldo Carvalho de Melo 
CommitDate: Thu, 8 Aug 2019 15:40:56 -0300

perf annotate: Fix printing of unaugmented disassembled instructions from BPF

The code to disassemble BPF programs uses binutil's disassembling
routines, and those use in turn fprintf to print to a memstream FILE,
adding a newline at the end of each line, which ends up confusing the
TUI routines called from:

  annotate_browser__write()
annotate_line__write()
  annotate_browser__printf()
ui_browser__vprintf()
  SLsmg_vprintf()

The SLsmg_vprintf() function in the slang library gets confused with the
terminating newline, so make the disasm_line__parse() function that
parses the lines produced by the BPF specific disassembler (that uses
binutil's libopcodes) and the lines produced by the objdump based
disassembler used for everything else (and that doesn't adds this
terminating newline) trim the end of the line in addition of the
beginning.

This way when disasm_line->ops.raw, i.e. for instructions without a
special scnprintf() method, we'll not have that \n getting in the way of
filling the screen right after the instruction with spaces to avoid
leaving what was on the screen before and thus garbling the annotation
screen, breaking scrolling, etc.

Cc: Adrian Hunter 
Cc: Jiri Olsa 
Cc: Namhyung Kim 
Cc: Song Liu 
Fixes: 6987561c9e86 ("perf annotate: Enable annotation of BPF programs")
Link: https://lkml.kernel.org/n/tip-unbr5a5efakobfr6rhxq9...@git.kernel.org
Signed-off-by: Arnaldo Carvalho de Melo 
---
 tools/perf/util/annotate.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c
index ac9ad2330f93..163536720149 100644
--- a/tools/perf/util/annotate.c
+++ b/tools/perf/util/annotate.c
@@ -1122,7 +1122,7 @@ static int disasm_line__parse(char *line, const char 
**namep, char **rawp)
goto out;
 
(*rawp)[0] = tmp;
-   *rawp = skip_spaces(*rawp);
+   *rawp = strim(*rawp);
 
return 0;

Re: [PATCH 2/2 v2] tracing: Document the stack trace algorithm in the comments

2019-08-08 Thread Joel Fernandes

On Wed, Aug 07, 2019 at 01:28:28PM -0400, Steven Rostedt wrote:
> From: "Steven Rostedt (VMware)" 
> 
> As the max stack tracer algorithm is not that easy to understand from the
> code, add comments that explain the algorithm and mentions how
> ARCH_RET_ADDR_AFTER_LOCAL_VARS affects it.
> 
> Link: http://lkml.kernel.org/r/20190806123455.487ac...@gandalf.local.home
> 

Acked-by: Joel Fernandes (Google) 

thanks!!

- Joel


> Suggested-by: Joel Fernandes 
> Signed-off-by: Steven Rostedt (VMware) 
> ---
>  kernel/trace/trace_stack.c | 98 ++
>  1 file changed, 98 insertions(+)
> 
> diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
> index 40e4a88eea8f..f94a2fc567de 100644
> --- a/kernel/trace/trace_stack.c
> +++ b/kernel/trace/trace_stack.c
> @@ -53,6 +53,104 @@ static void print_max_stack(void)
>   }
>  }
>  
> +/*
> + * The stack tracer looks for a maximum stack at each call from a function. 
> It
> + * registers a callback from ftrace, and in that callback it examines the 
> stack
> + * size. It determines the stack size from the variable passed in, which is 
> the
> + * address of a local variable in the stack_trace_call() callback function.
> + * The stack size is calculated by the address of the local variable to the 
> top
> + * of the current stack. If that size is smaller than the currently saved max
> + * stack size, nothing more is done.
> + *
> + * If the size of the stack is greater than the maximum recorded size, then 
> the
> + * following algorithm takes place.
> + *
> + * For architectures (like x86) that store the function's return address 
> before
> + * saving the function's local variables, the stack will look something like
> + * this:
> + *
> + *   [ top of stack ]
> + *0: sys call entry frame
> + *   10: return addr to entry code
> + *   11: start of sys_foo frame
> + *   20: return addr to sys_foo
> + *   21: start of kernel_func_bar frame
> + *   30: return addr to kernel_func_bar
> + *   31: [ do trace stack here ]
> + *
> + * The save_stack_trace() is called returning all the functions it finds in 
> the
> + * current stack. Which would be (from the bottom of the stack to the top):
> + *
> + *   return addr to kernel_func_bar
> + *   return addr to sys_foo
> + *   return addr to entry code
> + *
> + * Now to figure out how much each of these functions' local variable size 
> is,
> + * a search of the stack is made to find these values. When a match is made, 
> it
> + * is added to the stack_dump_trace[] array. The offset into the stack is 
> saved
> + * in the stack_trace_index[] array. The above example would show:
> + *
> + *stack_dump_trace[]|   stack_trace_index[]
> + *--+   ---
> + *  return addr to kernel_func_bar  |  30
> + *  return addr to sys_foo  |  20
> + *  return addr to entry|  10
> + *
> + * The print_max_stack() function above, uses these values to print the size 
> of
> + * each function's portion of the stack.
> + *
> + *  for (i = 0; i < nr_entries; i++) {
> + * size = i == nr_entries - 1 ? stack_trace_index[i] :
> + *stack_trace_index[i] - stack_trace_index[i+1]
> + * print "%d %d %d %s\n", i, stack_trace_index[i], size, 
> stack_dump_trace[i]);
> + *  }
> + *
> + * The above shows
> + *
> + * depth size  location
> + * -   
> + *  030   10   kernel_func_bar
> + *  120   10   sys_foo
> + *  210   10   entry code
> + *
> + * Now for architectures that might save the return address after the 
> functions
> + * local variables (saving the link register before calling nested 
> functions),
> + * this will cause the stack to look a little different:
> + *
> + * [ top of stack ]
> + *  0: sys call entry frame
> + * 10: start of sys_foo_frame
> + * 19: return addr to entry code << lr saved before calling kernel_func_bar
> + * 20: start of kernel_func_bar frame
> + * 29: return addr to sys_foo_frame << lr saved before calling next function
> + * 30: [ do trace stack here ]
> + *
> + * Although the functions returned by save_stack_trace() may be the same, the
> + * placement in the stack will be different. Using the same algorithm as 
> above
> + * would yield:
> + *
> + *stack_dump_trace[]|   stack_trace_index[]
> + *--+   ---
> + *  return addr to kernel_func_bar  |  30
> + *  return addr to sys_foo  |  29
> + *  return addr to entry|  19
> + *
> + * Where the mapping is off by one:
> + *
> + *   kernel_func_bar stack frame size is 29 - 19 not 30 - 29!
> + *
> + * To fix this, if the architecture sets ARCH_RET_ADDR_AFTER_LOCAL_VARS the
> + * values in stack_trace_index[] are shifted by one to and the number of
> + * stack trace entries is decremented by one.
> + *
> + *stack_dump_trace[]|   stack_trace_index[]
> + *

1 2 3 4 5 6 7 8 9 10 >

1 - 100 of 1073 matches

Mail list logo