Hi, On Mon, Jun 25, 2018 at 9:51 AM, Benjamin Tissoires <benjamin.tissoi...@redhat.com> wrote: > If a device gets removed right after having registered a power_supply node, > we might enter in a deadlock between the remove call (that has a lock on > the parent device) and the deferred register work. > > Allow the deferred register work to exit without taking the lock when > we are in the remove state. > > Stack trace on a Ubuntu 16.04: > > [16072.109121] INFO: task kworker/u16:2:1180 blocked for more than 120 > seconds. > [16072.109127] Not tainted 4.13.0-41-generic #46~16.04.1-Ubuntu > [16072.109129] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables > this message. > [16072.109132] kworker/u16:2 D 0 1180 2 0x80000000 > [16072.109142] Workqueue: events_power_efficient > power_supply_deferred_register_work > [16072.109144] Call Trace: > [16072.109152] __schedule+0x3d6/0x8b0 > [16072.109155] schedule+0x36/0x80 > [16072.109158] schedule_preempt_disabled+0xe/0x10 > [16072.109161] __mutex_lock.isra.2+0x2ab/0x4e0 > [16072.109166] __mutex_lock_slowpath+0x13/0x20 > [16072.109168] ? __mutex_lock_slowpath+0x13/0x20 > [16072.109171] mutex_lock+0x2f/0x40 > [16072.109174] power_supply_deferred_register_work+0x2b/0x50 > [16072.109179] process_one_work+0x15b/0x410 > [16072.109182] worker_thread+0x4b/0x460 > [16072.109186] kthread+0x10c/0x140 > [16072.109189] ? process_one_work+0x410/0x410 > [16072.109191] ? kthread_create_on_node+0x70/0x70 > [16072.109194] ret_from_fork+0x35/0x40 > [16072.109199] INFO: task test:2257 blocked for more than 120 seconds. > [16072.109202] Not tainted 4.13.0-41-generic #46~16.04.1-Ubuntu > [16072.109204] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables > this message. > [16072.109206] test D 0 2257 2256 0x00000004 > [16072.109208] Call Trace: > [16072.109211] __schedule+0x3d6/0x8b0 > [16072.109215] schedule+0x36/0x80 > [16072.109218] schedule_timeout+0x1f3/0x360 > [16072.109221] ? check_preempt_curr+0x5a/0xa0 > [16072.109224] ? ttwu_do_wakeup+0x1e/0x150 > [16072.109227] wait_for_completion+0xb4/0x140 > [16072.109230] ? wait_for_completion+0xb4/0x140 > [16072.109233] ? wake_up_q+0x70/0x70 > [16072.109236] flush_work+0x129/0x1e0 > [16072.109240] ? worker_detach_from_pool+0xb0/0xb0 > [16072.109243] __cancel_work_timer+0x10f/0x190 > [16072.109247] ? device_del+0x264/0x310 > [16072.109250] ? __wake_up+0x44/0x50 > [16072.109253] cancel_delayed_work_sync+0x13/0x20 > [16072.109257] power_supply_unregister+0x37/0xb0 > [16072.109260] devm_power_supply_release+0x11/0x20 > [16072.109263] release_nodes+0x110/0x200 > [16072.109266] devres_release_group+0x7c/0xb0 > [16072.109274] wacom_remove+0xc2/0x110 [wacom] > [16072.109279] hid_device_remove+0x6e/0xd0 [hid] > [16072.109284] device_release_driver_internal+0x158/0x210 > [16072.109288] device_release_driver+0x12/0x20 > [16072.109291] bus_remove_device+0xec/0x160 > [16072.109293] device_del+0x1de/0x310 > [16072.109298] hid_destroy_device+0x27/0x60 [hid] > [16072.109303] usbhid_disconnect+0x51/0x70 [usbhid] > [16072.109308] usb_unbind_interface+0x77/0x270 > [16072.109311] device_release_driver_internal+0x158/0x210 > [16072.109315] device_release_driver+0x12/0x20 > [16072.109318] usb_driver_release_interface+0x77/0x80 > [16072.109321] proc_ioctl+0x20f/0x250 > [16072.109325] usbdev_do_ioctl+0x57f/0x1140 > [16072.109327] ? __wake_up+0x44/0x50 > [16072.109331] usbdev_ioctl+0xe/0x20 > [16072.109336] do_vfs_ioctl+0xa4/0x600 > [16072.109339] ? vfs_write+0x15a/0x1b0 > [16072.109343] SyS_ioctl+0x79/0x90 > [16072.109347] entry_SYSCALL_64_fastpath+0x24/0xab > [16072.109349] RIP: 0033:0x7f20da807f47 > [16072.109351] RSP: 002b:00007ffc422ae398 EFLAGS: 00000246 ORIG_RAX: > 0000000000000010 > [16072.109353] RAX: ffffffffffffffda RBX: 00000000010b8560 RCX: > 00007f20da807f47 > [16072.109355] RDX: 00007ffc422ae3a0 RSI: 00000000c0105512 RDI: > 0000000000000009 > [16072.109356] RBP: 0000000000000000 R08: 00007ffc422ae3e0 R09: > 0000000000000010 > [16072.109357] R10: 00000000000000a6 R11: 0000000000000246 R12: > 0000000000000000 > [16072.109359] R13: 00000000010b8560 R14: 00007ffc422ae2e0 R15: > 0000000000000000 > > Reported-and-tested-by: Richard Hughes <rhug...@redhat.com> > Tested-by: Aaron Skomra <aaron.sko...@wacom.com> > Signed-off-by: Benjamin Tissoires <benjamin.tissoi...@redhat.com>
This also has been reported on the kernel bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=200397 Is there any chances this could get reviewed soon-ish? Cheers, Benjamin > --- > drivers/power/supply/power_supply_core.c | 11 +++++++++-- > include/linux/power_supply.h | 1 + > 2 files changed, 10 insertions(+), 2 deletions(-) > > diff --git a/drivers/power/supply/power_supply_core.c > b/drivers/power/supply/power_supply_core.c > index feac7b066e6c..20a8c2a12199 100644 > --- a/drivers/power/supply/power_supply_core.c > +++ b/drivers/power/supply/power_supply_core.c > @@ -14,6 +14,7 @@ > #include <linux/types.h> > #include <linux/init.h> > #include <linux/slab.h> > +#include <linux/delay.h> > #include <linux/device.h> > #include <linux/notifier.h> > #include <linux/err.h> > @@ -139,8 +140,13 @@ static void power_supply_deferred_register_work(struct > work_struct *work) > struct power_supply *psy = container_of(work, struct power_supply, > deferred_register_work.work); > > - if (psy->dev.parent) > - mutex_lock(&psy->dev.parent->mutex); > + if (psy->dev.parent) { > + while (!mutex_trylock(&psy->dev.parent->mutex)) { > + if (psy->removing) > + return; > + msleep(10); > + } > + } > > power_supply_changed(psy); > > @@ -1071,6 +1077,7 @@ EXPORT_SYMBOL_GPL(devm_power_supply_register_no_ws); > void power_supply_unregister(struct power_supply *psy) > { > WARN_ON(atomic_dec_return(&psy->use_cnt)); > + psy->removing = true; > cancel_work_sync(&psy->changed_work); > cancel_delayed_work_sync(&psy->deferred_register_work); > sysfs_remove_link(&psy->dev.kobj, "powers"); > diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h > index f0139b460a72..0a587526a916 100644 > --- a/include/linux/power_supply.h > +++ b/include/linux/power_supply.h > @@ -251,6 +251,7 @@ struct power_supply { > spinlock_t changed_lock; > bool changed; > bool initialized; > + bool removing; > atomic_t use_cnt; > #ifdef CONFIG_THERMAL > struct thermal_zone_device *tzd; > -- > 2.14.3 >