Re: [RFC] prctl: Deprecate non PR_SET_MM_MAP operations

2018-04-04 Thread Cyrill Gorcunov
On Wed, Apr 04, 2018 at 07:35:41AM +0200, Michal Hocko wrote:
> > > Or we can simply drop it off because PR_SET_MM_MAP covers all needs,
> > > and I would rather prefer to do that asap.
> > 
> > Thanks for making it deprecated. I'd prefer just drop it off if nobody
> > objects. The change will get soaked in linux-next for a while , we will know
> > if it breaks compatibility (it sounds very unlikely).
> 
> Yeah, let's just drop it and have the patch in linux-next (via mmotm)
> for 2 release cycles and see whether somebody complains.
> 
> You can add
> Acked-by: Michal Hocko 
> for such a patch.

Sure. Thanks!


Re: [PATCH v3 6/6] spi: sun6i: add DMA transfers support

2018-04-04 Thread Maxime Ripard
On Tue, Apr 03, 2018 at 06:44:49PM +0300, Sergey Suloev wrote:
> DMA transfers are now available for sun6i and sun8i SoCs.
> The DMA mode is used automatically as soon as requested
> transfer length is more than FIFO length.
> 
> Changes in v3:
> 1) Debug log enhancements.
> 
> Signed-off-by: Sergey Suloev 
> ---
>  drivers/spi/spi-sun6i.c | 331 
> ++--
>  1 file changed, 294 insertions(+), 37 deletions(-)
> 
> diff --git a/drivers/spi/spi-sun6i.c b/drivers/spi/spi-sun6i.c
> index 2fa9d6e..7f41871 100644
> --- a/drivers/spi/spi-sun6i.c
> +++ b/drivers/spi/spi-sun6i.c
> @@ -14,6 +14,8 @@
>  #include 
>  #include 
>  #include 
> +#include 
> +#include 
>  #include 
>  #include 
>  #include 
> @@ -55,17 +57,20 @@
>  
>  #define SUN6I_FIFO_CTL_REG   0x18
>  #define SUN6I_FIFO_CTL_RF_RDY_TRIG_LEVEL_MASK0xff
> -#define SUN6I_FIFO_CTL_RF_RDY_TRIG_LEVEL_BITS0
> +#define SUN6I_FIFO_CTL_RF_RDY_TRIG_LEVEL_POS 0

Why are you changing that name

> +#define SUN6I_FIFO_CTL_RF_DRQ_EN BIT(8)
>  #define SUN6I_FIFO_CTL_RF_RSTBIT(15)
>  #define SUN6I_FIFO_CTL_TF_ERQ_TRIG_LEVEL_MASK0xff
> -#define SUN6I_FIFO_CTL_TF_ERQ_TRIG_LEVEL_BITS16
> +#define SUN6I_FIFO_CTL_TF_ERQ_TRIG_LEVEL_POS 16

Ditto.

> +#define SUN6I_FIFO_CTL_TF_DRQ_EN BIT(24)
>  #define SUN6I_FIFO_CTL_TF_RSTBIT(31)
> +#define SUN6I_FIFO_CTL_DMA_DEDICATE  BIT(9)|BIT(25)
>  
>  #define SUN6I_FIFO_STA_REG   0x1c
>  #define SUN6I_FIFO_STA_RF_CNT_MASK   0x7f
> -#define SUN6I_FIFO_STA_RF_CNT_BITS   0
> +#define SUN6I_FIFO_STA_RF_CNT_POS0

Ditto.

>  #define SUN6I_FIFO_STA_TF_CNT_MASK   0x7f
> -#define SUN6I_FIFO_STA_TF_CNT_BITS   16
> +#define SUN6I_FIFO_STA_TF_CNT_POS16

Ditto.

>  
>  #define SUN6I_CLK_CTL_REG0x24
>  #define SUN6I_CLK_CTL_CDR2_MASK  0xff
> @@ -135,7 +140,7 @@ static inline u32 sun6i_spi_get_tx_fifo_count(struct 
> sun6i_spi *sspi)
>  {
>   u32 reg = sun6i_spi_read(sspi, SUN6I_FIFO_STA_REG);
>  
> - reg >>= SUN6I_FIFO_STA_TF_CNT_BITS;
> + reg >>= SUN6I_FIFO_STA_TF_CNT_POS;
>  
>   return reg & SUN6I_FIFO_STA_TF_CNT_MASK;
>  }
> @@ -148,7 +153,7 @@ static inline void sun6i_spi_drain_fifo(struct sun6i_spi 
> *sspi, int len)
>   /* See how much data is available */
>   reg = sun6i_spi_read(sspi, SUN6I_FIFO_STA_REG);
>   reg &= SUN6I_FIFO_STA_RF_CNT_MASK;
> - cnt = reg >> SUN6I_FIFO_STA_RF_CNT_BITS;
> + cnt = reg >> SUN6I_FIFO_STA_RF_CNT_POS;
>  
>   if (len > cnt)
>   len = cnt;
> @@ -177,6 +182,15 @@ static inline void sun6i_spi_fill_fifo(struct sun6i_spi 
> *sspi, int len)
>   }
>  }
>  
> +static bool sun6i_spi_can_dma(struct spi_master *master,
> +   struct spi_device *spi,
> +   struct spi_transfer *tfr)
> +{
> + struct sun6i_spi *sspi = spi_master_get_devdata(master);
> +
> + return tfr->len > sspi->fifo_depth;
> +}
> +
>  static void sun6i_spi_set_cs(struct spi_device *spi, bool enable)
>  {
>   struct sun6i_spi *sspi = spi_master_get_devdata(spi->master);
> @@ -208,6 +222,9 @@ static size_t sun6i_spi_max_transfer_size(struct 
> spi_device *spi)
>   struct spi_master *master = spi->master;
>   struct sun6i_spi *sspi = spi_master_get_devdata(master);
>  
> + if (master->can_dma)
> + return SUN6I_MAX_XFER_SIZE;
> +
>   return sspi->fifo_depth;
>  }
>  
> @@ -268,16 +285,187 @@ static int sun6i_spi_wait_for_transfer(struct 
> spi_device *spi,
>   return 0;
>  }
>  
> +static void sun6i_spi_dma_callback(void *param)
> +{
> + struct spi_master *master = param;
> +
> + dev_dbg(&master->dev, "DMA transfer complete\n");
> + spi_finalize_current_transfer(master);
> +}
> +
> +static int sun6i_spi_dmap_prep_tx(struct spi_master *master,
> +   struct spi_transfer *tfr,
> +   dma_cookie_t *cookie)
> +{
> + struct dma_async_tx_descriptor *chan_desc = NULL;
> +
> + chan_desc = dmaengine_prep_slave_sg(master->dma_tx,
> + tfr->tx_sg.sgl, tfr->tx_sg.nents,
> + DMA_TO_DEVICE,
> + DMA_PREP_INTERRUPT | DMA_CTRL_ACK);
> + if (!chan_desc) {
> + dev_err(&master->dev,
> + "Couldn't prepare TX DMA slave\n");
> + return -EIO;
> + }
> +
> + chan_desc->callback = sun6i_spi_dma_callback;
> + chan_desc->callback_param = master;
> +
> + *cookie = dmaengine_submit(chan_desc);
> + dma_async_issue_pending(master->dma_tx);
> +
> + return 0;
> +}
> +
> +static int sun6i_spi_dmap_prep_rx(struct spi_master *master,
> +   struct spi_transfer *tfr,
> +   dma_cookie_t *co

Re: [PATCH v3 5/6] spi: sun6i: introduce register set/unset helpers

2018-04-04 Thread Maxime Ripard
On Tue, Apr 03, 2018 at 06:44:48PM +0300, Sergey Suloev wrote:
> Two helper functions were added in order to set/unset
> specified flags in registers.
> 
> Signed-off-by: Sergey Suloev 

Again, I'm not really sure what the benefit of that is. Your diffstat
is pretty much identical, so it's basically just moving code around.

Maxime

-- 
Maxime Ripard, Bootlin (formerly Free Electrons)
Embedded Linux and Kernel engineering
https://bootlin.com


signature.asc
Description: PGP signature


Re: [PATCH v2] perf stat: enable 1ms interval for printing event counters values

2018-04-04 Thread Jiri Olsa
On Tue, Apr 03, 2018 at 09:18:33PM +0300, Alexey Budankov wrote:
> 
> Currently print count interval for performance counters values is 
> limited by 10ms so reading the values at frequencies higher than 100Hz 
> is restricted by the tool.
> 
> This change makes perf stat -I possible on frequencies up to 1KHz and, 
> to some extent, makes perf stat -I to be on-par with perf record 
> sampling profiling.
> 
> When running perf stat -I for monitoring e.g. PCIe uncore counters and 
> at the same time profiling some I/O workload by perf record e.g. for 
> cpu-cycles and context switches, it is then possible to observe 
> consolidated CPU/OS/IO(Uncore) performance picture for that workload.
> 
> Tool overhead warning printed when specifying -v option can be missed 
> due to screen scrolling in case you have output to the console 
> so message is moved into help available by running perf stat -h.
> 
> Signed-off-by: Alexey Budankov 
> ---
> 
> Changes in v2:
>  - updated minimum value to 1ms at perf-stat.txt manual

Acked-by: Jiri Olsa 

thanks,
jirka


Re: [PATCH v2 1/6] spi: core: handle timeout error from transfer_one()

2018-04-04 Thread Maxime Ripard
On Tue, Apr 03, 2018 at 07:24:11PM +0300, Sergey Suloev wrote:
> On 04/03/2018 07:18 PM, Mark Brown wrote:
> > On Tue, Apr 03, 2018 at 07:00:55PM +0300, Sergey Suloev wrote:
> > > On 04/03/2018 06:52 PM, Mark Brown wrote:
> > > > On Tue, Apr 03, 2018 at 06:29:00PM +0300, Sergey Suloev wrote:
> > > > > As long as sun4i/sun6i SPI drivers have overriden the default
> > > > > "wait for completion" procedure then we need to properly
> > > > > handle -ETIMEDOUT error from transfer_one().
> > > > Why is this connected to those drivers specifically?
> > > These 2 drivers have their own "waiting" code and not using the code from
> > > SPI core.
> > Does this not apply to any other driver - why is this something we only
> > have to do when these drivers do it?  That's what's setting off alarm
> > bells.
> 
> sun4i/sun6i drivers have let's say "smart" waiting while SPI core uses a
> fixed interval to wait.
> 
> I can't say for every SPI driver in kernel, that's outside of my area of
> expertise.

I'm not sure what's specific about the sun4i / sun6i case here. Your
patch doesn't have anything to do with the delay before the timeout,
but the fact that we return -ETIMEDOUT in the first place.

And I'm pretty sure that papering over an error returned by a driver
is not the right thing to do.

Maxime

-- 
Maxime Ripard, Bootlin (formerly Free Electrons)
Embedded Linux and Kernel engineering
https://bootlin.com


signature.asc
Description: PGP signature


Re: [PATCH v2 2/6] spi: sun4i: restrict transfer length in PIO-mode

2018-04-04 Thread Maxime Ripard
On Tue, Apr 03, 2018 at 06:29:01PM +0300, Sergey Suloev wrote:
> There is no need to handle the 3/4 FIFO empty interrupt
> as the maximum supported transfer length in PIO mode
> is 64 bytes.
> As long as a problem was reported previously with filling FIFO
> on A10s we want to stick with 63 bytes depth.
> 
> Changes in v2:
> 1) Restored processing of 3/4 FIFO full interrupt.
> 
> Signed-off-by: Sergey Suloev 
> ---
>  drivers/spi/spi-sun4i.c | 37 ++---
>  1 file changed, 10 insertions(+), 27 deletions(-)
> 
> diff --git a/drivers/spi/spi-sun4i.c b/drivers/spi/spi-sun4i.c
> index 4141003..08fd007 100644
> --- a/drivers/spi/spi-sun4i.c
> +++ b/drivers/spi/spi-sun4i.c
> @@ -22,7 +22,12 @@
>  
>  #include 
>  
> -#define SUN4I_FIFO_DEPTH 64
> +/*
> + * FIFO length is 64 bytes
> + * But filling the FIFO fully might cause a timeout
> + * on some devices, for example on spi2 on A10s
> + */
> +#define SUN4I_FIFO_DEPTH 63

The FIFO depth is 64 bytes, so the code should remain the same at
least from that regard.

>  #define SUN4I_RXDATA_REG 0x00
>  
> @@ -202,7 +207,7 @@ static void sun4i_spi_set_cs(struct spi_device *spi, bool 
> enable)
>  
>  static size_t sun4i_spi_max_transfer_size(struct spi_device *spi)
>  {
> - return SUN4I_FIFO_DEPTH - 1;
> + return SUN4I_FIFO_DEPTH;
>  }
>  
>  static int sun4i_spi_transfer_one(struct spi_master *master,
> @@ -216,11 +221,8 @@ static int sun4i_spi_transfer_one(struct spi_master 
> *master,
>   int ret = 0;
>   u32 reg;
>  
> - /* We don't support transfer larger than the FIFO */
> - if (tfr->len > SUN4I_MAX_XFER_SIZE)
> - return -EMSGSIZE;
> -
> - if (tfr->tx_buf && tfr->len >= SUN4I_MAX_XFER_SIZE)
> + /* We don't support transfers larger than FIFO depth */
> + if (tfr->len > SUN4I_FIFO_DEPTH)
>   return -EMSGSIZE;

This essentially reverts 196737912da5, why?

Maxime

-- 
Maxime Ripard, Bootlin (formerly Free Electrons)
Embedded Linux and Kernel engineering
https://bootlin.com


signature.asc
Description: PGP signature


Re: [PATCH v2 3/6] spi: sun4i: coding style/readability improvements

2018-04-04 Thread Maxime Ripard
On Tue, Apr 03, 2018 at 06:29:02PM +0300, Sergey Suloev wrote:
> Minor changes to fulfill the coding style and
> improve the readability.

Again, this is a very subjective statement. Overall, I'm not convinced
by the idea of moving code around and shuffling variables assignments
just for the sake of it.

Maxime

-- 
Maxime Ripard, Bootlin (formerly Free Electrons)
Embedded Linux and Kernel engineering
https://bootlin.com


signature.asc
Description: PGP signature


Re: [PATCH v2 4/6] spi: sun4i: use completion provided by SPI core driver

2018-04-04 Thread Maxime Ripard
On Tue, Apr 03, 2018 at 06:29:03PM +0300, Sergey Suloev wrote:
> As long as the completion already provided by the SPI core
> then there is no need to waste extra-memory on this.
> Also a waiting function was added to avoid code duplication.

This should also be split in two parts.

Maxime

-- 
Maxime Ripard, Bootlin (formerly Free Electrons)
Embedded Linux and Kernel engineering
https://bootlin.com


signature.asc
Description: PGP signature


Re: [PATCH v2 5/6] spi: sun4i: introduce register set/unset helpers

2018-04-04 Thread Maxime Ripard
On Tue, Apr 03, 2018 at 06:29:04PM +0300, Sergey Suloev wrote:
> Two helper functions were added in order to set/unset
> specified flags in registers.

Again, the diffstat is pretty neutral, so what is the benefit?

Maxime

-- 
Maxime Ripard, Bootlin (formerly Free Electrons)
Embedded Linux and Kernel engineering
https://bootlin.com


signature.asc
Description: PGP signature


Re: KASAN: use-after-free Read in radix_tree_next_chunk

2018-04-04 Thread Pavel Machek
Come on? Two copies of one mail, to lkml with thousands of
subscribers?


> Hello,
> 
> syzbot hit the following crash on upstream commit
> 9dd2326890d89a5179967c947dab2bab34d7ddee (Fri Mar 30 17:29:47 2018 +)
> Merge tag 'ceph-for-4.16-rc8' of git://github.com/ceph/ceph-client
> syzbot dashboard link:
> https://syzkaller.appspot.com/bug?extid=040b31ac96753fd7eb46
> 
> So far this crash happened 50 times on upstream.
> C reproducer: https://syzkaller.appspot.com/x/repro.c?id=6534913729757184
> syzkaller reproducer:
> https://syzkaller.appspot.com/x/repro.syz?id=6221692233842688
> Raw console output:
> https://syzkaller.appspot.com/x/log.txt?id=5252920484298752
> Kernel config:
> https://syzkaller.appspot.com/x/.config?id=-2760467897697295172
> compiler: gcc (GCC) 7.1.1 20170620
> 
> IMPORTANT: if you fix the bug, please add the following tag to the commit:
> Reported-by: syzbot+040b31ac96753fd7e...@syzkaller.appspotmail.com
> It will help syzbot understand when the bug is fixed. See footer for
> details.
> If you forward the report, please keep this part and the footer.
> 
> IPVS: ftp: loaded support on port[0] = 21
> XFS (loop5): Invalid device [./file0], error=-15
> XFS (loop7): nobarrier option is deprecated, ignoring.
> XFS (loop7): Invalid device [./file0], error=-15
> ==
> BUG: KASAN: use-after-free in radix_tree_next_chunk+0xde1/0xdf0
> lib/radix-tree.c:1733
> Read of size 4 at addr 8801b1669250 by task syzkaller476032/4458
> 
> CPU: 1 PID: 4458 Comm: syzkaller476032 Not tainted 4.16.0-rc7+ #7
> IPVS: ftp: loaded support on port[0] = 21
> Hardware name: Google Google Compute Engine/Google Compute Engine,
> BIOS Google 01/01/2011
> Call Trace:
>  __dump_stack lib/dump_stack.c:17 [inline]
>  dump_stack+0x194/0x24d lib/dump_stack.c:53
>  print_address_description+0x73/0x250 mm/kasan/report.c:256
>  kasan_report_error mm/kasan/report.c:354 [inline]
>  kasan_report+0x23c/0x360 mm/kasan/report.c:412
>  __asan_report_load4_noabort+0x14/0x20 mm/kasan/report.c:432
>  radix_tree_next_chunk+0xde1/0xdf0 lib/radix-tree.c:1733
>  radix_tree_gang_lookup_tag+0x36e/0x5e0 lib/radix-tree.c:1918
> IPVS: ftp: loaded support on port[0] = 21
>  xfs_perag_get_tag+0x109/0x6c0 fs/xfs/libxfs/xfs_sb.c:88
> IPVS: ftp: loaded support on port[0] = 21
>  xfs_reclaim_inodes_count+0x82/0xb0 fs/xfs/xfs_icache.c:1362
>  xfs_fs_nr_cached_objects+0x37/0x50 fs/xfs/xfs_super.c:1778
>  super_cache_count+0x96/0x280 fs/super.c:131
>  do_shrink_slab mm/vmscan.c:310 [inline]
>  shrink_slab.part.46+0x30c/0xe80 mm/vmscan.c:475
>  shrink_slab+0x9d/0xb0 mm/vmscan.c:442
>  shrink_node+0x51e/0xf70 mm/vmscan.c:2556
>  shrink_zones mm/vmscan.c:2728 [inline]
>  do_try_to_free_pages+0x383/0x1020 mm/vmscan.c:2790
>  try_to_free_mem_cgroup_pages+0x44d/0xb40 mm/vmscan.c:3079
> IPVS: ftp: loaded support on port[0] = 21
>  reclaim_high.constprop.64+0x1e2/0x330 mm/memcontrol.c:1862
> IPVS: ftp: loaded support on port[0] = 21
>  mem_cgroup_handle_over_high+0x8d/0x130 mm/memcontrol.c:1887
>  tracehook_notify_resume include/linux/tracehook.h:193 [inline]
>  exit_to_usermode_loop+0x242/0x2f0 arch/x86/entry/common.c:166
>  prepare_exit_to_usermode arch/x86/entry/common.c:196 [inline]
>  syscall_return_slowpath arch/x86/entry/common.c:265 [inline]
>  do_syscall_64+0x6ec/0x940 arch/x86/entry/common.c:292
>  entry_SYSCALL_64_after_hwframe+0x42/0xb7
> RIP: 0033:0x440fda
> RSP: 002b:7ffd309f1880 EFLAGS: 0246 ORIG_RAX: 0038
> RAX: 0003 RBX:  RCX: 00440fda
> RDX:  RSI:  RDI: 01200011
> RBP: 7ffd309f18a0 R08: 0001 R09: 020eb880
> R10: 020ebb50 R11: 0246 R12: 0001
> R13: 7ffd309f18d0 R14:  R15: 7ffd309f19e8
> 
> Allocated by task 4469:
>  save_stack+0x43/0xd0 mm/kasan/kasan.c:447
>  set_track mm/kasan/kasan.c:459 [inline]
>  kasan_kmalloc+0xad/0xe0 mm/kasan/kasan.c:552
>  kmem_cache_alloc_trace+0x136/0x740 mm/slab.c:3608
>  kmalloc include/linux/slab.h:512 [inline]
>  kzalloc include/linux/slab.h:701 [inline]
>  xfs_fs_fill_super+0xd1/0x1220 fs/xfs/xfs_super.c:1579
>  mount_bdev+0x2b7/0x370 fs/super.c:1119
>  xfs_fs_mount+0x34/0x40 fs/xfs/xfs_super.c:1770
>  mount_fs+0x66/0x2d0 fs/super.c:1222
>  vfs_kern_mount.part.26+0xc6/0x4a0 fs/namespace.c:1037
>  vfs_kern_mount fs/namespace.c:2509 [inline]
>  do_new_mount fs/namespace.c:2512 [inline]
>  do_mount+0xea4/0x2bb0 fs/namespace.c:2842
>  SYSC_mount fs/namespace.c:3058 [inline]
>  SyS_mount+0xab/0x120 fs/namespace.c:3035
>  do_syscall_64+0x281/0x940 arch/x86/entry/common.c:287
>  entry_SYSCALL_64_after_hwframe+0x42/0xb7
> 
> Freed by task 4469:
>  save_stack+0x43/0xd0 mm/kasan/kasan.c:447
>  set_track mm/kasan/kasan.c:459 [inline]
>  __kasan_slab_free+0x11a/0x170 mm/kasan/kasan.c:520
>  kasan_slab_free+0xe/0x10 mm/kasan/kasan.c:527
>  __cache_free mm/slab.c:3486 [inline]
>  kfree+0

[PATCH RT 5/7] crypto: limit more FPU-enabled sections

2018-04-04 Thread Daniel Wagner
From: Sebastian Andrzej Siewior 

Those crypto drivers use SSE/AVX/… for their crypto work and in order to
do so in kernel they need to enable the "FPU" in kernel mode which
disables preemption.
There are two problems with the way they are used:
- the while loop which processes X bytes may create latency spikes and
  should be avoided or limited.
- the cipher-walk-next part may allocate/free memory and may use
  kmap_atomic().

The whole kernel_fpu_begin()/end() processing isn't probably that cheap.
It most likely makes sense to process as much of those as possible in one
go. The new *_fpu_sched_rt() schedules only if a RT task is pending.

Probably we should measure the performance those ciphers in pure SW
mode and with this optimisations to see if it makes sense to keep them
for RT.

This kernel_fpu_resched() makes the code more preemptible which might hurt
performance.

Cc: stable...@vger.kernel.org
Signed-off-by: Sebastian Andrzej Siewior 
---
 arch/x86/crypto/camellia_aesni_avx2_glue.c | 20 
 arch/x86/crypto/camellia_aesni_avx_glue.c  | 19 +++
 arch/x86/crypto/cast6_avx_glue.c   | 24 +++-
 arch/x86/crypto/chacha20_glue.c|  8 +---
 arch/x86/crypto/serpent_avx2_glue.c| 19 +++
 arch/x86/crypto/serpent_avx_glue.c | 23 +++
 arch/x86/crypto/serpent_sse2_glue.c| 23 +++
 arch/x86/crypto/twofish_avx_glue.c | 27 +--
 arch/x86/include/asm/fpu/api.h |  1 +
 arch/x86/kernel/fpu/core.c | 12 
 10 files changed, 158 insertions(+), 18 deletions(-)

diff --git a/arch/x86/crypto/camellia_aesni_avx2_glue.c 
b/arch/x86/crypto/camellia_aesni_avx2_glue.c
index d84456924563..c54536d9932c 100644
--- a/arch/x86/crypto/camellia_aesni_avx2_glue.c
+++ b/arch/x86/crypto/camellia_aesni_avx2_glue.c
@@ -206,6 +206,20 @@ struct crypt_priv {
bool fpu_enabled;
 };
 
+#ifdef CONFIG_PREEMPT_RT_FULL
+static void camellia_fpu_end_rt(struct crypt_priv *ctx)
+{
+   bool fpu_enabled = ctx->fpu_enabled;
+
+   if (!fpu_enabled)
+   return;
+   camellia_fpu_end(fpu_enabled);
+   ctx->fpu_enabled = false;
+}
+#else
+static void camellia_fpu_end_rt(struct crypt_priv *ctx) { }
+#endif
+
 static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
 {
const unsigned int bsize = CAMELLIA_BLOCK_SIZE;
@@ -221,16 +235,19 @@ static void encrypt_callback(void *priv, u8 *srcdst, 
unsigned int nbytes)
}
 
if (nbytes >= CAMELLIA_AESNI_PARALLEL_BLOCKS * bsize) {
+   kernel_fpu_resched();
camellia_ecb_enc_16way(ctx->ctx, srcdst, srcdst);
srcdst += bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS;
nbytes -= bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS;
}
 
while (nbytes >= CAMELLIA_PARALLEL_BLOCKS * bsize) {
+   kernel_fpu_resched();
camellia_enc_blk_2way(ctx->ctx, srcdst, srcdst);
srcdst += bsize * CAMELLIA_PARALLEL_BLOCKS;
nbytes -= bsize * CAMELLIA_PARALLEL_BLOCKS;
}
+   camellia_fpu_end_rt(ctx);
 
for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
camellia_enc_blk(ctx->ctx, srcdst, srcdst);
@@ -251,16 +268,19 @@ static void decrypt_callback(void *priv, u8 *srcdst, 
unsigned int nbytes)
}
 
if (nbytes >= CAMELLIA_AESNI_PARALLEL_BLOCKS * bsize) {
+   kernel_fpu_resched();
camellia_ecb_dec_16way(ctx->ctx, srcdst, srcdst);
srcdst += bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS;
nbytes -= bsize * CAMELLIA_AESNI_PARALLEL_BLOCKS;
}
 
while (nbytes >= CAMELLIA_PARALLEL_BLOCKS * bsize) {
+   kernel_fpu_resched();
camellia_dec_blk_2way(ctx->ctx, srcdst, srcdst);
srcdst += bsize * CAMELLIA_PARALLEL_BLOCKS;
nbytes -= bsize * CAMELLIA_PARALLEL_BLOCKS;
}
+   camellia_fpu_end_rt(ctx);
 
for (i = 0; i < nbytes / bsize; i++, srcdst += bsize)
camellia_dec_blk(ctx->ctx, srcdst, srcdst);
diff --git a/arch/x86/crypto/camellia_aesni_avx_glue.c 
b/arch/x86/crypto/camellia_aesni_avx_glue.c
index 93d8f295784e..a1666a58eee5 100644
--- a/arch/x86/crypto/camellia_aesni_avx_glue.c
+++ b/arch/x86/crypto/camellia_aesni_avx_glue.c
@@ -210,6 +210,21 @@ struct crypt_priv {
bool fpu_enabled;
 };
 
+#ifdef CONFIG_PREEMPT_RT_FULL
+static void camellia_fpu_end_rt(struct crypt_priv *ctx)
+{
+   bool fpu_enabled = ctx->fpu_enabled;
+
+   if (!fpu_enabled)
+   return;
+   camellia_fpu_end(fpu_enabled);
+   ctx->fpu_enabled = false;
+}
+
+#else
+static void camellia_fpu_end_rt(struct crypt_priv *ctx) { }
+#endif
+
 static void encrypt_callback(void *priv, u8 *srcdst, unsigned int nbytes)
 {
const unsigned int bsize = CAMEL

[PATCH RT 6/7] Revert "memcontrol: Prevent scheduling while atomic in cgroup code"

2018-04-04 Thread Daniel Wagner
From: "Steven Rostedt (VMware)" 

The commit "memcontrol: Prevent scheduling while atomic in cgroup code"
fixed this issue:

   refill_stock()
  get_cpu_var()
  drain_stock()
 res_counter_uncharge()
res_counter_uncharge_until()
   spin_lock() <== boom

But commit 3e32cb2e0a12b ("mm: memcontrol: lockless page counters") replaced
the calls to res_counter_uncharge() in drain_stock() to the lockless
function page_counter_uncharge(). There is no more spin lock there and no
more reason to have that local lock.

Cc: 
Reported-by: Haiyang HY1 Tan 
Signed-off-by: Steven Rostedt (VMware) 
[bigeasy: That upstream commit appeared in v3.19 and the patch in
  question in v3.18.7-rt2 and v3.18 seems still to be maintained. So I
  guess that v3.18 would need the locallocks that we are about to remove
  here. I am not sure if any earlier versions have the patch
  backported.
  The stable tag here is because Haiyang reported (and debugged) a crash
  in 4.4-RT with this patch applied (which has get_cpu_light() instead
  the locallocks it gained in v4.9-RT).
  
https://lkml.kernel.org/r/05aa4ec5c6ec1d48be2cdcff3ae0b8a637f78...@cnmailex04.lenovo.com
]
Signed-off-by: Sebastian Andrzej Siewior 
---
 mm/memcontrol.c | 7 ++-
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 493b4986d5dc..56f67a15937b 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1925,17 +1925,14 @@ static void drain_local_stock(struct work_struct *dummy)
  */
 static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
 {
-   struct memcg_stock_pcp *stock;
-   int cpu = get_cpu_light();
-
-   stock = &per_cpu(memcg_stock, cpu);
+   struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock);
 
if (stock->cached != memcg) { /* reset if necessary */
drain_stock(stock);
stock->cached = memcg;
}
stock->nr_pages += nr_pages;
-   put_cpu_light();
+   put_cpu_var(memcg_stock);
 }
 
 /*
-- 
2.14.3


[PATCH RT 2/7] timer: Invoke timer_start_debug() where it makes sense

2018-04-04 Thread Daniel Wagner
From: Thomas Gleixner 

The timer start debug function is called before the proper timer base is
set. As a consequence the trace data contains the stale CPU and flags
values.

Call the debug function after setting the new base and flags.

Fixes: 500462a9de65 ("timers: Switch to a non-cascading wheel")
Signed-off-by: Thomas Gleixner 
Cc: sta...@vger.kernel.org
Cc: r...@linutronix.de
Signed-off-by: Sebastian Andrzej Siewior 
---
 kernel/time/timer.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index a8246d79cb5a..6b322aea1c46 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -838,8 +838,6 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
if (!ret && pending_only)
goto out_unlock;
 
-   debug_activate(timer, expires);
-
new_base = get_target_base(base, pinned);
 
if (base != new_base) {
@@ -854,6 +852,8 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
base = switch_timer_base(timer, base, new_base);
}
 
+   debug_activate(timer, expires);
+
timer->expires = expires;
internal_add_timer(base, timer);
 
-- 
2.14.3


[PATCH RT 7/7] Linux 4.4.126-rt141-rc1

2018-04-04 Thread Daniel Wagner
Signed-off-by: Daniel Wagner 
---
 localversion-rt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/localversion-rt b/localversion-rt
index 09a4d4e4f010..564532c76548 100644
--- a/localversion-rt
+++ b/localversion-rt
@@ -1 +1 @@
--rt141
+-rt141-rc1
-- 
2.14.3


Re: [PATCH 2/2] kfree_rcu() should use kfree_bulk() interface

2018-04-04 Thread Rao Shoaib



On 04/03/2018 07:23 PM, Matthew Wilcox wrote:

On Tue, Apr 03, 2018 at 05:55:55PM -0700, Rao Shoaib wrote:

On 04/03/2018 01:58 PM, Matthew Wilcox wrote:

I think you might be better off with an IDR.  The IDR can always
contain one entry, so there's no need for this 'rbf_list_head' or
__rcu_bulk_schedule_list.  The IDR contains its first 64 entries in
an array (if that array can be allocated), so it's compatible with the
kfree_bulk() interface.


I have just familiarized myself with what IDR is by reading your article. If
I am incorrect please correct me.

The list and head you have pointed are only used  if the container can not
be allocated. That could happen with IDR as well. Note that the containers
are allocated at boot time and are re-used.

No, it can't happen with the IDR.  The IDR can always contain one entry
without allocating anything.  If you fail to allocate the second entry,
just free the first entry.


IDR seems to have some overhead, such as I have to specifically add the
pointer and free the ID, plus radix tree maintenance.

... what?  Adding a pointer is simply idr_alloc(), and you get back an
integer telling you which index it has.  Your data structure has its
own set of overhead.
The only overhead is a pointer that points to the head and an int to 
keep count. If I use idr, I would have to allocate an struct idr which 
is much larger. idr_alloc()/idr_destroy() operations are much more 
costly than updating two pointers. As the pointers are stored in 
slots/nodes corresponding to the id, I would  have to retrieve the 
pointers by calling idr_remove() to pass them to be freed, the 
slots/nodes would constantly be allocated and freed.


IDR is a very useful interface for allocating/managing ID's but I really 
do not see the justification for using it over here, perhaps you can 
elaborate more on the benefits and also on how I can just pass the array 
to be freed.


Shoaib



[PATCH RT 4/7] arm*: disable NEON in kernel mode

2018-04-04 Thread Daniel Wagner
From: Sebastian Andrzej Siewior 

NEON in kernel mode is used by the crypto algorithms and raid6 code.
While the raid6 code looks okay, the crypto algorithms do not: NEON
is enabled on first invocation and may allocate/free/map memory before
the NEON mode is disabled again.
This needs to be changed until it can be enabled.
On ARM NEON in kernel mode can be simply disabled. on ARM64 it needs to
stay on due to possible EFI callbacks so here I disable each algorithm.

Cc: stable...@vger.kernel.org
Signed-off-by: Sebastian Andrzej Siewior 
---
 arch/arm/Kconfig  |  2 +-
 arch/arm64/crypto/Kconfig | 14 +++---
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 79c4603e9453..5e054a0c4b25 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -2119,7 +2119,7 @@ config NEON
 
 config KERNEL_MODE_NEON
bool "Support for NEON in kernel mode"
-   depends on NEON && AEABI
+   depends on NEON && AEABI && !PREEMPT_RT_BASE
help
  Say Y to include support for NEON in kernel mode.
 
diff --git a/arch/arm64/crypto/Kconfig b/arch/arm64/crypto/Kconfig
index 2cf32e9887e1..cd71b3432720 100644
--- a/arch/arm64/crypto/Kconfig
+++ b/arch/arm64/crypto/Kconfig
@@ -10,41 +10,41 @@ if ARM64_CRYPTO
 
 config CRYPTO_SHA1_ARM64_CE
tristate "SHA-1 digest algorithm (ARMv8 Crypto Extensions)"
-   depends on ARM64 && KERNEL_MODE_NEON
+   depends on ARM64 && KERNEL_MODE_NEON && !PREEMPT_RT_BASE
select CRYPTO_HASH
 
 config CRYPTO_SHA2_ARM64_CE
tristate "SHA-224/SHA-256 digest algorithm (ARMv8 Crypto Extensions)"
-   depends on ARM64 && KERNEL_MODE_NEON
+   depends on ARM64 && KERNEL_MODE_NEON && !PREEMPT_RT_BASE
select CRYPTO_HASH
 
 config CRYPTO_GHASH_ARM64_CE
tristate "GHASH (for GCM chaining mode) using ARMv8 Crypto Extensions"
-   depends on ARM64 && KERNEL_MODE_NEON
+   depends on ARM64 && KERNEL_MODE_NEON && !PREEMPT_RT_BASE
select CRYPTO_HASH
 
 config CRYPTO_AES_ARM64_CE
tristate "AES core cipher using ARMv8 Crypto Extensions"
-   depends on ARM64 && KERNEL_MODE_NEON
+   depends on ARM64 && KERNEL_MODE_NEON && !PREEMPT_RT_BASE
select CRYPTO_ALGAPI
 
 config CRYPTO_AES_ARM64_CE_CCM
tristate "AES in CCM mode using ARMv8 Crypto Extensions"
-   depends on ARM64 && KERNEL_MODE_NEON
+   depends on ARM64 && KERNEL_MODE_NEON && !PREEMPT_RT_BASE
select CRYPTO_ALGAPI
select CRYPTO_AES_ARM64_CE
select CRYPTO_AEAD
 
 config CRYPTO_AES_ARM64_CE_BLK
tristate "AES in ECB/CBC/CTR/XTS modes using ARMv8 Crypto Extensions"
-   depends on ARM64 && KERNEL_MODE_NEON
+   depends on ARM64 && KERNEL_MODE_NEON && !PREEMPT_RT_BASE
select CRYPTO_BLKCIPHER
select CRYPTO_AES_ARM64_CE
select CRYPTO_ABLK_HELPER
 
 config CRYPTO_AES_ARM64_NEON_BLK
tristate "AES in ECB/CBC/CTR/XTS modes using NEON instructions"
-   depends on ARM64 && KERNEL_MODE_NEON
+   depends on ARM64 && KERNEL_MODE_NEON && !PREEMPT_RT_BASE
select CRYPTO_BLKCIPHER
select CRYPTO_AES
select CRYPTO_ABLK_HELPER
-- 
2.14.3


[PATCH RT 0/7] 4.4.126-rt141-rc1

2018-04-04 Thread Daniel Wagner
Dear RT Folks,

This is the RT stable review cycle of patch 4.4.126-rt141-rc1.

The series contains a few patches which never made it to the v4.4-rt
tree. I identified the patches using Steven's workflow. Next thing I
try to do is to use Julia's script to idendtify missing
patches. In theory nothing should be. 

Please scream at me if I messed something up. Please test the patches too.

The pre-releases will not be pushed to the git repository, only the
final release is.

If all goes well, this patch will be converted to the next main release
on 16.04.2017.

Enjoy,
-- Daniel

ps: Bare with me, I didn't have time to get all the magic tooling
working for releasing a -rc series as Steven has done in the past.
That's why this mail looks slightly different then what you are used.

Changes from 4.4.126-rt141:

Daniel Wagner (1):
  Linux 4.4.126-rt141-rc1

Sebastian Andrzej Siewior (4):
  locking: add types.h
  mm/slub: close possible memory-leak in kmem_cache_alloc_bulk()
  arm*: disable NEON in kernel mode
  crypto: limit more FPU-enabled sections

Steven Rostedt (VMware) (1):
  Revert "memcontrol: Prevent scheduling while atomic in cgroup code"

Thomas Gleixner (1):
  timer: Invoke timer_start_debug() where it makes sense

 arch/arm/Kconfig   |  2 +-
 arch/arm64/crypto/Kconfig  | 14 +++---
 arch/x86/crypto/camellia_aesni_avx2_glue.c | 20 
 arch/x86/crypto/camellia_aesni_avx_glue.c  | 19 +++
 arch/x86/crypto/cast6_avx_glue.c   | 24 +++-
 arch/x86/crypto/chacha20_glue.c|  8 +---
 arch/x86/crypto/serpent_avx2_glue.c| 19 +++
 arch/x86/crypto/serpent_avx_glue.c | 23 +++
 arch/x86/crypto/serpent_sse2_glue.c| 23 +++
 arch/x86/crypto/twofish_avx_glue.c | 27 +--
 arch/x86/include/asm/fpu/api.h |  1 +
 arch/x86/kernel/fpu/core.c | 12 
 include/linux/spinlock_types_raw.h |  2 ++
 kernel/time/timer.c|  4 ++--
 localversion-rt|  2 +-
 mm/memcontrol.c|  7 ++-
 mm/slub.c  |  1 +
 17 files changed, 174 insertions(+), 34 deletions(-)

-- 
2.14.3


[PATCH RT 1/7] locking: add types.h

2018-04-04 Thread Daniel Wagner
From: Sebastian Andrzej Siewior 

During the stable update the arm architecture did not compile anymore
due to missing definition of u16/32.

Cc: stable...@vger.kernel.org
Signed-off-by: Sebastian Andrzej Siewior 
---
 include/linux/spinlock_types_raw.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/include/linux/spinlock_types_raw.h 
b/include/linux/spinlock_types_raw.h
index edffc4d53fc9..03235b475b77 100644
--- a/include/linux/spinlock_types_raw.h
+++ b/include/linux/spinlock_types_raw.h
@@ -1,6 +1,8 @@
 #ifndef __LINUX_SPINLOCK_TYPES_RAW_H
 #define __LINUX_SPINLOCK_TYPES_RAW_H
 
+#include 
+
 #if defined(CONFIG_SMP)
 # include 
 #else
-- 
2.14.3


[PATCH RT 3/7] mm/slub: close possible memory-leak in kmem_cache_alloc_bulk()

2018-04-04 Thread Daniel Wagner
From: Sebastian Andrzej Siewior 

Under certain circumstances we could leak elements which were moved to
the local "to_free" list. The damage is limited since I can't find
any users here.

Cc: stable...@vger.kernel.org
Signed-off-by: Sebastian Andrzej Siewior 
---
 mm/slub.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/mm/slub.c b/mm/slub.c
index b183c5271607..b3626ab324fe 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -3026,6 +3026,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t 
flags, size_t size,
return i;
 error:
local_irq_enable();
+   free_delayed(&to_free);
slab_post_alloc_hook(s, flags, i, p);
__kmem_cache_free_bulk(s, i, p);
return 0;
-- 
2.14.3


Re: [RFC PATCH 3/4] acpi: apei: Do not panic() in NMI because of GHES messages

2018-04-04 Thread James Morse
Hi Alexandru,

On 03/04/18 18:08, Alexandru Gagniuc wrote:
> BIOSes like to send NMIs for a number of silly reasons often deemed
> to be "fatal". For example pin bounce during a PCIE hotplug/unplug
> might cause the link to go down and retrain, with fatal PCI errors
> being generated while the link is retraining.

Sounds fun!


> Instead of panic()ing in NMI context, pass fatal errors down to IRQ
> context to see if they can be resolved.

How do we know we will survive this trip?

On arm64 systems it may not be possible to return to the context we took the NMI
notification from: we may bounce back into firmware with the same "world is on
fire" error. Firmware can rightly assume the OS has made no attempt to handle
the error. Your 'not re-arming the error' example makes this worrying.


> With these change, PCIe error are handled by AER. Other far less
> common errors, such as machine check exceptions, still cause a panic()
> in their respective handlers.

I agree AER is always going to be different. Could we take a look at the CPER
records while still in_nmi() to decide whether linux knows better than firmware?

For non-standard or processor-errors I think we should always panic() if they're
marked as fatal.
For memory-errors we could split memory_failure() up to have
{NMI,IRQ,process}-context helpers, all we need to know at NMI-time is whether
the affected memory is kernel memory.

For the PCI*/AER errors we should be able to try and handle it ... if we can get
back to process/IRQ context:
What happens if a PCIe driver has interrupts masked and is doing something to
cause this error? We can take the NMI and setup the irq-work, but it may never
run as we return to interrupts-masked context.


> diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
> index 2c998125b1d5..7243a99ea57e 100644
> --- a/drivers/acpi/apei/ghes.c
> +++ b/drivers/acpi/apei/ghes.c
> @@ -955,7 +962,7 @@ static void __process_error(struct ghes *ghes)
>  static int ghes_notify_nmi(unsigned int cmd, struct pt_regs *regs)
>  {
>   struct ghes *ghes;
> - int sev, ret = NMI_DONE;
> + int ret = NMI_DONE;
>  
>   if (!atomic_add_unless(&ghes_in_nmi, 1, 1))
>   return ret;
> @@ -968,13 +975,6 @@ static int ghes_notify_nmi(unsigned int cmd, struct 
> pt_regs *regs)
>   ret = NMI_HANDLED;
>   }
>  
> - sev = ghes_severity(ghes->estatus->error_severity);
> - if (sev >= GHES_SEV_PANIC) {
> - oops_begin();
> - ghes_print_queued_estatus();
> - __ghes_panic(ghes);
> - }
> -
>   if (!(ghes->flags & GHES_TO_CLEAR))
>   continue;

For Processor-errors I think this is the wrong thing to do, but we should be
able to poke around in the CPER records and find out what we are dealing with.


Thanks,

James


Build failure due commit 9217e566bdee

2018-04-04 Thread Jarkko Nikula

Hi

Today's head and linux-next doesn't compile due commit 9217e566bdee 
("of_net: Implement of_get_nvmem_mac_address helper"):


drivers/of/of_net.o: In function `of_get_nvmem_mac_address':
.../drivers/of/of_net.c:100: undefined reference to `of_nvmem_cell_get'
.../drivers/of/of_net.c:104: undefined reference to `nvmem_cell_read'
.../drivers/of/of_net.c:106: undefined reference to `nvmem_cell_put'
Makefile:1033: recipe for target 'vmlinux' failed
make: *** [vmlinux] Error 1

Builds if I set CONFIG_NVMEM=y from CONFIG_NVMEM=m or revert above commit.

--
Jarkko


Re: [PATCH v3] staging: vt6655: check for memory allocation failures

2018-04-04 Thread Ji-Hun Kim
On Tue, Apr 03, 2018 at 01:40:52PM +0300, Dan Carpenter wrote:
> > desc->rd_info = kzalloc(sizeof(*desc->rd_info), GFP_KERNEL);
> > -
> > +   if (!desc->rd_info) {
> > +   ret = -ENOMEM;
> > +   goto error;
> > +   }
> > if (!device_alloc_rx_buf(priv, desc))
> > dev_err(&priv->pcid->dev, "can not alloc rx bufs\n");
> >  
> 
> We need to handle the case where device_alloc_rx_buf() fails as well...

Hi Dan, thanks for your comments! I will write a patch v5 including this fix.

> Some years back, I wrote a post about error handling that might be
> helpful:
> https://plus.google.com/106378716002406849458/posts/dnanfhQ4mHQ

This post is very helpful to me, Thank you.

> You are using "one err" and "do nothing" style error handling which are
> described in the post.

Sorry, I didn't fully understand "one err" and "do nothing" style error
handling of this code. The reason using goto instead of returns directly
was that for freeing previously allocated "rd_info"s in the for loop.
Please see below comment.


> > @@ -550,20 +554,29 @@ static void device_init_rd0_ring(struct vnt_private 
> > *priv)
> > if (i > 0)
> > priv->aRD0Ring[i-1].next_desc = cpu_to_le32(priv->rd0_pool_dma);
> > priv->pCurrRD[0] = &priv->aRD0Ring[0];
> > +
> > +   return 0;
> > +error:
> > +   device_free_rd0_ring(priv);
> > +   return ret;
> >  }
> 
> Of course, Jia-Ju Bai is correct to say that this is a layering
> violation.  Each function should only clean up after its self.
> 
> Also, this is a very typical "one err" style bug which I explain about
> in my g+ post.  The rule that applies here is that you should only free
> things which have been allocated.  Since we only partially allocated the
> rd0 ring, device_free_rd0_ring() will crash when we do:
> 
>   dma_unmap_single(&priv->pcid->dev, rd_info->skb_dma,
>priv->rx_buf_sz, DMA_FROM_DEVICE);
> 
> "rd_info" is NULL so rd_info->skb_dma is a NULL dereference.

For dealing with this problem, I added below code on patch v3. I think it
would not make Null dereferencing issues, because it will not enter 
dma_unmap_single(), if "rd_info" is Null.

+   if (rd_info) {
+   dma_unmap_single(&priv->pcid->dev, rd_info->skb_dma,
+   priv->rx_buf_sz, DMA_FROM_DEVICE);
+   dev_kfree_skb(rd_info->skb);
+   kfree(desc->rd_info);
+   }

I would appreciate for your opinions what would be better way for freeing
allocated "rd_info"s in the loop previously.

Best regards,
Ji-Hun


Re: v4.16+ seeing many unaligned access in dequeue_task_fair() on IA64

2018-04-04 Thread Peter Zijlstra
On Wed, Apr 04, 2018 at 12:04:00AM +, Luck, Tony wrote:
> > bisect says:
> >
> > d519329f72a6 ("sched/fair: Update util_est only on util_avg updates")
> > 
> > Reverting just this commit makes the problem go away.
> 
> The unaligned read and write seem to come from:
> 
> struct util_est ue = READ_ONCE(p->se.avg.util_est);
> WRITE_ONCE(p->se.avg.util_est, ue);
> 
> which is puzzling as they were around before. Also the "avg"
> field is tagged with an attribute to make it cache aligned
> and there don't look to be holes in the structure that would
> make util_est not be 8-byte aligned ... though it does consist
> of two 4-byte fields, so legal for it to be 4-byte aligned.

Right, I remember being careful with that. Which again brings me to the
RANDSTRUCT thing, which will mess that up.

Does the below cure things? It makes absolutely no difference for my
x86_64-defconfig build, but it puts more explicit alignment constraints
on things.


diff --git a/include/linux/sched.h b/include/linux/sched.h
index f228c6033832..b3d697f3b573 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -300,7 +300,7 @@ struct util_est {
unsigned intenqueued;
unsigned intewma;
 #define UTIL_EST_WEIGHT_SHIFT  2
-};
+} __attribute__((__aligned__(sizeof(u64;
 
 /*
  * The load_avg/util_avg accumulates an infinite geometric series
@@ -364,7 +364,7 @@ struct sched_avg {
unsigned long   runnable_load_avg;
unsigned long   util_avg;
struct util_est util_est;
-};
+} cacheline_aligned;
 
 struct sched_statistics {
 #ifdef CONFIG_SCHEDSTATS
@@ -435,7 +435,7 @@ struct sched_entity {
 * Put into separate cache line so it does not
 * collide with read-mostly values above.
 */
-   struct sched_avgavg cacheline_aligned_in_smp;
+   struct sched_avgavg;
 #endif
 };
 


Re: [PATCH 2/2] kfree_rcu() should use kfree_bulk() interface

2018-04-04 Thread Rao Shoaib



On 04/02/2018 10:20 AM, Christopher Lameter wrote:

On Sun, 1 Apr 2018, rao.sho...@oracle.com wrote:


kfree_rcu() should use the new kfree_bulk() interface for freeing
rcu structures as it is more efficient.

It would be even better if this approach could also use

kmem_cache_free_bulk()

or

kfree_bulk()
Sorry I do not understand your comment. The patch is using kfree_bulk() 
which is an inline function.


Shoaib


Re: Multiple generic PHY instances for DWC3 USB IP

2018-04-04 Thread Masahiro Yamada
2018-04-04 15:04 GMT+09:00 Felipe Balbi :
>
> Hi,
>
> Masahiro Yamada  writes:
>> 2018-04-04 14:36 GMT+09:00 Felipe Balbi :
>>>
>>> Hi,
>>>
>>> Masahiro Yamada  writes:
 Currently, DWC3 core IP (drivers/usb/dwc3/core.c)
 can take only one PHY phandle for each of SS, HS.
 (phy-names DT property is "usb2-phy" and "usb3-phy" for each)
>>>
>>> We never had any other requirements :-)
>>>
 The DWC3 core IP is provided by Synopsys,
 but some SoC-dependent parts (a.k.a glue-layer)
 are implemented by SoC venders.

 The number of connected PHY instances are SoC-dependent.

 If you look at generic drivers such as
   drivers/usb/host/ehci-platform.c
 the driver can handle arbitrary number of PHY instances.

 However, as mentioned above, DWC3 core allows only one PHY phandle
 for each SS/HS.
 This can result in a strange DT structure.

 For example, Socionext PXs3 SoC is integrated with 2 instances of DWC3.

 The instance 0 of DWC3 is connected with 2 super-speed PHYs.
>>>
>>> why 2 super-speed phys? Is this a two-port host-only implementation?
>>
>>
>> Socionext SoCs only support the host-mode.
>>
>>
>> The instance 0 has 2 ports.
>> In our integration, 1 SS PHY is needed for each port.
>> That's why it needs 2 SS PHYs.
>>
>> Each DWC3 instance is connected with
>> multiple HS PHYs and multiple SS PHYs,
>> depending on the number of ports.
>
> in that case, you shouldn't need dwc3 at all. A Host-only dwc3 is xHCI
> compliant. If you really don't have the gadget block, there's no need
> for you to use dwc3. Just use xhci-plat directly.

Sorry, I was misunderstanding.

Some of our SoCs support gadget,
so we need to use the dwc3 driver.


 Is this OK?
>>>
>>> I don't know, I need a bit more details about your integration :-)
>>
>>
>> I can send a patch.
>>
>> My concern is the following commit.
>> I do not know which parts are using this lookups.
>
> Samsung SoCs, probably ;-)
>
> Anyway, if your IP really is host-only, then you don't need dwc3 for
> anything. Just go for xHCI directly. If xHCI needs to be extended when
> it comes to PHY, then you can discuss with Mathias Nyman :-)
>
> --
> balbi



-- 
Best Regards
Masahiro Yamada


Re: [PATCH] mmc: sdhci-cadence: send tune request twice to work around errata

2018-04-04 Thread Ulf Hansson
On 27 March 2018 at 11:29, Masahiro Yamada
 wrote:
> Cadence sent out an errata report to their customers of this IP.
> This errata is not so severe, but the tune request should be sent
> twice to avoid the potential issue.
>
> Quote from the report:
>
> Problem Summary
> ---
> The IP6116 SD/eMMC PHY design has a timing issue on receive data path.
> This issue may lead to an incorrect values of read/write pointers of
> the synchronization FIFO. Such a situation can happen at the SDR104
> and HS200 tuning procedure when the PHY is requested to change a phase
> of sampling clock when moving to the next tuning iteration.
>
> Workarounds
> ---
> The following are valid workarounds to resolve the issue:
>
> 1. In eMMC mode, software sends tune request twice instead of once at
>each iteration. This means that the clock phase is not changed on
>the second request so there is no potential for clock instability.
> 2. In SD mode, software must not use the hardware tuning and instead
>perform an almost identical procedure to eMMC, using the HRS34 Tune
>Force register.
>
> Signed-off-by: Masahiro Yamada 

Do you want this to be tagged for stable as well? And perhaps you
could add a fixes tag?

Kind regards
Uffe

> ---
>
>  drivers/mmc/host/sdhci-cadence.c | 22 ++
>  1 file changed, 18 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/mmc/host/sdhci-cadence.c 
> b/drivers/mmc/host/sdhci-cadence.c
> index 0f589e2..bc30d16 100644
> --- a/drivers/mmc/host/sdhci-cadence.c
> +++ b/drivers/mmc/host/sdhci-cadence.c
> @@ -253,6 +253,7 @@ static int sdhci_cdns_set_tune_val(struct sdhci_host 
> *host, unsigned int val)
> struct sdhci_cdns_priv *priv = sdhci_cdns_priv(host);
> void __iomem *reg = priv->hrs_addr + SDHCI_CDNS_HRS06;
> u32 tmp;
> +   int i, ret;
>
> if (WARN_ON(!FIELD_FIT(SDHCI_CDNS_HRS06_TUNE, val)))
> return -EINVAL;
> @@ -260,11 +261,24 @@ static int sdhci_cdns_set_tune_val(struct sdhci_host 
> *host, unsigned int val)
> tmp = readl(reg);
> tmp &= ~SDHCI_CDNS_HRS06_TUNE;
> tmp |= FIELD_PREP(SDHCI_CDNS_HRS06_TUNE, val);
> -   tmp |= SDHCI_CDNS_HRS06_TUNE_UP;
> -   writel(tmp, reg);
>
> -   return readl_poll_timeout(reg, tmp, !(tmp & SDHCI_CDNS_HRS06_TUNE_UP),
> - 0, 1);
> +   /*
> +* Workaround for IP errata:
> +* The IP6116 SD/eMMC PHY design has a timing issue on receive data
> +* path. Send tune request twice.
> +*/
> +   for (i = 0; i < 2; i++) {
> +   tmp |= SDHCI_CDNS_HRS06_TUNE_UP;
> +   writel(tmp, reg);
> +
> +   ret = readl_poll_timeout(reg, tmp,
> +!(tmp & SDHCI_CDNS_HRS06_TUNE_UP),
> +0, 1);
> +
> +   return ret;
> +   }
> +
> +   return 0;
>  }
>
>  static int sdhci_cdns_execute_tuning(struct mmc_host *mmc, u32 opcode)
> --
> 2.7.4
>


Re: [PATCH RT 1/3] alarmtimer: Prevent live lock in alarm_cancel()

2018-04-04 Thread Sebastian Andrzej Siewior
On 2018-04-03 20:32:36 [+0200], Daniel Wagner wrote:
> Hi Sebastian,
> 
> On 03/28/2018 12:07 PM, Sebastian Andrzej Siewior wrote:
> > If alarm_try_to_cancel() requires a retry, then depending on the
> > priority setting the retry loop might prevent timer callback completion
> > on RT. Prevent that by waiting for completion on RT, no change for a
> > non RT kernel.
> > 
> > Cc: stable...@vger.kernel.org
> 
> How relevant is this serie for trees before eae1c4ae275f ("posix-timers:
> Make use of cancel/arm callbacks")? Patch #2 seems to depend on that change
> which was added in v4.12.

so #1 looks relevant since that alarmtimer was introduced. #2 since the
change you mentioned because after that patch alarmtimer would be
handled wrongly - before you should do the schedule_timout() which
should be fine.  And #3 since 3.0+ (since the RCU-readsection was added)
but it is not easy to trigger (you would have to delete the timer while
a callback is waiting for it).

> Thanks,
> Daniel

Sebastian


[PATCH 1/1] drm/i915: Do not use kfree() to free kmem_cache_alloc() return value

2018-04-04 Thread Xidong Wang
In eb_lookup_vmas(), the return value of kmem_cache_alloc() is freed
with kfree(). I think the expected paired function is kmem_cahce_free().

Signed-off-by: Xidong Wang 
---
 drivers/gpu/drm/i915/i915_gem_execbuffer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c 
b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 8c170db..0414228 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -728,7 +728,7 @@ static int eb_lookup_vmas(struct i915_execbuffer *eb)
 
err = radix_tree_insert(handles_vma, handle, vma);
if (unlikely(err)) {
-   kfree(lut);
+   kmem_cache_free(eb->i915->luts, lut);
goto err_obj;
}
 
-- 
2.7.4




Re: [PATCH 11/11] x86/pti: leave kernel text global for !PCID

2018-04-04 Thread kbuild test robot
Hi Dave,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on tip/auto-latest]
[also build test WARNING on next-20180403]
[cannot apply to tip/x86/core v4.16]
[if your patch is applied to the wrong git tree, please drop us a note to help 
improve the system]

url:
https://github.com/0day-ci/linux/commits/Dave-Hansen/Use-global-pages-with-PTI/20180404-135611
reproduce:
# apt-get install sparse
make ARCH=x86_64 allmodconfig
make C=1 CF=-D__CHECK_ENDIAN__


sparse warnings: (new ones prefixed by >>)

>> arch/x86/mm/pti.c:286:1: sparse: symbol 'pti_clone_pmds' was not declared. 
>> Should it be static?
   arch/x86/mm/pti.c:439:6: sparse: symbol 'pti_set_kernel_image_nonglobal' was 
not declared. Should it be static?

Please review and possibly fold the followup patch.

---
0-DAY kernel test infrastructureOpen Source Technology Center
https://lists.01.org/pipermail/kbuild-all   Intel Corporation


[RFC PATCH] x86/pti: pti_clone_pmds can be static

2018-04-04 Thread kbuild test robot

Fixes: a7e2701bf2b2 ("x86/pti: leave kernel text global for !PCID")
Signed-off-by: Fengguang Wu 
---
 pti.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c
index 3ee9ceb..057c8ff 100644
--- a/arch/x86/mm/pti.c
+++ b/arch/x86/mm/pti.c
@@ -282,7 +282,7 @@ static void __init pti_setup_vsyscall(void)
 static void __init pti_setup_vsyscall(void) { }
 #endif
 
-void
+static void
 pti_clone_pmds(unsigned long start, unsigned long end, pmdval_t clear)
 {
unsigned long addr;


Re: [PATCH v5 08/13] ARM: sunxi: Add initialization of CNTVOFF

2018-04-04 Thread Maxime Ripard
On Tue, Apr 03, 2018 at 10:06:28PM +0200, Mylène Josserand wrote:
> Hello,
> 
> Thank you for the review.
> 
> On Tue, 3 Apr 2018 11:12:18 +0200
> Maxime Ripard  wrote:
> 
> > On Tue, Apr 03, 2018 at 08:18:31AM +0200, Mylène Josserand wrote:
> > > Add the initialization of CNTVOFF for sun8i-a83t.
> > > 
> > > For boot CPU, Create a new machine that handles this
> > > function's call in an "init_early" callback.
> > > For secondary CPUs, add this function into secondary_startup
> > > assembly entry.
> > > 
> > > Signed-off-by: Mylène Josserand 
> > > ---
> > >  arch/arm/mach-sunxi/headsmp.S |  1 +
> > >  arch/arm/mach-sunxi/sunxi.c   | 18 +-
> > >  2 files changed, 18 insertions(+), 1 deletion(-)
> > > 
> > > diff --git a/arch/arm/mach-sunxi/headsmp.S b/arch/arm/mach-sunxi/headsmp.S
> > > index 79890fbe5613..b586b7cf803a 100644
> > > --- a/arch/arm/mach-sunxi/headsmp.S
> > > +++ b/arch/arm/mach-sunxi/headsmp.S
> > > @@ -71,6 +71,7 @@ ENDPROC(sunxi_mc_smp_cluster_cache_enable)
> > >  
> > >  ENTRY(sunxi_mc_smp_secondary_startup)
> > >   bl  sunxi_mc_smp_cluster_cache_enable
> > > + bl  smp_init_cntvoff
> > >   b   secondary_startup
> > >  ENDPROC(sunxi_mc_smp_secondary_startup)
> > >  
> > > diff --git a/arch/arm/mach-sunxi/sunxi.c b/arch/arm/mach-sunxi/sunxi.c
> > > index 5e9602ce1573..090784108c0a 100644
> > > --- a/arch/arm/mach-sunxi/sunxi.c
> > > +++ b/arch/arm/mach-sunxi/sunxi.c
> > > @@ -16,6 +16,7 @@
> > >  #include 
> > >  
> > >  #include 
> > > +#include 
> > >  
> > >  static const char * const sunxi_board_dt_compat[] = {
> > >   "allwinner,sun4i-a10",
> > > @@ -62,7 +63,6 @@ MACHINE_END
> > >  static const char * const sun8i_board_dt_compat[] = {
> > >   "allwinner,sun8i-a23",
> > >   "allwinner,sun8i-a33",
> > > - "allwinner,sun8i-a83t",
> > >   "allwinner,sun8i-h2-plus",
> > >   "allwinner,sun8i-h3",
> > >   "allwinner,sun8i-r40",
> > > @@ -75,6 +75,22 @@ DT_MACHINE_START(SUN8I_DT, "Allwinner sun8i Family")
> > >   .dt_compat  = sun8i_board_dt_compat,
> > >  MACHINE_END
> > >  
> > > +void __init sun8i_cntvoff_init(void)
> > > +{
> > > + smp_init_cntvoff();  
> > 
> > Can't this be moved to the SMP setup code?
> 
> I tried to put it in the first lines of "sunxi_mc_smp_init" function
> but it did not work. I tried to find some callbacks to have an
> early "init" and I only found the "init_early"'s one. There is probably
> another way to handle that so do not hesitate to tell me any ideas.

It's hard to say without more context about why it doesn't work. Have
you checked the order between early_initcall, the timer initialization
function and init_early?

> > > +}
> > > +
> > > +static const char * const sun8i_cntvoff_board_dt_compat[] = {
> > > + "allwinner,sun8i-a83t",
> > > + NULL,
> > > +};
> > > +
> > > +DT_MACHINE_START(SUN8I_CNTVOFF_DT, "Allwinner sun8i boards needing 
> > > cntvoff")  
> > 
> > All of the SoCs need CNTVOFF, so that doesn't really make sense. Why
> > not just calling it for what it is: an A83t?
> 
> Sure, I will update it.

Looking back at that code, I guess you want to change also the
smp_init_cntvoff function. It's not really related to SMP either.

Maxime

-- 
Maxime Ripard, Bootlin (formerly Free Electrons)
Embedded Linux and Kernel engineering
https://bootlin.com


signature.asc
Description: PGP signature


[PATCH] ARM: decompressor: restore r1 and r2 just before jumping to the kernel

2018-04-04 Thread Łukasz Stelmach
Hypervisor setup before __enter_kernel destroys the value sotred in
r1. The value needs to be restored just before the jump.

Signed-off-by: Łukasz Stelmach 
---
 arch/arm/boot/compressed/head.S | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/arm/boot/compressed/head.S b/arch/arm/boot/compressed/head.S
index 182bf6add0b9..517e0e18f0b8 100644
--- a/arch/arm/boot/compressed/head.S
+++ b/arch/arm/boot/compressed/head.S
@@ -561,8 +561,6 @@ not_relocated:  mov r0, #0
bl  decompress_kernel
bl  cache_clean_flush
bl  cache_off
-   mov r1, r7  @ restore architecture number
-   mov r2, r8  @ restore atags pointer
 
 #ifdef CONFIG_ARM_VIRT_EXT
mrs r0, spsr@ Get saved CPU boot mode
@@ -1365,6 +1363,8 @@ __hyp_reentry_vectors:
 
 __enter_kernel:
mov r0, #0  @ must be 0
+   mov r1, r7  @ restore architecture number
+   mov r2, r8  @ restore atags pointer
  ARM(  mov pc, r4  )   @ call kernel
  M_CLASS(  add r4, r4, #1  )   @ enter in Thumb mode for M 
class
  THUMB(bx  r4  )   @ entry point is always 
ARM for A/R classes
-- 
2.11.0



Re: [PATCH v2] ARM: dts: tpc: Device tree description of the iMX6Q TPC board

2018-04-04 Thread Lukasz Majewski
Hi Fabio,

Thanks for the feedback,

> Hi Lukasz,
> 
> On Tue, Apr 3, 2018 at 1:59 PM, Lukasz Majewski  wrote:
> 
> > diff --git a/Documentation/devicetree/bindings/vendor-prefixes.txt
> > b/Documentation/devicetree/bindings/vendor-prefixes.txt index
> > ae850d6c0ad3..8ff7eadc8bef 100644 ---
> > a/Documentation/devicetree/bindings/vendor-prefixes.txt +++
> > b/Documentation/devicetree/bindings/vendor-prefixes.txt @@ -181,6
> > +181,7 @@ karoKa-Ro electronics GmbH keithkoep  Keith &
> > Koep GmbH keymileKeymile GmbH
> >  khadas Khadas
> > +kiebackpeterKieback & Peter GmbH  
> 
> This should be a separate patch.

Ok.

> 
> >  kinetic Kinetic Technologies
> >  kingnovel  Kingnovel Technology Co., Ltd.
> >  kosagi Sutajio Ko-Usagi PTE Ltd.
> > diff --git a/arch/arm/boot/dts/Makefile b/arch/arm/boot/dts/Makefile
> > index ade7a38543dc..c148c4cf28f2 100644
> > --- a/arch/arm/boot/dts/Makefile
> > +++ b/arch/arm/boot/dts/Makefile
> > @@ -459,6 +459,7 @@ dtb-$(CONFIG_SOC_IMX6Q) += \
> > imx6q-icore-ofcap10.dtb \
> > imx6q-icore-ofcap12.dtb \
> > imx6q-icore-rqs.dtb \
> > +   imx6q-kp-tpc.dtb \
> > imx6q-marsboard.dtb \
> > imx6q-mccmon6.dtb \
> > imx6q-nitrogen6x.dtb \
> > diff --git a/arch/arm/boot/dts/imx6q-kp-tpc.dts
> > b/arch/arm/boot/dts/imx6q-kp-tpc.dts new file mode 100644
> > index ..b5646040b516
> > --- /dev/null
> > +++ b/arch/arm/boot/dts/imx6q-kp-tpc.dts
> > @@ -0,0 +1,23 @@
> > +/*
> > + * Copyright 2018
> > + * Lukasz Majewski, DENX Software Engineering, lu...@denx.de
> > + *
> > + * SPDX-License-Identifier: (GPL-2.0+ OR MIT)  
> 
> This line should be the first one and start with //

Ok.

> 
> > + */
> > +
> > +/dts-v1/;
> > +
> > +#include "imx6q-kp.dtsi"
> > +
> > +/ {
> > +   model = "Freescale i.MX6 Qwuad K+P TPC Board";
> > +   compatible = "kiebackpeter,imx6q-tpc", "fsl,imx6q";
> > +
> > +   memory: memory@1000 {  
> 
> Only memory@1000 is enough.

Ok.

> 
> > --- /dev/null
> > +++ b/arch/arm/boot/dts/imx6q-kp.dtsi
> > @@ -0,0 +1,460 @@
> > +/*
> > + * Copyright 2018
> > + * Lukasz Majewski, DENX Software Engineering, lu...@denx.de
> > + *
> > + * SPDX-License-Identifier: (GPL-2.0+ OR MIT)  
> 
> Same here.
> 
> > +   beeper {
> > +   compatible = "pwm-beeper";
> > +   pwms = <&pwm2 0 50>; //2kHz  
> 
> No // style comments, please.
> 
> > +   lcd_panel: lcd-panel {
> > +   compatible = "auo,g070vvn01";  
> 
> I don't see this compatible string in linux-next.

I've sent support for this display yesterday.

[PATCH] display: panel: Add AUO g070vvn01 display support (800x480)


> 
> > +&i2c1 {
> > +   clock-frequency = <40>;
> > +   pinctrl-names = "default";
> > +   pinctrl-0 = <&pinctrl_i2c1>;
> > +   status = "okay";
> > +
> > +   goodix_ts@5d {
> > +   compatible = "goodix,gt911";
> > +   reg = <0x5d>;
> > +   pinctrl-names = "default";
> > +   pinctrl-0 = <&pinctrl_ts>;
> > +   interrupt-parent = <&gpio1>;
> > +   interrupts = <9 IRQ_TYPE_EDGE_FALLING>;
> > +   irq-gpios = <&gpio1 9 GPIO_ACTIVE_HIGH>;
> > +   reset-gpios = <&gpio5 2 GPIO_ACTIVE_HIGH>;
> > +   };
> > +
> > +   rx8025@32 {
> > +   compatible = "dallas,rx8025";  
> 
> I don't see this compatible string in linux-next.

Ach... it should be "dallas,ds1307" as it supports also this chip. I
will fix it in v2.


Best regards,

Lukasz Majewski

--

DENX Software Engineering GmbH,  Managing Director: Wolfgang Denk
HRB 165235 Munich, Office: Kirchenstr.5, D-82194 Groebenzell, Germany
Phone: (+49)-8142-66989-10 Fax: (+49)-8142-66989-80 Email: w...@denx.de


pgpaxjWbpJ7iN.pgp
Description: OpenPGP digital signature


ARM compile failure in Re: linux-next: Tree for Apr 4

2018-04-04 Thread Pavel Machek
Hi!

> Please do not add any v4.18 destined stuff to your linux-next included
> trees until after v4.17-rc1 has been released.
> 
> Changes since 20180403:
> 
> The vfs tree still had its build failure for which I reverted a commit.
> 
> Non-merge commits (relative to Linus' tree): 8505
>  8493 files changed, 392779 insertions(+), 275941 deletions(-)
> 
> 

When trying to build kernel for N900, I get:

  CC  lib/timerqueue.o
CC  lib/vsprintf.o
lib/string.c: In function 'strstr':
lib/string.c:478:8: error: inlining failed in call to
always_inline 'strlen': function not inlinable
lib/string.c:903:5: error: called from here
lib/string.c:478:8: error: inlining failed in call to
always_inline 'strlen': function not inlinable
lib/string.c:906:5: error: called from here
lib/string.c:855:15: error: inlining failed in call to
always_inline 'memcmp': function not inlinable
...
pavel@duo:/data/l/linux-next-n900$ git branch -l
* (detached from next-20180403)

I'm using

eval ` eldk-switch.sh -r 5.4 armv7a`

for cross-compilation.

Pavel
-- 
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) 
http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html


signature.asc
Description: Digital signature


Re: [RFC][PATCH] tracing, printk: Force no hashing when trace_printk() is used

2018-04-04 Thread Peter Zijlstra
On Tue, Apr 03, 2018 at 05:06:12PM -0400, Steven Rostedt wrote:
> If you are concerned about attack surface, I could make it a bit more
> difficult to tweak by malicious software. What about the patch below?
> It would be much more difficult to modify this knob from an attack
> vector.

Not if you build using clang, because that doesn't support asm-goto and
thus falls back to a simple runtime variable, which is exactly what Kees
didn't want.


x32 suspend failuer in Re: linux-next: Tree for Apr 4

2018-04-04 Thread Pavel Machek
Hi!

> Please do not add any v4.18 destined stuff to your linux-next included
> trees until after v4.17-rc1 has been released.

On thinkpad x60, suspend does not suspend at all with this -next
version. Previous versions suspended/resumed fine but broke networking.

Any ideas? I guess bisecting on next would not be easy?

Pavel
-- 
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) 
http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html


signature.asc
Description: Digital signature


[PATCH] doc: Add vendor prefix for Kieback & Peter GmbH

2018-04-04 Thread Lukasz Majewski
The 'kiebackpeter' entry has been added to vendor-prefixes.txt to indicate
products from Kieback & Peter GmbH.

Signed-off-by: Lukasz Majewski 
---
 Documentation/devicetree/bindings/vendor-prefixes.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Documentation/devicetree/bindings/vendor-prefixes.txt 
b/Documentation/devicetree/bindings/vendor-prefixes.txt
index ae850d6c0ad3..8ff7eadc8bef 100644
--- a/Documentation/devicetree/bindings/vendor-prefixes.txt
+++ b/Documentation/devicetree/bindings/vendor-prefixes.txt
@@ -181,6 +181,7 @@ karoKa-Ro electronics GmbH
 keithkoep  Keith & Koep GmbH
 keymileKeymile GmbH
 khadas Khadas
+kiebackpeterKieback & Peter GmbH
 kinetic Kinetic Technologies
 kingnovel  Kingnovel Technology Co., Ltd.
 kosagi Sutajio Ko-Usagi PTE Ltd.
-- 
2.11.0



Re: [PATCH 1/1] drm/i915: Do not use kfree() to free kmem_cache_alloc() return value

2018-04-04 Thread Chris Wilson
Quoting Xidong Wang (2018-04-04 08:37:54)
> In eb_lookup_vmas(), the return value of kmem_cache_alloc() is freed
> with kfree(). I think the expected paired function is kmem_cahce_free().
> 
> Signed-off-by: Xidong Wang 

That is indeed what it should be doing,

Fixes: d1b48c1e7184 ("drm/i915: Replace execbuf vma ht with an idr")
Cc: Chris Wilson 
Cc: Tvrtko Ursulin 
Cc:  # v4.14+
Reviewed-by: Chris Wilson 
-Chris


Re: x32 suspend failuer in Re: linux-next: Tree for Apr 4

2018-04-04 Thread Rafael J. Wysocki
On Wed, Apr 4, 2018 at 9:50 AM, Pavel Machek  wrote:
> Hi!
>
>> Please do not add any v4.18 destined stuff to your linux-next included
>> trees until after v4.17-rc1 has been released.
>
> On thinkpad x60, suspend does not suspend at all with this -next
> version. Previous versions suspended/resumed fine but broke networking.
>
> Any ideas? I guess bisecting on next would not be easy?

Well, why would it be different from a bisect on any other git repo?


Re: [PATCH v9 00/24] Speculative page faults

2018-04-04 Thread Laurent Dufour
Hi Jerome,

Thanks for reviewing this series.

On 03/04/2018 22:37, Jerome Glisse wrote:
> On Tue, Mar 13, 2018 at 06:59:30PM +0100, Laurent Dufour wrote:
>> This is a port on kernel 4.16 of the work done by Peter Zijlstra to
>> handle page fault without holding the mm semaphore [1].
>>
>> The idea is to try to handle user space page faults without holding the
>> mmap_sem. This should allow better concurrency for massively threaded
>> process since the page fault handler will not wait for other threads memory
>> layout change to be done, assuming that this change is done in another part
>> of the process's memory space. This type page fault is named speculative
>> page fault. If the speculative page fault fails because of a concurrency is
>> detected or because underlying PMD or PTE tables are not yet allocating, it
>> is failing its processing and a classic page fault is then tried.
>>
>> The speculative page fault (SPF) has to look for the VMA matching the fault
>> address without holding the mmap_sem, this is done by introducing a rwlock
>> which protects the access to the mm_rb tree. Previously this was done using
>> SRCU but it was introducing a lot of scheduling to process the VMA's
>> freeing
>> operation which was hitting the performance by 20% as reported by Kemi Wang
>> [2].Using a rwlock to protect access to the mm_rb tree is limiting the
>> locking contention to these operations which are expected to be in a O(log
>> n)
>> order. In addition to ensure that the VMA is not freed in our back a
>> reference count is added and 2 services (get_vma() and put_vma()) are
>> introduced to handle the reference count. When a VMA is fetch from the RB
>> tree using get_vma() is must be later freeed using put_vma(). Furthermore,
>> to allow the VMA to be used again by the classic page fault handler a
>> service is introduced can_reuse_spf_vma(). This service is expected to be
>> called with the mmap_sem hold. It checked that the VMA is still matching
>> the specified address and is releasing its reference count as the mmap_sem
>> is hold it is ensure that it will not be freed in our back. In general, the
>> VMA's reference count could be decremented when holding the mmap_sem but it
>> should not be increased as holding the mmap_sem is ensuring that the VMA is
>> stable. I can't see anymore the overhead I got while will-it-scale
>> benchmark anymore.
>>
>> The VMA's attributes checked during the speculative page fault processing
>> have to be protected against parallel changes. This is done by using a per
>> VMA sequence lock. This sequence lock allows the speculative page fault
>> handler to fast check for parallel changes in progress and to abort the
>> speculative page fault in that case.
>>
>> Once the VMA is found, the speculative page fault handler would check for
>> the VMA's attributes to verify that the page fault has to be handled
>> correctly or not. Thus the VMA is protected through a sequence lock which
>> allows fast detection of concurrent VMA changes. If such a change is
>> detected, the speculative page fault is aborted and a *classic* page fault
>> is tried.  VMA sequence lockings are added when VMA attributes which are
>> checked during the page fault are modified.
>>
>> When the PTE is fetched, the VMA is checked to see if it has been changed,
>> so once the page table is locked, the VMA is valid, so any other changes
>> leading to touching this PTE will need to lock the page table, so no
>> parallel change is possible at this time.
> 
> What would have been nice is some pseudo highlevel code before all the
> above detailed description. Something like:
>   speculative_fault(addr) {
> mm_lock_for_vma_snapshot()
> vma_snapshot = snapshot_vma_infos(addr)
> mm_unlock_for_vma_snapshot()
> ...
> if (!vma_can_speculatively_fault(vma_snapshot, addr))
> return;
> ...
> /* Do fault ie alloc memory, read from file ... */
> page = ...;
> 
> preempt_disable();
> if (vma_snapshot_still_valid(vma_snapshot, addr) &&
> vma_pte_map_lock(vma_snapshot, addr)) {
> if (pte_same(ptep, orig_pte)) {
> /* Setup new pte */
> page = NULL;
> }
> }
> preempt_enable();
> if (page)
> put(page)
>   }
> 
> I just find pseudo code easier for grasping the highlevel view of the
> expected code flow.

Fair enough, I agree that sounds easier this way, but one might argue that the
pseudo code is not more valid or accurate at one time :)

As always, the updated documentation is the code itself.

I'll try to put one inspired by yours in the next series's header.

>>
>> The locking of the PTE is done with interrupts disabled, this allows to
>> check for the PMD to ensure that there is not an ongoing collapsing
>> operation. Since khugepaged is firstly set the PMD to pmd_none and then is
>> waiting for the other CPU to have catch the IPI interrupt, if the pmd is
>> valid at the time the PTE is locked, we have the guaran

Re: Multiple generic PHY instances for DWC3 USB IP

2018-04-04 Thread Felipe Balbi

Hi,

Masahiro Yamada  writes:
>>> Each DWC3 instance is connected with
>>> multiple HS PHYs and multiple SS PHYs,
>>> depending on the number of ports.
>>
>> in that case, you shouldn't need dwc3 at all. A Host-only dwc3 is xHCI
>> compliant. If you really don't have the gadget block, there's no need
>> for you to use dwc3. Just use xhci-plat directly.
>
> Sorry, I was misunderstanding.
>
> Some of our SoCs support gadget,
> so we need to use the dwc3 driver.

fair enough. Now we need to figure out how to pass multiply PHYs to a
multi-port dwc3 instance.

Kishon, any ideas? How do you think DT should look like?

-- 
balbi


Re: [GIT PULL] arch: remove obsolete architecture ports

2018-04-04 Thread Pavel Machek
On Tue 2018-04-03 11:18:15, Geert Uytterhoeven wrote:
> On Mon, Apr 2, 2018 at 9:54 PM, Arnd Bergmann  wrote:
> > On Mon, Apr 2, 2018 at 8:57 PM, Linus Torvalds
> >  wrote:
> > Regarding a possible revert, that would indeed involve reverting
> > multiple patches for most architectures, plus parts of at least these
> > three:
> >
> >   Documentation: arch-support: remove obsolete architectures
> >   treewide: simplify Kconfig dependencies for removed archs
> >   ktest: remove obsolete architectures
> >
> > For those, I went the other way, and removed all architectures at
> > once to simplify my work and to avoid touching the same files up
> > to eight times with interdependent patches (which couldn't
> > be reverted without conflicts either).
> >
> > There are a couple of driver removal patches that got picked up
> > into subsystem trees instead of this tree, so a full revert would also
> > involve finding other drivers, but if you prefer to have the patches
> > completely split up by arch, I could rework the series that way.
> 
> In reality, a resurrection may not be implemented as a pure revert, but as
> the addition of a new architecture, implemented using modern features (DT,
> CCF, ...).

By insisting on new features instead of pure revert + incremental
updates, you pretty much make sure resurection will not be possible
:-(.

Pavel
-- 
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) 
http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html


signature.asc
Description: Digital signature


Re: [PATCH v4 5/8] mfd: cros_ec_dev: Register cros_ec_accel_legacy driver as a subdevice.

2018-04-04 Thread Enric Balletbo Serra
Hi Lee,

2018-03-28 13:03 GMT+02:00 Lee Jones :
> On Tue, 20 Mar 2018, Enric Balletbo i Serra wrote:
>
>> With this patch, the cros_ec_ctl driver will register the legacy
>> accelerometer driver (named cros_ec_accel_legacy) if it fails to
>> register sensors through the usual path cros_ec_sensors_register().
>> This legacy device is present on Chromebook devices with older EC
>> firmware only supporting deprecated EC commands (Glimmer based devices).
>>
>> Tested-by: Gwendal Grignou 
>> Signed-off-by: Enric Balletbo i Serra 
>> Reviewed-by: Gwendal Grignou 
>> Reviewed-by: Andy Shevchenko 
>> ---
>>
>> Changes in v4:
>> - [5/8] Nit: EC -> ECs (Lee Jones)
>> - [5/8] Statically define cros_ec_accel_legacy_cells (Lee Jones)
>>
>> Changes in v3:
>> - [5/8] Add the Reviewed-by Andy Shevchenko.
>>
>> Changes in v2:
>> - [5/8] Add the Reviewed-by Gwendal.
>>
>>  drivers/mfd/cros_ec_dev.c | 70 
>> +++
>>  1 file changed, 70 insertions(+)
>>
>> diff --git a/drivers/mfd/cros_ec_dev.c b/drivers/mfd/cros_ec_dev.c
>> index f60a53f11942..0d541d59d6f5 100644
>> --- a/drivers/mfd/cros_ec_dev.c
>> +++ b/drivers/mfd/cros_ec_dev.c
>> @@ -389,6 +389,73 @@ static void cros_ec_sensors_register(struct cros_ec_dev 
>> *ec)
>>   kfree(msg);
>>  }
>>
>> +static struct cros_ec_sensor_platform sensor_platforms[] = {
>> + {
>> + .sensor_num = 0,
>> + },
>> + {
>> + .sensor_num = 1,
>> + }
>> +};
>
> Also no need to be so many  lines.
>
> Each one of these entries can be placed on a single line.
>
> And there's no need for a comma if there is nothing to separate.
>
> { .sensor_num = 0 },
> { .sensor_num = 1 }
>
> Also, this seems like a pretty pointless struct.
>
> What is the sensor_num property used for?
>
> Why does it care what sensor number it is?
>

I thought that was used but after look again I didn't see where, so
seems that you have reason and this struct is pointless. I'll remove
this and the .id in the cells, we can always send a patch later
introducing this if I am missing something. I'll send another version.

Thanks,
  Enric

>> +static const struct mfd_cell cros_ec_accel_legacy_cells[] = {
>> + {
>> + .name = "cros-ec-accel-legacy",
>> + .id = 0,
>
> What are you using this for?
>
>> + .platform_data = &sensor_platforms[0],
>> + .pdata_size = sizeof(struct cros_ec_sensor_platform),
>> + },
>> + {
>> + .name = "cros-ec-accel-legacy",
>> + .id = 1,
>
> And this?
>
>> + .platform_data = &sensor_platforms[1],
>> + .pdata_size = sizeof(struct cros_ec_sensor_platform),
>> + }
>> +};
>> +
>> +static void cros_ec_accel_legacy_register(struct cros_ec_dev *ec)
>> +{
>> + struct cros_ec_device *ec_dev = ec->ec_dev;
>> + u8 status;
>> + int ret;
>> +
>> + /*
>> +  * ECs that need legacy support are the main EC, directly connected to
>> +  * the AP.
>> +  */
>> + if (ec->cmd_offset != 0)
>> + return;
>> +
>> + /*
>> +  * Check if EC supports direct memory reads and if EC has
>> +  * accelerometers.
>> +  */
>> + if (!ec_dev->cmd_readmem)
>> + return;
>> +
>> + ret = ec_dev->cmd_readmem(ec_dev, EC_MEMMAP_ACC_STATUS, 1, &status);
>> + if (ret < 0) {
>> + dev_warn(ec->dev, "EC does not support direct reads.\n");
>> + return;
>> + }
>> +
>> + /* Check if EC has accelerometers. */
>> + if (!(status & EC_MEMMAP_ACC_STATUS_PRESENCE_BIT)) {
>> + dev_info(ec->dev, "EC does not have accelerometers.\n");
>> + return;
>> + }
>> +
>> + /*
>> +  * Register 2 accelerometers
>> +  */
>> + ret = mfd_add_devices(ec->dev, PLATFORM_DEVID_AUTO,
>> +   cros_ec_accel_legacy_cells,
>> +   ARRAY_SIZE(cros_ec_accel_legacy_cells),
>> +   NULL, 0, NULL);
>> + if (ret)
>> + dev_err(ec_dev->dev, "failed to add EC sensors\n");
>> +}
>> +
>>  static const struct mfd_cell cros_ec_rtc_cells[] = {
>>   {
>>   .name = "cros-ec-rtc",
>> @@ -442,6 +509,9 @@ static int ec_device_probe(struct platform_device *pdev)
>>   /* check whether this EC is a sensor hub. */
>>   if (cros_ec_check_features(ec, EC_FEATURE_MOTION_SENSE))
>>   cros_ec_sensors_register(ec);
>> + else
>> + /* Workaroud for older EC firmware */
>> + cros_ec_accel_legacy_register(ec);
>>
>>   /* Check whether this EC instance has RTC host command support */
>>   if (cros_ec_check_features(ec, EC_FEATURE_RTC)) {
>
> --
> Lee Jones [李琼斯]
> Linaro Services Technical Lead
> Linaro.org │ Open source software for ARM SoCs
> Follow Linaro: Facebook | Twitter | Blog


Re: [GIT PULL] Kernel lockdown for secure boot

2018-04-04 Thread David Howells
Andy Lutomirski  wrote:

> As far as I can tell, what's really going on here is that there's a
> significant contingent here that wants to prevent Linux from
> chainloading something that isn't Linux.

You have completely the wrong end of the stick.  No one has said that or even
implied that.  You are alleging dishonesty on our part.

What we *have* said is that *if* we want to pass the secure boot state across
kexec, then we have to make sure that:

 (1) no one tampers with the intermediate kernel between boot and kexec
 otherwise the secure boot state is effectively invalidated, and

 (2) the image that gets kexec'ed is trusted.

Remember: you cannot know (2) if you don't have (1).

And if someone tampers with the aim of breaking, say, Windows, then someone,
e.g.  Microsoft, might blacklist the shim.

David


Re: [lustre-devel] [PATCH 11/17] staging: lustre: libcfs: discard cfs_time_shift().

2018-04-04 Thread Dilger, Andreas
On Apr 2, 2018, at 16:26, NeilBrown  wrote:
> On Mon, Apr 02 2018, Dilger, Andreas wrote:
>> On Mar 30, 2018, at 13:02, James Simmons  wrote:
 This function simply multiplies by HZ and adds jiffies.
 This is simple enough to be opencoded, and doing so
 makes the code easier to read.
 
 Same for cfs_time_shift_64()
>>> 
>>> Reviewed-by: James Simmons 
>> 
>> Hmm, I thought we were trying to get rid of direct HZ usage in modules,
>> because of tickless systems, and move to e.g. msecs_to_jiffies() or similar?
> 
> Are we?  I hadn't heard but I could easily have missed it.
> Documentation/scheduler/completion.txt does say
> 
>Timeouts are preferably calculated with
>msecs_to_jiffies() or usecs_to_jiffies().
> 
> but is isn't clear what they are preferred to.  Do you remember where
> you heard? or have a reference?

I thought the goal was to avoid hard-coding the HZ value so that kernels
could have variable clock rates in the future.

Cheers, Andreas

> $ git grep ' \* *HZ'  |wc
>   2244   15679  170016
> $ git grep msecs_to_jiffies | wc
>   3301   13151  276725
> 
> so msecs_to_jiffies is slightly more popular than "* HZ" (even if you add
> in "HZ *").  But that could just be a preference for using milliseconds
> over using seconds.
> 
> $ git grep msecs_to_jiffies   | grep -c '[0-9]000'
> 587
> 
> so there are only 587 places that msecs_to_jiffies is clearly used in
> place of multiplying by HZ.
> 
> If we were to pursue this, I would want to add secs_to_jiffies() to
> include/linux/jiffies.h and use that.
> 
> Thanks,
> NeilBrown
> ___
> lustre-devel mailing list
> lustre-de...@lists.lustre.org
> http://lists.lustre.org/listinfo.cgi/lustre-devel-lustre.org

Cheers, Andreas
--
Andreas Dilger
Lustre Principal Architect
Intel Corporation









KVM: Questions regarding page_track and memory slots

2018-04-04 Thread David Hildenbrand
Hi,

I can see that we allocate under x86 for each KVM memslot
"sizeof(unsigned short) * npages" for page_track.

So 1 byte for each 4096 bytes of memory slot size. This doesn't sound a
lot, but if we have very big memory slots (e.g. for NVDIMM), this can
quickly get out of hand. E.g. for 4TB, we would need 1GB. And this just
means somebody created a big memory slot, not even memory would have to
be populated in that slot.

I can see that the only user is right now kvmgt. My assumption is, that
only a fraction of all memory will be tracked.


1. Do we actually need to track on a per-page level, how many trackers
we have? Is it a valid use case that multiple users track the same page?
(e.g. can't we simply use a bitmap)

2. Is my assumption, that this is actually a sparse "bitmap" true?
Wouldn't something like a radix tree, that grows with the number of
tracked pages, be a better fit?


I am looking right now into creating big memory slots and only assigning
a portion at a time to the guest. So initially, most parts of the big
memory slot would not be accessible by the guest and would not have
backing pages. Of course, I want to minimize the involved size of the
memory slot. An alternative is growing a memory slot atomically. Of
course, the current state of page-track is also a problem for growing a
memory slot atomically (having to copy/grow the array).

Thanks or any insight.

-- 

Thanks,

David / dhildenb


Re: [PATCH v2 2/6] i2c: i2c-stm32f7: Add slave support

2018-04-04 Thread Pierre Yves MORDRET


On 04/03/2018 05:26 PM, Wolfram Sang wrote:
> On Mon, Mar 26, 2018 at 10:41:51AM +0200, Pierre Yves MORDRET wrote:
>>
>>
>> On 03/25/2018 08:16 PM, Wolfram Sang wrote:
>>> On Wed, Mar 21, 2018 at 05:48:56PM +0100, Pierre-Yves MORDRET wrote:
 This patch adds slave support for I2C controller embedded in STM32F7 SoC

 Signed-off-by: M'boumba Cedric Madianga 
 Signed-off-by: Pierre-Yves MORDRET 
>>>
>>> Looks OK from a first look. What kind of tests did you do?
>>
>> As mentioned for 10-bit, I'm using 2 I2C instances from the SoC.
>> Here are the tests I dit:
>>  - Master/slave send in 7 and 10 bits
>>  - master/slave recv in 7 and 10 bits
>>  - E2PROM Read/Write
> 
> As far as I understand the code now, both instances can be master /
> slave simultanously on the same bus?
> 

Correct.


Re: [PATCH v3 6/7] arm: dts: sun8i: a33: Add the DSI-related nodes

2018-04-04 Thread Maxime Ripard
Hi,

On Thu, Mar 22, 2018 at 10:23:32AM +0800, Chen-Yu Tsai wrote:
> On Tue, Mar 6, 2018 at 9:56 PM, Maxime Ripard  
> wrote:
> > From: Maxime Ripard 
> >
> > The A33 has a MIPI-DSI block, along with its D-PHY. Let's add it in order
> > to use it in the relevant boards.
> >
> > Signed-off-by: Maxime Ripard 
> > ---
> >  arch/arm/boot/dts/sun8i-a33.dtsi | 44 +-
> >  1 file changed, 44 insertions(+)
> >
> > diff --git a/arch/arm/boot/dts/sun8i-a33.dtsi 
> > b/arch/arm/boot/dts/sun8i-a33.dtsi
> > index 50eb84fa246a..94cfa7b1bbfa 100644
> > --- a/arch/arm/boot/dts/sun8i-a33.dtsi
> > +++ b/arch/arm/boot/dts/sun8i-a33.dtsi
> > @@ -236,6 +236,11 @@
> > #address-cells = <1>;
> > #size-cells = <0>;
> > reg = <1>;
> > +
> > +   tcon0_out_dsi0: endpoint@1 {
> > +   reg = <1>;
> > +   remote-endpoint = 
> > <&dsi0_in_tcon0>;
> > +   };
> > };
> > };
> > };
> > @@ -280,6 +285,45 @@
> > #io-channel-cells = <0>;
> > };
> >
> > +   dsi0: dsi@1ca {
> 
> Nit: There's only one so you don't need the numbered suffix.

I'll fix it.

> Also, is "dsi" specific enough, or should we use "mipi-dsi"

If we were to be pedantic about it, that would even be MIPI-DSI2, but
I'm not sure it's worth it to be honest.

> > +   compatible = "allwinner,sun6i-a31-mipi-dsi";
> > +   reg = <0x01ca 0x1000>;
> > +   interrupts = ;
> > +   clocks = <&ccu CLK_BUS_MIPI_DSI>,
> > +<&ccu CLK_DSI_SCLK>;
> > +   clock-names = "bus", "mod";
> > +   resets = <&ccu RST_BUS_MIPI_DSI>;
> > +   phys = <&dphy0>;
> > +   phy-names = "dphy";
> > +   status = "disabled";
> > +
> > +   ports {
> > +   #address-cells = <1>;
> > +   #size-cells = <0>;
> > +
> > +   port@0 {
> > +   #address-cells = <1>;
> > +   #size-cells = <0>;
> > +   reg = <0>;
> > +
> > +   dsi0_in_tcon0: endpoint {
> > +   remote-endpoint = 
> > <&tcon0_out_dsi0>;
> > +   };
> > +   };
> > +   };
> > +   };
> > +
> > +   dphy0: d-phy@1ca1000 {
> 
> Same nit, and "dsi-phy" would be better.

D-PHY is one of the MIPI standards that can be used with DSI, but it
doesn't mean DSI-PHY. You also have C-PHY (that can be used with DSI
as well) and M-PHY (than can be used with UniPro and CSI3). So, no, it
really is a D-PHY controller :)

Maxime

-- 
Maxime Ripard, Bootlin (formerly Free Electrons)
Embedded Linux and Kernel engineering
https://bootlin.com


signature.asc
Description: PGP signature


Re: [PATCH 1/1] drm/i915:Do not use kfree() to free kmem_cache_alloc() return value

2018-04-04 Thread Jani Nikula
On Wed, 04 Apr 2018, Xidong Wang  wrote:
> In eb_lookup_vmas(), lut, the return value of kmem_cache_alloc(), is freed
> with kfree().I think the expected paired function is kmem_cache_free().

Agreed. But did you try to compile your patch before sending?

Fixes: d1b48c1e7184 ("drm/i915: Replace execbuf vma ht with an idr")
Cc: Chris Wilson 
Cc: Tvrtko Ursulin 
Cc:  # v4.14+

BR,
Jani.

>
> Signed-off-by: Xidong Wang 
> ---
>  drivers/gpu/drm/i915/i915_gem_execbuffer.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c 
> b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> index 8c170db..08fe476 100644
> --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> @@ -728,7 +728,7 @@ static int eb_lookup_vmas(struct i915_execbuffer *eb)
>  
>   err = radix_tree_insert(handles_vma, handle, vma);
>   if (unlikely(err)) {
> - kfree(lut);
> + kmem_cache_free(lut);
>   goto err_obj;
>   }

-- 
Jani Nikula, Intel Open Source Technology Center


Re: [PATCH v2 3/6] i2c: i2c-stm32f7: Add initial SMBus protocols support

2018-04-04 Thread Pierre Yves MORDRET


On 04/03/2018 05:31 PM, Wolfram Sang wrote:
> 
 All SMBus protocols are implemented except SMBus-specific protocols.
>>>
>>> What does that mean?
>>
>> It miss SMBus Host Notification and SMBBus Alert. They are almost ready but 
>> I'm
>> struggling to put them back to operational state after recent changes 
>> related to
>> SMBust Host Notification. A more "classic" interrupt base solution has been 
>> put
>> in place but I fail to use implement it in my side.
>> Another patch set is going to be delivered for these 2 commands.
> 
> This is totally fine to implement it incrementally. Please just update the
> commit message with the more detailed explanation above.
> 

Ok. I get.

>>> That is quite some complexity considering we have I2C_FUNC_SMBUS_EMUL. I
>>> don't mind, but you really want that?
>>>
>>
>> All SMBBus commands are implemented as such. I never try to emulation 
>> commands.
>> Should we use emulation SMBus commands or real commands... Don't know.
> 
> You won't see any difference on the wire. I don't know your HW. It might
> be that SMBus mode is more "automatic" and uses less interrupts. Or
> stuff like Alert or HostNotification only works in this mode. If you and
> the driver maintainers think it is worth the added complexity, I am
> fine, too.
> 
Ok. I see.
I am the maintainer. So yes I keep it as such ... with this complexity ;)


Re: [PATCH v2 char-misc 1/1] x86/hyperv: Add interrupt handler annotations

2018-04-04 Thread Greg KH
On Tue, Apr 03, 2018 at 01:59:08PM -0700, mhkelle...@gmail.com wrote:
> From: Michael Kelley 
> 
> Add standard interrupt handler annotations to
> hyperv_vector_handler().
> 
> Signed-off-by: Michael Kelley 
> ---
> Changes in v2:
> * Fixed From: line
> ---
>  arch/x86/kernel/cpu/mshyperv.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
> index 4488cf0..20f6849 100644
> --- a/arch/x86/kernel/cpu/mshyperv.c
> +++ b/arch/x86/kernel/cpu/mshyperv.c
> @@ -41,7 +41,7 @@ static void (*hv_stimer0_handler)(void);
>  static void (*hv_kexec_handler)(void);
>  static void (*hv_crash_handler)(struct pt_regs *regs);
>  
> -void hyperv_vector_handler(struct pt_regs *regs)
> +__visible void __irq_entry hyperv_vector_handler(struct pt_regs *regs)

What bug does this solve?  What is wrong with the existing markings?
What does __visible and __irq_entry give us that we don't already have
and we need?

Are you really using LTO that requires this marking to prevent the code
from being removed?

thanks,

greg k-h


[PATCH 1/3] adp5061: New driver for ADP5061 I2C battery charger

2018-04-04 Thread Stefan Popa
This patch adds basic support for Analog Devices I2C programmable linear
battery charger.

With this driver, some parameters can be read and configured such as:
* trickle charge current level
* trickle charge voltage threshold
* weak charge threshold
* constant current
* constant charge voltage limit
* battery full
* input current limit
* charger status
* battery status
* termination current

Datasheet:
http://www.analog.com/media/en/technical-documentation/data-sheets/ADP5061.pdf

Signed-off-by: Stefan Popa 
---
 .../devicetree/bindings/power/supply/adp5061.txt   |  17 +
 MAINTAINERS|   8 +
 drivers/power/supply/Kconfig   |  11 +
 drivers/power/supply/Makefile  |   1 +
 drivers/power/supply/adp5061.c | 745 +
 5 files changed, 782 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/power/supply/adp5061.txt
 create mode 100644 drivers/power/supply/adp5061.c

diff --git a/Documentation/devicetree/bindings/power/supply/adp5061.txt 
b/Documentation/devicetree/bindings/power/supply/adp5061.txt
new file mode 100644
index 000..7447446
--- /dev/null
+++ b/Documentation/devicetree/bindings/power/supply/adp5061.txt
@@ -0,0 +1,17 @@
+Analog Devices ADP5061 Programmable Linear Battery Charger Driver
+
+Required properties:
+  - compatible:should be "adi,adp5061"
+  - reg:   i2c address of the device
+
+The node for this driver must be a child node of a I2C controller, hence
+all mandatory properties described in
+Documentation/devicetree/bindings/i2c/i2c.txt
+must be specified.
+
+Example:
+
+   adp5061@14 {
+   compatible = "adi,adp5061";
+   reg = <0x14>;
+   };
diff --git a/MAINTAINERS b/MAINTAINERS
index 3bdc260..a9ca73b 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -797,6 +797,14 @@ L: linux-me...@vger.kernel.org
 S: Maintained
 F: drivers/media/i2c/ad9389b*
 
+ANALOG DEVICES INC ADP5061 DRIVER
+M: Stefan Popa 
+L: linux...@vger.kernel.org
+W: http://ez.analog.com/community/linux-device-drivers
+S: Supported
+F: Documentation/devicetree/bindings/power/supply/adp5061.txt
+F: drivers/power/supply/adp5061.c
+
 ANALOG DEVICES INC ADV7180 DRIVER
 M: Lars-Peter Clausen 
 L: linux-me...@vger.kernel.org
diff --git a/drivers/power/supply/Kconfig b/drivers/power/supply/Kconfig
index 428b426..761cc52 100644
--- a/drivers/power/supply/Kconfig
+++ b/drivers/power/supply/Kconfig
@@ -75,6 +75,17 @@ config BATTERY_88PM860X
help
  Say Y here to enable battery monitor for Marvell 88PM860x chip.
 
+config CHARGER_ADP5061
+   tristate "ADP5061 battery charger driver"
+   depends on I2C
+   select REGMAP_I2C
+   help
+ Say Y here to enable support for the ADP5061 standalone battery
+ charger.
+
+ This driver can be build as a module. If so, the module will be
+ called adp5061.
+
 config BATTERY_ACT8945A
tristate "Active-semi ACT8945A charger driver"
depends on MFD_ACT8945A || COMPILE_TEST
diff --git a/drivers/power/supply/Makefile b/drivers/power/supply/Makefile
index e83aa84..71b1398 100644
--- a/drivers/power/supply/Makefile
+++ b/drivers/power/supply/Makefile
@@ -18,6 +18,7 @@ obj-$(CONFIG_WM8350_POWER)+= wm8350_power.o
 obj-$(CONFIG_TEST_POWER)   += test_power.o
 
 obj-$(CONFIG_BATTERY_88PM860X) += 88pm860x_battery.o
+obj-$(CONFIG_CHARGER_ADP5061)  += adp5061.o
 obj-$(CONFIG_BATTERY_ACT8945A) += act8945a_charger.o
 obj-$(CONFIG_BATTERY_AXP20X)   += axp20x_battery.o
 obj-$(CONFIG_CHARGER_AXP20X)   += axp20x_ac_power.o
diff --git a/drivers/power/supply/adp5061.c b/drivers/power/supply/adp5061.c
new file mode 100644
index 000..c00a02e
--- /dev/null
+++ b/drivers/power/supply/adp5061.c
@@ -0,0 +1,745 @@
+/*
+ * ADP5061 I2C Programmable Linear Battery Charger
+ *
+ * Copyright 2018 Analog Devices Inc.
+ *
+ * Licensed under the GPL-2.
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+/* ADP5061 registers definition */
+#define ADP5061_ID 0x00
+#define ADP5061_REV0x01
+#define ADP5061_VINX_SET   0x02
+#define ADP5061_TERM_SET   0x03
+#define ADP5061_CHG_CURR   0x04
+#define ADP5061_VOLTAGE_TH 0x05
+#define ADP5061_TIMER_SET  0x06
+#define ADP5061_FUNC_SET_1 0x07
+#define ADP5061_FUNC_SET_2 0x08
+#define ADP5061_INT_EN 0x09
+#define ADP5061_INT_ACT0x0A
+#define ADP5061_CHG_STATUS_1   0x0B
+#define ADP5061_CHG_STATUS_2   0x0C
+#define ADP5061_FAULT  0x0D
+#define ADP5061_BATTERY_SHORT  0x10
+#define ADP5061_IEND   0x11
+
+/* ADP5061_VINX_SET */
+#define ADP5061_VINX_SET_ILIM_MSK  GENMASK(3, 0)
+#d

[PATCH 2/3] adp5061: Add support for battery charging enable

2018-04-04 Thread Stefan Popa
This patch adds the option to enable/disable battery charging. This
option is not configurable via the power_supply properties, therefore,
access via sysfs was provided to examine and modify this attribute on the
fly.

Signed-off-by: Stefan Popa 
---
 .../ABI/testing/sysfs-class-power-adp5061  | 10 
 MAINTAINERS|  1 +
 drivers/power/supply/adp5061.c | 62 ++
 3 files changed, 73 insertions(+)
 create mode 100644 Documentation/ABI/testing/sysfs-class-power-adp5061

diff --git a/Documentation/ABI/testing/sysfs-class-power-adp5061 
b/Documentation/ABI/testing/sysfs-class-power-adp5061
new file mode 100644
index 000..0d056aa
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-class-power-adp5061
@@ -0,0 +1,10 @@
+What: /sys/class/power_supply/adp5061/charging_enabled
+Description:
+   Enable/disable battery charging.
+
+   The ADP5061 charging function can be enabled by setting
+   this attribute to 1. See device datasheet for details.
+
+   Valid values:
+   - 1: enabled
+   - 0: disabled
diff --git a/MAINTAINERS b/MAINTAINERS
index a9ca73b..9271246 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -804,6 +804,7 @@ W:  http://ez.analog.com/community/linux-device-drivers
 S: Supported
 F: Documentation/devicetree/bindings/power/supply/adp5061.txt
 F: drivers/power/supply/adp5061.c
+F: Documentation/ABI/testing/sysfs-class-power-adp5061
 
 ANALOG DEVICES INC ADV7180 DRIVER
 M: Lars-Peter Clausen 
diff --git a/drivers/power/supply/adp5061.c b/drivers/power/supply/adp5061.c
index c00a02e..7cd2e67 100644
--- a/drivers/power/supply/adp5061.c
+++ b/drivers/power/supply/adp5061.c
@@ -79,6 +79,10 @@
 #define ADP5061_IEND_IEND_MSK  GENMASK(7, 5)
 #define ADP5061_IEND_IEND_MODE(x)  (((x) & 0x07) << 5)
 
+/* ADP5061_FUNC_SET_1 */
+#define ADP5061_FUNC_SET_1_EN_CHG_MSK  BIT(0)
+#define ADP5061_FUNC_SET_1_EN_CHG_MODE(x)  (((x) & 0x01) << 0)
+
 #define ADP5061_NO_BATTERY 0x01
 #define ADP5061_ICHG_MAX   1300 // mA
 
@@ -692,11 +696,63 @@ static const struct power_supply_desc adp5061_desc = {
.num_properties = ARRAY_SIZE(adp5061_props),
 };
 
+static int charging_enabled_show(struct device *dev,
+struct device_attribute *attr,
+char *buf)
+{
+   struct power_supply *psy = dev_get_drvdata(dev);
+   struct adp5061_state *st = power_supply_get_drvdata(psy);
+   unsigned int regval;
+   int ret;
+
+   ret = regmap_read(st->regmap, ADP5061_FUNC_SET_1, ®val);
+   if (ret < 0)
+   return ret;
+
+   regval &= ADP5061_FUNC_SET_1_EN_CHG_MSK;
+   return sprintf(buf, "%d\n", regval);
+}
+
+static int charging_enabled_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+   struct power_supply *psy = dev_get_drvdata(dev);
+   struct adp5061_state *st = power_supply_get_drvdata(psy);
+   u8 chg_en;
+   int ret;
+
+   ret = kstrtou8(buf, 0, &chg_en);
+   if (ret < 0)
+   return ret;
+
+   ret = regmap_update_bits(st->regmap, ADP5061_FUNC_SET_1,
+ADP5061_FUNC_SET_1_EN_CHG_MSK,
+ADP5061_FUNC_SET_1_EN_CHG_MODE(!!chg_en));
+
+   if (ret < 0)
+   return ret;
+
+   return count;
+}
+
+static DEVICE_ATTR_RW(charging_enabled);
+
+static struct attribute *adp5061_attributes[] = {
+   &dev_attr_charging_enabled.attr,
+   NULL
+};
+
+static const struct attribute_group adp5061_attr_group = {
+   .attrs = adp5061_attributes,
+};
+
 static int adp5061_probe(struct i2c_client *client,
 const struct i2c_device_id *id)
 {
struct power_supply_config psy_cfg = {};
struct adp5061_state *st;
+   int ret;
 
st = devm_kzalloc(&client->dev, sizeof(*st), GFP_KERNEL);
if (!st)
@@ -722,6 +778,12 @@ static int adp5061_probe(struct i2c_client *client,
return PTR_ERR(st->psy);
}
 
+   ret = sysfs_create_group(&st->psy->dev.kobj, &adp5061_attr_group);
+   if (ret < 0) {
+   dev_err(&client->dev, "failed to create sysfs group\n");
+   return ret;
+   }
+
return 0;
 }
 
-- 
2.7.4



[PATCH] media: dvb_frontend: fix wrong cast in compat_ioctl

2018-04-04 Thread Katsuhiro Suzuki
FE_GET_PROPERTY has always failed as following situations:
  - Use compatible ioctl
  - The array of 'struct dtv_property' has 2 or more items

This patch fixes wrong cast to a pointer 'struct dtv_property' from a
pointer of 2nd or after item of 'struct compat_dtv_property' array.

Signed-off-by: Katsuhiro Suzuki 
---
 drivers/media/dvb-core/dvb_frontend.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/media/dvb-core/dvb_frontend.c 
b/drivers/media/dvb-core/dvb_frontend.c
index 21a7d4b47e1a..e33414975065 100644
--- a/drivers/media/dvb-core/dvb_frontend.c
+++ b/drivers/media/dvb-core/dvb_frontend.c
@@ -2089,7 +2089,7 @@ static int dvb_frontend_handle_compat_ioctl(struct file 
*file, unsigned int cmd,
}
for (i = 0; i < tvps->num; i++) {
err = dtv_property_process_get(
-   fe, &getp, (struct dtv_property *)tvp + i, file);
+   fe, &getp, (struct dtv_property *)(tvp + i), file);
if (err < 0) {
kfree(tvp);
return err;
-- 
2.16.3



Re: [PATCH] usb: dwc3: of-simple: use managed and shared reset control

2018-04-04 Thread Philipp Zabel
On Wed, 2018-04-04 at 10:25 +0900, Masahiro Yamada wrote:
> 2018-04-03 19:35 GMT+09:00 Vivek Gautam :
> > 
> > 
> > On 4/3/2018 3:49 PM, Masahiro Yamada wrote:
> > > 
> > > 2018-04-03 17:46 GMT+09:00 Philipp Zabel :
> > > > 
> > > > On Tue, 2018-04-03 at 17:30 +0900, Masahiro Yamada wrote:
> > > > > 
> > > > > 2018-04-03 17:00 GMT+09:00 Philipp Zabel :
> > > > > > 
> > > > > > On Thu, 2018-03-29 at 15:07 +0900, Masahiro Yamada wrote:
> > > > > > > 
> > > > > > > This driver handles the reset control in a common manner; deassert
> > > > > > > resets before use, assert them after use.  There is no good reason
> > > > > > > why it should be exclusive.
> > > > > > 
> > > > > > Is this preemptive cleanup, or do you have hardware on the horizon 
> > > > > > that
> > > > > > shares these reset lines with other peripherals?
> > > > > 
> > > > > This patch is necessary for Socionext SoCs.
> > > > > 
> > > > > The same reset lines are shared between
> > > > > this dwc3-of_simple and other glue circuits.
> > > > 
> > > > Thanks, this is helpful information.
> > > > 
> > > > > > > Also, use devm_ for clean-up.
> > > > > > > 
> > > > > > > Signed-off-by: Masahiro Yamada 
> > > > > > > ---
> > > > > > > 
> > > > > > > CCing Philipp Zabel.
> > > > > > > I see his sob in commit 06c47e6286d5.
> > > > > > 
> > > > > > At the time I was concerned with the reset_control_array addition 
> > > > > > and
> > > > > > didn't look closely at the exclusive vs shared issue.
> > > > > > > 
> > > > > > >   drivers/usb/dwc3/dwc3-of-simple.c | 7 ++-
> > > > > > >   1 file changed, 2 insertions(+), 5 deletions(-)
> > > > > > > 
> > > > > > > diff --git a/drivers/usb/dwc3/dwc3-of-simple.c
> > > > > > > b/drivers/usb/dwc3/dwc3-of-simple.c
> > > > > > > index e54c362..bd6ab65 100644
> > > > > > > --- a/drivers/usb/dwc3/dwc3-of-simple.c
> > > > > > > +++ b/drivers/usb/dwc3/dwc3-of-simple.c
> > > > > > > @@ -91,7 +91,7 @@ static int dwc3_of_simple_probe(struct
> > > > > > > platform_device *pdev)
> > > > > > >platform_set_drvdata(pdev, simple);
> > > > > > >simple->dev = dev;
> > > > > > > 
> > > > > > > - simple->resets =
> > > > > > > of_reset_control_array_get_optional_exclusive(np);
> > > > > > > + simple->resets =
> > > > > > > devm_reset_control_array_get_optional_shared(dev);
> > > > > > 
> > > > > >  From the usage in the driver, it does indeed look like _shared 
> > > > > > reset
> > > > > > usage is appropriate. I assume that the hardware has no need for the
> > > > > > reset to be asserted right before probe or after remove, it just
> > > > > > requires that the reset line is kept deasserted while the driver is
> > > > > > probed.
> > > > > > 
> > > > > > >if (IS_ERR(simple->resets)) {
> > > > > > >ret = PTR_ERR(simple->resets);
> > > > > > >dev_err(dev, "failed to get device resets, 
> > > > > > > err=%d\n",
> > > > > > > ret);
> > > > > > > @@ -100,7 +100,7 @@ static int dwc3_of_simple_probe(struct
> > > > > > > platform_device *pdev)
> > > > > > > 
> > > > > > >ret = reset_control_deassert(simple->resets);
> > > > > > >if (ret)
> > > > > > > - goto err_resetc_put;
> > > > > > > + return ret;
> > > > > > > 
> > > > > > >ret = dwc3_of_simple_clk_init(simple,
> > > > > > > of_count_phandle_with_args(np,
> > > > > > >"clocks",
> > > > > > > "#clock-cells"));
> > > > > > > @@ -126,8 +126,6 @@ static int dwc3_of_simple_probe(struct
> > > > > > > platform_device *pdev)
> > > > > > >   err_resetc_assert:
> > > > > > >reset_control_assert(simple->resets);
> > > > > > > 
> > > > > > > -err_resetc_put:
> > > > > > > - reset_control_put(simple->resets);
> > > > > > >return ret;
> > > > > > >   }
> > > > > > > 
> > > > > > > @@ -146,7 +144,6 @@ static int dwc3_of_simple_remove(struct
> > > > > > > platform_device *pdev)
> > > > > > >simple->num_clocks = 0;
> > > > > > > 
> > > > > > >reset_control_assert(simple->resets);
> > > > > > > - reset_control_put(simple->resets);
> > > > > > > 
> > > > > > >pm_runtime_put_sync(dev);
> > > > > > >pm_runtime_disable(dev);
> > > > > > 
> > > > > > Changing to devm_ changes the order here. Whether or not it could 
> > > > > > be a
> > > > > > problem to assert the reset only after pm_runtime_put (or 
> > > > > > potentially
> > > > > > never), I can't say. I assume this is a non-issue, but somebody who
> > > > > > knows the hardware better would have to decide.
> > > > > 
> > > > > 
> > > > > 
> > > > > I do not understand what you mean.
> > > > 
> > > > Sorry for the confusion, I have mixed up things here.
> > > > 
> > > > > Can you describe your concern in more details?
> > > > > 
> > > > > I am not touching reset_control_assert() here.
> > > > 
> > > > With the change to shared reset control, reset_control_assert
> > > > potentially does nothing, so it could be possible that
> > > > pm_

[PATCH 3/3] adp5061: Add support for charging voltage limit enable

2018-04-04 Thread Stefan Popa
This patch adds the option to activate/deactivate the charging voltage
limit. If activated, the charger prevents charging until the battery
voltage drops below the VCHG_VLIM threshold.

This option is not configurable via the power_supply properties,
therefore, access via sysfs was provided to examine and modify this
attribute on the fly.

Signed-off-by: Stefan Popa 
---
 .../ABI/testing/sysfs-class-power-adp5061  | 13 ++
 drivers/power/supply/adp5061.c | 46 ++
 2 files changed, 59 insertions(+)

diff --git a/Documentation/ABI/testing/sysfs-class-power-adp5061 
b/Documentation/ABI/testing/sysfs-class-power-adp5061
index 0d056aa..25064c1 100644
--- a/Documentation/ABI/testing/sysfs-class-power-adp5061
+++ b/Documentation/ABI/testing/sysfs-class-power-adp5061
@@ -8,3 +8,16 @@ Description:
Valid values:
- 1: enabled
- 0: disabled
+
+What: /sys/class/power_supply/adp5061/charging_vlim_enabled
+Description:
+   Enable/disable charging voltage limit
+
+   The ADP5061 charging voltage limit can be enabled by setting
+   this attribute to 1. When enabled, the charger prevents charging
+   until the battery voltage drops bellow the VCHG_VLIM threshold.
+   See device datasheet for details.
+
+   Valid values:
+   - 1: enabled
+   - 0: disabled
diff --git a/drivers/power/supply/adp5061.c b/drivers/power/supply/adp5061.c
index 7cd2e67..5931e45 100644
--- a/drivers/power/supply/adp5061.c
+++ b/drivers/power/supply/adp5061.c
@@ -83,6 +83,10 @@
 #define ADP5061_FUNC_SET_1_EN_CHG_MSK  BIT(0)
 #define ADP5061_FUNC_SET_1_EN_CHG_MODE(x)  (((x) & 0x01) << 0)
 
+/* ADP5061_FUNC_SET_2 */
+#define ADP5061_FUNC_SET_2_EN_CHG_VLIM_MSK BIT(5)
+#define ADP5061_FUNC_SET_2_EN_CHG_VLIM_MODE(x) (((x) & 0x01) << 5)
+
 #define ADP5061_NO_BATTERY 0x01
 #define ADP5061_ICHG_MAX   1300 // mA
 
@@ -736,10 +740,52 @@ static int charging_enabled_store(struct device *dev,
return count;
 }
 
+static int charging_vlim_enabled_show(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+   struct power_supply *psy = dev_get_drvdata(dev);
+   struct adp5061_state *st = power_supply_get_drvdata(psy);
+   unsigned int regval;
+   int ret;
+
+   ret = regmap_read(st->regmap, ADP5061_FUNC_SET_2, ®val);
+   if (ret < 0)
+   return ret;
+
+   regval = (regval & ADP5061_FUNC_SET_2_EN_CHG_VLIM_MSK) >> 5;
+   return sprintf(buf, "%d\n", regval);
+}
+
+static int charging_vlim_enabled_store(struct device *dev,
+  struct device_attribute *attr,
+  const char *buf, size_t count)
+{
+   struct power_supply *psy = dev_get_drvdata(dev);
+   struct adp5061_state *st = power_supply_get_drvdata(psy);
+   u8 chg_vlim_en;
+   int ret;
+
+   ret = kstrtou8(buf, 0, &chg_vlim_en);
+   if (ret < 0)
+   return ret;
+
+   ret = regmap_update_bits(st->regmap, ADP5061_FUNC_SET_2,
+   ADP5061_FUNC_SET_2_EN_CHG_VLIM_MSK,
+   ADP5061_FUNC_SET_2_EN_CHG_VLIM_MODE(!!chg_vlim_en));
+
+   if (ret < 0)
+   return ret;
+
+   return count;
+}
+
 static DEVICE_ATTR_RW(charging_enabled);
+static DEVICE_ATTR_RW(charging_vlim_enabled);
 
 static struct attribute *adp5061_attributes[] = {
&dev_attr_charging_enabled.attr,
+   &dev_attr_charging_vlim_enabled.attr,
NULL
 };
 
-- 
2.7.4



Re: [PATCH v2 5/6] i2c: i2c-stm32f7: Add DMA support

2018-04-04 Thread Pierre Yves MORDRET


On 04/03/2018 05:39 PM, Wolfram Sang wrote:
> 
>> +#define STM32F7_I2C_DMA_LEN_MIN 0x1
> ...
> 
>> +if (i2c_dev->dma && f7_msg->count >= STM32F7_I2C_DMA_LEN_MIN) {
> 
> Are you using DMA for every message with a length >= 1? The setup of
> that might be more expensive than the DMA gain, if so.
> 
Well yes. I am in charge of DMA IPs as well. I2C is the only devices that I had
to test DMA outside standard DMA Engine test. Quite convenient.
I believe to stress both I2C and DMA, this value was relevant.
Now I agree this value can be tuned a little bit. I might raise this threshold
in V3.


Re: linux-next: Signed-off-by missing for commit in the overlayfs tree

2018-04-04 Thread Miklos Szeredi
On Fri, Mar 30, 2018 at 3:08 AM, Stephen Rothwell  wrote:
> Hi Miklos,
>
> Commit
>
>   cbf293becfa4 ("ovl: cleanup setting OVL_INDEX")
>
> is missing a Signed-off-by from its author.

Hi Steven,

AFAIK for trivial work we are not required to get a signoff from the author.

Thanks,
Miklos


Re: [PATCH v9 09/24] mm: protect mremap() against SPF hanlder

2018-04-04 Thread Laurent Dufour


On 28/03/2018 23:21, David Rientjes wrote:
> On Wed, 28 Mar 2018, Laurent Dufour wrote:
> 
 @@ -326,7 +336,10 @@ static unsigned long move_vma(struct vm_area_struct 
 *vma,
mremap_userfaultfd_prep(new_vma, uf);
arch_remap(mm, old_addr, old_addr + old_len,
   new_addr, new_addr + new_len);
 +  if (vma != new_vma)
 +  vm_raw_write_end(vma);
}
 +  vm_raw_write_end(new_vma);
>>>
>>> Just do
>>>
>>> vm_raw_write_end(vma);
>>> vm_raw_write_end(new_vma);
>>>
>>> here.
>>
>> Are you sure ? we can have vma = new_vma done if (unlikely(err))
>>
> 
> Sorry, what I meant was do
> 
> if (vma != new_vma)
>   vm_raw_write_end(vma);
> vm_raw_write_end(new_vma);
> 
> after the conditional.  Having the locking unnecessarily embedded in the 
> conditional has been an issue in the past with other areas of core code, 
> unless you have a strong reason for it.

Unfortunately, I can't see how doing this in another way since vma = new_vma is
done in the error branch.
So releasing the VMAs outside of the conditional may lead to miss 'vma' if the
error branch is taken.

Here is the code snippet as a reminder:

new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff,
   &need_rmap_locks);
[...]
if (vma != new_vma)
vm_raw_write_begin(vma);
[...]
if (unlikely(err)) {
[...]
if (vma != new_vma)
vm_raw_write_end(vma);
vma = new_vma;  here we lost reference to vma
[...]
} else {
[...]
if (vma != new_vma)
vm_raw_write_end(vma);
}
vm_raw_write_end(new_vma);



Re: [PATCH V3 4/4] genirq/affinity: irq vector spread among online CPUs as far as possible

2018-04-04 Thread Thomas Gleixner
On Wed, 4 Apr 2018, Ming Lei wrote:
> On Tue, Apr 03, 2018 at 03:32:21PM +0200, Thomas Gleixner wrote:
> > On Thu, 8 Mar 2018, Ming Lei wrote:
> > > 1) before 84676c1f21 ("genirq/affinity: assign vectors to all possible 
> > > CPUs")
> > >   irq 39, cpu list 0
> > >   irq 40, cpu list 1
> > >   irq 41, cpu list 2
> > >   irq 42, cpu list 3
> > > 
> > > 2) after 84676c1f21 ("genirq/affinity: assign vectors to all possible 
> > > CPUs")
> > >   irq 39, cpu list 0-2
> > >   irq 40, cpu list 3-4,6
> > >   irq 41, cpu list 5
> > >   irq 42, cpu list 7
> > > 
> > > 3) after applying this patch against V4.15+:
> > >   irq 39, cpu list 0,4
> > >   irq 40, cpu list 1,6
> > >   irq 41, cpu list 2,5
> > >   irq 42, cpu list 3,7
> > 
> > That's more or less window dressing. If the device is already in use when
> > the offline CPUs get hot plugged, then the interrupts still stay on cpu 0-3
> > because the effective affinity of interrupts on X86 (and other
> > architectures) is always a single CPU.
> > 
> > So this only might move interrupts to the hotplugged CPUs when the device
> > is initialized after CPU hotplug and the actual vector allocation moves an
> > interrupt out to the higher numbered CPUs if they have less vectors
> > allocated than the lower numbered ones.
> 
> It works for blk-mq devices, such as NVMe.
> 
> Now NVMe driver creates num_possible_cpus() hw queues, and each
> hw queue is assigned one msix irq vector.
> 
> Storage is Client/Server model, that means the interrupt is only
> delivered to CPU after one IO request is submitted to hw queue and
> it is completed by this hw queue.
> 
> When CPUs is hotplugged, and there will be IO submitted from these
> CPUs, then finally IOs complete and irq events are generated from
> hw queues, and notify these submission CPU by IRQ finally.

I'm aware how that hw-queue stuff works. But that only works if the
spreading algorithm makes the interrupts affine to offline/not-present CPUs
when the block device is initialized.

In the example above:

> > >   irq 39, cpu list 0,4
> > >   irq 40, cpu list 1,6
> > >   irq 41, cpu list 2,5
> > >   irq 42, cpu list 3,7

and assumed that at driver init time only CPU 0-3 are online then the
hotplug of CPU 4-7 will not result in any interrupt delivered to CPU 4-7.

So the extra assignment to CPU 4-7 in the affinity mask has no effect
whatsoever and even if the spreading result is 'perfect' it just looks
perfect as it is not making any difference versus the original result:

> > >   irq 39, cpu list 0
> > >   irq 40, cpu list 1
> > >   irq 41, cpu list 2
> > >   irq 42, cpu list 3

Thanks,

tglx




Re: [RFC] drm/atomic+msm: add helper to implement legacy dirtyfb

2018-04-04 Thread Thomas Hellstrom

Hi,

On 04/04/2018 08:58 AM, Daniel Vetter wrote:

On Wed, Apr 4, 2018 at 12:42 AM, Rob Clark  wrote:

Add an atomic helper to implement dirtyfb support.  This is needed to
support DSI command-mode panels with x11 userspace (ie. when we can't
rely on pageflips to trigger a flush to the panel).

To signal to the driver that the async atomic update needs to
synchronize with fences, even though the fb didn't change, the
drm_atomic_state::dirty flag is added.

Signed-off-by: Rob Clark 
---
Background: there are a number of different folks working on getting
upstream kernel working on various different phones/tablets with qcom
SoC's.. many of them have command mode panels, so we kind of need a
way to support the legacy dirtyfb ioctl for x11 support.

I know there is work on a proprer non-legacy atomic property for
userspace to communicate dirty-rect(s) to the kernel, so this can
be improved from triggering a full-frame flush once that is in
place.  But we kinda needa a stop-gap solution.

I had considered an in-driver solution for this, but things get a
bit tricky if userspace ands up combining dirtyfb ioctls with page-
flips, because we need to synchronize setting various CTL.FLUSH bits
with setting the CTL.START bit.  (ie. really all we need to do for
cmd mode panels is bang CTL.START, but is this ends up racing with
pageflips setting FLUSH bits, then bad things.)  The easiest soln
is to wrap this up as an atomic commit and rely on the worker to
serialize things.  Hence adding an atomic dirtyfb helper.

I guess at least the helper, with some small addition to translate
and pass-thru the dirty rect(s) is useful to the final atomic dirty-
rect property solution.  Depending on how far off that is, a stop-
gap solution could be useful.

Adding Noralf, who iirc already posted the full dirty helpers already somewhere.
-Daniel


I've asked Deepak to RFC the core changes suggested for the full dirty 
blob on dri-devel. It builds on DisplayLink's suggestion, with a simple 
helper to get to the desired coordinates.


One thing to perhaps discuss is how we would like to fit this with 
front-buffer rendering and the dirty ioctl. In the page-flip context, 
the dirty rects, like egl's swapbuffer_with_damage is a hint to restrict 
the damage region that can be fully ignored by the driver, new content 
is indicated by a new framebuffer.


We could do the same for frontbuffer rendering: Either set a dirty flag 
like you do here, or provide a content_age state member. Since we clear 
the dirty flag on state copies, I guess that would be sufficient. The 
blob rectangles would then become a hint to restrict the damage region.


Another approach would be to have the presence of dirty rects without 
framebuffer change to indicate frontbuffer rendering.


I think I like the first approach best, although it may be tempting for 
user-space apps to just set the dirty bit instead of providing the full 
damage region.


/Thomas



Re: [RESEND PATCH V4] pidns: introduce syscall translate_pid

2018-04-04 Thread Konstantin Khlebnikov

On 04.04.2018 00:51, Nagarathnam Muthusamy wrote:



On 04/03/2018 02:52 PM, Andrew Morton wrote:

On Tue, 3 Apr 2018 14:45:28 -0700 Nagarathnam Muthusamy 
 wrote:


This changelog doesn't explain what the value is to our users.  I
assume it is a performance optimization because "backward translation
requires scanning all tasks"?  If so, please show us real-world
examples of the performance benefit from this patch, and please go to
great lengths to explain to us why this optimisation is needed by our
users.

One of the usecase by Oracle database involves multiple levels of
nested pid namespaces and we require pid translation between the
levels. Discussions on the particular usecase, why any of the existing
methods was not usable happened in the following thread.

https://patchwork.kernel.org/patch/10276785/

At the end, it was agreed that this patch along with flocks will solve the
issue.

Nobody who reads this patch's changelog will know any of this.  Please
let's get all this information into the proper place.

Sure! Will resend the patch with updated change log.


I have v5 version of this proposal in work.

I've redesigned interface to be more convenient for cases where
strict race-protection isn't required and pid-ns could be referenced pid.

It has 5 arguments rather than 3 because types of references are
defined explicitly rather than magic like -1, >0, <0.
This more verbose but protects against errors like passing -1 from
failed previous syscall as argument.

kind of
translate_pid(pid, TRANSLATE_PID_FD_PIDNS, ns_fd, TRANSLATE_PID_CURRENT_PIDNS, 
0)

I'll send it today with including more detailed motivation for patch.


[PATCH v2 6/9] trace_uprobe: Support SDT markers having reference count (semaphore)

2018-04-04 Thread Ravi Bangoria
Userspace Statically Defined Tracepoints[1] are dtrace style markers
inside userspace applications. These markers are added by developer at
important places in the code. Each marker source expands to a single
nop instruction in the compiled code but there may be additional
overhead for computing the marker arguments which expands to couple of
instructions. In case the overhead is more, execution of it can be
ommited by runtime if() condition when no one is tracing on the marker:

if (reference_counter > 0) {
Execute marker instructions;
}

Default value of reference counter is 0. Tracer has to increment the
reference counter before tracing on a marker and decrement it when
done with the tracing.

Implement the reference counter logic in trace_uprobe, leaving core
uprobe infrastructure as is, except one new callback from uprobe_mmap()
to trace_uprobe.

trace_uprobe definition with reference counter will now be:

  :[(ref_ctr_offset)]

There are two different cases while enabling the marker,
 1. Trace existing process. In this case, find all suitable processes
and increment the reference counter in them.
 2. Enable trace before running target binary. In this case, all mmaps
will get notified to trace_uprobe and trace_uprobe will increment
the reference counter if corresponding uprobe is enabled.

At the time of disabling probes, decrement reference counter in all
existing target processes.

[1] https://sourceware.org/systemtap/wiki/UserSpaceProbeImplementation

Note: 'reference counter' is called as 'semaphore' in original Dtrace
(or Systemtap, bcc and even in ELF) documentation and code. But the
term 'semaphore' is misleading in this context. This is just a counter
used to hold number of tracers tracing on a marker. This is not really
used for any synchronization. So we are referring it as 'reference
counter' in kernel / perf code.

Signed-off-by: Ravi Bangoria 
Signed-off-by: Fengguang Wu 
[Fengguang reported/fixed build failure in RFC patch]
---
 include/linux/uprobes.h |  10 +++
 kernel/events/uprobes.c |  16 +
 kernel/trace/trace_uprobe.c | 162 +++-
 3 files changed, 186 insertions(+), 2 deletions(-)

diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
index 7bd2760..2db3ed1 100644
--- a/include/linux/uprobes.h
+++ b/include/linux/uprobes.h
@@ -122,6 +122,8 @@ struct uprobe_map_info {
unsigned long vaddr;
 };
 
+extern void (*uprobe_mmap_callback)(struct vm_area_struct *vma);
+
 extern int set_swbp(struct arch_uprobe *aup, struct mm_struct *mm, unsigned 
long vaddr);
 extern int set_orig_insn(struct arch_uprobe *aup, struct mm_struct *mm, 
unsigned long vaddr);
 extern bool is_swbp_insn(uprobe_opcode_t *insn);
@@ -136,6 +138,8 @@ struct uprobe_map_info {
 extern void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, 
unsigned long end);
 extern void uprobe_start_dup_mmap(void);
 extern void uprobe_end_dup_mmap(void);
+extern void uprobe_down_write_dup_mmap(void);
+extern void uprobe_up_write_dup_mmap(void);
 extern void uprobe_dup_mmap(struct mm_struct *oldmm, struct mm_struct *newmm);
 extern void uprobe_free_utask(struct task_struct *t);
 extern void uprobe_copy_process(struct task_struct *t, unsigned long flags);
@@ -192,6 +196,12 @@ static inline void uprobe_start_dup_mmap(void)
 static inline void uprobe_end_dup_mmap(void)
 {
 }
+static inline void uprobe_down_write_dup_mmap(void)
+{
+}
+static inline void uprobe_up_write_dup_mmap(void)
+{
+}
 static inline void
 uprobe_dup_mmap(struct mm_struct *oldmm, struct mm_struct *newmm)
 {
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 096d1e6..c691334 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -1044,6 +1044,9 @@ static void build_probe_list(struct inode *inode,
spin_unlock(&uprobes_treelock);
 }
 
+/* Rightnow the only user of this is trace_uprobe. */
+void (*uprobe_mmap_callback)(struct vm_area_struct *vma);
+
 /*
  * Called from mmap_region/vma_adjust with mm->mmap_sem acquired.
  *
@@ -1056,6 +1059,9 @@ int uprobe_mmap(struct vm_area_struct *vma)
struct uprobe *uprobe, *u;
struct inode *inode;
 
+   if (uprobe_mmap_callback)
+   uprobe_mmap_callback(vma);
+
if (no_uprobe_events() || !valid_vma(vma, true))
return 0;
 
@@ -1247,6 +1253,16 @@ void uprobe_end_dup_mmap(void)
percpu_up_read(&dup_mmap_sem);
 }
 
+void uprobe_down_write_dup_mmap(void)
+{
+   percpu_down_write(&dup_mmap_sem);
+}
+
+void uprobe_up_write_dup_mmap(void)
+{
+   percpu_up_write(&dup_mmap_sem);
+}
+
 void uprobe_dup_mmap(struct mm_struct *oldmm, struct mm_struct *newmm)
 {
if (test_bit(MMF_HAS_UPROBES, &oldmm->flags)) {
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 2014f43..5582c2d 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -25,6 +25,8 @@
 #include 
 #include 
 #include 
+#in

[PATCH v2 4/9] Uprobe: Rename map_info to uprobe_map_info

2018-04-04 Thread Ravi Bangoria
map_info is very generic name, rename it to uprobe_map_info.
Renaming will help to export this structure outside of the
file.

Also rename free_map_info() to uprobe_free_map_info() and
build_map_info() to uprobe_build_map_info().

Signed-off-by: Ravi Bangoria 
Reviewed-by: Jérôme Glisse 
---
 kernel/events/uprobes.c | 30 --
 1 file changed, 16 insertions(+), 14 deletions(-)

diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 1d439c7..477dc42 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -695,28 +695,30 @@ static void delete_uprobe(struct uprobe *uprobe)
put_uprobe(uprobe);
 }
 
-struct map_info {
-   struct map_info *next;
+struct uprobe_map_info {
+   struct uprobe_map_info *next;
struct mm_struct *mm;
unsigned long vaddr;
 };
 
-static inline struct map_info *free_map_info(struct map_info *info)
+static inline struct uprobe_map_info *
+uprobe_free_map_info(struct uprobe_map_info *info)
 {
-   struct map_info *next = info->next;
+   struct uprobe_map_info *next = info->next;
mmput(info->mm);
kfree(info);
return next;
 }
 
-static struct map_info *
-build_map_info(struct address_space *mapping, loff_t offset, bool is_register)
+static struct uprobe_map_info *
+uprobe_build_map_info(struct address_space *mapping, loff_t offset,
+ bool is_register)
 {
unsigned long pgoff = offset >> PAGE_SHIFT;
struct vm_area_struct *vma;
-   struct map_info *curr = NULL;
-   struct map_info *prev = NULL;
-   struct map_info *info;
+   struct uprobe_map_info *curr = NULL;
+   struct uprobe_map_info *prev = NULL;
+   struct uprobe_map_info *info;
int more = 0;
 
  again:
@@ -730,7 +732,7 @@ static inline struct map_info *free_map_info(struct 
map_info *info)
 * Needs GFP_NOWAIT to avoid i_mmap_rwsem recursion 
through
 * reclaim. This is optimistic, no harm done if it 
fails.
 */
-   prev = kmalloc(sizeof(struct map_info),
+   prev = kmalloc(sizeof(struct uprobe_map_info),
GFP_NOWAIT | __GFP_NOMEMALLOC | 
__GFP_NOWARN);
if (prev)
prev->next = NULL;
@@ -763,7 +765,7 @@ static inline struct map_info *free_map_info(struct 
map_info *info)
}
 
do {
-   info = kmalloc(sizeof(struct map_info), GFP_KERNEL);
+   info = kmalloc(sizeof(struct uprobe_map_info), GFP_KERNEL);
if (!info) {
curr = ERR_PTR(-ENOMEM);
goto out;
@@ -786,11 +788,11 @@ static inline struct map_info *free_map_info(struct 
map_info *info)
 register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new)
 {
bool is_register = !!new;
-   struct map_info *info;
+   struct uprobe_map_info *info;
int err = 0;
 
percpu_down_write(&dup_mmap_sem);
-   info = build_map_info(uprobe->inode->i_mapping,
+   info = uprobe_build_map_info(uprobe->inode->i_mapping,
uprobe->offset, is_register);
if (IS_ERR(info)) {
err = PTR_ERR(info);
@@ -828,7 +830,7 @@ static inline struct map_info *free_map_info(struct 
map_info *info)
  unlock:
up_write(&mm->mmap_sem);
  free:
-   info = free_map_info(info);
+   info = uprobe_free_map_info(info);
}
  out:
percpu_up_write(&dup_mmap_sem);
-- 
1.8.3.1



[PATCH v2 3/9] Uprobe: Move mmput() into free_map_info()

2018-04-04 Thread Ravi Bangoria
From: Oleg Nesterov 

build_map_info() has a side effect like one need to perform
mmput() when done with the mm. Add mmput() in free_map_info()
so that user does not have to call it explicitly.

Signed-off-by: Oleg Nesterov 
Signed-off-by: Ravi Bangoria 
---
 kernel/events/uprobes.c | 9 ++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 535fd39..1d439c7 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -704,6 +704,7 @@ struct map_info {
 static inline struct map_info *free_map_info(struct map_info *info)
 {
struct map_info *next = info->next;
+   mmput(info->mm);
kfree(info);
return next;
 }
@@ -773,8 +774,11 @@ static inline struct map_info *free_map_info(struct 
map_info *info)
 
goto again;
  out:
-   while (prev)
-   prev = free_map_info(prev);
+   while (prev) {
+   info = prev;
+   prev = prev->next;
+   kfree(info);
+   }
return curr;
 }
 
@@ -824,7 +828,6 @@ static inline struct map_info *free_map_info(struct 
map_info *info)
  unlock:
up_write(&mm->mmap_sem);
  free:
-   mmput(mm);
info = free_map_info(info);
}
  out:
-- 
1.8.3.1



[PATCH v2 0/9] trace_uprobe: Support SDT markers having reference count (semaphore)

2018-04-04 Thread Ravi Bangoria
Userspace Statically Defined Tracepoints[1] are dtrace style markers
inside userspace applications. Applications like PostgreSQL, MySQL,
Pthread, Perl, Python, Java, Ruby, Node.js, libvirt, QEMU, glib etc
have these markers embedded in them. These markers are added by developer
at important places in the code. Each marker source expands to a single
nop instruction in the compiled code but there may be additional
overhead for computing the marker arguments which expands to couple of
instructions. In case the overhead is more, execution of it can be
omitted by runtime if() condition when no one is tracing on the marker:

if (reference_counter > 0) {
Execute marker instructions;
}   

Default value of reference counter is 0. Tracer has to increment the 
reference counter before tracing on a marker and decrement it when
done with the tracing.

Currently, perf tool has limited supports for SDT markers. I.e. it
can not trace markers surrounded by reference counter. Also, it's
not easy to add reference counter logic in userspace tool like perf,
so basic idea for this patchset is to add reference counter logic in
the trace_uprobe infrastructure. Ex,[2]

  # cat tick.c
... 
for (i = 0; i < 100; i++) {
DTRACE_PROBE1(tick, loop1, i);
if (TICK_LOOP2_ENABLED()) {
DTRACE_PROBE1(tick, loop2, i); 
}
printf("hi: %d\n", i); 
sleep(1);
}   
... 

Here tick:loop1 is marker without reference counter where as tick:loop2
is surrounded by reference counter condition.

  # perf buildid-cache --add /tmp/tick
  # perf probe sdt_tick:loop1
  # perf probe sdt_tick:loop2

  # perf stat -e sdt_tick:loop1,sdt_tick:loop2 -- /tmp/tick
  hi: 0
  hi: 1
  hi: 2
  ^C
  Performance counter stats for '/tmp/tick':
 3  sdt_tick:loop1
 0  sdt_tick:loop2
 2.747086086 seconds time elapsed

Perf failed to record data for tick:loop2. Same experiment with this
patch series:

  # ./perf buildid-cache --add /tmp/tick
  # ./perf probe sdt_tick:loop2
  # ./perf stat -e sdt_tick:loop2 /tmp/tick
hi: 0
hi: 1
hi: 2
^C  
 Performance counter stats for '/tmp/tick':
 3  sdt_tick:loop2
   2.561851452 seconds time elapsed


Note:
 - 'reference counter' is called as 'semaphore' in original Dtrace
   (or Systemtap, bcc and even in ELF) documentation and code. But the 
   term 'semaphore' is misleading in this context. This is just a counter
   used to hold number of tracers tracing on a marker. This is not really
   used for any synchronization. So we are referring it as 'reference
   counter' in kernel / perf code.


v2 changes:
 - [PATCH v2 3/9] is new. build_map_info() has a side effect. One has
   to perform mmput() when he is done with the mm. Let free_map_info()
   take care of mmput() so that one does not need to worry about it.
 - [PATCH v2 6/9] sdt_update_ref_ctr(). No need to use memcpy().
   Reference counter can be directly updated using normal assignment.
 - [PATCH v2 6/9] Check valid vma is returned by sdt_find_vma() before
   incrementing / decrementing a reference counter.
 - [PATCH v2 6/9] Introduce utility functions for taking write lock on
   dup_mmap_sem. Use these functions in trace_uprobe to avoide race with
   fork / dup_mmap().
 - [PATCH v2 6/9] Don't check presence of mm in tu->sml at decrement
   time. Purpose of maintaining the list is to ensure increment happen
   only once for each {trace_uprobe,mm} tuple.
 - [PATCH v2 7/9] v1 was not removing mm from tu->sml when process
   exits and tracing is still on. This leads to a problem if same
   address gets used by new mm. Use mmu_notifier to remove such mm
   from the list. This guarantees that all mm which has been added
   to tu->sml will be removed from list either when tracing ends or
   when process goes away.
 - [PATCH v2 7/9] Patch description was misleading. Change it. Add
   more generic python example.
 - [PATCH v2 7/9] Convert sml_rw_sem into mutex sml_lock.
 - [PATCH v2 7/9] Use builtin linked list in sdt_mm_list instead of
   defining it's own pointer chain.
 - Change the order of last two patches.
 - [PATCH v2 9/9] Check availability of ref_ctr_offset support by
   trace_uprobe infrastructure before using it. This ensures newer
   perf tool will still work on older kernels which does not support
   trace_uprobe with reference counter.
 - Other changes as suggested by Masami, Oleg and Steve.

v1 can be found at:
  https://lkml.org/lkml/2018/3/13/432

[1] https://sourceware.org/systemtap/wiki/UserSpaceProbeImplementation
[2] https://github.com/iovisor/bcc/issues/327#issuecomment-200576506
[3] https://lkml.org/lkml/2017/12/6/976


Oleg Nesterov (1):
  Uprobe: Move mmput() into free_map_info()

Ravi Bangoria (8):
  Uprobe: Export vaddr <-> offset conversion functions
  mm: Prefix vma_ to vaddr_to_offset() and offset_to_vaddr()
  Uprobe: Rename map_info to uprobe_map_info
  Uprobe: Export uprobe_map_info along with
upro

[PATCH v2 8/9] trace_uprobe/sdt: Document about reference counter

2018-04-04 Thread Ravi Bangoria
Reference counter gate the invocation of probe. If present,
by default reference count is 0. Kernel needs to increment
it before tracing the probe and decrement it when done. This
is identical to semaphore in Userspace Statically Defined
Tracepoints (USDT).

Document usage of reference counter.

Signed-off-by: Ravi Bangoria 
---
 Documentation/trace/uprobetracer.txt | 16 +---
 kernel/trace/trace.c |  2 +-
 2 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/Documentation/trace/uprobetracer.txt 
b/Documentation/trace/uprobetracer.txt
index bf526a7c..cb6751d 100644
--- a/Documentation/trace/uprobetracer.txt
+++ b/Documentation/trace/uprobetracer.txt
@@ -19,15 +19,25 @@ user to calculate the offset of the probepoint in the 
object.
 
 Synopsis of uprobe_tracer
 -
-  p[:[GRP/]EVENT] PATH:OFFSET [FETCHARGS] : Set a uprobe
-  r[:[GRP/]EVENT] PATH:OFFSET [FETCHARGS] : Set a return uprobe (uretprobe)
-  -:[GRP/]EVENT   : Clear uprobe or uretprobe event
+  p[:[GRP/]EVENT] PATH:OFFSET[(REF_CTR_OFFSET)] [FETCHARGS]
+  r[:[GRP/]EVENT] PATH:OFFSET[(REF_CTR_OFFSET)] [FETCHARGS]
+  -:[GRP/]EVENT
+
+  p : Set a uprobe
+  r : Set a return uprobe (uretprobe)
+  - : Clear uprobe or uretprobe event
 
   GRP   : Group name. If omitted, "uprobes" is the default value.
   EVENT : Event name. If omitted, the event name is generated based
   on PATH+OFFSET.
   PATH  : Path to an executable or a library.
   OFFSET: Offset where the probe is inserted.
+  REF_CTR_OFFSET: Reference counter offset. Optional field. Reference count
+ gate the invocation of probe. If present, by default
+ reference count is 0. Kernel needs to increment it before
+ tracing the probe and decrement it when done. This is
+ identical to semaphore in Userspace Statically Defined
+ Tracepoints (USDT).
 
   FETCHARGS : Arguments. Each probe can have up to 128 args.
%REG : Fetch register REG
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 300f4ea..d211937 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -4604,7 +4604,7 @@ static int tracing_trace_options_open(struct inode 
*inode, struct file *file)
   "place (kretprobe): [:][+]|\n"
 #endif
 #ifdef CONFIG_UPROBE_EVENTS
-   "\tplace: :\n"
+  "   place (uprobe): :[(ref_ctr_offset)]\n"
 #endif
"\t args: =fetcharg[:type]\n"
"\t fetcharg: %, @, @[+|-],\n"
-- 
1.8.3.1



[PATCH v2 7/9] trace_uprobe/sdt: Fix multiple update of same reference counter

2018-04-04 Thread Ravi Bangoria
When virtual memory map for binary/library is being prepared, there is
no direct one to one mapping between mmap() and virtual memory area. Ex,
when loader loads the library, it first calls mmap(size = total_size),
where total_size is addition of size of all elf sections that are going
to be mapped. Then it splits individual vmas with new mmap()/mprotect()
calls. Loader does this to ensure it gets continuous address range for
a library. load_elf_binary() also uses similar tricks while preparing
mappings of binary.

Ex for pyhton library,

  # strace -o out python
mmap(NULL, 2738968, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 
0x7fff9246
mmap(0x7fff926a, 327680, PROT_READ|PROT_WRITE, 
MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x23) = 0x7fff926a
mprotect(0x7fff926a, 65536, PROT_READ) = 0

Here, the first mmap() maps the whole library into one region. Second
mmap() and third mprotect() split out the whole region into smaller
vmas and sets appropriate protection flags.

Now, in this case, trace_uprobe_mmap_callback() update the reference
counter twice -- by second mmap() call and by third mprotect() call --
because both regions contain reference counter.

But while de-registration, reference counter will get decremented only
by once leaving reference counter > 0 even if no one is tracing on that
marker.

Example with python library before patch:

# readelf -n /lib64/libpython2.7.so.1.0 | grep -A1 function__entry
  Name: function__entry
  ... Semaphore: 0x002899d8

  Probe on a marker:
# echo "p:sdt_python/function__entry 
/usr/lib64/libpython2.7.so.1.0:0x16a4d4(0x2799d8)" > uprobe_events

  Start tracing:
# perf record -e sdt_python:function__entry -a

  Run python workload:
# python
# cat /proc/`pgrep python`/maps | grep libpython
  7fffadb0-7fffadd4 r-xp  08:05 403934  
/usr/lib64/libpython2.7.so.1.0
  7fffadd4-7fffadd5 r--p 0023 08:05 403934  
/usr/lib64/libpython2.7.so.1.0
  7fffadd5-7fffadd9 rw-p 0024 08:05 403934  
/usr/lib64/libpython2.7.so.1.0

  Reference counter value has been incremented twice:
# dd if=/proc/`pgrep python`/mem bs=1 count=1 skip=$(( 0x7fffadd899d8 )) 
2>/dev/null | xxd
  000: 02   .

  Kill perf:
#
  ^C[ perf record: Woken up 1 times to write data ]
  [ perf record: Captured and wrote 0.322 MB perf.data (1273 samples) ]

  Reference conter is still 1 even when no one is tracing on it:
# dd if=/proc/`pgrep python`/mem bs=1 count=1 skip=$(( 0x7fffadd899d8 )) 
2>/dev/null | xxd
  000: 01   .

Ensure increment and decrement happens in sync by keeping list of mms
in trace_uprobe. Check presence of mm in the list before incrementing
the reference counter. I.e. for each {trace_uprobe,mm} tuple, reference
counter must be incremented only by one. Note that we don't check the
presence of mm in the list at decrement time.

We consider only two case while incrementing the reference counter:
  1. Target binary is already running when we start tracing. In this
 case, find all mm which maps region of target binary containing
 reference counter. Loop over all mms and increment the counter
 if mm is not already present in the list.
  2. Tracer is already tracing before target binary starts execution.
 In this case, all mmap(vma) gets notified to trace_uprobe.
 Trace_uprobe will update reference counter if vma->vm_mm is not
 already present in the list.

  There is also a third case which we don't consider, a fork() case.
  When process with markers forks itself, we don't explicitly increment
  the reference counter in child process because it should be taken care
  by dup_mmap(). We also don't add the child mm in the list. This is
  fine because we don't check presence of mm in the list at decrement
  time.

After patch:

  Start perf record and then run python...
  Reference counter value has been incremented only once:
# dd if=/proc/`pgrep python`/mem bs=1 count=1 skip=$(( 0x7fff9cbf99d8 )) 
2>/dev/null | xxd
  000: 01   .

  Kill perf:
#
  ^C[ perf record: Woken up 1 times to write data ]
  [ perf record: Captured and wrote 0.364 MB perf.data (1427 samples) ]

  Reference conter is reset to 0:
# dd if=/proc/`pgrep python`/mem bs=1 count=1 skip=$(( 0x7fff9cbb99d8 )) 
2>/dev/null | xxd
  000: 00   .

Signed-off-by: Ravi Bangoria 
---
 kernel/trace/trace_uprobe.c | 105 ++--
 1 file changed, 102 insertions(+), 3 deletions(-)

diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 5582c2d..c045174 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -27,6 +27,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "trace_probe.h"
 
@@ -50,6 +51,11 

[PATCH v2 1/9] Uprobe: Export vaddr <-> offset conversion functions

2018-04-04 Thread Ravi Bangoria
These are generic functions which operates on file offset
and virtual address. Make these functions available outside
of uprobe code so that other can use it as well.

Signed-off-by: Ravi Bangoria 
Reviewed-by: Jérôme Glisse 
---
 include/linux/mm.h  | 12 
 kernel/events/uprobes.c | 10 --
 2 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index ad06d42..95909f2 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2274,6 +2274,18 @@ struct vm_unmapped_area_info {
return unmapped_area(info);
 }
 
+static inline unsigned long
+offset_to_vaddr(struct vm_area_struct *vma, loff_t offset)
+{
+   return vma->vm_start + offset - ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
+}
+
+static inline loff_t
+vaddr_to_offset(struct vm_area_struct *vma, unsigned long vaddr)
+{
+   return ((loff_t)vma->vm_pgoff << PAGE_SHIFT) + (vaddr - vma->vm_start);
+}
+
 /* truncate.c */
 extern void truncate_inode_pages(struct address_space *, loff_t);
 extern void truncate_inode_pages_range(struct address_space *,
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index ce6848e..bd6f230 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -130,16 +130,6 @@ static bool valid_vma(struct vm_area_struct *vma, bool 
is_register)
return vma->vm_file && (vma->vm_flags & flags) == VM_MAYEXEC;
 }
 
-static unsigned long offset_to_vaddr(struct vm_area_struct *vma, loff_t offset)
-{
-   return vma->vm_start + offset - ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
-}
-
-static loff_t vaddr_to_offset(struct vm_area_struct *vma, unsigned long vaddr)
-{
-   return ((loff_t)vma->vm_pgoff << PAGE_SHIFT) + (vaddr - vma->vm_start);
-}
-
 /**
  * __replace_page - replace page in vma by new page.
  * based on replace_page in mm/ksm.c
-- 
1.8.3.1



Re: [PATCH] drm/imx: Remove last traces of struct imx_drm_crtc

2018-04-04 Thread Philipp Zabel
On Tue, 2018-04-03 at 20:56 -0300, Fabio Estevam wrote:
> On Tue, Mar 27, 2018 at 6:39 PM, Leonard Crestez
>  wrote:
> > When the definition of this struct was removed a forward declaration and an
> > unused struct member were still left around. Remove them because they serve
> > no purpose.
> > 
> > Fixes 44b460cfe554 ("drm: imx: remove struct imx_drm_crtc and 
> > imx_drm_crtc_helper_funcs")
> > 
> > Signed-off-by: Leonard Crestez 
> 
> Reviewed-by: Fabio Estevam 

Thank you, applied to imx-drm/next.

regards
Philipp


[PATCH v2 9/9] perf probe: Support SDT markers having reference counter (semaphore)

2018-04-04 Thread Ravi Bangoria
With this, perf buildid-cache will save SDT markers with reference
counter in probe cache. Perf probe will be able to probe markers
having reference counter. Ex,

  # readelf -n /tmp/tick | grep -A1 loop2
Name: loop2
... Semaphore: 0x10020036

  # ./perf buildid-cache --add /tmp/tick
  # ./perf probe sdt_tick:loop2
  # ./perf stat -e sdt_tick:loop2 /tmp/tick
hi: 0
hi: 1
hi: 2
^C
 Performance counter stats for '/tmp/tick':
 3  sdt_tick:loop2
   2.561851452 seconds time elapsed

Signed-off-by: Ravi Bangoria 
---
 tools/perf/util/probe-event.c | 18 ++---
 tools/perf/util/probe-event.h |  1 +
 tools/perf/util/probe-file.c  | 34 ++--
 tools/perf/util/probe-file.h  |  1 +
 tools/perf/util/symbol-elf.c  | 46 ---
 tools/perf/util/symbol.h  |  7 +++
 6 files changed, 86 insertions(+), 21 deletions(-)

diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c
index e1dbc98..b3a1330 100644
--- a/tools/perf/util/probe-event.c
+++ b/tools/perf/util/probe-event.c
@@ -1832,6 +1832,12 @@ int parse_probe_trace_command(const char *cmd, struct 
probe_trace_event *tev)
tp->offset = strtoul(fmt2_str, NULL, 10);
}
 
+   if (tev->uprobes) {
+   fmt2_str = strchr(p, '(');
+   if (fmt2_str)
+   tp->ref_ctr_offset = strtoul(fmt2_str + 1, NULL, 0);
+   }
+
tev->nargs = argc - 2;
tev->args = zalloc(sizeof(struct probe_trace_arg) * tev->nargs);
if (tev->args == NULL) {
@@ -2054,15 +2060,21 @@ char *synthesize_probe_trace_command(struct 
probe_trace_event *tev)
}
 
/* Use the tp->address for uprobes */
-   if (tev->uprobes)
+   if (tev->uprobes) {
err = strbuf_addf(&buf, "%s:0x%lx", tp->module, tp->address);
-   else if (!strncmp(tp->symbol, "0x", 2))
+   if (uprobe_ref_ctr_is_supported() &&
+   tp->ref_ctr_offset &&
+   err >= 0)
+   err = strbuf_addf(&buf, "(0x%lx)", tp->ref_ctr_offset);
+   } else if (!strncmp(tp->symbol, "0x", 2)) {
/* Absolute address. See try_to_find_absolute_address() */
err = strbuf_addf(&buf, "%s%s0x%lx", tp->module ?: "",
  tp->module ? ":" : "", tp->address);
-   else
+   } else {
err = strbuf_addf(&buf, "%s%s%s+%lu", tp->module ?: "",
tp->module ? ":" : "", tp->symbol, tp->offset);
+   }
+
if (err)
goto error;
 
diff --git a/tools/perf/util/probe-event.h b/tools/perf/util/probe-event.h
index 45b14f0..15a98c3 100644
--- a/tools/perf/util/probe-event.h
+++ b/tools/perf/util/probe-event.h
@@ -27,6 +27,7 @@ struct probe_trace_point {
char*symbol;/* Base symbol */
char*module;/* Module name */
unsigned long   offset; /* Offset from symbol */
+   unsigned long   ref_ctr_offset; /* SDT reference counter offset */
unsigned long   address;/* Actual address of the trace point */
boolretprobe;   /* Return probe flag */
 };
diff --git a/tools/perf/util/probe-file.c b/tools/perf/util/probe-file.c
index 4ae1123..ca0e524 100644
--- a/tools/perf/util/probe-file.c
+++ b/tools/perf/util/probe-file.c
@@ -697,8 +697,16 @@ int probe_cache__add_entry(struct probe_cache *pcache,
 #ifdef HAVE_GELF_GETNOTE_SUPPORT
 static unsigned long long sdt_note__get_addr(struct sdt_note *note)
 {
-   return note->bit32 ? (unsigned long long)note->addr.a32[0]
-: (unsigned long long)note->addr.a64[0];
+   return note->bit32 ?
+   (unsigned long long)note->addr.a32[SDT_NOTE_IDX_LOC] :
+   (unsigned long long)note->addr.a64[SDT_NOTE_IDX_LOC];
+}
+
+static unsigned long long sdt_note__get_ref_ctr_offset(struct sdt_note *note)
+{
+   return note->bit32 ?
+   (unsigned long long)note->addr.a32[SDT_NOTE_IDX_REFCTR] :
+   (unsigned long long)note->addr.a64[SDT_NOTE_IDX_REFCTR];
 }
 
 static const char * const type_to_suffix[] = {
@@ -776,14 +784,21 @@ static char *synthesize_sdt_probe_command(struct sdt_note 
*note,
 {
struct strbuf buf;
char *ret = NULL, **args;
-   int i, args_count;
+   int i, args_count, err;
+   unsigned long long ref_ctr_offset;
 
if (strbuf_init(&buf, 32) < 0)
return NULL;
 
-   if (strbuf_addf(&buf, "p:%s/%s %s:0x%llx",
-   sdtgrp, note->name, pathname,
-   sdt_note__get_addr(note)) < 0)
+   err = strbuf_addf(&buf, "p:%s/%s %s:0x%llx",
+   sdtgrp, note->name, pathname,
+   sdt_note__get_addr(note));
+
+   ref_ctr_offset = sdt_note__get_ref_ctr_offset(note);
+ 

[PATCH v2 2/9] mm: Prefix vma_ to vaddr_to_offset() and offset_to_vaddr()

2018-04-04 Thread Ravi Bangoria
Make function names more meaningful by adding vma_ prefix
to them.

Signed-off-by: Ravi Bangoria 
Reviewed-by: Jérôme Glisse 
---
 include/linux/mm.h  |  4 ++--
 kernel/events/uprobes.c | 14 +++---
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 95909f2..d7ee526 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2275,13 +2275,13 @@ struct vm_unmapped_area_info {
 }
 
 static inline unsigned long
-offset_to_vaddr(struct vm_area_struct *vma, loff_t offset)
+vma_offset_to_vaddr(struct vm_area_struct *vma, loff_t offset)
 {
return vma->vm_start + offset - ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
 }
 
 static inline loff_t
-vaddr_to_offset(struct vm_area_struct *vma, unsigned long vaddr)
+vma_vaddr_to_offset(struct vm_area_struct *vma, unsigned long vaddr)
 {
return ((loff_t)vma->vm_pgoff << PAGE_SHIFT) + (vaddr - vma->vm_start);
 }
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index bd6f230..535fd39 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -748,7 +748,7 @@ static inline struct map_info *free_map_info(struct 
map_info *info)
curr = info;
 
info->mm = vma->vm_mm;
-   info->vaddr = offset_to_vaddr(vma, offset);
+   info->vaddr = vma_offset_to_vaddr(vma, offset);
}
i_mmap_unlock_read(mapping);
 
@@ -807,7 +807,7 @@ static inline struct map_info *free_map_info(struct 
map_info *info)
goto unlock;
 
if (vma->vm_start > info->vaddr ||
-   vaddr_to_offset(vma, info->vaddr) != uprobe->offset)
+   vma_vaddr_to_offset(vma, info->vaddr) != uprobe->offset)
goto unlock;
 
if (is_register) {
@@ -977,7 +977,7 @@ static int unapply_uprobe(struct uprobe *uprobe, struct 
mm_struct *mm)
uprobe->offset >= offset + vma->vm_end - vma->vm_start)
continue;
 
-   vaddr = offset_to_vaddr(vma, uprobe->offset);
+   vaddr = vma_offset_to_vaddr(vma, uprobe->offset);
err |= remove_breakpoint(uprobe, mm, vaddr);
}
up_read(&mm->mmap_sem);
@@ -1023,7 +1023,7 @@ static void build_probe_list(struct inode *inode,
struct uprobe *u;
 
INIT_LIST_HEAD(head);
-   min = vaddr_to_offset(vma, start);
+   min = vma_vaddr_to_offset(vma, start);
max = min + (end - start) - 1;
 
spin_lock(&uprobes_treelock);
@@ -1076,7 +1076,7 @@ int uprobe_mmap(struct vm_area_struct *vma)
list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) {
if (!fatal_signal_pending(current) &&
filter_chain(uprobe, UPROBE_FILTER_MMAP, vma->vm_mm)) {
-   unsigned long vaddr = offset_to_vaddr(vma, 
uprobe->offset);
+   unsigned long vaddr = vma_offset_to_vaddr(vma, 
uprobe->offset);
install_breakpoint(uprobe, vma->vm_mm, vma, vaddr);
}
put_uprobe(uprobe);
@@ -1095,7 +1095,7 @@ int uprobe_mmap(struct vm_area_struct *vma)
 
inode = file_inode(vma->vm_file);
 
-   min = vaddr_to_offset(vma, start);
+   min = vma_vaddr_to_offset(vma, start);
max = min + (end - start) - 1;
 
spin_lock(&uprobes_treelock);
@@ -1730,7 +1730,7 @@ static struct uprobe *find_active_uprobe(unsigned long 
bp_vaddr, int *is_swbp)
if (vma && vma->vm_start <= bp_vaddr) {
if (valid_vma(vma, false)) {
struct inode *inode = file_inode(vma->vm_file);
-   loff_t offset = vaddr_to_offset(vma, bp_vaddr);
+   loff_t offset = vma_vaddr_to_offset(vma, bp_vaddr);
 
uprobe = find_uprobe(inode, offset);
}
-- 
1.8.3.1



[PATCH v2 5/9] Uprobe: Export uprobe_map_info along with uprobe_{build/free}_map_info()

2018-04-04 Thread Ravi Bangoria
Given the file(inode) and offset, build_map_info() finds all
existing mm that map the portion of file containing offset.

Exporting these functions and data structure will help to use
them in other set of files.

Signed-off-by: Ravi Bangoria 
Reviewed-by: Jérôme Glisse 
---
 include/linux/uprobes.h |  9 +
 kernel/events/uprobes.c | 14 +++---
 2 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
index 0a294e9..7bd2760 100644
--- a/include/linux/uprobes.h
+++ b/include/linux/uprobes.h
@@ -109,12 +109,19 @@ enum rp_check {
RP_CHECK_RET,
 };
 
+struct address_space;
 struct xol_area;
 
 struct uprobes_state {
struct xol_area *xol_area;
 };
 
+struct uprobe_map_info {
+   struct uprobe_map_info *next;
+   struct mm_struct *mm;
+   unsigned long vaddr;
+};
+
 extern int set_swbp(struct arch_uprobe *aup, struct mm_struct *mm, unsigned 
long vaddr);
 extern int set_orig_insn(struct arch_uprobe *aup, struct mm_struct *mm, 
unsigned long vaddr);
 extern bool is_swbp_insn(uprobe_opcode_t *insn);
@@ -149,6 +156,8 @@ struct uprobes_state {
 extern bool arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *regs);
 extern void arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr,
 void *src, unsigned long len);
+extern struct uprobe_map_info *uprobe_free_map_info(struct uprobe_map_info 
*info);
+extern struct uprobe_map_info *uprobe_build_map_info(struct address_space 
*mapping, loff_t offset, bool is_register);
 #else /* !CONFIG_UPROBES */
 struct uprobes_state {
 };
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 477dc42..096d1e6 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -695,14 +695,7 @@ static void delete_uprobe(struct uprobe *uprobe)
put_uprobe(uprobe);
 }
 
-struct uprobe_map_info {
-   struct uprobe_map_info *next;
-   struct mm_struct *mm;
-   unsigned long vaddr;
-};
-
-static inline struct uprobe_map_info *
-uprobe_free_map_info(struct uprobe_map_info *info)
+struct uprobe_map_info *uprobe_free_map_info(struct uprobe_map_info *info)
 {
struct uprobe_map_info *next = info->next;
mmput(info->mm);
@@ -710,9 +703,8 @@ struct uprobe_map_info {
return next;
 }
 
-static struct uprobe_map_info *
-uprobe_build_map_info(struct address_space *mapping, loff_t offset,
- bool is_register)
+struct uprobe_map_info *uprobe_build_map_info(struct address_space *mapping,
+ loff_t offset, bool is_register)
 {
unsigned long pgoff = offset >> PAGE_SHIFT;
struct vm_area_struct *vma;
-- 
1.8.3.1



Re: [PATCH] usbip: vhc_hcd: prevent module being removed while device are attached

2018-04-04 Thread Oliver Neukum
Am Dienstag, den 03.04.2018, 09:56 -0600 schrieb Shuah Khan:
> This is a virtual device associated to a real physical device on a different
> system. My concern is that if the module gets removed accidentally then it
> could disrupt access to the remote device. The remote nature of the device
> with several players involved makes this scenario a bit more complex than

Hi,

you would doubtlessly lose connection to that device. Yet you would
also lose connections if you down your network. You need to be root
to unload a module. You could overwrite your root filesystems or flash
your firmware. In general we cannot and don't try to protect root
from such accidents.

Regards
Oliver



Re: [PATCH] drivers: gpio: pca953x: add compatibility for pcal6524 and pcal9555a

2018-04-04 Thread H. Nikolaus Schaller
Hi,

> Am 30.03.2018 um 23:39 schrieb Andy Shevchenko :
> 
> On Thu, Mar 29, 2018 at 10:29 PM, H. Nikolaus Schaller
>  wrote:
> 
>> Another issue I have fixed is that the pcal6524 has 4 registers per
>> bank while the pcal9555a has only 2.
> 
> So, you mean it's still in non-working case in Linus' tree?

Only the pcal6524 has this problem (pcal9555a should be fine).
IMHO, it does not matter since nobody is currently using it by in-tree DTS.

> Please, send patches!

Will come for review today.

> 
>> This is encoded in the address
>> constants for the new "L"-registers and has to be taken care of in
>> pca953x_read_regs_24() and pca953x_write_regs_24().
>> 
>> But I still have another problem:
>> 
>> [4.808823] pca953x 4-0022: irq 186: unsupported type 8
>> [4.814303] genirq: Setting trigger mode 8 for irq 186 failed 
>> (pca953x_irq_set_type+0x0/0xa8)
>> 
>> This comes from 
>> https://elixir.bootlin.com/linux/v4.16-rc7/source/sound/soc/codecs/ts3a227e.c#L314
>> 
>> It appears that the pca953x driver/interrupt-controller can't handle
>> IRQF_TRIGGER_LOW, but that is hard coded into the ts3a227 driver.
>> 
>> Anyone with knowledge and help about this issue?
> 
> This is easy to fix (not read datasheet for better solution though):
> emulate it via FALLING (LOW) and RISING (HIGH).

I have tested it by some ugly but simple hack (will include the patch),
but we have to consider:

* the request of LEVEL_LOW vs. EDGE_FALLING is defined in the driver of the 
attached device
  i.e. changing it there might break a lot of systems where the 
interrupt-controller can handle
  LEVEL_LOW properly
* I could not find any code that allows to overwrite it by DT (by the second 
parameter of interrupts = )
* EDGE_FALLING as a replacement of LEVEL_LOW may loose interrupts if multiple 
interrupts are
  present and the connected chip does not provide a pulse for each one
* I am not sure if the tca/pca hardware can really handle LEVEL interrupts - so 
it might need some
  emulation (repeat the irq while the input is still active after the handler 
did run)

Anyways, the pcal6524 basically works in interrupt mode and code can always be 
improved.

There is another thing I came across: the pcal6524 has a built-in pull-up/down 
option
which seems to be enabled by default, which might make trouble if there are 
external
pull-downs in the circuits. This is different to the pin-compatible tca6424.

So I am tempted to turn off this pull-up/down by probe code to stay compatible.
The right thing would be to define some pinctrl but that is beyond my 
capabilities
and capacity for this chip.

BR and thanks,
Nikolaus



Re: [PATCH] HID: input: fix battery level reporting on BT mice

2018-04-04 Thread Jiri Kosina
On Tue, 3 Apr 2018, Martin van Es wrote:

> Hi Dimitry,
> 
> I reapplied the 3 commits I had to revert earlier and applied your patch. 
> Have 
> correct battery level reading on my BT mouse back!
> 
> /sys/class/power_supply/hid-00:1f:20:fd:cb:be-battery# grep "" *
> capacity:53
> grep: device: Is a directory
> model_name:Bluetooth Mouse M336/M337/M535
> online:1
> grep: power: Is a directory
> grep: powers: Is a directory
> present:1
> scope:Device
> status:Discharging
> grep: subsystem: Is a directory
> type:Battery
> uevent:POWER_SUPPLY_NAME=hid-00:1f:20:fd:cb:be-battery
> uevent:POWER_SUPPLY_PRESENT=1
> uevent:POWER_SUPPLY_ONLINE=1
> uevent:POWER_SUPPLY_CAPACITY=53
> uevent:POWER_SUPPLY_MODEL_NAME=Bluetooth Mouse M336/M337/M535
> uevent:POWER_SUPPLY_STATUS=Discharging
> uevent:POWER_SUPPLY_SCOPE=Device
> 
> rdesc file is attached
> 
> Thx for the effort!

Can I add your Tested-by: while applying the commit?

Thanks,

-- 
Jiri Kosina
SUSE Labs



Re: [GIT PULL] arch: remove obsolete architecture ports

2018-04-04 Thread Arnd Bergmann
On Wed, Apr 4, 2018 at 10:01 AM, Pavel Machek  wrote:
> On Tue 2018-04-03 11:18:15, Geert Uytterhoeven wrote:

>> In reality, a resurrection may not be implemented as a pure revert, but as
>> the addition of a new architecture, implemented using modern features (DT,
>> CCF, ...).
>
> By insisting on new features instead of pure revert + incremental
> updates, you pretty much make sure resurection will not be possible
> :-(.

It wasn't that anybody demanded it to be that way, but rather that the
maintainer chose to do it like that, as you can see from the first version
that got posted: https://marc.info/?l=linux-arch&m=142181640803103
This is the only reference point we have, since no other architecture
ever got removed and then put back.

Also, now that the other architectures are gone, a lot of changes can
be done more easily that will be incompatible with a pure revert, so
the more time passes, the harder it will get to do that.

Some of the architectures (e.g. tile or cris) have been kept up to
date, but others had already bitrotted to the point where they were
unlikely to work on any real hardware for many relases, but a revert
could still be used as a starting point in theory.

  Arnd


Re: [PATCH 2/2] kfree_rcu() should use kfree_bulk() interface

2018-04-04 Thread Rao Shoaib



On 04/04/2018 12:16 AM, Rao Shoaib wrote:



On 04/03/2018 07:23 PM, Matthew Wilcox wrote:

On Tue, Apr 03, 2018 at 05:55:55PM -0700, Rao Shoaib wrote:

On 04/03/2018 01:58 PM, Matthew Wilcox wrote:

I think you might be better off with an IDR.  The IDR can always
contain one entry, so there's no need for this 'rbf_list_head' or
__rcu_bulk_schedule_list.  The IDR contains its first 64 entries in
an array (if that array can be allocated), so it's compatible with the
kfree_bulk() interface.

I have just familiarized myself with what IDR is by reading your 
article. If

I am incorrect please correct me.

The list and head you have pointed are only used  if the container 
can not
be allocated. That could happen with IDR as well. Note that the 
containers

are allocated at boot time and are re-used.

No, it can't happen with the IDR.  The IDR can always contain one entry
without allocating anything.  If you fail to allocate the second entry,
just free the first entry.


IDR seems to have some overhead, such as I have to specifically add the
pointer and free the ID, plus radix tree maintenance.

... what?  Adding a pointer is simply idr_alloc(), and you get back an
integer telling you which index it has.  Your data structure has its
own set of overhead.
The only overhead is a pointer that points to the head and an int to 
keep count. If I use idr, I would have to allocate an struct idr which 
is much larger. idr_alloc()/idr_destroy() operations are much more 
costly than updating two pointers. As the pointers are stored in 
slots/nodes corresponding to the id, I would  have to retrieve the 
pointers by calling idr_remove() to pass them to be freed, the 
slots/nodes would constantly be allocated and freed.


IDR is a very useful interface for allocating/managing ID's but I 
really do not see the justification for using it over here, perhaps 
you can elaborate more on the benefits and also on how I can just pass 
the array to be freed.


Shoaib

I may have mis-understood your comment. You are probably suggesting that 
I use IDR instead of allocating following containers.


+   struct  rcu_bulk_free_container *rbf_container;
+   struct  rcu_bulk_free_container *rbf_cached_container;


IDR uses radix_tree_node which allocates following two arrays. since I 
do not need any ID's why not just use the radix_tree_node directly, but 
I do not need a radix tree either, so why not just use an array. That is 
what I am doing.


void __rcu  *slots[RADIX_TREE_MAP_SIZE];
unsigned long   tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS]; ==> Not 
needed


As far as allocation failure is concerned, the allocation has to be done 
at run time. If the allocation of a container can fail, so can the 
allocation of radix_tree_node as it also requires memory.


I really do not see any advantages of using IDR. The structure I have is 
much simpler and does exactly what I need.


Shoaib




Re: Multiple generic PHY instances for DWC3 USB IP

2018-04-04 Thread Arnd Bergmann
On Wed, Apr 4, 2018 at 10:00 AM, Felipe Balbi
 wrote:
>
> Hi,
>
> Masahiro Yamada  writes:
 Each DWC3 instance is connected with
 multiple HS PHYs and multiple SS PHYs,
 depending on the number of ports.
>>>
>>> in that case, you shouldn't need dwc3 at all. A Host-only dwc3 is xHCI
>>> compliant. If you really don't have the gadget block, there's no need
>>> for you to use dwc3. Just use xhci-plat directly.
>>
>> Sorry, I was misunderstanding.
>>
>> Some of our SoCs support gadget,
>> so we need to use the dwc3 driver.
>
> fair enough. Now we need to figure out how to pass multiply PHYs to a
> multi-port dwc3 instance.
>
> Kishon, any ideas? How do you think DT should look like?

See this series from Martin Blumenstingl:

https://www.spinics.net/lists/linux-usb/msg166281.html

  Arnd


Re: [RFC] drm/atomic+msm: add helper to implement legacy dirtyfb

2018-04-04 Thread Daniel Vetter
On Wed, Apr 04, 2018 at 10:22:21AM +0200, Thomas Hellstrom wrote:
> Hi,
> 
> On 04/04/2018 08:58 AM, Daniel Vetter wrote:
> > On Wed, Apr 4, 2018 at 12:42 AM, Rob Clark  wrote:
> > > Add an atomic helper to implement dirtyfb support.  This is needed to
> > > support DSI command-mode panels with x11 userspace (ie. when we can't
> > > rely on pageflips to trigger a flush to the panel).
> > > 
> > > To signal to the driver that the async atomic update needs to
> > > synchronize with fences, even though the fb didn't change, the
> > > drm_atomic_state::dirty flag is added.
> > > 
> > > Signed-off-by: Rob Clark 
> > > ---
> > > Background: there are a number of different folks working on getting
> > > upstream kernel working on various different phones/tablets with qcom
> > > SoC's.. many of them have command mode panels, so we kind of need a
> > > way to support the legacy dirtyfb ioctl for x11 support.
> > > 
> > > I know there is work on a proprer non-legacy atomic property for
> > > userspace to communicate dirty-rect(s) to the kernel, so this can
> > > be improved from triggering a full-frame flush once that is in
> > > place.  But we kinda needa a stop-gap solution.
> > > 
> > > I had considered an in-driver solution for this, but things get a
> > > bit tricky if userspace ands up combining dirtyfb ioctls with page-
> > > flips, because we need to synchronize setting various CTL.FLUSH bits
> > > with setting the CTL.START bit.  (ie. really all we need to do for
> > > cmd mode panels is bang CTL.START, but is this ends up racing with
> > > pageflips setting FLUSH bits, then bad things.)  The easiest soln
> > > is to wrap this up as an atomic commit and rely on the worker to
> > > serialize things.  Hence adding an atomic dirtyfb helper.
> > > 
> > > I guess at least the helper, with some small addition to translate
> > > and pass-thru the dirty rect(s) is useful to the final atomic dirty-
> > > rect property solution.  Depending on how far off that is, a stop-
> > > gap solution could be useful.
> > Adding Noralf, who iirc already posted the full dirty helpers already 
> > somewhere.
> > -Daniel
> 
> I've asked Deepak to RFC the core changes suggested for the full dirty blob
> on dri-devel. It builds on DisplayLink's suggestion, with a simple helper to
> get to the desired coordinates.
> 
> One thing to perhaps discuss is how we would like to fit this with
> front-buffer rendering and the dirty ioctl. In the page-flip context, the
> dirty rects, like egl's swapbuffer_with_damage is a hint to restrict the
> damage region that can be fully ignored by the driver, new content is
> indicated by a new framebuffer.
> 
> We could do the same for frontbuffer rendering: Either set a dirty flag like
> you do here, or provide a content_age state member. Since we clear the dirty
> flag on state copies, I guess that would be sufficient. The blob rectangles
> would then become a hint to restrict the damage region.

I'm not entirely following here - I thought for frontbuffer rendering the
dirty rects have always just been a hint, and that the driver was always
free to re-upload the entire buffer to the screen.

And through a helper like Rob's proposing here (and have floated around in
different versions already) we'd essentially map a frontbuffer dirtyfb
call to a fake flip with dirty rect. Manual upload drivers already need to
upload the entire screen if they get a flip, since some userspace uses
that to flush out frontbuffer rendering (instead of calling dirtyfb).

So from that pov the new dirty flag is kinda not necessary imo.

> Another approach would be to have the presence of dirty rects without
> framebuffer change to indicate frontbuffer rendering.
> 
> I think I like the first approach best, although it may be tempting for
> user-space apps to just set the dirty bit instead of providing the full
> damage region.

Or I'm not following you here, because I don't quite see the difference
between dirtyfb and a flip.
-Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch


Re: x32 suspend failuer in Re: linux-next: Tree for Apr 4

2018-04-04 Thread Pavel Machek
On Wed 2018-04-04 09:58:17, Rafael J. Wysocki wrote:
> On Wed, Apr 4, 2018 at 9:50 AM, Pavel Machek  wrote:
> > Hi!
> >
> >> Please do not add any v4.18 destined stuff to your linux-next included
> >> trees until after v4.17-rc1 has been released.
> >
> > On thinkpad x60, suspend does not suspend at all with this -next
> > version. Previous versions suspended/resumed fine but broke networking.
> >
> > Any ideas? I guess bisecting on next would not be easy?
> 
> Well, why would it be different from a bisect on any other git repo?

Well, v4.16-rc4 is parent of v4.16-rc6, but next-20180304 is not
parent of next-20180307.

But you are right that if I do bisect between -linus and -next, it
should work.

Anyway, does s2ram work for you in -next? Are you testing 32bit?

Best regards,
Pavel
-- 
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) 
http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html


signature.asc
Description: Digital signature


Re: [PATCH V2 6/7] thermal/drivers/cpu_cooling: Introduce the cpu idle cooling driver

2018-04-04 Thread Daniel Lezcano
On 26/02/2018 05:30, Viresh Kumar wrote:

[ ... ]

 +
 +  for_each_possible_cpu(cpu) {
 +  cpumask = topology_core_cpumask(cpu);
 +
 +  cct = per_cpu_ptr(&cpuidle_cooling_tsk, cpu);
 +
 +  /*
 +   * This condition makes the first cpu belonging to the
 +   * cluster to create a cooling device and allocates
 +   * the structure. Others CPUs belonging to the same
 +   * cluster will just increment the refcount on the
 +   * cooling device structure and initialize it.
 +   */
 +  if (cpu == cpumask_first(cpumask)) {
>>>
>>> Your function still have few assumptions of cpu numbering and it will
>>> break in few cases. What if the CPUs on a big Little system (4x4) are
>>> present in this order: B L L L L B B B  ??
>>>
>>> This configuration can happen if CPUs in DT are marked as: 0-3 LITTLE,
>>> 4-7 big and a big CPU is used by the boot loader to bring up Linux.
>>
>> Ok, how can I sort it out ?
> 
> I would do something like this:
> 
> cpumask_copy(possible, cpu_possible_mask);
> 
> while (!cpumask_empty(possible)) {
> first = cpumask_first(possible);
> cpumask = topology_core_cpumask(first);
> cpumask_andnot(possible, possible, cpumask);
> 
> allocate_cooling_dev(first); //This is most of this function 
> in your patch.
> 
> while (!cpumask_empty(cpumask)) {
> temp = cpumask_first(possible);
> //rest init "temp"
> cpumask_clear_cpu(temp, cpumask);
> }
> 
> //Everything done, register cooling device for cpumask.
> }


Mmh, that sounds very complex. May be it is simpler to count the number
of cluster and initialize the idle_cdev for each cluster and then go for
this loop with the cluster cpumask.




-- 
  Linaro.org │ Open source software for ARM SoCs

Follow Linaro:   Facebook |
 Twitter |
 Blog



Re: [PATCH] HID: input: fix battery level reporting on BT mice

2018-04-04 Thread Martin van Es
On Wednesday, April 4, 2018 10:33:16 AM CEST Jiri Kosina wrote:
> Can I add your Tested-by: while applying the commit?

That's ok.

Best regards,
Martin


[PATCH v9 02/10] sched: idle: Do not stop the tick upfront in the idle loop

2018-04-04 Thread Rafael J. Wysocki
From: Rafael J. Wysocki 

Push the decision whether or not to stop the tick somewhat deeper
into the idle loop.

Stopping the tick upfront leads to unpleasant outcomes in case the
idle governor doesn't agree with the nohz code on the duration of the
upcoming idle period.  Specifically, if the tick has been stopped and
the idle governor predicts short idle, the situation is bad regardless
of whether or not the prediction is accurate.  If it is accurate, the
tick has been stopped unnecessarily which means excessive overhead.
If it is not accurate, the CPU is likely to spend too much time in
the (shallow, because short idle has been predicted) idle state
selected by the governor [1].

As the first step towards addressing this problem, change the code
to make the tick stopping decision inside of the loop in do_idle().
In particular, do not stop the tick in the cpu_idle_poll() code path.
Also don't do that in tick_nohz_irq_exit() which doesn't really have
enough information on whether or not to stop the tick.

Link: https://marc.info/?l=linux-pm&m=150116085925208&w=2 # [1]
Link: 
https://tu-dresden.de/zih/forschung/ressourcen/dateien/projekte/haec/powernightmares.pdf
Suggested-by: Frederic Weisbecker 
Signed-off-by: Rafael J. Wysocki 
Reviewed-by: Frederic Weisbecker 
---

v8 -> v9:
 * No changes in the patch.
 * Tag from Frederic.

---
 include/linux/tick.h |2 ++
 kernel/sched/idle.c  |9 ++---
 kernel/time/tick-sched.c |   26 ++
 3 files changed, 26 insertions(+), 11 deletions(-)

Index: linux-pm/kernel/sched/idle.c
===
--- linux-pm.orig/kernel/sched/idle.c
+++ linux-pm/kernel/sched/idle.c
@@ -221,13 +221,13 @@ static void do_idle(void)
 
__current_set_polling();
tick_nohz_idle_enter();
-   tick_nohz_idle_stop_tick_protected();
 
while (!need_resched()) {
check_pgt_cache();
rmb();
 
if (cpu_is_offline(cpu)) {
+   tick_nohz_idle_stop_tick_protected();
cpuhp_report_idle_dead();
arch_cpu_idle_dead();
}
@@ -241,10 +241,13 @@ static void do_idle(void)
 * broadcast device expired for us, we don't want to go deep
 * idle as we know that the IPI is going to arrive right away.
 */
-   if (cpu_idle_force_poll || tick_check_broadcast_expired())
+   if (cpu_idle_force_poll || tick_check_broadcast_expired()) {
+   tick_nohz_idle_restart_tick();
cpu_idle_poll();
-   else
+   } else {
+   tick_nohz_idle_stop_tick();
cpuidle_idle_call();
+   }
arch_cpu_idle_exit();
}
 
Index: linux-pm/kernel/time/tick-sched.c
===
--- linux-pm.orig/kernel/time/tick-sched.c
+++ linux-pm/kernel/time/tick-sched.c
@@ -984,12 +984,10 @@ void tick_nohz_irq_exit(void)
 {
struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
 
-   if (ts->inidle) {
+   if (ts->inidle)
tick_nohz_start_idle(ts);
-   __tick_nohz_idle_stop_tick(ts);
-   } else {
+   else
tick_nohz_full_update_tick(ts);
-   }
 }
 
 /**
@@ -1050,6 +1048,20 @@ static void tick_nohz_account_idle_ticks
 #endif
 }
 
+static void __tick_nohz_idle_restart_tick(struct tick_sched *ts, ktime_t now)
+{
+   tick_nohz_restart_sched_tick(ts, now);
+   tick_nohz_account_idle_ticks(ts);
+}
+
+void tick_nohz_idle_restart_tick(void)
+{
+   struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
+
+   if (ts->tick_stopped)
+   __tick_nohz_idle_restart_tick(ts, ktime_get());
+}
+
 /**
  * tick_nohz_idle_exit - restart the idle tick from the idle task
  *
@@ -1074,10 +1086,8 @@ void tick_nohz_idle_exit(void)
if (ts->idle_active)
tick_nohz_stop_idle(ts, now);
 
-   if (ts->tick_stopped) {
-   tick_nohz_restart_sched_tick(ts, now);
-   tick_nohz_account_idle_ticks(ts);
-   }
+   if (ts->tick_stopped)
+   __tick_nohz_idle_restart_tick(ts, now);
 
local_irq_enable();
 }
Index: linux-pm/include/linux/tick.h
===
--- linux-pm.orig/include/linux/tick.h
+++ linux-pm/include/linux/tick.h
@@ -115,6 +115,7 @@ enum tick_dep_bits {
 extern bool tick_nohz_enabled;
 extern int tick_nohz_tick_stopped(void);
 extern void tick_nohz_idle_stop_tick(void);
+extern void tick_nohz_idle_restart_tick(void);
 extern void tick_nohz_idle_enter(void);
 extern void tick_nohz_idle_exit(void);
 extern void tick_nohz_irq_exit(void);
@@ -135,6 +136,7 @@ static inline void tick_nohz_idle_stop_t
 #define tick_nohz_enabled (0)
 static inline int tick_nohz_tick_stoppe

[PATCH v9 00/10] sched/cpuidle: Idle loop rework

2018-04-04 Thread Rafael J. Wysocki
Hi All,

Thanks a lot for the feedback so far!

For the motivation/summary, please refer to the BZ entry at

 https://bugzilla.kernel.org/show_bug.cgi?id=199227

created for collecting information related to this patch series.  Some v7.3
testing results from Len and Doug are in there already.

The testing so far shows significant idle power improvements, both in terms of
reducing the average idle power (about 10% on some systems) and in terms of
reducing the idle power noise (in the vast majority of cases, with this series
applied the idle power is mostly stable around the power floor of the system).
The average power is also reduced in some non-idle workloads and there are
some performance improvements in them.

It also is reported that the series generally addresses the problem it has been
motivated by (ie. the "powernightmares" issue).

This revision is mostly a re-send of the v8 with three patches changed as
follows.

> Patch 1 prepares the tick-sched code for the subsequent modifications and it
> doesn't change the code's functionality (at least not intentionally).
> 
> Patch 2 starts pushing the tick stopping decision deeper into the idle
> loop, but that is limited to do_idle() and tick_nohz_irq_exit().
> 
> Patch 3 makes cpuidle_idle_call() decide whether or not to stop the tick
> and sets the stage for the subsequent changes.
>
> Patch 4 is a new one just for the TICK_USEC definition changes.
>
> Patch 5 adds a bool pointer argument to cpuidle_select() and the ->select
> governor callback allowing them to return a "nohz" hint on whether or not to
> stop the tick to the caller.  It also adds code to decide what value to
> return as "nohz" to the menu governor and modifies its correction factor
> computations to take running tick into account if need be.
>
> Patch 6 (which is new) contains some changes that previously were included
> into the big reordering patch (patch [6/8] in the v7).  Essentially, it does
> more tick-sched code reorganization in preparation for the subsequent changes
> (and should not modify the functionality).

Patch 7 is a new version of its v8 counterpart.  It makes fewer changes to the
existing code and adds a special function for the handling of the use case it
is about.  It still makes some hrtimer code modifications allowing it to return
the time to the next event with one timer excluded (which needs to be done with
respect to the tick timer), though.

Patch 8 reorders the idle state selection with respect to the stopping of
the tick and causes the additional "nohz" hint from cpuidle_select() to be
used for deciding whether or not to stop the tick.  It is a rebased version
of its v8 counterpart.

Patch 9 causes the menu governor to refine the state selection in case the
tick is not going to be stopped and the already selected state does not fit
the interval before the next tick time.  It is a new version that avoids
using state 0 if it has been disabled (if state 0 has been disabled, the
governor only should use it when no states are enabled at all).

> Patch 10 Deals with the situation in which the tick was stopped previously,
> but the idle governor still predicts short idle (it has not changed).

This series is complementary to the poll_idle() patches discussed recently

https://patchwork.kernel.org/patch/10282237/
https://patchwork.kernel.org/patch/10311775/

that have been merged for v4.17 already.

There is a new git branch containing the current series at

 git://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm.git \
 idle-loop-v9

Thanks,
Rafael



[PATCH v9 01/10] time: tick-sched: Reorganize idle tick management code

2018-04-04 Thread Rafael J. Wysocki
From: Rafael J. Wysocki 

Prepare the scheduler tick code for reworking the idle loop to
avoid stopping the tick in some cases.

The idea is to split the nohz idle entry call to decouple the idle
time stats accounting and preparatory work from the actual tick stop
code, in order to later be able to delay the tick stop once we reach
more power-knowledgeable callers.

Move away the tick_nohz_start_idle() invocation from
__tick_nohz_idle_enter(), rename the latter to
__tick_nohz_idle_stop_tick() and define tick_nohz_idle_stop_tick()
as a wrapper around it for calling it from the outside.

Make tick_nohz_idle_enter() only call tick_nohz_start_idle() instead
of calling the entire __tick_nohz_idle_enter(), add another wrapper
disabling and enabling interrupts around tick_nohz_idle_stop_tick()
and make the current callers of tick_nohz_idle_enter() call it too
to retain their current functionality.

Signed-off-by: Rafael J. Wysocki 
Reviewed-by: Frederic Weisbecker 
---

v8 -> v9:
 * No changes in the patch.
 * Tag from Frederic.

---
 arch/x86/xen/smp_pv.c|1 +
 include/linux/tick.h |   12 
 kernel/sched/idle.c  |1 +
 kernel/time/tick-sched.c |   46 +-
 4 files changed, 39 insertions(+), 21 deletions(-)

Index: linux-pm/include/linux/tick.h
===
--- linux-pm.orig/include/linux/tick.h
+++ linux-pm/include/linux/tick.h
@@ -114,6 +114,7 @@ enum tick_dep_bits {
 #ifdef CONFIG_NO_HZ_COMMON
 extern bool tick_nohz_enabled;
 extern int tick_nohz_tick_stopped(void);
+extern void tick_nohz_idle_stop_tick(void);
 extern void tick_nohz_idle_enter(void);
 extern void tick_nohz_idle_exit(void);
 extern void tick_nohz_irq_exit(void);
@@ -122,9 +123,18 @@ extern unsigned long tick_nohz_get_idle_
 extern unsigned long tick_nohz_get_idle_calls_cpu(int cpu);
 extern u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time);
 extern u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time);
+
+static inline void tick_nohz_idle_stop_tick_protected(void)
+{
+   local_irq_disable();
+   tick_nohz_idle_stop_tick();
+   local_irq_enable();
+}
+
 #else /* !CONFIG_NO_HZ_COMMON */
 #define tick_nohz_enabled (0)
 static inline int tick_nohz_tick_stopped(void) { return 0; }
+static inline void tick_nohz_idle_stop_tick(void) { }
 static inline void tick_nohz_idle_enter(void) { }
 static inline void tick_nohz_idle_exit(void) { }
 
@@ -134,6 +144,8 @@ static inline ktime_t tick_nohz_get_slee
 }
 static inline u64 get_cpu_idle_time_us(int cpu, u64 *unused) { return -1; }
 static inline u64 get_cpu_iowait_time_us(int cpu, u64 *unused) { return -1; }
+
+static inline void tick_nohz_idle_stop_tick_protected(void) { }
 #endif /* !CONFIG_NO_HZ_COMMON */
 
 #ifdef CONFIG_NO_HZ_FULL
Index: linux-pm/kernel/time/tick-sched.c
===
--- linux-pm.orig/kernel/time/tick-sched.c
+++ linux-pm/kernel/time/tick-sched.c
@@ -539,14 +539,11 @@ static void tick_nohz_stop_idle(struct t
sched_clock_idle_wakeup_event();
 }
 
-static ktime_t tick_nohz_start_idle(struct tick_sched *ts)
+static void tick_nohz_start_idle(struct tick_sched *ts)
 {
-   ktime_t now = ktime_get();
-
-   ts->idle_entrytime = now;
+   ts->idle_entrytime = ktime_get();
ts->idle_active = 1;
sched_clock_idle_sleep_event();
-   return now;
 }
 
 /**
@@ -911,19 +908,21 @@ static bool can_stop_idle_tick(int cpu,
return true;
 }
 
-static void __tick_nohz_idle_enter(struct tick_sched *ts)
+static void __tick_nohz_idle_stop_tick(struct tick_sched *ts)
 {
-   ktime_t now, expires;
+   ktime_t expires;
int cpu = smp_processor_id();
 
-   now = tick_nohz_start_idle(ts);
-
if (can_stop_idle_tick(cpu, ts)) {
int was_stopped = ts->tick_stopped;
 
ts->idle_calls++;
 
-   expires = tick_nohz_stop_sched_tick(ts, now, cpu);
+   /*
+* The idle entry time should be a sufficient approximation of
+* the current time at this point.
+*/
+   expires = tick_nohz_stop_sched_tick(ts, ts->idle_entrytime, 
cpu);
if (expires > 0LL) {
ts->idle_sleeps++;
ts->idle_expires = expires;
@@ -937,16 +936,19 @@ static void __tick_nohz_idle_enter(struc
 }
 
 /**
- * tick_nohz_idle_enter - stop the idle tick from the idle task
+ * tick_nohz_idle_stop_tick - stop the idle tick from the idle task
  *
  * When the next event is more than a tick into the future, stop the idle tick
- * Called when we start the idle loop.
- *
- * The arch is responsible of calling:
+ */
+void tick_nohz_idle_stop_tick(void)
+{
+   __tick_nohz_idle_stop_tick(this_cpu_ptr(&tick_cpu_sched));
+}
+
+/**
+ * tick_nohz_idle_enter - prepare for entering idle on the current CPU
  *
- * - rcu_idle_enter() afte

[PATCH v9 09/10] cpuidle: menu: Refine idle state selection for running tick

2018-04-04 Thread Rafael J. Wysocki
From: Rafael J. Wysocki 

If the tick isn't stopped, the target residency of the state selected
by the menu governor may be greater than the actual time to the next
tick and that means lost energy.

To avoid that, make tick_nohz_get_sleep_length() return the current
time to the next event (before stopping the tick) in addition to the
estimated one via an extra pointer argument and make menu_select()
use that value to refine the state selection when necessary.

Signed-off-by: Rafael J. Wysocki 
---

v8 -> v9:
 * Avoid using idle state 0 if it is disabled.
 * Rebase.

---
 drivers/cpuidle/governors/menu.c |   27 +--
 include/linux/tick.h |7 ---
 kernel/time/tick-sched.c |   12 ++--
 3 files changed, 35 insertions(+), 11 deletions(-)

Index: linux-pm/include/linux/tick.h
===
--- linux-pm.orig/include/linux/tick.h
+++ linux-pm/include/linux/tick.h
@@ -121,7 +121,7 @@ extern void tick_nohz_idle_enter(void);
 extern void tick_nohz_idle_exit(void);
 extern void tick_nohz_irq_exit(void);
 extern bool tick_nohz_idle_got_tick(void);
-extern ktime_t tick_nohz_get_sleep_length(void);
+extern ktime_t tick_nohz_get_sleep_length(ktime_t *delta_next);
 extern unsigned long tick_nohz_get_idle_calls(void);
 extern unsigned long tick_nohz_get_idle_calls_cpu(int cpu);
 extern u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time);
@@ -144,9 +144,10 @@ static inline void tick_nohz_idle_enter(
 static inline void tick_nohz_idle_exit(void) { }
 static inline bool tick_nohz_idle_got_tick(void) { return false; }
 
-static inline ktime_t tick_nohz_get_sleep_length(void)
+static inline ktime_t tick_nohz_get_sleep_length(ktime_t *delta_next)
 {
-   return NSEC_PER_SEC / HZ;
+   *delta_next = NSEC_PER_SEC / HZ;
+   return *delta_next;
 }
 static inline u64 get_cpu_idle_time_us(int cpu, u64 *unused) { return -1; }
 static inline u64 get_cpu_iowait_time_us(int cpu, u64 *unused) { return -1; }
Index: linux-pm/kernel/time/tick-sched.c
===
--- linux-pm.orig/kernel/time/tick-sched.c
+++ linux-pm/kernel/time/tick-sched.c
@@ -1047,10 +1047,11 @@ bool tick_nohz_idle_got_tick(void)
 
 /**
  * tick_nohz_get_sleep_length - return the expected length of the current sleep
+ * @delta_next: duration until the next event if the tick cannot be stopped
  *
  * Called from power state control code with interrupts disabled
  */
-ktime_t tick_nohz_get_sleep_length(void)
+ktime_t tick_nohz_get_sleep_length(ktime_t *delta_next)
 {
struct clock_event_device *dev = 
__this_cpu_read(tick_cpu_device.evtdev);
struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
@@ -1064,12 +1065,14 @@ ktime_t tick_nohz_get_sleep_length(void)
 
WARN_ON_ONCE(!ts->inidle);
 
+   *delta_next = ktime_sub(dev->next_event, now);
+
if (!can_stop_idle_tick(cpu, ts))
-   goto out_dev;
+   return *delta_next;
 
next_event = tick_nohz_next_event(ts, cpu);
if (!next_event)
-   goto out_dev;
+   return *delta_next;
 
/*
 * If the next highres timer to expire is earlier than next_event, the
@@ -1079,9 +1082,6 @@ ktime_t tick_nohz_get_sleep_length(void)
   hrtimer_next_event_without(&ts->sched_timer));
 
return ktime_sub(next_event, now);
-
-out_dev:
-   return ktime_sub(dev->next_event, now);
 }
 
 /**
Index: linux-pm/drivers/cpuidle/governors/menu.c
===
--- linux-pm.orig/drivers/cpuidle/governors/menu.c
+++ linux-pm/drivers/cpuidle/governors/menu.c
@@ -295,6 +295,7 @@ static int menu_select(struct cpuidle_dr
unsigned int expected_interval;
unsigned long nr_iowaiters, cpu_load;
int resume_latency = dev_pm_qos_raw_read_value(device);
+   ktime_t delta_next;
 
if (data->needs_update) {
menu_update(drv, dev);
@@ -312,7 +313,7 @@ static int menu_select(struct cpuidle_dr
}
 
/* determine the expected residency time, round up */
-   data->next_timer_us = ktime_to_us(tick_nohz_get_sleep_length());
+   data->next_timer_us = 
ktime_to_us(tick_nohz_get_sleep_length(&delta_next));
 
get_iowait_load(&nr_iowaiters, &cpu_load);
data->bucket = which_bucket(data->next_timer_us, nr_iowaiters);
@@ -396,9 +397,31 @@ static int menu_select(struct cpuidle_dr
 * expected idle duration is shorter than the tick period length.
 */
if ((drv->states[idx].flags & CPUIDLE_FLAG_POLLING) ||
-   expected_interval < TICK_USEC)
+   expected_interval < TICK_USEC) {
+   unsigned int delta_next_us = ktime_to_us(delta_next);
+
*stop_tick = false;
 
+   if (!tick_nohz_tick_stopped() && idx > 0 &&
+   drv->states[idx].target_residency > 

[PATCH v9 04/10] jiffies: Introduce USER_TICK_USEC and redefine TICK_USEC

2018-04-04 Thread Rafael J. Wysocki
From: Rafael J. Wysocki 

Since the subsequent changes will need a TICK_USEC definition
analogous to TICK_NSEC, rename the existing TICK_USEC as
USER_TICK_USEC, update its users and redefine TICK_USEC
accordingly.

Suggested-by: Peter Zijlstra 
Signed-off-by: Rafael J. Wysocki 
---

v8 -> v9: No changes.

---
 drivers/net/ethernet/sfc/mcdi.c |2 +-
 include/linux/jiffies.h |7 +--
 kernel/time/ntp.c   |2 +-
 3 files changed, 7 insertions(+), 4 deletions(-)

Index: linux-pm/include/linux/jiffies.h
===
--- linux-pm.orig/include/linux/jiffies.h
+++ linux-pm/include/linux/jiffies.h
@@ -62,8 +62,11 @@ extern int register_refined_jiffies(long
 /* TICK_NSEC is the time between ticks in nsec assuming SHIFTED_HZ */
 #define TICK_NSEC ((NSEC_PER_SEC+HZ/2)/HZ)
 
-/* TICK_USEC is the time between ticks in usec assuming fake USER_HZ */
-#define TICK_USEC ((100UL + USER_HZ/2) / USER_HZ)
+/* TICK_USEC is the time between ticks in usec assuming SHIFTED_HZ */
+#define TICK_USEC ((USEC_PER_SEC + HZ/2) / HZ)
+
+/* USER_TICK_USEC is the time between ticks in usec assuming fake USER_HZ */
+#define USER_TICK_USEC ((100UL + USER_HZ/2) / USER_HZ)
 
 #ifndef __jiffy_arch_data
 #define __jiffy_arch_data
Index: linux-pm/drivers/net/ethernet/sfc/mcdi.c
===
--- linux-pm.orig/drivers/net/ethernet/sfc/mcdi.c
+++ linux-pm/drivers/net/ethernet/sfc/mcdi.c
@@ -375,7 +375,7 @@ static int efx_mcdi_poll(struct efx_nic
 * because generally mcdi responses are fast. After that, back off
 * and poll once a jiffy (approximately)
 */
-   spins = TICK_USEC;
+   spins = USER_TICK_USEC;
finish = jiffies + MCDI_RPC_TIMEOUT;
 
while (1) {
Index: linux-pm/kernel/time/ntp.c
===
--- linux-pm.orig/kernel/time/ntp.c
+++ linux-pm/kernel/time/ntp.c
@@ -31,7 +31,7 @@
 
 
 /* USER_HZ period (usecs): */
-unsigned long  tick_usec = TICK_USEC;
+unsigned long  tick_usec = USER_TICK_USEC;
 
 /* SHIFTED_HZ period (nsecs): */
 unsigned long  tick_nsec;



Re: [PATCH] lib: logic_pio: Fix potential NULL pointer dereference

2018-04-04 Thread John Garry

On 03/04/2018 22:15, Gustavo A. R. Silva wrote:

new_range is being dereferenced before it is null checked, hence
there is a potential null pointer dereference.

Fix this by moving the pointer dereference after new_range has
been properly null checked.



Hi Gustavo,

In fact we expect new_range to never be NULL. But, if we're going to 
check, then better make sure the check is correct...


Thanks,
John


Addresses-Coverity-ID: 1466163 ("Dereference before null check")
Fixes: 0a7198426259 ("lib: Add generic PIO mapping method")
Signed-off-by: Gustavo A. R. Silva 


Reviewed-by: John Garry 


---
 lib/logic_pio.c | 7 +--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/lib/logic_pio.c b/lib/logic_pio.c
index 29cedea..30dfdce 100644
--- a/lib/logic_pio.c
+++ b/lib/logic_pio.c
@@ -33,8 +33,8 @@ static DEFINE_MUTEX(io_range_mutex);
 int logic_pio_register_range(struct logic_pio_hwaddr *new_range)
 {
struct logic_pio_hwaddr *range;
-   resource_size_t start = new_range->hw_start;
-   resource_size_t end = new_range->hw_start + new_range->size;
+   resource_size_t start;
+   resource_size_t end;
resource_size_t mmio_sz = 0;
resource_size_t iio_sz = MMIO_UPPER_LIMIT;
int ret = 0;
@@ -42,6 +42,9 @@ int logic_pio_register_range(struct logic_pio_hwaddr 
*new_range)
if (!new_range || !new_range->fwnode || !new_range->size)
return -EINVAL;

+   start = new_range->hw_start;
+   end = new_range->hw_start + new_range->size;
+
mutex_lock(&io_range_mutex);
list_for_each_entry_rcu(range, &io_range_list, list) {
if (range->fwnode == new_range->fwnode) {






[PATCH v9 03/10] sched: idle: Do not stop the tick before cpuidle_idle_call()

2018-04-04 Thread Rafael J. Wysocki
From: Rafael J. Wysocki 

Make cpuidle_idle_call() decide whether or not to stop the tick.

First, the cpuidle_enter_s2idle() path deals with the tick (and with
the entire timekeeping for that matter) by itself and it doesn't need
the tick to be stopped beforehand.

Second, to address the issue with short idle duration predictions
by the idle governor after the tick has been stopped, it will be
necessary to change the ordering of cpuidle_select() with respect
to tick_nohz_idle_stop_tick().  To prepare for that, put a
tick_nohz_idle_stop_tick() call in the same branch in which
cpuidle_select() is called.

Signed-off-by: Rafael J. Wysocki 
Reviewed-by: Frederic Weisbecker 
---

v8 -> v9:
 * No changes in the patch.
 * Tag from Frederic.

---
 kernel/sched/idle.c |   19 +++
 1 file changed, 15 insertions(+), 4 deletions(-)

Index: linux-pm/kernel/sched/idle.c
===
--- linux-pm.orig/kernel/sched/idle.c
+++ linux-pm/kernel/sched/idle.c
@@ -146,13 +146,15 @@ static void cpuidle_idle_call(void)
}
 
/*
-* Tell the RCU framework we are entering an idle section,
-* so no more rcu read side critical sections and one more
+* The RCU framework needs to be told that we are entering an idle
+* section, so no more rcu read side critical sections and one more
 * step to the grace period
 */
-   rcu_idle_enter();
 
if (cpuidle_not_available(drv, dev)) {
+   tick_nohz_idle_stop_tick();
+   rcu_idle_enter();
+
default_idle_call();
goto exit_idle;
}
@@ -169,16 +171,26 @@ static void cpuidle_idle_call(void)
 
if (idle_should_enter_s2idle() || dev->use_deepest_state) {
if (idle_should_enter_s2idle()) {
+   rcu_idle_enter();
+
entered_state = cpuidle_enter_s2idle(drv, dev);
if (entered_state > 0) {
local_irq_enable();
goto exit_idle;
}
+
+   rcu_idle_exit();
}
 
+   tick_nohz_idle_stop_tick();
+   rcu_idle_enter();
+
next_state = cpuidle_find_deepest_state(drv, dev);
call_cpuidle(drv, dev, next_state);
} else {
+   tick_nohz_idle_stop_tick();
+   rcu_idle_enter();
+
/*
 * Ask the cpuidle framework to choose a convenient idle state.
 */
@@ -245,7 +257,6 @@ static void do_idle(void)
tick_nohz_idle_restart_tick();
cpu_idle_poll();
} else {
-   tick_nohz_idle_stop_tick();
cpuidle_idle_call();
}
arch_cpu_idle_exit();



[PATCH v9 05/10] cpuidle: Return nohz hint from cpuidle_select()

2018-04-04 Thread Rafael J. Wysocki
From: Rafael J. Wysocki 

Add a new pointer argument to cpuidle_select() and to the ->select
cpuidle governor callback to allow a boolean value indicating
whether or not the tick should be stopped before entering the
selected state to be returned from there.

Make the ladder governor ignore that pointer (to preserve its
current behavior) and make the menu governor return 'false" through
it if:
 (1) the idle exit latency is constrained at 0, or
 (2) the selected state is a polling one, or
 (3) the expected idle period duration is within the tick period
 range.

In addition to that, the correction factor computations in the menu
governor need to take the possibility that the tick may not be
stopped into account to avoid artificially small correction factor
values.  To that end, add a mechanism to record tick wakeups, as
suggested by Peter Zijlstra, and use it to modify the menu_update()
behavior when tick wakeup occurs.  Namely, if the CPU is woken up by
the tick and the return value of tick_nohz_get_sleep_length() is not
within the tick boundary, the predicted idle duration is likely too
short, so make menu_update() try to compensate for that by updating
the governor statistics as though the CPU was idle for a long time.

Since the value returned through the new argument pointer of
cpuidle_select() is not used by its caller yet, this change by
itself is not expected to alter the functionality of the code.

Signed-off-by: Rafael J. Wysocki 
---

v8 -> v9: No changes.

---
 drivers/cpuidle/cpuidle.c  |   10 +-
 drivers/cpuidle/governors/ladder.c |3 +
 drivers/cpuidle/governors/menu.c   |   59 +
 include/linux/cpuidle.h|8 +++--
 include/linux/tick.h   |2 +
 kernel/sched/idle.c|4 +-
 kernel/time/tick-sched.c   |   20 
 7 files changed, 87 insertions(+), 19 deletions(-)

Index: linux-pm/include/linux/cpuidle.h
===
--- linux-pm.orig/include/linux/cpuidle.h
+++ linux-pm/include/linux/cpuidle.h
@@ -135,7 +135,8 @@ extern bool cpuidle_not_available(struct
  struct cpuidle_device *dev);
 
 extern int cpuidle_select(struct cpuidle_driver *drv,
- struct cpuidle_device *dev);
+ struct cpuidle_device *dev,
+ bool *stop_tick);
 extern int cpuidle_enter(struct cpuidle_driver *drv,
 struct cpuidle_device *dev, int index);
 extern void cpuidle_reflect(struct cpuidle_device *dev, int index);
@@ -167,7 +168,7 @@ static inline bool cpuidle_not_available
 struct cpuidle_device *dev)
 {return true; }
 static inline int cpuidle_select(struct cpuidle_driver *drv,
-struct cpuidle_device *dev)
+struct cpuidle_device *dev, bool *stop_tick)
 {return -ENODEV; }
 static inline int cpuidle_enter(struct cpuidle_driver *drv,
struct cpuidle_device *dev, int index)
@@ -250,7 +251,8 @@ struct cpuidle_governor {
struct cpuidle_device *dev);
 
int  (*select)  (struct cpuidle_driver *drv,
-   struct cpuidle_device *dev);
+   struct cpuidle_device *dev,
+   bool *stop_tick);
void (*reflect) (struct cpuidle_device *dev, int index);
 };
 
Index: linux-pm/kernel/sched/idle.c
===
--- linux-pm.orig/kernel/sched/idle.c
+++ linux-pm/kernel/sched/idle.c
@@ -188,13 +188,15 @@ static void cpuidle_idle_call(void)
next_state = cpuidle_find_deepest_state(drv, dev);
call_cpuidle(drv, dev, next_state);
} else {
+   bool stop_tick = true;
+
tick_nohz_idle_stop_tick();
rcu_idle_enter();
 
/*
 * Ask the cpuidle framework to choose a convenient idle state.
 */
-   next_state = cpuidle_select(drv, dev);
+   next_state = cpuidle_select(drv, dev, &stop_tick);
entered_state = call_cpuidle(drv, dev, next_state);
/*
 * Give the governor an opportunity to reflect on the outcome
Index: linux-pm/drivers/cpuidle/cpuidle.c
===
--- linux-pm.orig/drivers/cpuidle/cpuidle.c
+++ linux-pm/drivers/cpuidle/cpuidle.c
@@ -272,12 +272,18 @@ int cpuidle_enter_state(struct cpuidle_d
  *
  * @drv: the cpuidle driver
  * @dev: the cpuidle device
+ * @stop_tick: indication on whether or not to stop the tick
  *
  * Returns the index of the idle state.  The return value must not be negative.
+ *
+ * The memory location pointed to by @stop_tick 

[PATCH v9 06/10] time: tick-sched: Split tick_nohz_stop_sched_tick()

2018-04-04 Thread Rafael J. Wysocki
From: Rafael J. Wysocki 

In order to address the issue with short idle duration predictions
by the idle governor after the scheduler tick has been stopped, split
tick_nohz_stop_sched_tick() into two separate routines, one computing
the time to the next timer event and the other simply stopping the
tick when the time to the next timer event is known.

Prepare these two routines to be called separately, as one of them
will be called by the idle governor in the cpuidle_select() code
path after subsequent changes.

Update the former callers of tick_nohz_stop_sched_tick() to use
the new routines, tick_nohz_next_event() and tick_nohz_stop_tick(),
instead of it and move the updates of the sleep_length field in
struct tick_sched into __tick_nohz_idle_stop_tick() as it doesn't
need to be updated anywhere else.

There should be no intentional visible changes in functionality
resulting from this change.

Signed-off-by: Rafael J. Wysocki 
---

v8 -> v9: No changes.

---
 kernel/time/tick-sched.c |  128 +--
 kernel/time/tick-sched.h |4 +
 2 files changed, 84 insertions(+), 48 deletions(-)

Index: linux-pm/kernel/time/tick-sched.h
===
--- linux-pm.orig/kernel/time/tick-sched.h
+++ linux-pm/kernel/time/tick-sched.h
@@ -39,6 +39,8 @@ enum tick_nohz_mode {
  * @idle_sleeptime:Sum of the time slept in idle with sched tick stopped
  * @iowait_sleeptime:  Sum of the time slept in idle with sched tick stopped, 
with IO outstanding
  * @sleep_length:  Duration of the current idle sleep
+ * @timer_expires: Anticipated timer expiration time (in case sched tick 
is stopped)
+ * @timer_expires_base:Base time clock monotonic for @timer_expires
  * @do_timer_lst:  CPU was the last one doing do_timer before going idle
  */
 struct tick_sched {
@@ -60,6 +62,8 @@ struct tick_sched {
ktime_t iowait_sleeptime;
ktime_t sleep_length;
unsigned long   last_jiffies;
+   u64 timer_expires;
+   u64 timer_expires_base;
u64 next_timer;
ktime_t idle_expires;
int do_timer_last;
Index: linux-pm/kernel/time/tick-sched.c
===
--- linux-pm.orig/kernel/time/tick-sched.c
+++ linux-pm/kernel/time/tick-sched.c
@@ -652,13 +652,10 @@ static inline bool local_timer_softirq_p
return local_softirq_pending() & TIMER_SOFTIRQ;
 }
 
-static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
-ktime_t now, int cpu)
+static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu)
 {
-   struct clock_event_device *dev = 
__this_cpu_read(tick_cpu_device.evtdev);
u64 basemono, next_tick, next_tmr, next_rcu, delta, expires;
unsigned long seq, basejiff;
-   ktime_t tick;
 
/* Read jiffies and the time when jiffies were updated last */
do {
@@ -667,6 +664,7 @@ static ktime_t tick_nohz_stop_sched_tick
basejiff = jiffies;
} while (read_seqretry(&jiffies_lock, seq));
ts->last_jiffies = basejiff;
+   ts->timer_expires_base = basemono;
 
/*
 * Keep the periodic tick, when RCU, architecture or irq_work
@@ -711,32 +709,20 @@ static ktime_t tick_nohz_stop_sched_tick
 * next period, so no point in stopping it either, bail.
 */
if (!ts->tick_stopped) {
-   tick = 0;
+   ts->timer_expires = 0;
goto out;
}
}
 
/*
-* If this CPU is the one which updates jiffies, then give up
-* the assignment and let it be taken by the CPU which runs
-* the tick timer next, which might be this CPU as well. If we
-* don't drop this here the jiffies might be stale and
-* do_timer() never invoked. Keep track of the fact that it
-* was the one which had the do_timer() duty last. If this CPU
-* is the one which had the do_timer() duty last, we limit the
-* sleep time to the timekeeping max_deferment value.
+* If this CPU is the one which had the do_timer() duty last, we limit
+* the sleep time to the timekeeping max_deferment value.
 * Otherwise we can sleep as long as we want.
 */
delta = timekeeping_max_deferment();
-   if (cpu == tick_do_timer_cpu) {
-   tick_do_timer_cpu = TICK_DO_TIMER_NONE;
-   ts->do_timer_last = 1;
-   } else if (tick_do_timer_cpu != TICK_DO_TIMER_NONE) {
-   delta = KTIME_MAX;
-   ts->do_timer_last = 0;
-   } else if (!ts->do_timer_last) {
+   if (cpu != tick_do_timer_cpu &&
+   

[PATCH] f2fs: enlarge block plug coverage

2018-04-04 Thread Chao Yu
This patch enlarges block plug coverage in __issue_discard_cmd, in
order to collect more pending bios before issuing them, to avoid
being disturbed by previous discard I/O in IO aware discard mode.

Signed-off-by: Chao Yu 
---
 fs/f2fs/segment.c | 7 +--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 8f0b5ba46315..4287e208c040 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -1208,10 +1208,12 @@ static int __issue_discard_cmd(struct f2fs_sb_info *sbi,
pend_list = &dcc->pend_list[i];
 
mutex_lock(&dcc->cmd_lock);
+
+   blk_start_plug(&plug);
+
if (list_empty(pend_list))
goto next;
f2fs_bug_on(sbi, !__check_rb_tree_consistence(sbi, &dcc->root));
-   blk_start_plug(&plug);
list_for_each_entry_safe(dc, tmp, pend_list, list) {
f2fs_bug_on(sbi, dc->state != D_PREP);
 
@@ -1227,8 +1229,9 @@ static int __issue_discard_cmd(struct f2fs_sb_info *sbi,
if (++iter >= dpolicy->max_requests)
break;
}
-   blk_finish_plug(&plug);
 next:
+   blk_finish_plug(&plug);
+
mutex_unlock(&dcc->cmd_lock);
 
if (iter >= dpolicy->max_requests)
-- 
2.15.0.55.gc2ece9dc4de6



[PATCH v9 10/10] cpuidle: menu: Avoid selecting shallow states with stopped tick

2018-04-04 Thread Rafael J. Wysocki
From: Rafael J. Wysocki 

If the scheduler tick has been stopped already and the governor
selects a shallow idle state, the CPU can spend a long time in that
state if the selection is based on an inaccurate prediction of idle
time.  That effect turns out to be relevant, so it needs to be
mitigated.

To that end, modify the menu governor to discard the result of the
idle time prediction if the tick is stopped and the predicted idle
time is less than the tick period length, unless the tick timer is
going to expire soon.

Signed-off-by: Rafael J. Wysocki 
---

v8 -> v9: No changes.

---
 drivers/cpuidle/governors/menu.c |   29 ++---
 1 file changed, 22 insertions(+), 7 deletions(-)

Index: linux-pm/drivers/cpuidle/governors/menu.c
===
--- linux-pm.orig/drivers/cpuidle/governors/menu.c
+++ linux-pm/drivers/cpuidle/governors/menu.c
@@ -352,13 +352,28 @@ static int menu_select(struct cpuidle_dr
 */
data->predicted_us = min(data->predicted_us, expected_interval);
 
-   /*
-* Use the performance multiplier and the user-configurable
-* latency_req to determine the maximum exit latency.
-*/
-   interactivity_req = data->predicted_us / 
performance_multiplier(nr_iowaiters, cpu_load);
-   if (latency_req > interactivity_req)
-   latency_req = interactivity_req;
+   if (tick_nohz_tick_stopped()) {
+   /*
+* If the tick is already stopped, the cost of possible short
+* idle duration misprediction is much higher, because the CPU
+* may be stuck in a shallow idle state for a long time as a
+* result of it.  In that case say we might mispredict and try
+* to force the CPU into a state for which we would have stopped
+* the tick, unless the tick timer is going to expire really
+* soon anyway.
+*/
+   if (data->predicted_us < TICK_USEC)
+   data->predicted_us = min_t(unsigned int, TICK_USEC,
+  ktime_to_us(delta_next));
+   } else {
+   /*
+* Use the performance multiplier and the user-configurable
+* latency_req to determine the maximum exit latency.
+*/
+   interactivity_req = data->predicted_us / 
performance_multiplier(nr_iowaiters, cpu_load);
+   if (latency_req > interactivity_req)
+   latency_req = interactivity_req;
+   }
 
expected_interval = data->predicted_us;
/*



[PATCH v9 08/10] sched: idle: Select idle state before stopping the tick

2018-04-04 Thread Rafael J. Wysocki
From: Rafael J. Wysocki 

In order to address the issue with short idle duration predictions
by the idle governor after the scheduler tick has been stopped,
reorder the code in cpuidle_idle_call() so that the governor idle
state selection runs before tick_nohz_idle_go_idle() and use the
"nohz" hint returned by cpuidle_select() to decide whether or not
to stop the tick.

This isn't straightforward, because menu_select() invokes
tick_nohz_get_sleep_length() to get the time to the next timer
event and the number returned by the latter comes from
__tick_nohz_idle_stop_tick().  Fortunately, however, it is possible
to compute that number without actually stopping the tick and with
the help of the existing code.

Namely, tick_nohz_get_sleep_length() can be made call
tick_nohz_next_event(), introduced earlier, to get the time to the
next non-highres timer event.  If that happens, tick_nohz_next_event()
need not be called by __tick_nohz_idle_stop_tick() again.

If it turns out that the scheduler tick cannot be stopped going
forward or the next timer event is too close for the tick to be
stopped, tick_nohz_get_sleep_length() can simply return the time to
the next event currently programmed into the corresponding clock
event device.

In addition to knowing the return value of tick_nohz_next_event(),
however, tick_nohz_get_sleep_length() needs to know the time to the
next highres timer event, but with the scheduler tick timer excluded,
which can be computed with the help of hrtimer_get_next_event().

That minimum of that number and the tick_nohz_next_event() return
value is the total time to the next timer event with the assumption
that the tick will be stopped.  It can be returned to the idle
governor which can use it for predicting idle duration (under the
assumption that the tick will be stopped) and deciding whether or
not it makes sense to stop the tick before putting the CPU into the
selected idle state.

With the above, the sleep_length field in struct tick_sched is not
necessary any more, so drop it.

Signed-off-by: Rafael J. Wysocki 
---

v8 -> v9: Rebase on top of the new [07/10].

---
 include/linux/tick.h |2 +
 kernel/sched/idle.c  |   11 ++--
 kernel/time/tick-sched.c |   61 +--
 kernel/time/tick-sched.h |2 -
 4 files changed, 59 insertions(+), 17 deletions(-)

Index: linux-pm/include/linux/tick.h
===
--- linux-pm.orig/include/linux/tick.h
+++ linux-pm/include/linux/tick.h
@@ -115,6 +115,7 @@ enum tick_dep_bits {
 extern bool tick_nohz_enabled;
 extern int tick_nohz_tick_stopped(void);
 extern void tick_nohz_idle_stop_tick(void);
+extern void tick_nohz_idle_retain_tick(void);
 extern void tick_nohz_idle_restart_tick(void);
 extern void tick_nohz_idle_enter(void);
 extern void tick_nohz_idle_exit(void);
@@ -137,6 +138,7 @@ static inline void tick_nohz_idle_stop_t
 #define tick_nohz_enabled (0)
 static inline int tick_nohz_tick_stopped(void) { return 0; }
 static inline void tick_nohz_idle_stop_tick(void) { }
+static inline void tick_nohz_idle_retain_tick(void) { }
 static inline void tick_nohz_idle_restart_tick(void) { }
 static inline void tick_nohz_idle_enter(void) { }
 static inline void tick_nohz_idle_exit(void) { }
Index: linux-pm/kernel/sched/idle.c
===
--- linux-pm.orig/kernel/sched/idle.c
+++ linux-pm/kernel/sched/idle.c
@@ -190,13 +190,18 @@ static void cpuidle_idle_call(void)
} else {
bool stop_tick = true;
 
-   tick_nohz_idle_stop_tick();
-   rcu_idle_enter();
-
/*
 * Ask the cpuidle framework to choose a convenient idle state.
 */
next_state = cpuidle_select(drv, dev, &stop_tick);
+
+   if (stop_tick)
+   tick_nohz_idle_stop_tick();
+   else
+   tick_nohz_idle_retain_tick();
+
+   rcu_idle_enter();
+
entered_state = call_cpuidle(drv, dev, next_state);
/*
 * Give the governor an opportunity to reflect on the outcome
Index: linux-pm/kernel/time/tick-sched.c
===
--- linux-pm.orig/kernel/time/tick-sched.c
+++ linux-pm/kernel/time/tick-sched.c
@@ -930,16 +930,19 @@ static bool can_stop_idle_tick(int cpu,
 
 static void __tick_nohz_idle_stop_tick(struct tick_sched *ts)
 {
-   struct clock_event_device *dev = 
__this_cpu_read(tick_cpu_device.evtdev);
ktime_t expires;
int cpu = smp_processor_id();
 
-   WARN_ON_ONCE(ts->timer_expires_base);
-
-   if (!can_stop_idle_tick(cpu, ts))
-   goto out;
-
-   expires = tick_nohz_next_event(ts, cpu);
+   /*
+* If tick_nohz_get_sleep_length() ran tick_nohz_next_event(), the
+* tick timer expiration time is known alrea

[PATCH v9 07/10] time: hrtimer: Introduce hrtimer_next_event_without()

2018-04-04 Thread Rafael J. Wysocki
From: Rafael J. Wysocki 

The next set of changes will need to compute the time to the next
hrtimer event over all hrtimers except for the scheduler tick one.

To that end introduce a new helper function,
hrtimer_next_event_without(), for computing the time until the next
hrtimer event over all timers except for one and modify the underlying
code in __hrtimer_next_event_base() to prepare it for being called by
that new function.

No intentional changes in functionality.

Signed-off-by: Rafael J. Wysocki 
---

v8 -> v9:
 * Make fewer changes to the existing code.
 * Add a new helper function for the handling of the use case at hand.

---
 include/linux/hrtimer.h |1 
 kernel/time/hrtimer.c   |   55 ++--
 2 files changed, 54 insertions(+), 2 deletions(-)

Index: linux-pm/include/linux/hrtimer.h
===
--- linux-pm.orig/include/linux/hrtimer.h
+++ linux-pm/include/linux/hrtimer.h
@@ -426,6 +426,7 @@ static inline ktime_t hrtimer_get_remain
 }
 
 extern u64 hrtimer_get_next_event(void);
+extern u64 hrtimer_next_event_without(const struct hrtimer *exclude);
 
 extern bool hrtimer_active(const struct hrtimer *timer);
 
Index: linux-pm/kernel/time/hrtimer.c
===
--- linux-pm.orig/kernel/time/hrtimer.c
+++ linux-pm/kernel/time/hrtimer.c
@@ -490,6 +490,7 @@ __next_base(struct hrtimer_cpu_base *cpu
while ((base = __next_base((cpu_base), &(active
 
 static ktime_t __hrtimer_next_event_base(struct hrtimer_cpu_base *cpu_base,
+const struct hrtimer *exclude,
 unsigned int active,
 ktime_t expires_next)
 {
@@ -502,9 +503,24 @@ static ktime_t __hrtimer_next_event_base
 
next = timerqueue_getnext(&base->active);
timer = container_of(next, struct hrtimer, node);
+   if (timer == exclude) {
+   /* Get to the next timer in the queue. */
+   struct rb_node *rbn = rb_next(&next->node);
+
+   next = rb_entry_safe(rbn, struct timerqueue_node, node);
+   if (!next)
+   continue;
+
+   timer = container_of(next, struct hrtimer, node);
+   }
expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
if (expires < expires_next) {
expires_next = expires;
+
+   /* Skip cpu_base update if a timer is being excluded. */
+   if (exclude)
+   continue;
+
if (timer->is_soft)
cpu_base->softirq_next_timer = timer;
else
@@ -548,7 +564,8 @@ __hrtimer_get_next_event(struct hrtimer_
if (!cpu_base->softirq_activated && (active_mask & 
HRTIMER_ACTIVE_SOFT)) {
active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT;
cpu_base->softirq_next_timer = NULL;
-   expires_next = __hrtimer_next_event_base(cpu_base, active, 
KTIME_MAX);
+   expires_next = __hrtimer_next_event_base(cpu_base, NULL,
+active, KTIME_MAX);
 
next_timer = cpu_base->softirq_next_timer;
}
@@ -556,7 +573,8 @@ __hrtimer_get_next_event(struct hrtimer_
if (active_mask & HRTIMER_ACTIVE_HARD) {
active = cpu_base->active_bases & HRTIMER_ACTIVE_HARD;
cpu_base->next_timer = next_timer;
-   expires_next = __hrtimer_next_event_base(cpu_base, active, 
expires_next);
+   expires_next = __hrtimer_next_event_base(cpu_base, NULL, active,
+expires_next);
}
 
return expires_next;
@@ -1200,6 +1218,39 @@ u64 hrtimer_get_next_event(void)
 
raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
 
+   return expires;
+}
+
+/**
+ * hrtimer_next_event_without - time until next expiry event w/o one timer
+ * @exclude:   timer to exclude
+ *
+ * Returns the next expiry time over all timers except for the @exclude one or
+ * KTIME_MAX if none of them is pending.
+ */
+u64 hrtimer_next_event_without(const struct hrtimer *exclude)
+{
+   struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
+   u64 expires = KTIME_MAX;
+   unsigned long flags;
+
+   raw_spin_lock_irqsave(&cpu_base->lock, flags);
+
+   if (__hrtimer_hres_active(cpu_base)) {
+   unsigned int active;
+
+   if (!cpu_base->softirq_activated) {
+   active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT;
+   expires = __hrtimer_next_event_base(cpu_base, exclude,
+

Re: [PATCH] ARC: Improve cmpxchng syscall implementation

2018-04-04 Thread Alexey Brodkin
Hi Vineet, Peter,

On Wed, 2018-03-21 at 14:54 +0300, Alexey Brodkin wrote:
> Hi Vineet,
> 
> On Mon, 2018-03-19 at 11:29 -0700, Vineet Gupta wrote:
> > On 03/19/2018 04:00 AM, Alexey Brodkin wrote:
> > > arc_usr_cmpxchg syscall is supposed to be used on platforms
> > > that lack support of Load-Locked/Store-Conditional instructions
> > > in hardware. And in that case we mimic missing hardware features
> > > with help of kernel's sycall that "atomically" checks current
> > > value in memory and then if it matches caller expectation new
> > > value is written to that same location.
> > > 
> > 
> > ...
> > ...
> > 
> > > 
> > > 2. What's worse if we're dealing with data from not yet allocated
> > > page (think of pre-copy-on-write state) we'll successfully
> > > read data but on write we'll silently return to user-space
> > > with correct result 
> > 
> > This is technically incorrect, even for reading, you need a page, which 
> > could be 
> > common zero page in certain cases.
> 
> Ok I'll reword it like.
> 
> > 
> > (which we really read just before). That leads
> > > to very strange problems in user-space app further down the line
> > > because new value was never written to the destination.
> > > 
> > > 3. Regardless of what went wrong we'll return from syscall
> > > and user-space application will continue to execute.
> > > Even if user's pointer was completely bogus.
> > 
> > Again we are exaggerating (from technical correctness POV) - if user 
> > pointer was 
> > bogs, the read would not have worked in first place etc. So lets tone down 
> > the 
> > rhetoric.
> 
> Ok here I may rephrase it like that:
> --->8-
> 3. Regardless of what went wrong we'll return from syscall
>and user-space application will continue to execute.
> --->8-
> 
> > 
> > > In case of hardware LL/SC that app would have been killed
> > > by the kernel.
> > > 
> > > With that change we attempt to imrove on all 3 items above:
> > > 
> > > 1. We still disable preemption around read-and-write of
> > > user's data but if we happen to fail with either of them
> > > we're enabling preemption and try to force page fault so
> > > that we have a correct mapping in the TLB. Then re-try
> > > again in "atomic" context.
> > > 
> > > 2. If real page fault fails or even access_ok() returns false
> > > we send SIGSEGV to the user-space process so if something goes
> > > seriously wrong we'll know about it much earlier.
> > > 
> > 
> > 
> > >   
> > >   /*
> > >* This is only for old cores lacking LLOCK/SCOND, which by 
> > > defintion
> > > @@ -60,23 +62,48 @@ SYSCALL_DEFINE3(arc_usr_cmpxchg, int *, uaddr, int, 
> > > expected, int, new)
> > >   /* Z indicates to userspace if operation succeded */
> > >   regs->status32 &= ~STATUS_Z_MASK;
> > >   
> > > - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int)))
> > > - return -EFAULT;
> > > + ret = access_ok(VERIFY_WRITE, uaddr, sizeof(*uaddr));
> > > + if (!ret)
> > > + goto fail;
> > >   
> > > +again:
> > >   preempt_disable();
> > >   
> > > - if (__get_user(uval, uaddr))
> > > - goto done;
> > > -
> > > - if (uval == expected) {
> > > - if (!__put_user(new, uaddr))
> > > + ret = __get_user(val, uaddr);
> > > + if (ret == -EFAULT) {
> > 
> > 
> > Lets see if this warrants adding complexity ! This implies that TLB entry 
> > with 
> > Read permissions didn't exist for reading the var and page fault handler 
> > could not 
> > wire up even a zero page due to preempt_disable, meaning it was something 
> > not 
> > touched by userspace already - sort of uninitialized variable in user code.
> 
> Ok I completely missed the fact that fast path TLB miss handler is being
> executed even if we have preemption disabled. So given the mapping exist
> we do not need to retry with enabled preemption.
> 
> Still maybe I'm a bit paranoid here but IMHO it's good to be ready for a 
> corner-case
> when the pointer is completely bogus and there's no mapping for him.
> I understand that today we only expect this syscall to be used from libc's
> internals but as long as syscall exists nobody stops anybody from using it
> directly without libc. So maybe instead of doing get_user_pages_fast() just
> send a SIGSEGV to the process? At least user will realize there's some problem
> at earlier stage.
> 
> > Otherwise it is extremely unlikely to start with a TLB entry with Read 
> > permissions, followed by syscall Trap only to find the entry missing, 
> > unless a 
> > global TLB flush came from other cores, right in the middle. But this 
> > syscall is 
> > not guaranteed to work with SMP anyways, so lets ignore any SMP misdoings 
> > here.
> 
> Well but that's exactly the situation I was debugging: we start from data 
> from read-only
> page and on attempt to write back modif

[PATCH v4 4/9] vsprintf: Consolidate handling of unknown pointer specifiers

2018-04-04 Thread Petr Mladek
There are few printk formats that make sense only with two or more
specifiers. Also some specifiers make sense only when a kernel feature
is enabled.

The handling of unknown specifiers is strange, inconsistent, and
even leaking the address. For example, netdev_bits() prints the
non-hashed pointer value or clock() prints "(null)".

The best solution seems to be in flags_string(). It does not print any
misleading value. Instead it calls WARN_ONCE() describing the unknown
specifier. Therefore it clearly shows the problem and helps to find it.

Note that WARN_ONCE() used to cause recursive printk(). But it is safe
now because vscnprintf() is called in printk_safe context from
vprintk_emit().

Signed-off-by: Petr Mladek 
---
 lib/vsprintf.c | 19 +--
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index dd71738d7a09..5a0d01846a11 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -1484,9 +1484,9 @@ char *netdev_bits(char *buf, char *end, const void *addr, 
const char *fmt)
size = sizeof(netdev_features_t);
break;
default:
-   num = (unsigned long)addr;
-   size = sizeof(unsigned long);
-   break;
+   WARN_ONCE(1, "Unsupported pointer format specifier: %%pN%c\n",
+ fmt[1]);
+   return buf;
}
 
return special_hex_number(buf, end, num, size);
@@ -1517,7 +1517,12 @@ static noinline_for_stack
 char *clock(char *buf, char *end, struct clk *clk, struct printf_spec spec,
const char *fmt)
 {
-   if (!IS_ENABLED(CONFIG_HAVE_CLK) || !clk)
+   if (!IS_ENABLED(CONFIG_HAVE_CLK)) {
+   WARN_ONCE(1, "Unsupported pointer format specifier: %%pC\n");
+   return buf;
+   }
+
+   if (!clk)
return string(buf, end, NULL, spec);
 
switch (fmt[1]) {
@@ -1529,7 +1534,8 @@ char *clock(char *buf, char *end, struct clk *clk, struct 
printf_spec spec,
 #ifdef CONFIG_COMMON_CLK
return string(buf, end, __clk_get_name(clk), spec);
 #else
-   return special_hex_number(buf, end, (unsigned long)clk, 
sizeof(unsigned long));
+   WARN_ONCE(1, "Unsupported pointer format specifier: %%pC\n");
+   return buf;
 #endif
}
 }
@@ -1593,7 +1599,8 @@ char *flags_string(char *buf, char *end, void *flags_ptr, 
const char *fmt)
names = gfpflag_names;
break;
default:
-   WARN_ONCE(1, "Unsupported flags modifier: %c\n", fmt[1]);
+   WARN_ONCE(1, "Unsupported pointer format specifier: %%pG%c\n",
+ fmt[1]);
return buf;
}
 
-- 
2.13.6



[PATCH v4 1/9] vsprintf: Shuffle ptr_to_id() code

2018-04-04 Thread Petr Mladek
This patch just moves ptr_to_id() implementation up in the source
file. We will want to call it earlier.

This patch does not change the existing behavior.

Signed-off-by: Petr Mladek 
---
 lib/vsprintf.c | 134 -
 1 file changed, 67 insertions(+), 67 deletions(-)

diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index d7a708f82559..c0c677c6a833 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -602,6 +602,73 @@ char *string(char *buf, char *end, const char *s, struct 
printf_spec spec)
return widen_string(buf, len, end, spec);
 }
 
+static bool have_filled_random_ptr_key __read_mostly;
+static siphash_key_t ptr_key __read_mostly;
+
+static void fill_random_ptr_key(struct random_ready_callback *unused)
+{
+   get_random_bytes(&ptr_key, sizeof(ptr_key));
+   /*
+* have_filled_random_ptr_key==true is dependent on get_random_bytes().
+* ptr_to_id() needs to see have_filled_random_ptr_key==true
+* after get_random_bytes() returns.
+*/
+   smp_mb();
+   WRITE_ONCE(have_filled_random_ptr_key, true);
+}
+
+static struct random_ready_callback random_ready = {
+   .func = fill_random_ptr_key
+};
+
+static int __init initialize_ptr_random(void)
+{
+   int ret = add_random_ready_callback(&random_ready);
+
+   if (!ret) {
+   return 0;
+   } else if (ret == -EALREADY) {
+   fill_random_ptr_key(&random_ready);
+   return 0;
+   }
+
+   return ret;
+}
+early_initcall(initialize_ptr_random);
+
+/* Maps a pointer to a 32 bit unique identifier. */
+static char *ptr_to_id(char *buf, char *end, void *ptr, struct printf_spec 
spec)
+{
+   unsigned long hashval;
+   const int default_width = 2 * sizeof(ptr);
+
+   if (unlikely(!have_filled_random_ptr_key)) {
+   spec.field_width = default_width;
+   /* string length must be less than default_width */
+   return string(buf, end, "(ptrval)", spec);
+   }
+
+#ifdef CONFIG_64BIT
+   hashval = (unsigned long)siphash_1u64((u64)ptr, &ptr_key);
+   /*
+* Mask off the first 32 bits, this makes explicit that we have
+* modified the address (and 32 bits is plenty for a unique ID).
+*/
+   hashval = hashval & 0x;
+#else
+   hashval = (unsigned long)siphash_1u32((u32)ptr, &ptr_key);
+#endif
+
+   spec.flags |= SMALL;
+   if (spec.field_width == -1) {
+   spec.field_width = default_width;
+   spec.flags |= ZEROPAD;
+   }
+   spec.base = 16;
+
+   return number(buf, end, hashval, spec);
+}
+
 static noinline_for_stack
 char *dentry_name(char *buf, char *end, const struct dentry *d, struct 
printf_spec spec,
  const char *fmt)
@@ -1659,73 +1726,6 @@ char *pointer_string(char *buf, char *end, const void 
*ptr,
return number(buf, end, (unsigned long int)ptr, spec);
 }
 
-static bool have_filled_random_ptr_key __read_mostly;
-static siphash_key_t ptr_key __read_mostly;
-
-static void fill_random_ptr_key(struct random_ready_callback *unused)
-{
-   get_random_bytes(&ptr_key, sizeof(ptr_key));
-   /*
-* have_filled_random_ptr_key==true is dependent on get_random_bytes().
-* ptr_to_id() needs to see have_filled_random_ptr_key==true
-* after get_random_bytes() returns.
-*/
-   smp_mb();
-   WRITE_ONCE(have_filled_random_ptr_key, true);
-}
-
-static struct random_ready_callback random_ready = {
-   .func = fill_random_ptr_key
-};
-
-static int __init initialize_ptr_random(void)
-{
-   int ret = add_random_ready_callback(&random_ready);
-
-   if (!ret) {
-   return 0;
-   } else if (ret == -EALREADY) {
-   fill_random_ptr_key(&random_ready);
-   return 0;
-   }
-
-   return ret;
-}
-early_initcall(initialize_ptr_random);
-
-/* Maps a pointer to a 32 bit unique identifier. */
-static char *ptr_to_id(char *buf, char *end, void *ptr, struct printf_spec 
spec)
-{
-   unsigned long hashval;
-   const int default_width = 2 * sizeof(ptr);
-
-   if (unlikely(!have_filled_random_ptr_key)) {
-   spec.field_width = default_width;
-   /* string length must be less than default_width */
-   return string(buf, end, "(ptrval)", spec);
-   }
-
-#ifdef CONFIG_64BIT
-   hashval = (unsigned long)siphash_1u64((u64)ptr, &ptr_key);
-   /*
-* Mask off the first 32 bits, this makes explicit that we have
-* modified the address (and 32 bits is plenty for a unique ID).
-*/
-   hashval = hashval & 0x;
-#else
-   hashval = (unsigned long)siphash_1u32((u32)ptr, &ptr_key);
-#endif
-
-   spec.flags |= SMALL;
-   if (spec.field_width == -1) {
-   spec.field_width = default_width;
-   spec.flags |= ZEROPAD;
-   }
-   spec.base = 16;
-
-   return number(buf

[PATCH v4 8/9] vsprintf: Prevent crash when dereferencing invalid pointers

2018-04-04 Thread Petr Mladek
We already prevent crash when dereferencing some obviously broken
pointers. But the handling is not consistent. Sometimes we print "(null)"
only for pure NULL pointer, sometimes for pointers in the first
page and sometimes also for pointers in the last page (error codes).

Note that printk() call this code under logbuf_lock. Any recursive
printks are redirected to the printk_safe implementation and the messages
are stored into per-CPU buffers. These buffers might be eventually flushed
in printk_safe_flush_on_panic() but it is not guaranteed.

This patch adds a check using probe_kernel_read(). It is not a full-proof
test. But it should help to see the error message in 99% situations where
the kernel would silently crash otherwise.

Also it makes the error handling unified for "%s" and the many %p*
specifiers that need to read the data from a given address. We print:

   + (null)   when accessing data on pure pure NULL address
   + (efault) when accessing data on an invalid address

It does not affect the %p* specifiers that just print the given address
in some form, namely %pF, %pf, %pS, %ps, %pB, %pK, %px, and plain %p.

Note that we print (efault) from security reasons. In fact, the real
address can be seen only by %px or eventually %pK.

Signed-off-by: Petr Mladek 
---
 Documentation/core-api/printk-formats.rst |   7 ++
 lib/test_printf.c |  37 ++--
 lib/vsprintf.c| 137 ++
 3 files changed, 136 insertions(+), 45 deletions(-)

diff --git a/Documentation/core-api/printk-formats.rst 
b/Documentation/core-api/printk-formats.rst
index 934559b3c130..dc020087b12a 100644
--- a/Documentation/core-api/printk-formats.rst
+++ b/Documentation/core-api/printk-formats.rst
@@ -50,6 +50,13 @@ A raw pointer value may be printed with %p which will hash 
the address
 before printing. The kernel also supports extended specifiers for printing
 pointers of different types.
 
+Some of the extended specifiers print the data on the given address instead
+of printing the address itself. In this case, the following error messages
+might be printed instead of the unreachable information::
+
+   (null)   data on plain NULL address
+   (efault) data on invalid address
+
 Plain Pointers
 --
 
diff --git a/lib/test_printf.c b/lib/test_printf.c
index 71ebfa43ad05..6bc386d7c7c6 100644
--- a/lib/test_printf.c
+++ b/lib/test_printf.c
@@ -209,12 +209,12 @@ test_string(void)
 #define ZEROS ""   /* hex 32 zero bits */
 
 static int __init
-plain_format(void)
+plain_format(void *ptr)
 {
char buf[PLAIN_BUF_SIZE];
int nchars;
 
-   nchars = snprintf(buf, PLAIN_BUF_SIZE, "%p", PTR);
+   nchars = snprintf(buf, PLAIN_BUF_SIZE, "%p", ptr);
 
if (nchars != PTR_WIDTH || strncmp(buf, ZEROS, strlen(ZEROS)) != 0)
return -1;
@@ -227,6 +227,7 @@ plain_format(void)
 #define PTR_WIDTH 8
 #define PTR ((void *)0x456789ab)
 #define PTR_STR "456789ab"
+#define ZEROS ""
 
 static int __init
 plain_format(void)
@@ -238,12 +239,12 @@ plain_format(void)
 #endif /* BITS_PER_LONG == 64 */
 
 static int __init
-plain_hash(void)
+plain_hash(void *ptr)
 {
char buf[PLAIN_BUF_SIZE];
int nchars;
 
-   nchars = snprintf(buf, PLAIN_BUF_SIZE, "%p", PTR);
+   nchars = snprintf(buf, PLAIN_BUF_SIZE, "%p", ptr);
 
if (nchars != PTR_WIDTH || strncmp(buf, PTR_STR, PTR_WIDTH) == 0)
return -1;
@@ -256,18 +257,18 @@ plain_hash(void)
  * after an address is hashed.
  */
 static void __init
-plain(void)
+plain(void *ptr)
 {
int err;
 
-   err = plain_hash();
+   err = plain_hash(ptr);
if (err) {
pr_warn("plain 'p' does not appear to be hashed\n");
failed_tests++;
return;
}
 
-   err = plain_format();
+   err = plain_format(ptr);
if (err) {
pr_warn("hashing plain 'p' has unexpected format\n");
failed_tests++;
@@ -275,6 +276,24 @@ plain(void)
 }
 
 static void __init
+null_pointer(void)
+{
+   plain(NULL);
+   test(ZEROS "", "%px", NULL);
+   test("(null)", "%pE", NULL);
+}
+
+#define PTR_INVALID ((void *)0x00ab)
+
+static void __init
+invalid_pointer(void)
+{
+   plain(PTR_INVALID);
+   test(ZEROS "00ab", "%px", PTR_INVALID);
+   test("(efault)", "%pE", PTR_INVALID);
+}
+
+static void __init
 symbol_ptr(void)
 {
 }
@@ -497,7 +516,9 @@ flags(void)
 static void __init
 test_pointer(void)
 {
-   plain();
+   plain(PTR);
+   null_pointer();
+   invalid_pointer();
symbol_ptr();
kernel_ptr();
struct_resource();
diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index 3551b7957d9e..1a080a75a825 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -599,12 +599,46 @@ char *__string(char *buf, char *end, const char *s, 
struct printf_spec spec)
return widen_string(buf, len, end, spec);

  1   2   3   4   5   6   7   8   9   10   >