Re: 3.7.10+: BUG Dentry still in use [unmount of cifs cifs]

2013-03-07 Thread Mateusz Guzik
On Tue, Mar 05, 2013 at 10:54:56AM -0800, Ben Greear wrote:
> In doing some CIFS testing (utilizing it's feature to bind to local
> address..but not sure that matters), we saw this error when trying
> to un-mount.
> 
> Our kernel is patched (nfs, some networking related patches), but there
> are no out-of-kernel patches to CIFS, so I don't *think* this is anything
> we could have caused.
> 
> This problem appears to be easily reproducible, so we will be happy
> to test patches if anyone has any suggestions.
> 
> BUG: Dentry 8800c07e43c0{i=45762,n=cifs2-01.7.lf-data} still in use (1) 
> [unmount of cifs cifs]

We encountered similar panic, but it was related to writes. In our case
the problem was that some cifsInodeInfo holding dentry references were
still around (in slow-work queue) during superblock destruction in unmount.

I reproduced the problem on 3.8 kernel (sorry, no 3.7 handy) with reads
as well, which should match your scenario.

I attached a patch that deals with this problem by grabbing refcounts to
cifs superblock on cifsInodeInfo creation. This delays sb destruction
until all cifsInodeInfos are gone. I didn't test it on 3.7.10 kernel but
it should work fine.

-- 
Mateusz Guzik
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 1a052c0..345e76b 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -91,6 +91,24 @@ struct workqueue_struct  *cifsiod_wq;
 __u8 cifs_client_guid[SMB2_CLIENT_GUID_SIZE];
 #endif
 
+void
+cifs_sb_active(struct super_block *sb)
+{
+   struct cifs_sb_info *server = CIFS_SB(sb);
+
+   if (atomic_inc_return(&server->active) == 1)
+   atomic_inc(&sb->s_active);
+}
+
+void
+cifs_sb_deactive(struct super_block *sb)
+{
+   struct cifs_sb_info *server = CIFS_SB(sb);
+
+   if (atomic_dec_and_test(&server->active))
+   deactivate_super(sb);
+}
+
 static int
 cifs_read_super(struct super_block *sb)
 {
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 7163419..0e32c34 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -41,6 +41,10 @@ extern struct file_system_type cifs_fs_type;
 extern const struct address_space_operations cifs_addr_ops;
 extern const struct address_space_operations cifs_addr_ops_smallbuf;
 
+/* Functions related to super block operations */
+extern void cifs_sb_active(struct super_block *sb);
+extern void cifs_sb_deactive(struct super_block *sb);
+
 /* Functions related to inodes */
 extern const struct inode_operations cifs_dir_inode_ops;
 extern struct inode *cifs_root_iget(struct super_block *);
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 8c0d855..7a0dd99 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -300,6 +300,8 @@ cifs_new_fileinfo(struct cifs_fid *fid, struct file *file,
INIT_WORK(&cfile->oplock_break, cifs_oplock_break);
mutex_init(&cfile->fh_mutex);
 
+   cifs_sb_active(inode->i_sb);
+
/*
 * If the server returned a read oplock and we have mandatory brlocks,
 * set oplock level to None.
@@ -349,7 +351,8 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
struct cifs_tcon *tcon = tlink_tcon(cifs_file->tlink);
struct TCP_Server_Info *server = tcon->ses->server;
struct cifsInodeInfo *cifsi = CIFS_I(inode);
-   struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+   struct super_block *sb = inode->i_sb;
+   struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
struct cifsLockInfo *li, *tmp;
struct cifs_fid fid;
struct cifs_pending_open open;
@@ -414,6 +417,7 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
 
cifs_put_tlink(cifs_file->tlink);
dput(cifs_file->dentry);
+   cifs_sb_deactive(sb);
kfree(cifs_file);
 }
 
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index e7931cc..cba259a 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -93,6 +93,24 @@ struct workqueue_struct  *cifsiod_wq;
 __u8 cifs_client_guid[SMB2_CLIENT_GUID_SIZE];
 #endif
 
+void
+cifs_sb_active(struct super_block *sb)
+{
+   struct cifs_sb_info *server = CIFS_SB(sb);
+
+   if (atomic_inc_return(&server->active) == 1)
+   atomic_inc(&sb->s_active);
+}
+
+void
+cifs_sb_deactive(struct super_block *sb)
+{
+   struct cifs_sb_info *server = CIFS_SB(sb);
+
+   if (atomic_dec_and_test(&server->active))
+   deactivate_super(sb);
+}
+
 static int
 cifs_read_super(struct super_block *sb)
 {
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 7163419..0e32c34 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -41,6 +41,10 @@ extern struct file_system_type cifs_fs_type;
 extern const struct address_space_operations cifs_addr_ops;
 extern const struct address_space_operations cifs_addr_ops_smallbuf;
 
+/* Functions related to super block operations */
+extern void cifs_sb_active(struct super_block *

Re: [PATCH] prctl: remove one-shot limitation for changing exe link

2016-07-30 Thread Mateusz Guzik
On Sat, Jul 30, 2016 at 12:31:40PM -0500, Eric W. Biederman wrote:
> So what I am requesting is very simple.  That the checks in
> prctl_set_mm_exe_file be tightened up to more closely approach what
> execve requires.  Thus preserving the value of the /proc/[pid]/exe for
> the applications that want to use the exe link.
> 
> Once the checks in prctl_set_mm_exe_file are tightened up please feel
> free to remove the one shot test.
> 

This is more fishy.

First of all exe_file is used by the audit subsystem. So someone has to
ask audit people what is the significance (if any) of the field.

All exe_file users but one use get_mm_exe_file and handle NULL
gracefully.

Even with the current limit of changing the field once, the user can
cause a transient failure of get_mm_exe_file which can fail to increment
the refcount before it drops to 0.

This transient failure can be used to get a NULL value stored in
->exe_file during fork (in dup_mmap):
RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm));

The one place which is not using get_mm_exe_file to get to the pointer
is audit_exe_compare:
rcu_read_lock();
exe_file = rcu_dereference(tsk->mm->exe_file);
ino = exe_file->f_inode->i_ino;
dev = exe_file->f_inode->i_sb->s_dev;
rcu_read_unlock();

This is buggy on 2 accounts:
1. exe_file can be NULL
2. rcu does not protect f_inode

The issue is made worse with allowing arbitrary number changes.

Modifying get_mm_exe_file to retry is trivial and in effect never return
NULL is trivial. With arbitrary number of changes allowed this may
require some cond_resched() or something.

For comments I cc'ed Richard Guy Briggs, who is both an audit person and
the author of audit_exe_compare.
-- 
Mateusz Guzik


Re: Is reading from /proc/self/smaps thread-safe?

2016-07-30 Thread Mateusz Guzik
On Tue, Jul 26, 2016 at 02:44:48PM +0200, Marcin Ślusarz wrote:
> Hey
> 
> I have a simple program that mmaps 8MB of anonymous memory, spawns 16
> threads, reads /proc/self/smaps in each thread and looks up whether
> mapped address can be found in smaps. From time to time it's not there.
> 
> Is this supposed to work reliably?
> 
> My guess is that libc functions allocate memory internally using mmap
> and modify process' address space while other thread is iterating over
> vmas.
> 
> I see that reading from smaps takes mmap_sem in read mode. I'm guessing
> vm modifications are done under mmap_sem in write mode.
> 
> Documentation/filesystem/proc.txt says reading from smaps is "slow but
> very precise" (although in context of RSS).
> 

Address space modification definitely happens as threads get their
stacks mmaped and unmapped.

If you run your program under strace you will see all threads perform
multiple read()s to get the content as the kernel keeps return short
reads (below 1 page size). In particular, seq_read imposes the limit
artificially.

Since there are multiple trips to the kernel, locks are dropped and
special measures are needed to maintain consistency of the result.

In m_start you can see there is a best-effort attempt: it is remembered
what vma was accessed by the previous run. But the vma can be unmapped
before we get here next time.

So no, reading the file when the content is bigger than 4k is not
guaranteed to give consistent results across reads.

I don't have a good idea how to fix it, and it is likely not worth
working on. This is not the only place which is unable to return
reliable information for sufficiently large dataset.

The obvious thing to try out is just storing all the necessary
information and generating the text form on read. Unfortunately even
that data is quite big -- over 100 bytes per vma. This can be shrinked
down significantly with encoding what information is present as opposed
to keeping all records). But with thousands of entries per application
this translates into kilobytes of memory which would have to allocated
just to hold it, sounds like a non-starter to me.

-- 
Mateusz Guzik


Re: [PACTH v4 1/3] mm, proc: Implement /proc//totmaps

2016-08-31 Thread Mateusz Guzik
On Wed, Aug 31, 2016 at 12:36:26PM -0400, Robert Foss wrote:
> On 2016-08-31 05:45 AM, Jacek Anaszewski wrote:
> > > +static void *m_totmaps_start(struct seq_file *p, loff_t *pos)
> > > +{
> > > +return NULL + (*pos == 0);
> > > +}
> > > +
> > > +static void *m_totmaps_next(struct seq_file *p, void *v, loff_t *pos)
> > > +{
> > > +++*pos;
> > > +return NULL;
> > > +}
> > > +
> > 
> > When reading totmaps of kernel processes the following NULL pointer
> > dereference occurs:
> > 
> > Unable to handle kernel NULL pointer dereference at virtual address
> > 0044
> > [] (down_read) from [] (totmaps_proc_show+0x2c/0x1e8)
> > [] (totmaps_proc_show) from [] (seq_read+0x1c8/0x4b8)
> > [] (seq_read) from [] (__vfs_read+0x2c/0x110)
> > [] (__vfs_read) from [] (vfs_read+0x8c/0x110)
> > [] (vfs_read) from [] (SyS_read+0x40/0x8c)
> > [] (SyS_read) from [] (ret_fast_syscall+0x0/0x3c)
> > 
> > It seems that some protection is needed for such processes, so that
> > totmaps would return empty string then, like in case of smaps.
> > 
> 
> Thanks for the testing Jacek!
> 
> I had a look around the corresponding smaps code, but I'm not seeing any
> checks, do you know where that check actually is made?
> 

See m_start in f/sproc/task_mmu.c. It not only check for non-null mm,
but also tries to bump ->mm_users and only then proceeds to walk the mm.

-- 
Mateusz Guzik


Re: [v2 PATCH] mm: introduce arg_lock to protect arg_start|end and env_start|end in mm_struct

2018-03-26 Thread Mateusz Guzik
On Tue, Mar 27, 2018 at 02:20:39AM +0800, Yang Shi wrote:
> mmap_sem is on the hot path of kernel, and it very contended, but it is
> abused too. It is used to protect arg_start|end and evn_start|end when
> reading /proc/$PID/cmdline and /proc/$PID/environ, but it doesn't make
> sense since those proc files just expect to read 4 values atomically and
> not related to VM, they could be set to arbitrary values by C/R.
> 

They are not arbitrary - there is basic validation performed when
setting them.

> And, the mmap_sem contention may cause unexpected issue like below:
> 
> INFO: task ps:14018 blocked for more than 120 seconds.
>Tainted: GE 4.9.79-009.ali3000.alios7.x86_64 #1
>  "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this
> message.
>  ps  D0 14018  1 0x0004
>   885582f84000 885e8682f000 880972943000 885ebf499bc0
>   8828ee12 c900349bfca8 817154d0 0040
>   00ff812f872a 885ebf499bc0 024000d000948300 880972943000
>  Call Trace:
>   [] ? __schedule+0x250/0x730
>   [] schedule+0x36/0x80
>   [] rwsem_down_read_failed+0xf0/0x150
>   [] call_rwsem_down_read_failed+0x18/0x30
>   [] down_read+0x20/0x40
>   [] proc_pid_cmdline_read+0xd9/0x4e0
>   [] ? do_filp_open+0xa5/0x100
>   [] __vfs_read+0x37/0x150
>   [] ? security_file_permission+0x9b/0xc0
>   [] vfs_read+0x96/0x130
>   [] SyS_read+0x55/0xc0
>   [] entry_SYSCALL_64_fastpath+0x1a/0xc5
> 
> Both Alexey Dobriyan and Michal Hocko suggested to use dedicated lock
> for them to mitigate the abuse of mmap_sem.
> 

While switching to arg spinlock here will relieve mmap_sem to an extent,
it wont help with the problem you are seeing here.

proc_pid_cmdline_read -> access_process_vm -> __access_remote_vm and you
got yet another down_read(&mm->mmap_sem);.

i.e. the issue you ran into is well known and predates my change.

The problem does not stem from contention either, but blocking for a
long time while holding the lock - the most common example is dealing
with dead nfs mount vs mmaped areas.

I don't have good ideas how to fix the problem. The least bad I came up
with was to trylock with a timeout - after a failure either return an
error or resort to returning p_comm. ps/top could be modified to
fallback to snatching the name from /status.

Since the lock owner is now being stored in the semaphore, perhaps the
above routine can happily spin until it grabs the lock or the owner is
detected to have gone into uninterruptible sleep and react accordingly. 

I don't know whether it is feasible to somehow avoid the mmap lock
altogether.

If it has to be there no matter what the code can be refactored to grab
it once and relock only if copyout would fault. This would in particular
reduce the number of times it is taken to begin with and still provide
the current synchronisation against prctl. But the fundamental problem
will remain.

That said, refactoring above will have the same effect as your patch and
will avoid growing mm_struct.

That's my $0,03. MM overlords have to comment on what to do with this.

> So, introduce a new spinlock in mm_struct to protect the concurrent access
> to arg_start|end and env_start|end.
> 
> And, commit ddf1d398e517e660207e2c807f76a90df543a217 ("prctl: take mmap
> sem for writing to protect against others") changed down_read to
> down_write to avoid write race condition in prctl_set_mm(). Since we
> already have dedicated lock to protect them, it is safe to change back
> to down_read.
> 
> Signed-off-by: Yang Shi 
> Cc: Alexey Dobriyan 
> Cc: Michal Hocko 
> Cc: Matthew Wilcox 
> Cc: Mateusz Guzik 
> Cc: Cyrill Gorcunov 
> ---
> v1 --> v2:
> * Use spinlock instead of rwlock per Mattew's suggestion
> * Replace down_write to down_read in prctl_set_mm (see commit log for details)
> 
>  fs/proc/base.c   |  8 
>  include/linux/mm_types.h |  2 ++
>  kernel/fork.c|  1 +
>  kernel/sys.c | 14 ++
>  mm/init-mm.c |  1 +
>  5 files changed, 18 insertions(+), 8 deletions(-)
> 
> diff --git a/fs/proc/base.c b/fs/proc/base.c
> index 9298324..e0282b6 100644
> --- a/fs/proc/base.c
> +++ b/fs/proc/base.c
> @@ -242,12 +242,12 @@ static ssize_t proc_pid_cmdline_read(struct file *file, 
> char __user *buf,
>   goto out_mmput;
>   }
>  
> - down_read(&mm->mmap_sem);
> + spin_lock(&mm->arg_lock);
>   arg_start = mm->arg_start;
>   arg_end = mm->arg_end;
>   env_start = mm->env_start;
>   env_end = mm->env_end;
> - up_read(&mm->mmap_sem);
> + spin_unlock(&mm->arg_lock);
>  
>   BUG_ON(arg_st

Re: [v2 PATCH] mm: introduce arg_lock to protect arg_start|end and env_start|end in mm_struct

2018-03-27 Thread Mateusz Guzik
On Tue, Mar 27, 2018 at 08:29:39AM +0200, Michal Hocko wrote:
> On Tue 27-03-18 02:20:39, Yang Shi wrote:
> [...]
> The patch looks reasonable to me. Maybe it would be better to be more
> explicit about the purpose of the patch. As others noticed, this alone
> wouldn't solve the mmap_sem contention issues. I _think_ that if you
> were more explicit about the mmap_sem abuse it would trigger less
> questions.
> 

>From what I gather even with other fixes the kernel will still end up
grabbing the semaphore. In this case I don't see what's the upside of
adding the spinlock for args. The downside is growth of mm_struct.

i.e. the code can be refactored to just hold the lock and relock only if
necessary (unable to copy to user without faulting)

-- 
Mateusz Guzik


Re: linux-next: build failure after merge of the vfs tree

2018-03-19 Thread Mateusz Guzik
On Mon, Mar 19, 2018 at 05:06:27PM +1100, Stephen Rothwell wrote:
> Hi Al,
> 
> After merging the vfs tree, today's linux-next build (x86_64 allnoconfig)
> failed like this:
> 
> fs/super.c: In function 'do_thaw_all_callback':
> fs/super.c:942:3: error: implicit declaration of function 
> 'emergency_thaw_bdev'; did you mean 'emergency_remount'? 
> [-Werror=implicit-function-declaration]
>emergency_thaw_bdev(sb);
>^~~
>emergency_remount
> 
> Caused by commit
> 
>   92afc556e622 ("buffer.c: call thaw_super during emergency thaw")
> 
> I have reverted that commit for today.
> 

Oops, did not test with CONFIG_BLOCK disabled. The sysrq func itself is guarded
with it so imho the right fixup is to do the same thing:

diff --git a/fs/super.c b/fs/super.c
index 5fa9a8d..86b5575 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -935,6 +935,7 @@ void emergency_remount(void)
}
 }
 
+#ifdef CONFIG_BLOCK
 static void do_thaw_all_callback(struct super_block *sb)
 {
down_write(&sb->s_umount);
@@ -968,6 +969,7 @@ void emergency_thaw_all(void)
schedule_work(work);
}
 }
+#endif
 
 /*
  * Unnamed block devices are dummy devices used by virtual


-- 
Mateusz Guzik


Re: [PATCH] vfs: Add AT_EMPTY_PATH_NOCHECK as unchecked AT_EMPTY_PATH

2024-06-22 Thread Mateusz Guzik
+cc Linus

On Sat, Jun 22, 2024 at 06:56:08PM +0800, Xi Ruoyao wrote:
> It's cheap to check if the path is empty in the userspace, but expensive
> to check if a userspace string is empty from the kernel.  So using statx
> and AT_EMPTY_PATH to implement fstat is slower than a "native" fstat
> call.  But for arch/loongarch fstat does not exist so we have to use
> statx, and on all 32-bit architectures we must use statx after 2037.
> And seccomp also cannot audit AT_EMPTY_PATH properly because it cannot
> check if path is empty.
> 
> To resolve these issues, add a relaxed version of AT_EMPTY_PATH: it does
> not check if the path is empty, but just assumes the path is empty and
> then behaves like AT_EMPTY_PATH.
> 
> Link: https://sourceware.org/pipermail/libc-alpha/2023-September/151364.html
> Link: 
> https://lore.kernel.org/loongarch/599df4a3-47a4-49be-9c81-8e21ea1f9...@xen0n.name/

imo the thing to do is to add fstat for your arch and add fstatx for
everyone.

My argument for fstatx specifically is that Rust uses statx instead of
fstat and is growing in popularity.

To sum up the problem: stat and statx met with "" + AT_EMPTY_PATH have
more work to do than fstat and its hypotethical statx counterpart:
- buf alloc/free for the path
- userspace access (very painful on x86_64 + SMAP)
- lockref acquire/release

(and other things concerning lookup itself which I'm going to ignore
here)

Your patch avoids the peek at userspace, but the other overhead remains.
In particular the lockref cycle, apart from adding work single-threaded,
adds avoidable serialization against other threads issuing stat(x) on
the same file. iow your patch still leaves performance on the table and
I don't think it is necessary.

If the flag is the way to go (I don't see why though), I would suggest
something like AT_NO_PATH and requring NULL as the path argument (or
some other predefined "there is nothing here" constant).

I wanted to type up a proposal for fstatx (+ patch) some time ago, but
some refactoring was needed to make it happen and put it on the back
burner.

Perhaps you would be willing to pick it up, assuming the vfs folk are
oke with it.

Regardless of what happens with statx or this patch you should probably
add fstat anyway.

If there are any other perf-sensitive syscalls which don't have their
fd-only variants they should be plugged to, but I can't think of
anything.

> ---
>  fs/namei.c | 8 +++-
>  fs/stat.c  | 4 +++-
>  include/linux/namei.h  | 4 
>  include/trace/misc/fs.h| 1 +
>  include/uapi/linux/fcntl.h | 3 +++
>  5 files changed, 18 insertions(+), 2 deletions(-)
> 
> diff --git a/fs/namei.c b/fs/namei.c
> index 37fb0a8aa09a..0c44a7ea5961 100644
> --- a/fs/namei.c
> +++ b/fs/namei.c
> @@ -147,7 +147,13 @@ getname_flags(const char __user *filename, int flags, 
> int *empty)
>   kname = (char *)result->iname;
>   result->name = kname;
>  
> - len = strncpy_from_user(kname, filename, EMBEDDED_NAME_MAX);
> + if (!(flags & LOOKUP_EMPTY_NOCHECK))
> + len = strncpy_from_user(kname, filename, EMBEDDED_NAME_MAX);
> + else {
> + len = 0;
> + kname[0] = '\0';
> + }
> +
>   if (unlikely(len < 0)) {
>   __putname(result);
>   return ERR_PTR(len);
> diff --git a/fs/stat.c b/fs/stat.c
> index 70bd3e888cfa..53944d3287cd 100644
> --- a/fs/stat.c
> +++ b/fs/stat.c
> @@ -210,6 +210,8 @@ int getname_statx_lookup_flags(int flags)
>   lookup_flags |= LOOKUP_AUTOMOUNT;
>   if (flags & AT_EMPTY_PATH)
>   lookup_flags |= LOOKUP_EMPTY;
> + if (flags & AT_EMPTY_PATH_NOCHECK)
> + lookup_flags |= LOOKUP_EMPTY | LOOKUP_EMPTY_NOCHECK;
>  
>   return lookup_flags;
>  }
> @@ -237,7 +239,7 @@ static int vfs_statx(int dfd, struct filename *filename, 
> int flags,
>   int error;
>  
>   if (flags & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT | AT_EMPTY_PATH |
> -   AT_STATX_SYNC_TYPE))
> +   AT_STATX_SYNC_TYPE | AT_EMPTY_PATH_NOCHECK))
>   return -EINVAL;
>  
>  retry:
> diff --git a/include/linux/namei.h b/include/linux/namei.h
> index 967aa9ea9f96..def6a8a1b531 100644
> --- a/include/linux/namei.h
> +++ b/include/linux/namei.h
> @@ -45,9 +45,13 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT};
>  #define LOOKUP_IN_ROOT   0x10 /* Treat dirfd as fs root. */
>  #define LOOKUP_CACHED0x20 /* Only do cached lookup */
>  #define LOOKUP_LINKAT_EMPTY  0x40 /* Linkat request with empty path. */
> +
>  /* LOOKUP_* flags which do scope-related checks based on the dirfd. */
>  #define LOOKUP_IS_SCOPED (LOOKUP_BENEATH | LOOKUP_IN_ROOT)
>  
> +/* If this is set, LOOKUP_EMPTY must be set as well. */
> +#define LOOKUP_EMPTY_NOCHECK 0x80 /* Consider path empty. */
> +
>  extern int path_pts(struct path *path);
>  
>  extern int user_path_at_empty(int, const char __user *, unsigned, struct 
> 

Re: [PATCH] vfs: Add AT_EMPTY_PATH_NOCHECK as unchecked AT_EMPTY_PATH

2024-06-22 Thread Mateusz Guzik
On Sun, Jun 23, 2024 at 2:59 AM Xi Ruoyao  wrote:
>
> On Sat, 2024-06-22 at 15:41 -0700, Linus Torvalds wrote:
>
> > I do think that we should make AT_EMPTY_PATH with a NULL path
> > "JustWork(tm)", because the stupid "look if the pathname is empty" is
> > horrible.
> >
> > But moving that check into getname() is *NOT* the right answer,
> > because by the time you get to getname(), you have already lost.
>
> Oops.  I'll try to get around of getname() too...
>
> > So the short-cut in vfs_fstatat() to never get a pathname is
> > disgusting - people should have used 'fstat()' - but it's _important_
> > disgusting.
>
> The problem is we don't have fstat() for LoongArch, and it'll be
> unusable on all 32-bit arch after 2037.
>
> And Arnd hates the idea adding fstat() for LoongArch because there would
> be one more 32-bit arch broken in 2037.
>
> Or should we just add something like "fstat_2037()"?
>

In that case fstat is out of the question, but no problem.

It was suggested to make AT_EMPTY_PATH + NULL pathname do the right
thing and have the syscalls short-circuit as needed.

for statx it would look like this (except you are going to have
implement do_statx_by_fd):

diff --git a/fs/stat.c b/fs/stat.c
index 16aa1f5ceec4..0afe72b320cc 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -710,6 +710,9 @@ SYSCALL_DEFINE5(statx,
int ret;
struct filename *name;

+   if (flags == AT_EMPTY_PATH && filename == NULL)
+   return do_statx_by_fd(...);
+
name = getname_flags(filename, getname_statx_lookup_flags(flags));
ret = do_statx(dfd, name, flags, mask, buffer);
    putname(name);

and so on

Personally I would prefer if fstatx was added instead, fwiw most
massaging in the area will be the same regardless.

-- 
Mateusz Guzik 



Re: [PATCH] vfs: Add AT_EMPTY_PATH_NOCHECK as unchecked AT_EMPTY_PATH

2024-06-23 Thread Mateusz Guzik
On Sun, Jun 23, 2024 at 3:22 AM Xi Ruoyao  wrote:
>
> On Sun, 2024-06-23 at 03:07 +0200, Mateusz Guzik wrote:
> > On Sun, Jun 23, 2024 at 2:59 AM Xi Ruoyao  wrote:
> > >
> > > On Sat, 2024-06-22 at 15:41 -0700, Linus Torvalds wrote:
> > >
> > > > I do think that we should make AT_EMPTY_PATH with a NULL path
> > > > "JustWork(tm)", because the stupid "look if the pathname is empty" is
> > > > horrible.
> > > >
> > > > But moving that check into getname() is *NOT* the right answer,
> > > > because by the time you get to getname(), you have already lost.
> > >
> > > Oops.  I'll try to get around of getname() too...
> > >
> > > > So the short-cut in vfs_fstatat() to never get a pathname is
> > > > disgusting - people should have used 'fstat()' - but it's _important_
> > > > disgusting.
> > >
> > > The problem is we don't have fstat() for LoongArch, and it'll be
> > > unusable on all 32-bit arch after 2037.
> > >
> > > And Arnd hates the idea adding fstat() for LoongArch because there would
> > > be one more 32-bit arch broken in 2037.
> > >
> > > Or should we just add something like "fstat_2037()"?
> > >
> >
> > In that case fstat is out of the question, but no problem.
> >
> > It was suggested to make AT_EMPTY_PATH + NULL pathname do the right
> > thing and have the syscalls short-circuit as needed.
> >
> > for statx it would look like this (except you are going to have
> > implement do_statx_by_fd):
> >
> > diff --git a/fs/stat.c b/fs/stat.c
> > index 16aa1f5ceec4..0afe72b320cc 100644
> > --- a/fs/stat.c
> > +++ b/fs/stat.c
> > @@ -710,6 +710,9 @@ SYSCALL_DEFINE5(statx,
> > int ret;
> > struct filename *name;
> >
> > +   if (flags == AT_EMPTY_PATH && filename == NULL)
> > +   return do_statx_by_fd(...);
> > +
> > name = getname_flags(filename, getname_statx_lookup_flags(flags));
> > ret = do_statx(dfd, name, flags, mask, buffer);
> > putname(name);
> >
> > and so on
> >
> > Personally I would prefer if fstatx was added instead, fwiw most
> > massaging in the area will be the same regardless.
>
> I do agree.  But if we do it *now* would it be "breaking the userspace"
> if some stupid program relies on fstatx() to return some error when the
> path is NULL?  The "stupid programs" may just exist in the wild...
>

You mean statx? fstatx would not accept a path to begin with.

Worry about some code breaking is why I suggested a dedicated flag
(AT_NO_PATH) myself in case fstatx is a no-go.

I am not convinced messing with AT_* flags is justified to begin with.
Any syscall which does not have a fd-only variant and is found to be
routinely used with AT_EMPTY_PATH should get one instead.

As far as I know that's only stat(due to a perf bug in glibc, now
fixed) and increasingly statx.

Suppose AT_EMPTY_PATH + NULL are to land and stat + statx get the
treatment. What about all the other syscalls? Sorting all that out is
quite a big of churn which is probably not worth it. But then there is
a feature gap in that they EFAULT for this pair while the stat* ones
don't and that's bound to raise confusion. Then one could add the
check in the bowels of path lookup in similar way you do did to
maintain the same behavior (but without per-syscall churn) and a big
fat warning that anyone getting there often needs to get patched with
short-circuiting the entire thing. So I think that's either a lot of
churn or nasty additions.

Regardless, as noted above, either making fstatx a thing or
short-circuiting mostly the same patching has to be done for
statx-related stuff.

However, this is not my call to make.

-- 
Mateusz Guzik 



Re: [PATCH RFC v3 12/13] mm: add SLAB_TYPESAFE_BY_RCU to files_cache

2024-08-12 Thread Mateusz Guzik
On Mon, Aug 12, 2024 at 09:29:16PM -0700, Andrii Nakryiko wrote:
> Add RCU protection for file struct's backing memory by adding
> SLAB_TYPESAFE_BY_RCU flag to files_cachep. This will allow to locklessly
> access struct file's fields under RCU lock protection without having to
> take much more expensive and contended locks.
> 
> This is going to be used for lockless uprobe look up in the next patch.
> 
> Suggested-by: Matthew Wilcox 
> Signed-off-by: Andrii Nakryiko 
> ---
>  kernel/fork.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 76ebafb956a6..91ecc32a491c 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -3157,8 +3157,8 @@ void __init proc_caches_init(void)
>   NULL);
>   files_cachep = kmem_cache_create("files_cache",
>   sizeof(struct files_struct), 0,
> - SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
> - NULL);
> + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU|
> + SLAB_ACCOUNT, NULL);
>   fs_cachep = kmem_cache_create("fs_cache",
>   sizeof(struct fs_struct), 0,
>   SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,

Did you mean to add it to the cache backing 'struct file' allocations?

That cache is created in fs/file_table.c and already has the flag:
filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0,
SLAB_TYPESAFE_BY_RCU | SLAB_HWCACHE_ALIGN |
SLAB_PANIC | SLAB_ACCOUNT, NULL);

The cache you are modifying in this patch contains the fd array et al
and is of no consequence to "uprobes: add speculative lockless VMA to
inode resolution".

iow this patch needs to be dropped



Re: [PATCH RFC v3 13/13] uprobes: add speculative lockless VMA to inode resolution

2024-08-12 Thread Mateusz Guzik
On Mon, Aug 12, 2024 at 09:29:17PM -0700, Andrii Nakryiko wrote:
> Now that files_cachep is SLAB_TYPESAFE_BY_RCU, we can safely access
> vma->vm_file->f_inode lockless only under rcu_read_lock() protection,
> attempting uprobe look up speculatively.
> 
> We rely on newly added mmap_lock_speculation_{start,end}() helpers to
> validate that mm_struct stays intact for entire duration of this
> speculation. If not, we fall back to mmap_lock-protected lookup.
> 
> This allows to avoid contention on mmap_lock in absolutely majority of
> cases, nicely improving uprobe/uretprobe scalability.
> 

Here I have to admit to being mostly ignorant about the mm, so bear with
me. :>

I note the result of find_active_uprobe_speculative is immediately stale
in face of modifications.

The thing I'm after is that the mmap_lock_speculation business adds
overhead on archs where a release fence is not a de facto nop and I
don't believe the commit message justifies it. Definitely a bummer to
add merely it for uprobes. If there are bigger plans concerning it
that's a different story of course.

With this in mind I have to ask if instead you could perhaps get away
with the already present per-vma sequence counter?



Re: [PATCH RFC v3 13/13] uprobes: add speculative lockless VMA to inode resolution

2024-08-15 Thread Mateusz Guzik
On Tue, Aug 13, 2024 at 08:36:03AM -0700, Suren Baghdasaryan wrote:
> On Mon, Aug 12, 2024 at 11:18 PM Mateusz Guzik  wrote:
> >
> > On Mon, Aug 12, 2024 at 09:29:17PM -0700, Andrii Nakryiko wrote:
> > > Now that files_cachep is SLAB_TYPESAFE_BY_RCU, we can safely access
> > > vma->vm_file->f_inode lockless only under rcu_read_lock() protection,
> > > attempting uprobe look up speculatively.
> > >
> > > We rely on newly added mmap_lock_speculation_{start,end}() helpers to
> > > validate that mm_struct stays intact for entire duration of this
> > > speculation. If not, we fall back to mmap_lock-protected lookup.
> > >
> > > This allows to avoid contention on mmap_lock in absolutely majority of
> > > cases, nicely improving uprobe/uretprobe scalability.
> > >
> >
> > Here I have to admit to being mostly ignorant about the mm, so bear with
> > me. :>
> >
> > I note the result of find_active_uprobe_speculative is immediately stale
> > in face of modifications.
> >
> > The thing I'm after is that the mmap_lock_speculation business adds
> > overhead on archs where a release fence is not a de facto nop and I
> > don't believe the commit message justifies it. Definitely a bummer to
> > add merely it for uprobes. If there are bigger plans concerning it
> > that's a different story of course.
> >
> > With this in mind I have to ask if instead you could perhaps get away
> > with the already present per-vma sequence counter?
> 
> per-vma sequence counter does not implement acquire/release logic, it
> relies on vma->vm_lock for synchronization. So if we want to use it,
> we would have to add additional memory barriers here. This is likely
> possible but as I mentioned before we would need to ensure the
> pagefault path does not regress. OTOH mm->mm_lock_seq already halfway
> there (it implements acquire/release logic), we just had to ensure
> mmap_write_lock() increments mm->mm_lock_seq.
> 
> So, from the release fence overhead POV I think whether we use
> mm->mm_lock_seq or vma->vm_lock, we would still need a proper fence
> here.
> 

Per my previous e-mail I'm not particularly familiar with mm internals,
so I'm going to handwave a little bit with my $0,03 concerning multicore
in general and if you disagree with it that's your business. For the
time being I have no interest in digging into any of this.

Before I do, to prevent this thread from being a total waste, here are
some remarks concerning the patch with the assumption that the core idea
lands.

>From the commit message:
> Now that files_cachep is SLAB_TYPESAFE_BY_RCU, we can safely access
> vma->vm_file->f_inode lockless only under rcu_read_lock() protection,
> attempting uprobe look up speculatively.

Just in case I'll note a nit that this paragraph will need to be removed
since the patch adding the flag is getting dropped.

A non-nit which may or may not end up mattering is that the flag (which
*is* set on the filep slab cache) makes things more difficult to
validate. Normal RCU usage guarantees that the object itself wont be
freed as long you follow the rules. However, the SLAB_TYPESAFE_BY_RCU
flag weakens it significantly -- the thing at hand will always be a
'struct file', but it may get reallocated to *another* file from under
you. Whether this aspect plays a role here I don't know.

> +static struct uprobe *find_active_uprobe_speculative(unsigned long bp_vaddr)
> +{
> + const vm_flags_t flags = VM_HUGETLB | VM_MAYEXEC | VM_MAYSHARE;
> + struct mm_struct *mm = current->mm;
> + struct uprobe *uprobe;
> + struct vm_area_struct *vma;
> + struct file *vm_file;
> + struct inode *vm_inode;
> + unsigned long vm_pgoff, vm_start;
> + int seq;
> + loff_t offset;
> +
> + if (!mmap_lock_speculation_start(mm, &seq))
> + return NULL;
> +
> + rcu_read_lock();
> +

I don't think there is a correctness problem here, but entering rcu
*after* deciding to speculatively do the lookup feels backwards.

> + vma = vma_lookup(mm, bp_vaddr);
> + if (!vma)
> + goto bail;
> +
> + vm_file = data_race(vma->vm_file);
> + if (!vm_file || (vma->vm_flags & flags) != VM_MAYEXEC)
> + goto bail;
> +

If vma teardown is allowed to progress and the file got fput'ed...

> + vm_inode = data_race(vm_file->f_inode);

... the inode can be NULL, I don't know if that's handled.

More importantly though, per my previous description of
SLAB_TYPESAFE_BY_RCU, by now the file could have been reallocated and
the inode you did find is completely unrelated

Re: [PATCH RFC v3 13/13] uprobes: add speculative lockless VMA to inode resolution

2024-08-15 Thread Mateusz Guzik
On Thu, Aug 15, 2024 at 10:45:45AM -0700, Suren Baghdasaryan wrote:
> >From all the above, my understanding of your objection is that
> checking mmap_lock during our speculation is too coarse-grained and
> you would prefer to use the VMA seq counter to check that the VMA we
> are working on is unchanged. I agree, that would be ideal. I had a
> quick chat with Jann about this and the conclusion we came to is that
> we would need to add an additional smp_wmb() barrier inside
> vma_start_write() and a smp_rmb() in the speculation code:
> 
> static inline void vma_start_write(struct vm_area_struct *vma)
> {
> int mm_lock_seq;
> 
> if (__is_vma_write_locked(vma, &mm_lock_seq))
> return;
> 
> down_write(&vma->vm_lock->lock);
> /*
>  * We should use WRITE_ONCE() here because we can have concurrent 
> reads
>  * from the early lockless pessimistic check in vma_start_read().
>  * We don't really care about the correctness of that early check, but
>  * we should use WRITE_ONCE() for cleanliness and to keep KCSAN happy.
>  */
> WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq);
> +smp_wmb();
> up_write(&vma->vm_lock->lock);
> }
> 
> Note: up_write(&vma->vm_lock->lock) in the vma_start_write() is not
> enough because it's one-way permeable (it's a "RELEASE operation") and
> later vma->vm_file store (or any other VMA modification) can move
> before our vma->vm_lock_seq store.
> 
> This makes vma_start_write() heavier but again, it's write-locking, so
> should not be considered a fast path.
> With this change we can use the code suggested by Andrii in
> https://lore.kernel.org/all/caef4bzzelg0wsyw2m7kfy0+aprpapvby7fbawb9vjca2+6k...@mail.gmail.com/
> with an additional smp_rmb():
> 
> rcu_read_lock()
> vma = find_vma(...)
> if (!vma) /* bail */
> 
> vm_lock_seq = smp_load_acquire(&vma->vm_lock_seq);
> mm_lock_seq = smp_load_acquire(&vma->mm->mm_lock_seq);
> /* I think vm_lock has to be acquired first to avoid the race */
> if (mm_lock_seq == vm_lock_seq)
> /* bail, vma is write-locked */
> ... perform uprobe lookup logic based on vma->vm_file->f_inode ...
> smp_rmb();
> if (vma->vm_lock_seq != vm_lock_seq)
> /* bail, VMA might have changed */
> 
> The smp_rmb() is needed so that vma->vm_lock_seq load does not get
> reordered and moved up before speculation.
> 
> I'm CC'ing Jann since he understands memory barriers way better than
> me and will keep me honest.
> 

So I briefly noted that maybe down_read on the vma would do it, but per
Andrii parallel lookups on the same vma on multiple CPUs are expected,
which whacks that out.

When I initially mentioned per-vma sequence counters I blindly assumed
they worked the usual way. I don't believe any fancy rework here is
warranted especially given that the per-mm counter thing is expected to
have other uses.

However, chances are decent this can still be worked out with per-vma
granualarity all while avoiding any stores on lookup and without
invasive (or complicated) changes. The lockless uprobe code claims to
guarantee only false negatives and the miss always falls back to the
mmap semaphore lookup. There may be something here, I'm going to chew on
it.

That said, thank you both for writeup so far.



Re: [PATCH RFC v3 13/13] uprobes: add speculative lockless VMA to inode resolution

2024-08-15 Thread Mateusz Guzik
On Thu, Aug 15, 2024 at 8:58 PM Jann Horn  wrote:
> Stupid question: Is this uprobe stuff actually such a hot codepath
> that it makes sense to optimize it to be faster than the page fault
> path?
>

That's what I implicitly asked, hoping a down_read on vma would do it,
but Andrii claims multiple parallel lookups on the same vma are a
problem.

Even so, I suspect something *simple* is doable here which avoids any
writes to vmas and does not need the mm-wide sequence counter. It may
be requirements are lax enough that merely observing some state is the
same before and after uprobe lookup will be sufficient, or maybe some
other hackery is viable without messing with fences in
vma_start_write.
-- 
Mateusz Guzik 



Re: [RFC PATCH] fs: use a sequence counter instead of file_lock in fd_install

2015-04-21 Thread Mateusz Guzik
On Tue, Apr 21, 2015 at 11:05:43AM -0700, Eric Dumazet wrote:
> On Mon, 2015-04-20 at 13:49 -0700, Eric Dumazet wrote:
> > On Mon, 2015-04-20 at 10:15 -0700, Eric Dumazet wrote:
> > > On Mon, 2015-04-20 at 17:10 +0200, Mateusz Guzik wrote:
> > > 
> > > > Sorry for spam but I came up with another hack. :)
> > > > 
> > > > The idea is that we can have a variable which would signify the that
> > > > given thread is playing with fd table in fd_install (kind of a lock
> > > > embedded into task_struct). We would also have a flag in files struct
> > > > indicating that a thread would like to resize it.
> > > > 
> > > > expand_fdtable would set the flag and iterate over all threads waiting
> > > > for all of them to have the var set to 0.
> > > 
> > > The opposite : you have to block them in some way and add a rcu_sched()
> > > or something.
> > 

What I described would block them, although it was a crappy approach
(iterating threads vs cpus). I was wondering if RCU could be abused for
this feature and apparently it can.

> > Here is the patch I cooked here but not yet tested.
> 
> In following version :
> 
> 1) I replaced the yield() hack by a proper wait queue.
> 
> 2) I do not invoke synchronize_sched() for mono threaded programs.
> 
> 3) I avoid multiple threads doing a resize and then only one wins the
> deal.
> 

One could argue this last bit could be committed separately (a different
logical change).

As I read up about synchronize_sched and rcu_read_lock_sched, the code
should be correct.

Also see nits below.

> (copying/clearing big amount of memory only to discover another guy did
> the same is a big waste of resources)
> 
> 
> This seems to run properly on my hosts.
> 
> Your comments/tests are most welcomed, thanks !
> 
>  fs/file.c   |   41 +-
>  include/linux/fdtable.h |3 ++
>  2 files changed, 39 insertions(+), 5 deletions(-)
> 
> diff --git a/fs/file.c b/fs/file.c
> index 93c5f89c248b..e0e113a56444 100644
> --- a/fs/file.c
> +++ b/fs/file.c
> @@ -147,6 +147,9 @@ static int expand_fdtable(struct files_struct *files, int 
> nr)
>  
>   spin_unlock(&files->file_lock);
>   new_fdt = alloc_fdtable(nr);
> + /* make sure no __fd_install() are still updating fdt */
> + if (atomic_read(&files->count) > 1)
> + synchronize_sched();
>   spin_lock(&files->file_lock);
>   if (!new_fdt)
>   return -ENOMEM;
> @@ -170,9 +173,12 @@ static int expand_fdtable(struct files_struct *files, 
> int nr)
>   if (cur_fdt != &files->fdtab)
>   call_rcu(&cur_fdt->rcu, free_fdtable_rcu);
>   } else {
> + WARN_ON_ONCE(1);
>   /* Somebody else expanded, so undo our attempt */
>   __free_fdtable(new_fdt);

The reader may be left confused why there is a warning while the comment
does not indicate anything is wrong.

>   }
> + /* coupled with smp_rmb() in __fd_install() */
> + smp_wmb();
>   return 1;
>  }
>  
> @@ -187,19 +193,33 @@ static int expand_fdtable(struct files_struct *files, 
> int nr)
>  static int expand_files(struct files_struct *files, int nr)
>  {
>   struct fdtable *fdt;
> + int expanded = 0;
>  
> +begin:
>   fdt = files_fdtable(files);
>  
>   /* Do we need to expand? */
>   if (nr < fdt->max_fds)
> - return 0;
> + return expanded;
>  
>   /* Can we expand? */
>   if (nr >= sysctl_nr_open)
>   return -EMFILE;
>  
> + while (unlikely(files->resize_in_progress)) {
> + spin_unlock(&files->file_lock);
> + expanded = 1;
> + wait_event(files->resize_wait, !files->resize_in_progress);
> + spin_lock(&files->file_lock);
> + goto begin;
> + }

This does not loop anymore, so s/while/if/ ?

> +
>   /* All good, so we try */
> - return expand_fdtable(files, nr);
> + files->resize_in_progress = true;
> + expanded = expand_fdtable(files, nr);
> + files->resize_in_progress = false;
> + wake_up_all(&files->resize_wait);
> + return expanded;
>  }
>  
>  static inline void __set_close_on_exec(int fd, struct fdtable *fdt)
> @@ -256,6 +276,8 @@ struct files_struct *dup_fd(struct files_struct *oldf, 
> int *errorp)
>   atomic_set(&newf->count, 1);
>  
>   spin_lock_init(&newf->file_lock);
> + newf->resize_in_progress = 0;
> +  

Re: [RFC PATCH] fs: use a sequence counter instead of file_lock in fd_install

2015-04-21 Thread Mateusz Guzik
On Tue, Apr 21, 2015 at 10:06:24PM +0200, Mateusz Guzik wrote:
> On Tue, Apr 21, 2015 at 11:05:43AM -0700, Eric Dumazet wrote:
> > On Mon, 2015-04-20 at 13:49 -0700, Eric Dumazet wrote:
> > > On Mon, 2015-04-20 at 10:15 -0700, Eric Dumazet wrote:
> > > > On Mon, 2015-04-20 at 17:10 +0200, Mateusz Guzik wrote:
> > > > 
> > > > > Sorry for spam but I came up with another hack. :)
> > > > > 
> > > > > The idea is that we can have a variable which would signify the that
> > > > > given thread is playing with fd table in fd_install (kind of a lock
> > > > > embedded into task_struct). We would also have a flag in files struct
> > > > > indicating that a thread would like to resize it.
> > > > > 
> > > > > expand_fdtable would set the flag and iterate over all threads waiting
> > > > > for all of them to have the var set to 0.
> > > > 
> > > > The opposite : you have to block them in some way and add a rcu_sched()
> > > > or something.
> > > 
> 
> What I described would block them, although it was a crappy approach
> (iterating threads vs cpus). I was wondering if RCU could be abused for
> this feature and apparently it can.
> 
> > > Here is the patch I cooked here but not yet tested.
> > 
> > In following version :
> > 
> > 1) I replaced the yield() hack by a proper wait queue.
> > 
> > 2) I do not invoke synchronize_sched() for mono threaded programs.
> > 
> > 3) I avoid multiple threads doing a resize and then only one wins the
> > deal.
> > 
> 
> One could argue this last bit could be committed separately (a different
> logical change).
> 
> As I read up about synchronize_sched and rcu_read_lock_sched, the code
> should be correct.
> 
> Also see nits below.
> 

I'm utter crap with memory barriers, but I believe some are needed now:

in dup_fd:
   for (i = open_files; i != 0; i--) {
struct file *f = *old_fds++;
if (f) {
get_file(f);

at least a data dependency barrier, or maybe smp_rmb for peace of mind

similarly in do_dup2:
tofree = fdt->fd[fd];
if (!tofree && fd_is_open(fd, fdt))
goto Ebusy;

> > (copying/clearing big amount of memory only to discover another guy did
> > the same is a big waste of resources)
> > 
> > 
> > This seems to run properly on my hosts.
> > 
> > Your comments/tests are most welcomed, thanks !
> > 
> >  fs/file.c   |   41 +-
> >  include/linux/fdtable.h |3 ++
> >  2 files changed, 39 insertions(+), 5 deletions(-)
> > 
> > diff --git a/fs/file.c b/fs/file.c
> > index 93c5f89c248b..e0e113a56444 100644
> > --- a/fs/file.c
> > +++ b/fs/file.c
> > @@ -147,6 +147,9 @@ static int expand_fdtable(struct files_struct *files, 
> > int nr)
> >  
> > spin_unlock(&files->file_lock);
> > new_fdt = alloc_fdtable(nr);
> > +   /* make sure no __fd_install() are still updating fdt */
> > +   if (atomic_read(&files->count) > 1)
> > +   synchronize_sched();
> > spin_lock(&files->file_lock);
> > if (!new_fdt)
> > return -ENOMEM;
> > @@ -170,9 +173,12 @@ static int expand_fdtable(struct files_struct *files, 
> > int nr)
> > if (cur_fdt != &files->fdtab)
> > call_rcu(&cur_fdt->rcu, free_fdtable_rcu);
> > } else {
> > +   WARN_ON_ONCE(1);
> > /* Somebody else expanded, so undo our attempt */
> > __free_fdtable(new_fdt);
> 
> The reader may be left confused why there is a warning while the comment
> does not indicate anything is wrong.
> 
> > }
> > +   /* coupled with smp_rmb() in __fd_install() */
> > +   smp_wmb();
> > return 1;
> >  }
> >  
> > @@ -187,19 +193,33 @@ static int expand_fdtable(struct files_struct *files, 
> > int nr)
> >  static int expand_files(struct files_struct *files, int nr)
> >  {
> > struct fdtable *fdt;
> > +   int expanded = 0;
> >  
> > +begin:
> > fdt = files_fdtable(files);
> >  
> > /* Do we need to expand? */
> > if (nr < fdt->max_fds)
> > -   return 0;
> > +   return expanded;
> >  
> > /* Can we expand? */
> > if (nr >= sysctl_nr_open)
> > return -EMFILE;
> >  
> > +   while (unlikely(files-&g

Re: [RFC PATCH] fs: use a sequence counter instead of file_lock in fd_install

2015-04-22 Thread Mateusz Guzik
On Tue, Apr 21, 2015 at 02:06:53PM -0700, Eric Dumazet wrote:
> On Tue, 2015-04-21 at 22:12 +0200, Mateusz Guzik wrote:
> 
> > in dup_fd:
> >for (i = open_files; i != 0; i--) {
> > struct file *f = *old_fds++;
> > if (f) {
> > get_file(f);
> > 
> 
> I see no new requirement here. f is either NULL or not.
> multi threaded programs never had a guarantee dup_fd() would catch a non
> NULL pointer here.
> 

It's not about seeing NULL f or not, but using the right address for
dereference.

If I read memory-barriers.txt right (see 'DATA DEPENDENCY BARRIERS'), it
is possible that cpus like alpha will see a non-NULL pointer and then
proceed to dereference *the old* (here: NULL) value.

Hence I suspect this needs smp_read_barrier_depends (along with
ACCESS_ONCE).

Other consumers (e.g. procfs code) use rcu_dereference macro which does
ends up using lockless_dereference macro, which in turn does:
#define lockless_dereference(p) \
({ \
typeof(p) _p1 = ACCESS_ONCE(p); \
smp_read_barrier_depends(); /* Dependency order vs. p
above. */ \
(_p1); \
})

That said memory barriers are not exactly my strong suit, but I do
believe my suspicion here is justified enough to ask someone with solid
memory barrier-fu to comment.

> 
> > at least a data dependency barrier, or maybe smp_rmb for peace of mind
> > 
> > similarly in do_dup2:
> > tofree = fdt->fd[fd];
> >     if (!tofree && fd_is_open(fd, fdt))
> > goto Ebusy;
> 
> Same here.
> 
> 

-- 
Mateusz Guzik
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] fs/file.c: remove useless xchg and NULL check in close_files

2015-04-14 Thread Mateusz Guzik
Since the table is about to be freed, there is no reason to set file
pointer to NULL on closing.

At this point open_fd map is supposed to indicate whether a file is
installed, so NULL-checking it is unnecessary.

Signed-off-by: Mateusz Guzik 
---
 fs/file.c | 7 ++-
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/fs/file.c b/fs/file.c
index 93c5f89..35a024a 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -364,11 +364,8 @@ static struct fdtable *close_files(struct files_struct * 
files)
set = fdt->open_fds[j++];
while (set) {
if (set & 1) {
-   struct file * file = xchg(&fdt->fd[i], NULL);
-   if (file) {
-   filp_close(file, files);
-   cond_resched_rcu_qs();
-   }
+   filp_close(fdt->fd[i], files);
+   cond_resched_rcu_qs();
}
i++;
set >>= 1;
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[RFC PATCH] fs: use a sequence counter instead of file_lock in fd_install

2015-04-16 Thread Mateusz Guzik
Hi,

Currently obtaining a new file descriptor results in locking fdtable
twice - once in order to reserve a slot and second time to fill it.

Hack below gets rid of the second lock usage.

It gives me a ~30% speedup (~300k ops -> ~400k ops) in a microbenchmark
where 16 threads create a pipe (2 fds) and call 2 * close.

Results are fluctuating and even with the patch sometimes drop to around
~300k ops. However, without the patch they never get higher.

I can come up with a better benchmark if that's necessary.

Comments?

==

Subject: [PATCH] fs: use a sequence counter instead of file_lock in fd_install

Because the lock is not held, it is possible that fdtable will be
reallocated as we fill it.

RCU is used to guarantee the old table is not freed just in case we
happen to write to it (which is harmless).

sequence counter is used to ensure we updated the right table.

Signed-off-by: Mateusz Guzik 
---
 fs/file.c   | 24 +++-
 include/linux/fdtable.h |  5 +
 2 files changed, 24 insertions(+), 5 deletions(-)

diff --git a/fs/file.c b/fs/file.c
index 93c5f89..bd1ef4c 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -165,8 +165,10 @@ static int expand_fdtable(struct files_struct *files, int 
nr)
cur_fdt = files_fdtable(files);
if (nr >= cur_fdt->max_fds) {
/* Continue as planned */
+   write_seqcount_begin(&files->fdt_seqcount);
copy_fdtable(new_fdt, cur_fdt);
rcu_assign_pointer(files->fdt, new_fdt);
+   write_seqcount_end(&files->fdt_seqcount);
if (cur_fdt != &files->fdtab)
call_rcu(&cur_fdt->rcu, free_fdtable_rcu);
} else {
@@ -256,6 +258,7 @@ struct files_struct *dup_fd(struct files_struct *oldf, int 
*errorp)
atomic_set(&newf->count, 1);
 
spin_lock_init(&newf->file_lock);
+   seqcount_init(&newf->fdt_seqcount);
newf->next_fd = 0;
new_fdt = &newf->fdtab;
new_fdt->max_fds = NR_OPEN_DEFAULT;
@@ -429,6 +432,7 @@ void exit_files(struct task_struct *tsk)
 
 struct files_struct init_files = {
.count  = ATOMIC_INIT(1),
+   .fdt_seqcount   = SEQCNT_ZERO(fdt_seqcount),
.fdt= &init_files.fdtab,
.fdtab  = {
.max_fds= NR_OPEN_DEFAULT,
@@ -552,12 +556,22 @@ EXPORT_SYMBOL(put_unused_fd);
 void __fd_install(struct files_struct *files, unsigned int fd,
struct file *file)
 {
+   unsigned long seq;
struct fdtable *fdt;
-   spin_lock(&files->file_lock);
-   fdt = files_fdtable(files);
-   BUG_ON(fdt->fd[fd] != NULL);
-   rcu_assign_pointer(fdt->fd[fd], file);
-   spin_unlock(&files->file_lock);
+
+   rcu_read_lock();
+   do {
+   seq = read_seqcount_begin(&files->fdt_seqcount);
+   fdt = files_fdtable_seq(files);
+   /*
+* Entry in the table can already be equal to file if we
+* had to restart and copy_fdtable picked up our update.
+*/
+   BUG_ON(!(fdt->fd[fd] == NULL || fdt->fd[fd] == file));
+   rcu_assign_pointer(fdt->fd[fd], file);
+   smp_mb();
+   } while (__read_seqcount_retry(&files->fdt_seqcount, seq));
+   rcu_read_unlock();
 }
 
 void fd_install(unsigned int fd, struct file *file)
diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h
index 230f87b..9e41765 100644
--- a/include/linux/fdtable.h
+++ b/include/linux/fdtable.h
@@ -12,6 +12,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 
@@ -47,6 +48,7 @@ struct files_struct {
* read mostly part
*/
atomic_t count;
+   seqcount_t fdt_seqcount;
struct fdtable __rcu *fdt;
struct fdtable fdtab;
   /*
@@ -69,6 +71,9 @@ struct dentry;
 #define files_fdtable(files) \
rcu_dereference_check_fdtable((files), (files)->fdt)
 
+#define files_fdtable_seq(files) \
+   rcu_dereference((files)->fdt)
+
 /*
  * The caller must ensure that fd table isn't shared or hold rcu or file lock
  */
-- 
1.8.3.1
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC PATCH] fs: use a sequence counter instead of file_lock in fd_install

2015-04-16 Thread Mateusz Guzik
On Thu, Apr 16, 2015 at 01:55:39PM -0700, Eric Dumazet wrote:
> On Thu, 2015-04-16 at 13:42 -0700, Eric Dumazet wrote:
> > On Thu, 2015-04-16 at 19:09 +0100, Al Viro wrote:
> > > On Thu, Apr 16, 2015 at 02:16:31PM +0200, Mateusz Guzik wrote:
> > > > @@ -165,8 +165,10 @@ static int expand_fdtable(struct files_struct 
> > > > *files, int nr)
> > > > cur_fdt = files_fdtable(files);
> > > > if (nr >= cur_fdt->max_fds) {
> > > > /* Continue as planned */
> > > > +   write_seqcount_begin(&files->fdt_seqcount);
> > > > copy_fdtable(new_fdt, cur_fdt);
> > > > rcu_assign_pointer(files->fdt, new_fdt);
> > > > +   write_seqcount_end(&files->fdt_seqcount);
> > > > if (cur_fdt != &files->fdtab)
> > > > call_rcu(&cur_fdt->rcu, free_fdtable_rcu);
> > > 
> > > Interesting.  AFAICS, your test doesn't step anywhere near that path,
> > > does it?  So basically you never hit the retries during that...
> > 
> > Right, but then the table is almost never changed for a given process,
> > as we only increase it by power of two steps.
> > 
> > (So I scratch my initial comment, fdt_seqcount is really mostly read)
> 
> I tested Mateusz patch with my opensock program, mimicking a bit more
> what a server does (having lot of sockets)
> 
> 24 threads running, doing close(randomfd())/socket() calls like crazy.
> 
> Before patch :
> 
> # time ./opensock 
> 
> real  0m10.863s
> user  0m0.954s
> sys   2m43.659s
> 
> 
> After patch :
> 
> # time ./opensock
> 
> real  0m9.750s
> user  0m0.804s
> sys   2m18.034s
> 
> So this is an improvement for sure, but not massive.
> 
> perf record ./opensock ; report
> 
> 87.80%  opensock  [kernel.kallsyms]  [k] _raw_spin_lock   
>   
>|--52.70%-- __close_fd
>|--46.41%-- __alloc_fd

My crap benchmark is here: http://people.redhat.com/~mguzik/pipebench.c
(compile with -pthread, run with -s 10 -n 16 for 10 second test + 16
threads)

As noted earlier it tends to go from rougly 300k ops/s to 400.

The fundamental problem here seems to be this pesky POSIX requirement of
providing the lowest possible fd on each allocation (as a side note
Linux breaks this with parallel fd allocs, where one of these backs off
the reservation, not that I believe this causes trouble).

Ideally a process-wide switch could be implemented (e.g.
prctl(SCRATCH_LOWEST_FD_REQ)) which would grant the kernel the freedom
to return any fd it wants, so it would be possible to have fd ranges
per thread and the like.

Having only a O_SCRATCH_POSIX flag passed to syscalls would still leave
close() as a bottleneck.

In the meantime I consider the approach taken in my patch as an ok
temporary improvement.

-- 
Mateusz Guzik
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC PATCH] fs: use a sequence counter instead of file_lock in fd_install

2015-04-16 Thread Mateusz Guzik
On Thu, Apr 16, 2015 at 07:09:32PM +0100, Al Viro wrote:
> On Thu, Apr 16, 2015 at 02:16:31PM +0200, Mateusz Guzik wrote:
> > @@ -165,8 +165,10 @@ static int expand_fdtable(struct files_struct *files, 
> > int nr)
> > cur_fdt = files_fdtable(files);
> > if (nr >= cur_fdt->max_fds) {
> > /* Continue as planned */
> > +   write_seqcount_begin(&files->fdt_seqcount);
> > copy_fdtable(new_fdt, cur_fdt);
> > rcu_assign_pointer(files->fdt, new_fdt);
> > +   write_seqcount_end(&files->fdt_seqcount);
> > if (cur_fdt != &files->fdtab)
> > call_rcu(&cur_fdt->rcu, free_fdtable_rcu);
> 
> Interesting.  AFAICS, your test doesn't step anywhere near that path,
> does it?  So basically you never hit the retries during that...

well, yeah. In fact for non-shared tables one could go a step further
and just plop the pointer in, but I don't know if that makes much sense.
Other processes inspecting the table could get away with a data
dependency barrier. Closing would still take the lock, so you can only
suddenly see filp installed, but never one going away.

Now, as far as correctness goes, I think there is a bug in the patch
(which does not invalidate the idea though). Chances are I got a fix as
well.

Benchmark prog is here: http://people.redhat.com/~mguzik/pipebench.c
A modified version: http://people.redhat.com/~mguzik/fdi-fail.c

Benchmark is just doing pipe + close in a loop in multiple threads.

Modified version spawns threads, sleeps 100 ms and does dup(0, 300) to
reallocate the table while other threads continue the work.

This succesfully tested retries (along with cases where installed file
got copied and was encountered during retry).

However, I see sporadic close failures. I presume this is because of a
missing read barrier after write_seqcount_begin. Adding a smp_mb()
seems to solve the problem, but I could only test on 2 * 16 Intel(R)
Xeon(R) CPU E5-2660 0 @ 2.20GHz.

My memory barrier-fu is rather weak and I'm not that confident in my
crap suspicion here.

Thoughts?

-- 
Mateusz Guzik
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] fs/file.c: don't acquire files->file_lock in fd_install()

2015-04-27 Thread Mateusz Guzik
On Tue, Apr 21, 2015 at 09:59:28PM -0700, Eric Dumazet wrote:
> From: Eric Dumazet 
> 
> Mateusz Guzik reported :
> 
>  Currently obtaining a new file descriptor results in locking fdtable
>  twice - once in order to reserve a slot and second time to fill it.
> 
> Holding the spinlock in __fd_install() is needed in case a resize is
> done, or to prevent a resize.
> 
> Mateusz provided an RFC patch and a micro benchmark :
>   http://people.redhat.com/~mguzik/pipebench.c
> 
> A resize is an unlikely operation in a process lifetime,
> as table size is at least doubled at every resize.
> 
> We can use RCU instead of the spinlock :
> 
> __fd_install() must wait if a resize is in progress.
> 
> The resize must block new __fd_install() callers from starting,
> and wait that ongoing install are finished (synchronize_sched())
> 
> resize should be attempted by a single thread to not waste resources.
> 
> rcu_sched variant is used, as __fd_install() and expand_fdtable() run
> from process context.
> 
> It gives us a ~30% speedup using pipebench with 16 threads, and a ~10%
> speedup with another program using TCP sockets.
> 
> Signed-off-by: Eric Dumazet 
> Reported-by: Mateusz Guzik 

Reviewed-by: Mateusz Guzik 
Tested-by: Mateusz Guzik 

Thanks,

> ---
>  fs/file.c   |   66 +++---
>  include/linux/fdtable.h |3 +
>  2 files changed, 50 insertions(+), 19 deletions(-)
> 
> diff --git a/fs/file.c b/fs/file.c
> index 93c5f89c248b..17cef5e52f16 100644
> --- a/fs/file.c
> +++ b/fs/file.c
> @@ -147,6 +147,13 @@ static int expand_fdtable(struct files_struct *files, 
> int nr)
>  
>   spin_unlock(&files->file_lock);
>   new_fdt = alloc_fdtable(nr);
> +
> + /* make sure all __fd_install() have seen resize_in_progress
> +  * or have finished their rcu_read_lock_sched() section.
> +  */
> + if (atomic_read(&files->count) > 1)
> + synchronize_sched();
> +
>   spin_lock(&files->file_lock);
>   if (!new_fdt)
>   return -ENOMEM;
> @@ -158,21 +165,14 @@ static int expand_fdtable(struct files_struct *files, 
> int nr)
>   __free_fdtable(new_fdt);
>   return -EMFILE;
>   }
> - /*
> -  * Check again since another task may have expanded the fd table while
> -  * we dropped the lock
> -  */
>   cur_fdt = files_fdtable(files);
> - if (nr >= cur_fdt->max_fds) {
> - /* Continue as planned */
> - copy_fdtable(new_fdt, cur_fdt);
> - rcu_assign_pointer(files->fdt, new_fdt);
> - if (cur_fdt != &files->fdtab)
> - call_rcu(&cur_fdt->rcu, free_fdtable_rcu);
> - } else {
> - /* Somebody else expanded, so undo our attempt */
> - __free_fdtable(new_fdt);
> - }
> + BUG_ON(nr < cur_fdt->max_fds);
> + copy_fdtable(new_fdt, cur_fdt);
> + rcu_assign_pointer(files->fdt, new_fdt);
> + if (cur_fdt != &files->fdtab)
> + call_rcu(&cur_fdt->rcu, free_fdtable_rcu);
> + /* coupled with smp_rmb() in __fd_install() */
> + smp_wmb();
>   return 1;
>  }
>  
> @@ -185,21 +185,38 @@ static int expand_fdtable(struct files_struct *files, 
> int nr)
>   * The files->file_lock should be held on entry, and will be held on exit.
>   */
>  static int expand_files(struct files_struct *files, int nr)
> + __releases(files->file_lock)
> + __acquires(files->file_lock)
>  {
>   struct fdtable *fdt;
> + int expanded = 0;
>  
> +repeat:
>   fdt = files_fdtable(files);
>  
>   /* Do we need to expand? */
>   if (nr < fdt->max_fds)
> - return 0;
> + return expanded;
>  
>   /* Can we expand? */
>   if (nr >= sysctl_nr_open)
>   return -EMFILE;
>  
> + if (unlikely(files->resize_in_progress)) {
> + spin_unlock(&files->file_lock);
> + expanded = 1;
> + wait_event(files->resize_wait, !files->resize_in_progress);
> + spin_lock(&files->file_lock);
> + goto repeat;
> + }
> +
>   /* All good, so we try */
> - return expand_fdtable(files, nr);
> + files->resize_in_progress = true;
> + expanded = expand_fdtable(files, nr);
> + files->resize_in_progress = false;
> +
> + wake_up_all(&files->resize_wait);
> + return expanded;
>  }
>  
>  static inline void __set_close_on_exec(int fd, struct fdtable *fdt)
> @@ -256,6 +273,8 @@ struct fil

Re: [RFC PATCH RESEND] vfs: Move security_inode_killpriv() after permission checks

2015-04-08 Thread Mateusz Guzik
;  int
>  xfs_setattr_nonsize(
> + struct dentry   *dentry,
>   struct xfs_inode*ip,
>   struct iattr*iattr,
>   int flags)
> @@ -554,6 +555,10 @@ xfs_setattr_nonsize(
>   error = inode_change_ok(inode, iattr);
>   if (error)
>   return error;
> +
> + error = setattr_killpriv(dentry, iattr);
> + if (error)
> + return error;
>   }
>  
>   ASSERT((mask & ATTR_SIZE) == 0);
> @@ -734,6 +739,7 @@ out_dqrele:
>   */
>  int
>  xfs_setattr_size(
> + struct dentry   *dentry,
>   struct xfs_inode*ip,
>   struct iattr*iattr)
>  {
> @@ -776,9 +782,13 @@ xfs_setattr_size(
>* Use the regular setattr path to update the timestamps.
>*/
>   iattr->ia_valid &= ~ATTR_SIZE;
> - return xfs_setattr_nonsize(ip, iattr, 0);
> + return xfs_setattr_nonsize(dentry, ip, iattr, 0);
>   }
>  
> + error = setattr_killpriv(dentry, iattr);
> + if (error)
> + return error;
> +
>   /*
>* Make sure that the dquots are attached to the inode.
>*/
> @@ -974,10 +984,10 @@ xfs_vn_setattr(
>  
>   if (iattr->ia_valid & ATTR_SIZE) {
>   xfs_ilock(ip, XFS_IOLOCK_EXCL);
> - error = xfs_setattr_size(ip, iattr);
> + error = xfs_setattr_size(dentry, ip, iattr);
>   xfs_iunlock(ip, XFS_IOLOCK_EXCL);
>   } else {
> - error = xfs_setattr_nonsize(ip, iattr, 0);
> + error = xfs_setattr_nonsize(dentry, ip, iattr, 0);
>   }
>  
>   return error;
> diff --git a/fs/xfs/xfs_iops.h b/fs/xfs/xfs_iops.h
> index 1c34e43..6994d3e 100644
> --- a/fs/xfs/xfs_iops.h
> +++ b/fs/xfs/xfs_iops.h
> @@ -32,8 +32,14 @@ extern void xfs_setup_inode(struct xfs_inode *);
>   */
>  #define XFS_ATTR_NOACL   0x01/* Don't call posix_acl_chmod */
>  
> -extern int xfs_setattr_nonsize(struct xfs_inode *ip, struct iattr *vap,
> +/*
> + * XXX Several callers have to pass dentry = NULL and this should
> + * work but it's really ugly.
> + */
> +extern int xfs_setattr_nonsize(struct dentry *dentry,
> +struct xfs_inode *ip, struct iattr *vap,
>  int flags);
> -extern int xfs_setattr_size(struct xfs_inode *ip, struct iattr *vap);
> +extern int xfs_setattr_size(struct dentry *dentry,
> + struct xfs_inode *ip, struct iattr *vap);
>  
>  #endif /* __XFS_IOPS_H__ */
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index 9ab779e..7cad5d1 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -2663,6 +2663,7 @@ extern int buffer_migrate_page(struct address_space *,
>  extern int inode_change_ok(const struct inode *, struct iattr *);
>  extern int inode_newsize_ok(const struct inode *, loff_t offset);
>  extern void setattr_copy(struct inode *inode, const struct iattr *attr);
> +extern int setattr_killpriv(struct dentry *dentry, struct iattr *attr);
>  
>  extern int file_update_time(struct file *file);
>  
> diff --git a/mm/shmem.c b/mm/shmem.c
> index 185836b..d1d4b9b 100644
> --- a/mm/shmem.c
> +++ b/mm/shmem.c
> @@ -557,6 +557,10 @@ static int shmem_setattr(struct dentry *dentry, struct 
> iattr *attr)
>   if (error)
>   return error;
>  
> + error = setattr_killpriv(dentry, attr);
> + if (error)
> + return error;
> +
>   if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
>   loff_t oldsize = inode->i_size;
>   loff_t newsize = attr->ia_size;
> 
> 
> -- 
> Ben Hutchings
> The first rule of tautology club is the first rule of tautology club.



-- 
Mateusz Guzik
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 2/2] Fix variable "error" missing initialization

2015-04-29 Thread Mateusz Guzik
On Thu, Apr 30, 2015 at 12:00:34AM +0800, Shawn Chang wrote:
> From: Shawn C 
> 
> Signed-off-by: Shawn C 
> ---
>  mm/mlock.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/mm/mlock.c b/mm/mlock.c
> index c7f6785..660e5c5 100644
> --- a/mm/mlock.c
> +++ b/mm/mlock.c
> @@ -557,7 +557,7 @@ static int do_mlock(unsigned long start, size_t len, int 
> on)
>  {
>   unsigned long nstart, end, tmp;
>   struct vm_area_struct * vma, * prev;
> - int error;
> + int error = 0;
>  
>   VM_BUG_ON(start & ~PAGE_MASK);
>   VM_BUG_ON(len != PAGE_ALIGN(len));
> -- 
> 1.9.1
> 

This change does not make sense.

The very first read of error is after it gets set.

I see you sent another patch which credited grsecurity. In their
patchset it makes sense - do_mlock is modified in a way which can
interrupt the loop upcoming loop before it gets the chance to set error.

-- 
Mateusz Guzik
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC PATCH] fs: use a sequence counter instead of file_lock in fd_install

2015-04-17 Thread Mateusz Guzik
On Fri, Apr 17, 2015 at 02:46:56PM -0700, Eric Dumazet wrote:
> On Thu, 2015-04-16 at 14:16 +0200, Mateusz Guzik wrote:
> > Hi,
> > 
> > Currently obtaining a new file descriptor results in locking fdtable
> > twice - once in order to reserve a slot and second time to fill it
> 
> ...
> 
> 
> >  void __fd_install(struct files_struct *files, unsigned int fd,
> > struct file *file)
> >  {
> > +   unsigned long seq;
> 
>   unsigned int seq;
> 
> > struct fdtable *fdt;
> > -   spin_lock(&files->file_lock);
> > -   fdt = files_fdtable(files);
> > -   BUG_ON(fdt->fd[fd] != NULL);
> > -   rcu_assign_pointer(fdt->fd[fd], file);
> > -   spin_unlock(&files->file_lock);
> > +
> > +   rcu_read_lock();
> > +   do {
> > +   seq = read_seqcount_begin(&files->fdt_seqcount);
> > +   fdt = files_fdtable_seq(files);
> > +   /*
> > +* Entry in the table can already be equal to file if we
> > +* had to restart and copy_fdtable picked up our update.
> > +*/
> > +   BUG_ON(!(fdt->fd[fd] == NULL || fdt->fd[fd] == file));
> > +   rcu_assign_pointer(fdt->fd[fd], file);
> > +   smp_mb();
> > +   } while (__read_seqcount_retry(&files->fdt_seqcount, seq));
> > +   rcu_read_unlock();
> >  }
> >  
> 
> So one problem here is :
> 
> As soon as  rcu_assign_pointer(fdt->fd[fd], file) is done, and other cpu
> does one expand_fdtable() and releases files->file_lock, another cpu can
> close(fd).
> 
> Then another cpu can reuse the [fd] now empty slot and install a new
> file in it.
> 
> Then this cpu will crash here :
> 
> BUG_ON(!(fdt->fd[fd] == NULL || fdt->fd[fd] == file));
> 

Ouch, this is so obvious now that you mention it. Really stupid
mistake on my side.

I would say this makes the use of seq counter impossible. Even if we
decided to fall back to a lock on retry, we cannot know what to do if
the slot is reserved - it very well could be that something called
close, and something else reserved the slot, so putting the file inside
could be really bad. In fact we would be putting a file for which we
don't have a reference anymore.

However, not all hope is lost and I still think we can speed things up.

A locking primitive which only locks stuff for current cpu and has
another mode where it locks stuff for all cpus would do the trick just
fine. I'm not a linux guy, quick search suggests 'lglock' would do what
I want.

table reallocation is an extremely rare operation, so this should be
fine. It would take the lock 'globally' for given table.

I'll play with this.

-- 
Mateusz Guzik
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC PATCH] fs: use a sequence counter instead of file_lock in fd_install

2015-04-20 Thread Mateusz Guzik
On Sat, Apr 18, 2015 at 12:02:52AM +0100, Al Viro wrote:
> On Sat, Apr 18, 2015 at 12:16:48AM +0200, Mateusz Guzik wrote:
> 
> > I would say this makes the use of seq counter impossible. Even if we
> > decided to fall back to a lock on retry, we cannot know what to do if
> > the slot is reserved - it very well could be that something called
> > close, and something else reserved the slot, so putting the file inside
> > could be really bad. In fact we would be putting a file for which we
> > don't have a reference anymore.
> > 
> > However, not all hope is lost and I still think we can speed things up.
> > 
> > A locking primitive which only locks stuff for current cpu and has
> > another mode where it locks stuff for all cpus would do the trick just
> > fine. I'm not a linux guy, quick search suggests 'lglock' would do what
> > I want.
> > 
> > table reallocation is an extremely rare operation, so this should be
> > fine. It would take the lock 'globally' for given table.
> 
> It would also mean percpu_alloc() for each descriptor table...

Well as it was noted I have not checked how it's implemented at the time
of writing the message. I agree embedding something like this into files
struct is a non-starter.

I would say this could work with a small set of locks, selected by hashing
struct files pointer.

Table resizing is supposed to be extremely rare - most processes should
not need it at all (if they do, the default size is too small and should
be adjusted). Not only that, the lock is only needed if the process in
question is multithreaded.

So I would say this would not contend in real-world workloads, but still
looks crappy.

Unfortunately the whole thing loses original appeal of a simple hack
with no potential perfomrance drawbacks. Maybe I'll hack it up later and
run some tests anyway.

-- 
Mateusz Guzik
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC PATCH] fs: use a sequence counter instead of file_lock in fd_install

2015-04-20 Thread Mateusz Guzik
On Sat, Apr 18, 2015 at 12:41:38PM -0700, Eric Dumazet wrote:
> On Sat, 2015-04-18 at 00:02 +0100, Al Viro wrote:
> > On Sat, Apr 18, 2015 at 12:16:48AM +0200, Mateusz Guzik wrote:
> > 
> > > I would say this makes the use of seq counter impossible. Even if we
> > > decided to fall back to a lock on retry, we cannot know what to do if
> > > the slot is reserved - it very well could be that something called
> > > close, and something else reserved the slot, so putting the file inside
> > > could be really bad. In fact we would be putting a file for which we
> > > don't have a reference anymore.
> > > 
> > > However, not all hope is lost and I still think we can speed things up.
> > > 
> > > A locking primitive which only locks stuff for current cpu and has
> > > another mode where it locks stuff for all cpus would do the trick just
> > > fine. I'm not a linux guy, quick search suggests 'lglock' would do what
> > > I want.
> > > 
> > > table reallocation is an extremely rare operation, so this should be
> > > fine. It would take the lock 'globally' for given table.
> > 
> > It would also mean percpu_alloc() for each descriptor table...
> 
> I would rather use an xchg() instead of rcu_assign_ponter()
> 
> old = xchg(&fdt->fd[fd], file);
> if (unlikely(old))
>   filp_close(old, files);
> 
> If threads are using close() on random fds, final result is not
> guaranteed anyway.
> 

Well I don't see how could this be used to fix the problem.

If you are retrying and see NULL, you don't know whether your previous
update was not picked up by memcpy OR the fd got closed, which also
unreferenced the file you are installing. But you can't tell what
happened.

If you see non-NULL and what you found is not the file you are
installing, you know the file was freed so you can't close the old file.

One could try to introduce an invariant that files installed in a
lockless manner have to start with refcount 1, you still can't infer
anything from the fact that the counter is 1 when you retry (even if you
take the lock). It could have been duped, or even sent over a unix
socket and closed (although that awould surely require a solid pause in
execution) and who knows what else.

In general I would say this approach is too hard to get right to be
worthwile given expected speedup.

-- 
Mateusz Guzik
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC PATCH] fs: use a sequence counter instead of file_lock in fd_install

2015-04-20 Thread Mateusz Guzik
On Mon, Apr 20, 2015 at 03:06:33PM +0200, Mateusz Guzik wrote:
> On Sat, Apr 18, 2015 at 12:02:52AM +0100, Al Viro wrote:
> > On Sat, Apr 18, 2015 at 12:16:48AM +0200, Mateusz Guzik wrote:
> > 
> > > I would say this makes the use of seq counter impossible. Even if we
> > > decided to fall back to a lock on retry, we cannot know what to do if
> > > the slot is reserved - it very well could be that something called
> > > close, and something else reserved the slot, so putting the file inside
> > > could be really bad. In fact we would be putting a file for which we
> > > don't have a reference anymore.
> > > 
> > > However, not all hope is lost and I still think we can speed things up.
> > > 
> > > A locking primitive which only locks stuff for current cpu and has
> > > another mode where it locks stuff for all cpus would do the trick just
> > > fine. I'm not a linux guy, quick search suggests 'lglock' would do what
> > > I want.
> > > 
> > > table reallocation is an extremely rare operation, so this should be
> > > fine. It would take the lock 'globally' for given table.
> > 
> > It would also mean percpu_alloc() for each descriptor table...
> 
> Well as it was noted I have not checked how it's implemented at the time
> of writing the message. I agree embedding something like this into files
> struct is a non-starter.
> 
> I would say this could work with a small set of locks, selected by hashing
> struct files pointer.
> 
> Table resizing is supposed to be extremely rare - most processes should
> not need it at all (if they do, the default size is too small and should
> be adjusted). Not only that, the lock is only needed if the process in
> question is multithreaded.
> 
> So I would say this would not contend in real-world workloads, but still
> looks crappy.
> 
> Unfortunately the whole thing loses original appeal of a simple hack
> with no potential perfomrance drawbacks. Maybe I'll hack it up later and
> run some tests anyway.
> 

I just came up with another stupid hack, but this time it could really
work just fine.

Note that the entire issue stems from the fact that the table can be
resized at any moment. If only we had a guarantee the table "stands
still", we would not even need that sequence couner. fd_install could
just plop the file in.

So a stupid hack which comes to mind tells the kernel to make sure the
table is big enough and then never resize it ever again (inherited on
fork, cleared on exec):
prctl(FDTABLE_SIZE_FIXED, BIGNUM);

or

dup2(0, BIGNUM); /* sizes the table appropriately */
close(BIGNUM);
prctl(FDTABLE_SIZE_FIXED);

Thoughts?
-- 
Mateusz Guzik
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC PATCH] fs: use a sequence counter instead of file_lock in fd_install

2015-04-20 Thread Mateusz Guzik
On Mon, Apr 20, 2015 at 03:43:26PM +0200, Mateusz Guzik wrote:
> On Mon, Apr 20, 2015 at 03:06:33PM +0200, Mateusz Guzik wrote:
> > On Sat, Apr 18, 2015 at 12:02:52AM +0100, Al Viro wrote:
> > > On Sat, Apr 18, 2015 at 12:16:48AM +0200, Mateusz Guzik wrote:
> > > 
> > > > I would say this makes the use of seq counter impossible. Even if we
> > > > decided to fall back to a lock on retry, we cannot know what to do if
> > > > the slot is reserved - it very well could be that something called
> > > > close, and something else reserved the slot, so putting the file inside
> > > > could be really bad. In fact we would be putting a file for which we
> > > > don't have a reference anymore.
> > > > 
> > > > However, not all hope is lost and I still think we can speed things up.
> > > > 
> > > > A locking primitive which only locks stuff for current cpu and has
> > > > another mode where it locks stuff for all cpus would do the trick just
> > > > fine. I'm not a linux guy, quick search suggests 'lglock' would do what
> > > > I want.
> > > > 
> > > > table reallocation is an extremely rare operation, so this should be
> > > > fine. It would take the lock 'globally' for given table.
> > > 
> > > It would also mean percpu_alloc() for each descriptor table...
> > 
> > Well as it was noted I have not checked how it's implemented at the time
> > of writing the message. I agree embedding something like this into files
> > struct is a non-starter.
> > 
> > I would say this could work with a small set of locks, selected by hashing
> > struct files pointer.
> > 
> > Table resizing is supposed to be extremely rare - most processes should
> > not need it at all (if they do, the default size is too small and should
> > be adjusted). Not only that, the lock is only needed if the process in
> > question is multithreaded.
> > 
> > So I would say this would not contend in real-world workloads, but still
> > looks crappy.
> > 
> > Unfortunately the whole thing loses original appeal of a simple hack
> > with no potential perfomrance drawbacks. Maybe I'll hack it up later and
> > run some tests anyway.
> > 
> 
> I just came up with another stupid hack, but this time it could really
> work just fine.
> 
> Note that the entire issue stems from the fact that the table can be
> resized at any moment. If only we had a guarantee the table "stands
> still", we would not even need that sequence couner. fd_install could
> just plop the file in.
> 
> So a stupid hack which comes to mind tells the kernel to make sure the
> table is big enough and then never resize it ever again (inherited on
> fork, cleared on exec):
> prctl(FDTABLE_SIZE_FIXED, BIGNUM);
> 
> or
> 
> dup2(0, BIGNUM); /* sizes the table appropriately */
> close(BIGNUM);
> prctl(FDTABLE_SIZE_FIXED);
> 
> Thoughts?

Sorry for spam but I came up with another hack. :)

The idea is that we can have a variable which would signify the that
given thread is playing with fd table in fd_install (kind of a lock
embedded into task_struct). We would also have a flag in files struct
indicating that a thread would like to resize it.

expand_fdtable would set the flag and iterate over all threads waiting
for all of them to have the var set to 0.

fd_install would set the var, test the flag and if needed would just
unset the var and take the spin lock associated with the table.

This way the common case (nobody resizes the table) is lockless.

Resizing operation can get expensive but that should be totally fine.

As a hack in a hack we could abuse rcu's counter to server as the "lock".

Thoughts?

-- 
Mateusz Guzik
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC PATCH RESEND] vfs: Move security_inode_killpriv() after permission checks

2015-06-03 Thread Mateusz Guzik
On Mon, Apr 13, 2015 at 11:39:01AM +1000, James Morris wrote:
> On Wed, 8 Apr 2015, Mateusz Guzik wrote:
> 
> > This is still a problem. Any feedback about the patch?
> > 
> 
> I'd like to see feedback from vfs folk (Al).
> 

Ping? Are there any concerns with the patch?

-- 
Mateusz Guzik
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] kref: oops on zero or negative refcount

2014-02-20 Thread Mateusz Guzik
In use after free situations, it is possible for one thread to write to
memory that has just been reallocated to a new user. This could open up
potential security issues.

Close off those potential security holes by terminating the current
thread when kref encounters such a race condition or underflow.

Signed-off-by: Mateusz Guzik 
Cc: Rik van Riel 
---
 include/linux/kref.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/include/linux/kref.h b/include/linux/kref.h
index 484604d..c3f8a0a 100644
--- a/include/linux/kref.h
+++ b/include/linux/kref.h
@@ -43,8 +43,10 @@ static inline void kref_get(struct kref *kref)
/* If refcount was 0 before incrementing then we have a race
 * condition when this kref is freeing by some other thread right now.
 * In this case one should use kref_get_unless_zero()
+*
+* Terminate the current thread to stop potential security exploits.
 */
-   WARN_ON_ONCE(atomic_inc_return(&kref->refcount) < 2);
+   BUG_ON(atomic_inc_return(&kref->refcount) < 2);
 }
 
 /**
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] kref: oops on zero or negative refcount

2014-02-21 Thread Mateusz Guzik
On Thu, Feb 20, 2014 at 01:14:40PM -0500, Dave Jones wrote:
> On Thu, Feb 20, 2014 at 06:44:59PM +0100, Mateusz Guzik wrote:
>  > In use after free situations, it is possible for one thread to write to
>  > memory that has just been reallocated to a new user. This could open up
>  > potential security issues.
>  > 
>  > diff --git a/include/linux/kref.h b/include/linux/kref.h
>  > index 484604d..c3f8a0a 100644
>  > --- a/include/linux/kref.h
>  > +++ b/include/linux/kref.h
>  > @@ -43,8 +43,10 @@ static inline void kref_get(struct kref *kref)
>  >/* If refcount was 0 before incrementing then we have a race
>  > * condition when this kref is freeing by some other thread right now.
>  > * In this case one should use kref_get_unless_zero()
>  > +   *
>  > +   * Terminate the current thread to stop potential security exploits.
>  > */
>  > -  WARN_ON_ONCE(atomic_inc_return(&kref->refcount) < 2);
>  > +  BUG_ON(atomic_inc_return(&kref->refcount) < 2);
> 
> This isn't "terminating the thread", this is "lock up the box".
> 

Well, extent of damage caused by non-panicking BUG_ON (if any) depends
on the state when kref_get was executed.

However, since this condition is already a sign of big trouble (and a
potential exploitation attempt), I think a WARN_ON_ONCE is not
sufficient.

That said, can you elaborate on your concers? You just don't like that
comment, don't want that BUG_ON (want a panic instead) or maybe you
don't like the change at all (or something else)?

Thanks,
-- 
Mateusz Guzik
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC PATCH] cmdline: Hide "debug" from /proc/cmdline

2014-04-02 Thread Mateusz Guzik
On Wed, Apr 02, 2014 at 02:42:19PM -0400, Steven Rostedt wrote:
> It has come to our attention that a system running a specific user
> space init program will not boot if you add "debug" to the kernel
> command line. What happens is that the user space tool parses the
> kernel command line, and if it sees "debug" it will spit out so much
> information that the system fails to boot. This basically renders the
> "debug" option for the kernel useless.
> 
> This bug has been reported to the developers of said tool
> here:
> 
>   https://bugs.freedesktop.org/show_bug.cgi?id=76935
> 
> The response is:
> 
> "Generic terms are generic, not the first user owns them."
> 
> That is, the "debug" statement on the *kernel* command line is not
> owned by the kernel just because it was the first user of it, and
> they refuse to fix their bug.
> 
> Well, my response is, we OWN the kernel command line, and as such, we
> can keep the users from seeing stuff on it if we so choose. And with
> that, I propose this patch, which hides "debug" from /proc/cmdline,
> such that we don't have to worry about tools parsing for it and causing
> hardship for those trying to debug the kernel.
> 

Well, parsing kernel cmdline by systemd is a bad idea, and hiding
"debug" is even worse. What will happen when the next keyword clashes?
And how should I check the kernel is booted with "debug"?

If there is a real need to pass arguments to systemd, how about a
dedicated option (initargs= or whatever, where it has to be last in
cmdline), then systemd would be spawned with these arguments and would
just go over its argv.

-- 
Mateusz Guzik
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] mm/swap: fix race on swap_info reuse between swapoff and swapon

2014-01-12 Thread Mateusz Guzik
On Mon, Jan 13, 2014 at 11:51:42AM +0800, Weijie Yang wrote:
> On Mon, Jan 13, 2014 at 11:27 AM, Andrew Morton
>  wrote:
> > On Mon, 13 Jan 2014 11:08:58 +0800 Weijie Yang  
> > wrote:
> >
> >> >> --- a/mm/swapfile.c
> >> >> +++ b/mm/swapfile.c
> >> >> @@ -1922,7 +1922,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, 
> >> >> specialfile)
> >> >>   p->swap_map = NULL;
> >> >>   cluster_info = p->cluster_info;
> >> >>   p->cluster_info = NULL;
> >> >> - p->flags = 0;
> >> >>   frontswap_map = frontswap_map_get(p);
> >> >>   spin_unlock(&p->lock);
> >> >>   spin_unlock(&swap_lock);
> >> >> @@ -1948,6 +1947,16 @@ SYSCALL_DEFINE1(swapoff, const char __user *, 
> >> >> specialfile)
> >> >>   mutex_unlock(&inode->i_mutex);
> >> >>   }
> >> >>   filp_close(swap_file, NULL);
> >> >> +
> >> >> + /*
> >> >> + * clear SWP_USED flag after all resources freed
> >> >> + * so that swapon can reuse this swap_info in alloc_swap_info() 
> >> >> safely
> >> >> + * it is ok to not hold p->lock after we cleared its SWP_WRITEOK
> >> >> + */
> >> >> + spin_lock(&swap_lock);
> >> >> + p->flags = 0;
> >> >> + spin_unlock(&swap_lock);
> >> >> +
> >> >>   err = 0;
> >> >>   atomic_inc(&proc_poll_event);
> >> >>   wake_up_interruptible(&proc_poll_wait);
> > But do you agree that your
> > http://ozlabs.org/~akpm/mmots/broken-out/mm-swap-fix-race-on-swap_info-reuse-between-swapoff-and-swapon.patch
> > makes Krzysztof's
> > http://ozlabs.org/~akpm/mmots/broken-out/swap-fix-setting-page_size-blocksize-during-swapoff-swapon-race.patch
> > obsolete?
> 
> Yes, I agree.
> 
> > I've been sitting on Krzysztof's
> > swap-fix-setting-page_size-blocksize-during-swapoff-swapon-race.patch
> > for several months - Hugh had issues with it so I put it on hold and
> > nothing further happened.
> >
> >> I will try to resend a patchset to make lock usage in swapfile.c clear
> >> and fine grit
> >
> > OK, thanks.  In the meanwhile I'm planning on dropping Krzysztof's
> > patch and merging your patch into 3.14-rc1, which is why I'd like
> > confirmation that your patch addresses the issues which Krzysztof
> > identified?
> >
> 
> I think so, Krzysztof and I both try to fix the same issue(reuse
> swap_info while its
> previous resources are not cleared completely). The different is
> Krzysztof's patch
> uses a global swapon_mutex and its commit log only focuses on set_blocksize(),
> while my patch try to maintain the fine grit lock usage.
> 

Maybe I should get some sleep first, but I found some minor nits.

Newly introduced window:

p->swap_map == NULL && (p->flags & SWP_USED)

breaks swap_info_get:
if (!(p->flags & SWP_USED))
goto bad_device;
offset = swp_offset(entry);
if (offset >= p->max)
goto bad_offset;
if (!p->swap_map[offset])
goto bad_free;

so that would need a trivial adjustment.

Another nit is that swap_start and swap_next do the following:
if (!(si->flags & SWP_USED) || !si->swap_map)
continue;

Testing for swap_map does not look very nice and regardless of your
patch the latter cannot be true if the former is not, thus the check
can be simplified to mere !si->swap_map.

I'm wondering if it would make sense to dedicate a flag (SWP_ALLOCATED?)
to control whether swapon can use give swap_info. That is, it would be
tested and set in alloc_swap_info & cleared like you clear SWP_USED now.
SWP_USED would be cleared as it is and would be set in _enable_swap_info

Then swap_info_get would be left unchanged and swap_* would test for
SWP_USED only.

-- 
Mateusz Guzik
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] ipc: fix compat msgrcv with negative msgtyp

2014-01-15 Thread Mateusz Guzik
Compat function takes msgtyp argument as u32 and passes it down to
do_msgrcv which results in casting to long, thus the sign is lost
and we get a big positive number instead.

Cast the argument to signed type before passing it down.

Signed-off-by: Mateusz Guzik 
Reported-by: Gabriellla Schmidt 
Cc: Al Viro 
---
 ipc/compat.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ipc/compat.c b/ipc/compat.c
index 892f658..d3b3760 100644
--- a/ipc/compat.c
+++ b/ipc/compat.c
@@ -381,7 +381,7 @@ COMPAT_SYSCALL_DEFINE6(ipc, u32, call, int, first, int, 
second,
uptr = compat_ptr(ipck.msgp);
fifth = ipck.msgtyp;
}
-   return do_msgrcv(first, uptr, second, fifth, third,
+   return do_msgrcv(first, uptr, second, (s32)fifth, third,
 compat_do_msg_fill);
}
case MSGGET:
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: disappearing listen()ed SO_REUSEPORT sockets across fork() when using epoll

2013-11-25 Thread Mateusz Guzik
On Mon, Nov 25, 2013 at 11:53:24AM -0800, Shawn Landden wrote:
> On Mon, Nov 25, 2013 at 10:05 AM, Jason Baron  wrote:
> > On 11/22/2013 12:53 PM, Shawn Landden wrote:
> >> Hello, when running the attached program on 3.12 child processes
> >> are missing a socket fd opened, set with SO_REUSEPORT, listen()ed to,
> >> and added to epoll_ctl().
> >>
> >> This is the output I get when pointing "wget http://localhost:/";
> >> at the attached program:
> >>
> >> main PID 31591
> >> PID 31634 started
> >> PID 31634 accept()ed connection
> >> PID 31635 started
> >> PID 31636 started
> >> PID 31635 accept() failed: Bad file descriptor
> >> PID 31636 accept() failed: Bad file descriptor
> >> PID 31634 accept()ed connection
> >> PID 31634 accept()ed connection
> >> PID 31634 accept()ed connection
> >> PID 31634 accept()ed connection
> >>
> >>
> >> While I would expect something like:
> >>
> >> main PID 31591
> >> PID 31634 started
> >> PID 31634 accept()ed connection
> >> PID 31635 started
> >> PID 31636 started
> >> PID 31635 accept()ed connection
> >> PID 31636 accept()ed connection
> >>
> >> -more new processes, but inversely proportional to number of listening 
> >> processes
> >> -accept() always returns successfully
> >>
> >>
> >
> > The 'close(sockfd);' looks to be racing with the accept() calls. Removing 
> > seems
> > to get the result you are looking for.
> Interesting. That works, but it shouldn't. The close() is operating in
> the parent, so it shouldn't affect the child,
> there is a leak here of process separation.
> 

You fork, then close sockfd in the parent. Thus, the very first child
can accept connectins just fine.

Subsequent forks give you children without sockfd, thus accept fails.
The first child continues to work just fine.

-- 
Thou shalt not follow the NULL pointer, for chaos and madness await thee
at its end. 
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] vfs: Don't exchange filenames unconditionally.

2014-09-18 Thread Mateusz Guzik
On Thu, Sep 18, 2014 at 02:55:00PM +0400, Mikhail Efremov wrote:
> Only exchange source and destination filenames
> if flags contain RENAME_EXCHANGE.

But names are still exchanged without RENAME_EXCHANGE, provided they are
long enough.

da1ce0670c14 introduced this behaviour also for short names and this is
what you are reverting here.

That said, I think the patch is fine as an interim change, but should
note facts mentioned above.

For more info please see https://lkml.org/lkml/2014/9/7/6

I'll be posting a RFC patch for actual problem next week.

> In case if executable file was running and replaced by
> other file /proc/PID/exe should still show correct file name,
> not the old name of the file by which it was replaced.
> 




> The scenario when this bug manifests itself was like this:
> * ALT Linux uses rpm and start-stop-daemon;
> * during a package upgrade rpm creates a temporary file
>   for an executable to rename it upon successful unpacking;
> * start-stop-daemon is run subsequently and it obtains
>   the (nonexistant) temporary filename via /proc/PID/exe
>   thus failing to identify the running process.
> 
> Cc: Miklos Szeredi 
> Cc: Alexander Viro 
> Cc: linux-fsde...@vger.kernel.org
> 
> Fixes: da1ce0670c14 "vfs: add cross-rename"
> Signed-off-by: Mikhail Efremov 
> ---
>  fs/dcache.c | 27 +++
>  1 file changed, 19 insertions(+), 8 deletions(-)
> 
> diff --git a/fs/dcache.c b/fs/dcache.c
> index 7a5b514..3218570 100644
> --- a/fs/dcache.c
> +++ b/fs/dcache.c
> @@ -2372,7 +2372,8 @@ void dentry_update_name_case(struct dentry *dentry, 
> struct qstr *name)
>  }
>  EXPORT_SYMBOL(dentry_update_name_case);
>  
> -static void switch_names(struct dentry *dentry, struct dentry *target)
> +static void switch_names(struct dentry *dentry, struct dentry *target,
> +  bool exchange)
>  {
>   if (dname_external(target)) {
>   if (dname_external(dentry)) {
> @@ -2404,11 +2405,21 @@ static void switch_names(struct dentry *dentry, 
> struct dentry *target)
>   /*
>* Both are internal.
>*/
> - unsigned int i;
> - BUILD_BUG_ON(!IS_ALIGNED(DNAME_INLINE_LEN, 
> sizeof(long)));
> - for (i = 0; i < DNAME_INLINE_LEN / sizeof(long); i++) {
> - swap(((long *) &dentry->d_iname)[i],
> -  ((long *) &target->d_iname)[i]);
> + if (exchange) {
> + unsigned int i;
> +
> + BUILD_BUG_ON(!IS_ALIGNED(DNAME_INLINE_LEN,
> +  sizeof(long)));
> + for (i = 0; i < DNAME_INLINE_LEN / sizeof(long);
> + i++) {
> + swap(((long *) &dentry->d_iname)[i],
> +  ((long *) &target->d_iname)[i]);
> + }
> + } else {
> + memcpy(dentry->d_iname, target->d_name.name,
> + target->d_name.len + 1);
> + dentry->d_name.len = target->d_name.len;
> + return;
>   }
>   }
>   }
> @@ -2510,7 +2521,7 @@ static void __d_move(struct dentry *dentry, struct 
> dentry *target,
>   list_del(&target->d_u.d_child);
>  
>   /* Switch the names.. */
> - switch_names(dentry, target);
> + switch_names(dentry, target, exchange);
>   swap(dentry->d_name.hash, target->d_name.hash);
>  
>   /* ... and switch the parents */
> @@ -2649,7 +2660,7 @@ static void __d_materialise_dentry(struct dentry 
> *dentry, struct dentry *anon)
>  
>   dparent = dentry->d_parent;
>  
> - switch_names(dentry, anon);
> + switch_names(dentry, anon, false);
>   swap(dentry->d_name.hash, anon->d_name.hash);
>  
>   dentry->d_parent = dentry;
> -- 
> 1.8.5.5
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/

-- 
Mateusz Guzik
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RESEND][PATCHv2 1/2] procfs: show hierarchy of pid namespace

2014-09-22 Thread Mateusz Guzik
 pid_buf[32];
> + int i, curr_level;
> +
> + curr_ns = task_active_pid_ns(current);
> +
> + mutex_lock(&pidns_list_lock);
> + proc_pidns_list_refresh();
> +

So this reiterates everything for each open? Maybe a generation counter
could be employed here to avoid unnecessary work.

> + /* print pid namespace hierarchy */
> + list_for_each_entry_safe(pos, tmp, &pidns_tree, list) {
> + pid = pos->pid;
> + curr_level = -1;
> + ns = pid->numbers[pid->level].ns;
> + /* Check whether pid has relationship with current ns */
> + for (; ns != NULL; ns = ns->parent)
> + if (ns == curr_ns)
> + curr_level = curr_ns->level;
> +
> + if (curr_level == -1)
> + continue;
> +
> + for (i = curr_level + 1; i <= pid->level; i++) {
> + ns = pid->numbers[i].ns;
> + /* PID 1 in specific pid ns */
> + snprintf(pid_buf, 32, "/proc/%u/ns/pid",
> + pid_vnr(find_pid_ns(1, ns)));
> + seq_printf(m, "%s ", pid_buf);
> + }
> +
> + seq_putc(m, '\n');
> + }
> +
> + free_pidns_list(&pidns_tree);
> + mutex_unlock(&pidns_list_lock);
> +
> + return 0;
> +}
> +

-- 
Mateusz Guzik
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCHv3 1/2] procfs: show hierarchy of pid namespace

2014-09-24 Thread Mateusz Guzik
On Wed, Sep 24, 2014 at 06:00:26PM +0800, Chen Hanxiao wrote:
> +static int
> +pidns_list_filter(void)
> +{
> + struct pidns_list *pos, *pos_t;
> + struct pid_namespace *ns0, *ns1;
> + struct pid *pid0, *pid1;
> + int flag = 0;
> + int rc;
> +
> + /* screen pid with relationship
> +  * in pidns_list, we may add pids like
> +  * ns0   ns1   ns2
> +  * pid1->pid2->pid3
> +  * we should screen pid1, pid2 and keep pid3
> +  */
> + list_for_each_entry(pos, &pidns_list, list) {
> + list_for_each_entry(pos_t, &pidns_list, list) {

In the previous thread I tried to note this will be terribly inefficient
to use and adding a list of children to pid_namespace struct would deal
with the problem.

> + flag = 0;
> + pid0 = pos->pid;
> + pid1 = pos_t->pid;
> + ns0 = pid0->numbers[pid0->level].ns;
> + ns1 = pid1->numbers[pid1->level].ns;
> + if (pos->pid->level < pos_t->pid->level)
> + for (; ns1 != NULL; ns1 = ns1->parent)
> + if (ns0 == ns1) {
> + flag = 1;
> + break;
> + }
> + if (flag == 1)
> + break;
> + }
> +
> + if (flag == 0) {
> + rc = pidns_list_add(pos->pid, &pidns_tree);
> + if (rc)
> + goto out;
> + }
> + }
> +
> + /* Now all usefull stuff are in pidns_tree, free pidns_list*/
> + free_pidns_list(&pidns_list);
> +
> + return 0;
> +
> +out:
> + free_pidns_list(&pidns_tree);
> + return rc;
> +}
> +
> +/* collect pids in pidns_list,
> + * then remove duplicated ones,
> + * add the rest to pidns_tree
> + */
> +static int proc_pidns_list_refresh(void)
> +{
> + struct pid *pid;
> + struct task_struct *p;
> + int rc;
> +
> + /* collect pid in differet ns */
> + rcu_read_lock();
> + for_each_process(p) {
> + pid = task_pid(p);
> + if (pid && (pid->level > 0)) {
> + rc = pidns_list_add(pid, &pidns_list);
> + if (rc)
> + goto out;
> + }
> + }
> +
> + /* screen duplicate pids from list pidns_list
> + * and form a new list pidns_tree
> + */
> + rc = pidns_list_filter();
> + if (rc)
> + goto out;
> + rcu_read_unlock();
> +
> + return 0;
> +
> +out:
> + free_pidns_list(&pidns_list);
> + rcu_read_unlock();
> + return rc;
> +}
> +
> +static int nslist_proc_show(struct seq_file *m, void *v)
> +{
> + struct pidns_list *pos;
> + struct pid_namespace *ns, *curr_ns;
> + struct pid *pid;
> + char pid_buf[32];
> + int i, curr_level;
> + int rc;
> +
> + curr_ns = task_active_pid_ns(current);
> +
> + mutex_lock(&pidns_list_lock);
> + rc = proc_pidns_list_refresh();
> + if (rc) {
> + mutex_unlock(&pidns_list_lock);
> + return rc;
> + }
> +
> + /* print pid namespace hierarchy */
> + list_for_each_entry(pos, &pidns_tree, list) {

What keeps pid_namespace's safe to use? Similarly to previous patch,
here we hit a place where the code is not protected with rcu and
structures were just plugged into the list.

Recreating the list for each open seems quite unnecessary as well.

One could work around that by caching generated output and having a
generation counter for namespaces to know whether the content is stale.
But that still does not seem right.

It looks like in the original thread someone suggested hooking this up
under proc as a directory tree which sounds much better to me.

Just my $0,03.

> + pid = pos->pid;
> + curr_level = -1;
> + ns = pid->numbers[pid->level].ns;
> + /* Check whether a pid has relationship with current ns */
> + for (; ns != NULL; ns = ns->parent)
> + if (ns == curr_ns)
> + curr_level = curr_ns->level;
> +
> + if (curr_level == -1)
> + continue;
> +
> + for (i = curr_level + 1; i <= pid->level; i++) {
> +     ns = pid->numbers[i].ns;
> + /* show PID '

Re: /proc//exe symlink behavior change in >=3.15.

2014-09-11 Thread Mateusz Guzik
On Thu, Sep 11, 2014 at 06:39:58PM -0500, Chuck Ebbert wrote:
> On Sun, 7 Sep 2014 09:56:08 +0200
> Mateusz Guzik  wrote:
> 
> > On Sat, Sep 06, 2014 at 11:44:32PM +0200, Piotr Karbowski wrote:
> > > Hi,
> > > 
> > > Starting with kernel 3.15 the 'exe' symlink under /proc// acts 
> > > diffrent
> > > than it used to in all the pre-3.15 kernels.
> > > 
> > > The usecase:
> > > 
> > > run /root/testbin (app that just sleeps)
> > > cp /root/testbin /root/testbin.new
> > > mv /root/testbin.new /root/testbin
> > > ls -al /proc/`pidof testbin`/exe
> > > 
> > > <=3.14: /root/testbin (deleted)
> > > >=3.15: /root/testbin.new (deleted)
> > > 
> > > Was the change intentional? It does render my system unusable and I failed
> > > to find a information about such change in the ChangeLog.
> > > 
> > 
> > It looks like this was already broken for "long" (> DNAME_INLINE_LEN)
> > names.
> > 
> > Short names share the problem since da1ce0670c14d8 "vfs: add
> > cross-rename".
> > 
> > The following change to switch_names is the culprit:
> > 
> > -   memcpy(dentry->d_iname, target->d_name.name,
> > -   target->d_name.len + 1);
> > -   dentry->d_name.len = target->d_name.len;
> > -   return;
> > +   unsigned int i;
> > +   BUILD_BUG_ON(!IS_ALIGNED(DNAME_INLINE_LEN, sizeof(long)));
> > +   for (i = 0; i < DNAME_INLINE_LEN / sizeof(long); i++) {
> > + swap(((long *) &dentry->d_iname)[i],
> > +   ((long *) &target->d_iname)[i]);
> > +   }
> > 
> > 
> > Dentries can have names from embedded structure or from an external buffer.
> > 
> > If you take a look around you will see the code just swaps pointers for
> > "both external" case. But this results in the same behavoiur you are seeing.
> > 
> 
> Looks like the real problem here is that __d_materialise_dentry() needs the
> old behavior of switch_names() . At least that's how it got fixed in 
> grsecurity.

No.

Regression in question is an effect of swap instead of memcpy in
switch_names, as called by d_move. Fix in grsecurity reverts to previous
behaviour when needed and imho should be applied for the time being.

The real problem is that __d_move always switches parent dentry and
calls switch_names, which actually switches names in some cases.

Without the regression you get expected results only for short names
when you move stuff around within the same directory.

For instance with current code:
mv /foo/bar/baz /1/2/3

will replace the whole path.

Previous behavoiur would result in /foo/bar/3 as the new path, which is
clearly still incorrect

Leaving the old dentry under the same parent would mean that the "tree"
associated with now moved dentry will possibly need to be freed.

In addition to that one has to deal with the need of having renamed
dentry the new name which possibly came from an external buffer. An idea
I came up with (atomic_t refcount; char name[0]; with ->name assigned to
dentry) may require adding an additional field to struct dentry, which
would be bad.

I didn't have the time yet to look at this stuff properly.

-- 
Mateusz Guzik
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: Clarification needed on use of put_user inside a loop

2014-04-25 Thread Mateusz Guzik
On Fri, Apr 25, 2014 at 09:39:57PM +0530, Kumar Gaurav wrote:
> Hence when transferring data involves loops then checking permission
> (using access_ok()) once should be good to go then after we can
> simply transfer data using __put_user(), instead of using put_user()
> itself in loop.
> 

Well, I can't tell you whether this is a good idea, but:

This looks correct and other code is doing this already.

However, put_user calls might_fault, but __put_user consumers I found
(e.g. copy_siginfo_to_user) don't do that.

While it has only debugging purposes and would not change anything for
those consumers, it seems to be a bug to not include it.

Thus I suggest adding access_ok variant which calls might_fault.

-- 
Mateusz Guzik
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] drm: make variable named "refcount" atomic, like most refcounts in the kernel.

2014-04-26 Thread Mateusz Guzik
On Sat, Apr 26, 2014 at 06:06:15PM +0200, Lionel Debroux wrote:
> Based on PaX.
> 
> ---
> 
> From 7c712cadd97d43d03ff3d7ca04fd85bd8c6eb34a Mon Sep 17 00:00:00 2001
> From: Lionel Debroux 
> Date: Sat, 26 Apr 2014 15:53:55 +0200
> Subject: drm: make variable named "refcount" atomic, like most refcounts in
>  the kernel.
> 
> Extracted from the PaX patch.
> 
> 
[snip]
>   mutex_lock(&item->mutex);
> - BUG_ON(item->refcount == 0);
> + BUG_ON(atomic_read(&item->refcount) == 0);
>   BUG_ON(ref->object != item->object);
> - if (--item->refcount == 0) {
> + if (atomic_dec_and_test(&item->refcount)) {
>   ref->release(ref);
>   item->object = NULL;
>   }

I believe this change is in grsecurity so that overflow detector can be
used, there is clearly no reason to use mere atomic ops.

It may be that kernel devs would accept a patch implementing generic
refcount manipulation primitives without atomicity guarantees, which
could be used in cases like this.

Then atomic and non-atomic versions could be used to detect overflows and
overputs at least in debug kernels.

-- 
Mateusz Guzik
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] staging: line6: fix possible overrun

2014-04-26 Thread Mateusz Guzik
On Sat, Apr 26, 2014 at 07:09:22PM +0200, Laurent Navet wrote:
> The strcpy operation may write past the end of the fixed-size destination
> buffer if the source buffer is too large.
> 
> Found by coverity scan : CID 144979
> 
> Signed-off-by: Laurent Navet 
> ---
> build tested only
> 
>  drivers/staging/line6/audio.c | 5 +++--
>  1 file changed, 3 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/staging/line6/audio.c b/drivers/staging/line6/audio.c
> index 171d80c..65f5cd4 100644
> --- a/drivers/staging/line6/audio.c
> +++ b/drivers/staging/line6/audio.c
> @@ -32,9 +32,10 @@ int line6_init_audio(struct usb_line6 *line6)
>  
>   line6->card = card;
>  
> - strcpy(card->id, line6->properties->id);
> + strncpy(card->id, line6->properties->id, (sizeof(card->id)-1));
>   strcpy(card->driver, DRIVER_NAME);
> - strcpy(card->shortname, line6->properties->name);
> + strncpy(card->shortname, line6->properties->name,
> + (sizeof(card->shortname)-1));
>   /* longname is 80 chars - see asound.h */
>   sprintf(card->longname, "Line6 %s at USB %s", line6->properties->name,
>       dev_name(line6->ifcdev));

Would not it be better to return -EINVAL (or some other error) instead?

Now you will possibly truncate the name.

-- 
Mateusz Guzik
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] staging: line6: fix possible overrun

2014-04-26 Thread Mateusz Guzik
On Sun, Apr 27, 2014 at 12:36:21AM +0300, Dan Carpenter wrote:
> On Sat, Apr 26, 2014 at 10:47:05PM +0200, Mateusz Guzik wrote:
> > On Sat, Apr 26, 2014 at 07:09:22PM +0200, Laurent Navet wrote:
> > > The strcpy operation may write past the end of the fixed-size destination
> > > buffer if the source buffer is too large.
> > > 
> > > Found by coverity scan : CID 144979
> > > 
> > > Signed-off-by: Laurent Navet 
> > > ---
> > > build tested only
> > > 
> > >  drivers/staging/line6/audio.c | 5 +++--
> > >  1 file changed, 3 insertions(+), 2 deletions(-)
> > > 
> > > diff --git a/drivers/staging/line6/audio.c b/drivers/staging/line6/audio.c
> > > index 171d80c..65f5cd4 100644
> > > --- a/drivers/staging/line6/audio.c
> > > +++ b/drivers/staging/line6/audio.c
> > > @@ -32,9 +32,10 @@ int line6_init_audio(struct usb_line6 *line6)
> > >  
> > >   line6->card = card;
> > >  
> > > - strcpy(card->id, line6->properties->id);
> > > + strncpy(card->id, line6->properties->id, (sizeof(card->id)-1));
> > >   strcpy(card->driver, DRIVER_NAME);
> > > - strcpy(card->shortname, line6->properties->name);
> > > + strncpy(card->shortname, line6->properties->name,
> > > + (sizeof(card->shortname)-1));
> > >   /* longname is 80 chars - see asound.h */
> > >   sprintf(card->longname, "Line6 %s at USB %s", line6->properties->name,
> > >   dev_name(line6->ifcdev));
> > 
> > Would not it be better to return -EINVAL (or some other error) instead?
> > 
> > Now you will possibly truncate the name.
> > 
> 
> These don't come from the user, they come from the kernel.
> 
> drivers/staging/line6/driver.c
> 59  
> 60  #define L6PROP(dev_bit, dev_id, dev_name, dev_cap)\
> 61  {.device_bit = LINE6_BIT_##dev_bit, .id = dev_id,\
> 62   .name = dev_name, .capabilities = LINE6_BIT_##dev_cap}
> 63  
> 64  /* *INDENT-OFF* */
> 65  static const struct line6_properties line6_properties_table[] = {
> 66  L6PROP(BASSPODXT, "BassPODxt", "BassPODxt",
> CTRL_PCM_HW),
> 67  L6PROP(BASSPODXTLIVE, "BassPODxtLive", "BassPODxt Live",   
> CTRL_PCM_HW),
> 68  L6PROP(BASSPODXTPRO,  "BassPODxtPro",  "BassPODxt Pro",
> CTRL_PCM_HW),
> 69  L6PROP(GUITARPORT,"GuitarPort","GuitarPort",   
> PCM),
> 70  L6PROP(POCKETPOD, "PocketPOD", "Pocket POD",   
> CONTROL),
> 71  L6PROP(PODHD300,  "PODHD300",  "POD HD300",
> CTRL_PCM_HW),
> 72  L6PROP(PODHD400,  "PODHD400",  "POD HD400",
> CTRL_PCM_HW),
> 73  L6PROP(PODHD500,  "PODHD500",  "POD HD500",
> CTRL_PCM_HW),
> 74  L6PROP(PODSTUDIO_GX,  "PODStudioGX",   "POD Studio GX",
> PCM),
> 75  L6PROP(PODSTUDIO_UX1, "PODStudioUX1",  "POD Studio UX1",   
> PCM),
> 76  L6PROP(PODSTUDIO_UX2, "PODStudioUX2",  "POD Studio UX2",   
> PCM),
> 77  L6PROP(PODX3, "PODX3", "POD X3",   
> PCM),
> 78  L6PROP(PODX3LIVE, "PODX3Live", "POD X3 Live",  
> PCM),
> 79  L6PROP(PODXT, "PODxt", "PODxt",
> CTRL_PCM_HW),
> 80  L6PROP(PODXTLIVE, "PODxtLive", "PODxt Live",   
> CTRL_PCM_HW),
> 81  L6PROP(PODXTPRO,  "PODxtPro",  "PODxt Pro",
> CTRL_PCM_HW),
> 82  L6PROP(TONEPORT_GX,   "TonePortGX","TonePort GX",  
> PCM),
> 83  L6PROP(TONEPORT_UX1,  "TonePortUX1",   "TonePort UX1", 
> PCM),
> 84  L6PROP(TONEPORT_UX2,  "TonePortUX2",   "TonePort UX2", 
> PCM),
> 85  L6PROP(VARIAX,"Variax","Variax Workbench", 
> CONTROL),
> 86  };
> 87  /* *INDENT-ON* */
> 88  
> 
> And sadly enough some of those ->id strings are more than 15 characters
> and a NUL which will fit in card->id.  So this overflow is real.  The
> card->shortname is a 32 char array so none of those overflow.
> 
> If we want to sovle the truncation issue then we need to think of
> shorter names for BassPODxtLive, BassPODxtPro, PODStudioUX1, and
> PODStudioUX2.
> 

In that case I suggest compile time assertions that ids and names fit and
a WARN_ON + -EINVAL in line6_init_audio to catch future offenders.

As a side note I'm not sure if pod_try_init from drivers/staging/line6/pod.c
cleans up properly after failed line6_init_audio.

-- 
Mateusz Guzik
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] drm: make variable named "refcount" atomic, like most refcounts in the kernel.

2014-04-26 Thread Mateusz Guzik
On Sun, Apr 27, 2014 at 12:00:56AM +0100, Al Viro wrote:
> (..)Non-atomic variant would be
>   if (++*p < 0) {
>   --*p;
>   whine
>   send SIGKILL to ourselves
>   }
> which is nowhere near a sane mitigation in this case.  Much saner one would
> be (*IF* the overflow is, indeed, possible and if simple s/int/long/ wouldn't
> be a better fix) something along the lines of
>   mutex_lock(&item->mutex);
>   if (unlikely(item->refcount == INT_MAX)) {
>   ret = -EINVAL;
>   
>   goto out_err;
>   }
>   
> 
> Mind you, I would be quite surprised if it turned out to be even
> theoretically possible to get that overflow here (judging by the look
> of callers, you'll run out of device numbers long before), but that's
> a separate story.
> 
> The point is, your "useful feature" is anything but, when applied without
> careful analysis of the situation; it's *not* a universal improvement.

I would argue even functions doing mere ptr->count++ and so on would be
useful.

For instance people who are fuzzing kernels looking for refcount
leaks/overupts could place low thresholds in these functions (along with
atomic ops) to increase chances that problems will manifest themselves.

(and this would help more people who report such problems)

The kernel locking itself up afterwards is not a problem for them.

That is provided there is enough hand-coded refcount code, if this would
be the only consumer (which will most likely never leak anyway) then this
is defnitely not worth it.

-- 
Mateusz Guzik
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] staging: line6: fix possible overrun

2014-04-27 Thread Mateusz Guzik
On Sun, Apr 27, 2014 at 08:39:32PM +0300, Dan Carpenter wrote:
> On Sat, Apr 26, 2014 at 11:59:46PM +0200, Mateusz Guzik wrote:
> > > And sadly enough some of those ->id strings are more than 15 characters
> > > and a NUL which will fit in card->id.  So this overflow is real.  The
> > > card->shortname is a 32 char array so none of those overflow.
> > > 
> > > If we want to sovle the truncation issue then we need to think of
> > > shorter names for BassPODxtLive, BassPODxtPro, PODStudioUX1, and
> > > PODStudioUX2.
> > > 
> > 
> > In that case I suggest compile time assertions that ids and names fit
> 
> That sounds like some magic code which I would love to see.  :)
> 

Just asserting something on compile time is not a problem.

The kernel has BUILD_BUG_ON macro. I didn't check why, but it doesn't
use _Static_assert. Instead it produces some code which makes it
unusable in this context.

Aforementoined _Static_assert is available at least in gcc and clang and
you can call it outside of any function, e.g.:
_Static_assert(sizeof(meh) < 42, "oh no");

Unfortnately I failed to come up with a macro which would allow me to use
it in the initializer. :/

One could change line6_properties's definition so that it contains
arrays instead of pointers, that would introduce automagic checking and
I don't think memory waste (if any) would be problematic.

> >  and a WARN_ON + -EINVAL in line6_init_audio to catch future
> >  offenders.
> 
> Returning -EINVAL is a bad idea because it would break the driver
> completely and make it unusable.
> 

Well I would vote for returning the error anyway. Something is wrong,
better fix it as it is instead of risking additional bugs resulting
from truncation.


> > 
> > As a side note I'm not sure if pod_try_init from drivers/staging/line6/pod.c
> > cleans up properly after failed line6_init_audio.
> 
> Yeah.  It doesn't seem to clean up at all.
> 

-- 
Mateusz Guzik
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH V2 2/2] fs: print a message when freezing/unfreezing filesystems

2014-05-15 Thread Mateusz Guzik
On Fri, May 16, 2014 at 08:21:35AM +1000, Dave Chinner wrote:
> On Thu, May 15, 2014 at 12:47:48PM +0200, Mateusz Guzik wrote:
> > On Thu, May 15, 2014 at 12:40:19PM +0200, Lukáš Czerner wrote:
> > > On Wed, 14 May 2014, Eric Sandeen wrote:
> > > 
> > > > Date: Wed, 14 May 2014 17:40:22 -0500
> > > > From: Eric Sandeen 
> > > > Reply-To: sand...@redhat.com
> > > > To: Dave Chinner , Jan Kara 
> > > > Cc: Mateusz Guzik , linux-kernel@vger.kernel.org,
> > > > linux-fsde...@vger.kernel.org, Josef Bacik ,
> > > > Al Viro , Joe Perches 
> > > > Subject: Re: [PATCH V2 2/2] fs: print a message when freezing/unfreezing
> > > > filesystems
> > > > 
> > > > On 5/14/14, 5:37 PM, Dave Chinner wrote:
> > > > > On Thu, May 15, 2014 at 08:00:52AM +1000, Dave Chinner wrote:
> > > > >> On Wed, May 14, 2014 at 01:39:45PM +0200, Jan Kara wrote:
> > > > >>> On Wed 14-05-14 13:26:21, Mateusz Guzik wrote:
> > > > >>>> On Wed, May 14, 2014 at 01:14:49PM +0200, Jan Kara wrote:
> > > > >>>>> On Wed 14-05-14 00:04:43, Mateusz Guzik wrote:
> > > > >>>>>> This helps hang troubleshooting efforts when only dmesg is 
> > > > >>>>>> available.
> > > > >>>>>>
> > > > >>>>>> While here remove code duplication with MS_RDONLY case and fix a
> > > > >>>>>> whitespace nit.
> > > > >>>>>   I'm somewhat undecided here I have to say. On one hand I don't 
> > > > >>>>> like
> > > > >>>>> printing to kernel log when everything is fine and kernel is 
> > > > >>>>> operating
> > > > >>>>> normally. On the other hand I've seen quite a few cases where 
> > > > >>>>> people have
> > > > >>>>> shot themselves in the foot with filesystem freezing so having 
> > > > >>>>> some trace
> > > > >>>>> of this in the log doesn't seem like a completely bad thing 
> > > > >>>>> either. What do
> > > > >>>>> other people think?
> > > > >>>>>
> > > > >>>>
> > > > >>>> I would like to note that the kernel already prints messages when 
> > > > >>>> e.g.
> > > > >>>> filesystems get mounted.
> > > > >>>   Yeah, that's a fair point.
> > > > >>
> > > > >> But filesystems choose to output that info, not the VFS. When you do
> > > > >> a remount,ro there is no output in syslog, because filesystems don't
> > > > >> need to dump any output - the state change is reflected in
> > > > >> /proc/self/mounts. IMO frozen should state should be communicated
> > > > >> the same way so that it is silent when it just works, and the state
> > > > >> can easily be determined when something goes wrong.
> > > > > 
> > > > > Say, like this:
> > > > > 
> > > > > $ grep /mnt/test /proc/mounts
> > > > > /dev/vda /mnt/test xfs rw,relatime,attr2,inode64,noquota 0 0
> > > > > $ sudo xfs_freeze -f /mnt/test
> > > > > $ grep /mnt/test /proc/mounts
> > > > > /dev/vda /mnt/test xfs rw,frozen,relatime,attr2,inode64,noquota 0 0
> > > > > $ sudo xfs_freeze -u /mnt/test
> > > > > $ grep /mnt/test /proc/mounts
> > > > > /dev/vda /mnt/test xfs rw,relatime,attr2,inode64,noquota 0 0
> > > > > $
> > > > 
> > > > I'm not totally convinced that including a non-mount option in what
> > > > has always (?) been a list of mount options is a great idea.
> > > 
> > > I do not like it either. Mixing this together with other mount
> > > options does not seem like a great idea, however we really need a
> > > way to report this and I guess we can not just change the
> > > /proc/self/mounts, or /proc/self/mountinfo format.
> > > 
> > > So what about crating a new file /proc/self/frozen with the list of
> > > frozen file systems in the same format what mounts, or mountinfo has
> > > ?
> > > 
> > 
> > As the time goes on there will be more data to present about given
> > mountpoint. So either mountinfo should be extendable to support this
> > (can't see why not) or new generic file with that property should be
> > provided. I vote for the former.
> > 
> > IOW, a new column in mountinfo. For frozen filesystems it would contain
> > 'frozen_by=[%s]:[%d]' (escaped comm, pid).
> 
> I really don't see that the process that froze the filesystem is
> particularly useful - it many cases that process is long gone (e.g.
> fsfreeze is being used to allow a HW array to take a snapshot). Just
> the fact it is in the process of freezing (if stuck, stack trace in
> sysrq-w should be present) or frozen (freezing process may be long
> gone, and is mostly irrelevant because you're now tracking down why
> a thaw hasn't happened)...
> 

There are deamons which perform freezing and unfreezing on their own.
Thus storing the name along with pid helps to determine whether someone
went behind such daemon's back, or maybe it's the daemon which "forgot" to
unfreeze after all.

-- 
Mateusz Guzik
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH V2 2/2] fs: print a message when freezing/unfreezing filesystems

2014-05-15 Thread Mateusz Guzik
On Fri, May 16, 2014 at 08:51:41AM +1000, Dave Chinner wrote:
> On Fri, May 16, 2014 at 12:34:40AM +0200, Mateusz Guzik wrote:
> > On Fri, May 16, 2014 at 08:21:35AM +1000, Dave Chinner wrote:
> > > > IOW, a new column in mountinfo. For frozen filesystems it would contain
> > > > 'frozen_by=[%s]:[%d]' (escaped comm, pid).
> > > 
> > > I really don't see that the process that froze the filesystem is
> > > particularly useful - it many cases that process is long gone (e.g.
> > > fsfreeze is being used to allow a HW array to take a snapshot). Just
> > > the fact it is in the process of freezing (if stuck, stack trace in
> > > sysrq-w should be present) or frozen (freezing process may be long
> > > gone, and is mostly irrelevant because you're now tracking down why
> > > a thaw hasn't happened)...
> > 
> > There are deamons which perform freezing and unfreezing on their own.
> > Thus storing the name along with pid helps to determine whether someone
> > went behind such daemon's back, or maybe it's the daemon which "forgot" to
> > unfreeze after all.
> 
> Such a daemon should be logging the fact that it's freezing and
> thawing the filesystem. The kernel is not the place to track what
> buggy userspace applications are doing wrong.
> 

Except there is no log entry if /var got frozen (and this is not an
imaginary example). Grabbig a debugger to inspect daemon's state is not
exactly what your typical support associate can or should do.

But this was a side request, I'm not going to argue about including
this since turns out there is a better way.

Somewhere in the thread an idea to log long-standing freezes was
mentioned which would provide sufficient information as far as
troubleshooting this stuff is concerned.

Thanks and sorry :->
-- 
Mateusz Guzik
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH V2 2/2] fs: print a message when freezing/unfreezing filesystems

2014-05-15 Thread Mateusz Guzik
On Fri, May 16, 2014 at 10:11:56AM +1000, Dave Chinner wrote:
> On Fri, May 16, 2014 at 01:19:09AM +0200, Mateusz Guzik wrote:
> > Except there is no log entry if /var got frozen (and this is not an
> > imaginary example).
> 
> Freezing the filesystem that the freezing daemon logs to is, well, a
> major application architecture fail. Sorry, catering for the lowest
> common denominator (i.e. stupidity) is not an valid argument for
> adding stuff to the kernel
> 

I'm only saying what you can encounter in varous companies. If aiding this
problem the way I proposed is not a good idea (and it turns out there is
a much better way), I'm not insisting.

> > Grabbig a debugger to inspect daemon's state is not
> > exactly what your typical support associate can or should do.
> 
> No, but they can read /proc/self/mountinfo, and grab sysrq-w output.
> And they should be able to read that and tell that there is a freeze
> hang from that info. This "filesystem hang triage 101" stuff
> 
> > But this was a side request, I'm not going to argue about including
> > this since turns out there is a better way.
> > 
> > Somewhere in the thread an idea to log long-standing freezes was
> > mentioned which would provide sufficient information as far as
> 
> You've already got the hung task timer firing when a fs is frozen
> for too long. You'll see processes hung in sb_write_wait(), and that
> tells you the filesystem is frozen. Then look at
> /proc/self/mountinfo to find which fs is frozen
> 

But additional question was what initiated the freeze and it is not
answered by this. Hopefully a warning for long-standing freezes will be
implemented and that will answer the question.

Once more, I'm fine with mere 'frozen' in mountinfo, so I suggest we
drop this now side subject. If you really want to continue we can
discuss this in private. :->

-- 
Mateusz Guzik
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] kref: warn on uninitialized kref

2014-05-17 Thread Mateusz Guzik
On Sat, May 17, 2014 at 06:53:17AM -0400, Mikulas Patocka wrote:
> I found a memory leak in iSCSI target that was caused by kref initialized
> to zero (the memory object was allocated with kzalloc, kref_init was not
> called and kref_put_spinlock_irqsave was called which changed "0" to "-1"
> and didn't free the object).
> 
> Similar bugs may exist in other kernel areas, so I submit this patch that
> adds a check to kref.h. If the value is zero or negative, we can assume
> that it is uninitialized and we warn about it.
> 
> Signed-off-by: Mikulas Patocka 
> 
> ---
>  include/linux/kref.h |4 +++-
>  1 file changed, 3 insertions(+), 1 deletion(-)
> 
> Index: linux-3.15-rc5/include/linux/kref.h
> ===
> --- linux-3.15-rc5.orig/include/linux/kref.h  2013-07-02 22:23:38.0 
> +0200
> +++ linux-3.15-rc5/include/linux/kref.h   2014-05-16 18:56:18.0 
> +0200
> @@ -69,7 +69,7 @@ static inline int kref_sub(struct kref *
>void (*release)(struct kref *kref))
>  {
>   WARN_ON(release == NULL);
> -
> + WARN_ON_ONCE(atomic_read(&kref->refcount) <= 0);
>   if (atomic_sub_and_test((int) count, &kref->refcount)) {
>   release(kref);
>   return 1;
> @@ -119,6 +119,7 @@ static inline int kref_put_spinlock_irqs
>   unsigned long flags;
>  
>   WARN_ON(release == NULL);
> + WARN_ON_ONCE(atomic_read(&kref->refcount) <= 0);
>   if (atomic_add_unless(&kref->refcount, -1, 1))
>   return 0;
>   spin_lock_irqsave(lock, flags);
> @@ -136,6 +137,7 @@ static inline int kref_put_mutex(struct 
>struct mutex *lock)
>  {
>   WARN_ON(release == NULL);
> + WARN_ON_ONCE(atomic_read(&kref->refcount) <= 0);
>   if (unlikely(!atomic_add_unless(&kref->refcount, -1, 1))) {
>   mutex_lock(lock);
>   if (unlikely(!atomic_dec_and_test(&kref->refcount))) {

This has a side effect of detecting some overputs, which is nice.

However, could be made better if kref_sub checked that refs you want to
take don't put the count below 0.

i.e.
WARN_ON(count > atomic_read(&kref->refcount));

this also detects your original problem.

-- 
Mateusz Guzik
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] fs: Cleanup string initializations (char[] instead of char *)

2014-05-17 Thread Mateusz Guzik
On Sat, May 17, 2014 at 05:00:18PM +0200, Manuel Schölling wrote:
> Initializations like 'char *foo = "bar"' will create two variables: a static
> string and a pointer (foo) to that static string. Instead 'char foo[] = "bar"'
> will declare a single variable and will end up in shorter
> assembly (according to Jeff Garzik on the KernelJanitor's TODO list).
> 

This is a greatly oversimplifying things, this may or may not happen.

Out of curiosity I checked my kernel on x86-64 and it has this
optimized:

0xa00a9629 : movabs $0x203a7367616c66,%rcx
crash> ascii 0x203a7367616c66
00203a7367616c66: flags: 


>  fs/binfmt_misc.c |2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
> index b605003..2a10529 100644
> --- a/fs/binfmt_misc.c
> +++ b/fs/binfmt_misc.c
> @@ -419,7 +419,7 @@ static void entry_status(Node *e, char *page)
>  {
>   char *dp;
>   char *status = "disabled";
> - const char * flags = "flags: ";
> + const char flags[] = "flags: ";
>  
>   if (test_bit(Enabled, &e->flags))
>   status = "enabled";

This particular function would be better of with removing this variable
and replacing all pairs like:
sprintf(dp, ...);
dp += strlen(...)

with:
dp += sprintf(dp, ...);

-- 
Mateusz Guzik
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] i2c: busses: i2c-pxa.c: Fix for possible null pointer dereference

2014-05-17 Thread Mateusz Guzik
On Sat, May 17, 2014 at 07:14:08PM +0200, Rickard Strandqvist wrote:
> There is otherwise a risk of a possible null pointer dereference.
> 
> Was largely found by using a static code analysis program called cppcheck.
> 
> Signed-off-by: Rickard Strandqvist 
> ---
>  drivers/i2c/busses/i2c-pxa.c |4 +++-
>  1 file changed, 3 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/i2c/busses/i2c-pxa.c b/drivers/i2c/busses/i2c-pxa.c
> index bbe6dfb..dbe5ebe 100644
> --- a/drivers/i2c/busses/i2c-pxa.c
> +++ b/drivers/i2c/busses/i2c-pxa.c
> @@ -1269,7 +1269,9 @@ eremap:
>  eclk:
>   kfree(i2c);
>  emalloc:
> - release_mem_region(res->start, resource_size(res));
> + if(res) {
> + release_mem_region(res->start, resource_size(res));
> + }
>   return ret;
>  }
>  

This looks incorrect.

request_mem_region, which I believe is a prerequisite for this function
may not be called when you jump here.

-- 
Mateusz Guzik
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] fs: Cleanup string initializations (char[] instead of char *)

2014-05-17 Thread Mateusz Guzik
On Sat, May 17, 2014 at 06:21:09PM +0100, Al Viro wrote:
> On Sat, May 17, 2014 at 05:44:28PM +0200, Mateusz Guzik wrote:
> > This particular function would be better of with removing this variable
> > and replacing all pairs like:
> > sprintf(dp, ...);
> > dp += strlen(...)
> > 
> > with:
> > dp += sprintf(dp, ...);
> 
> Sigh...  Premature optimisation and all such... (..)

Well, I was interested in getting rid of this error-prone style, which
results in stuff like:
sprintf(dp, "\nmask ");
dp += 6;

... and cleaning up the rest for consistency, will note next time.

I'm new to linux and didn't know about seq_ thingy, will grep some more
next time.

-- 
Mateusz Guzik
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] aio: simplify exit_aio

2014-02-18 Thread Mateusz Guzik
Since there are no other callers which can manipulate ioctx_table at
this stage it is not necessary to reload table pointer each time nor
to traverse it under rcu_read_lock

While here convert the loop to more readable for () and fix the
comment (aio_free_ring -> kill_ioctx).

Signed-off-by: Mateusz Guzik 
---
 fs/aio.c | 33 +++--
 1 file changed, 15 insertions(+), 18 deletions(-)

diff --git a/fs/aio.c b/fs/aio.c
index 062a5f6..0701181 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -766,38 +766,35 @@ void exit_aio(struct mm_struct *mm)
 {
struct kioctx_table *table;
struct kioctx *ctx;
-   unsigned i = 0;
+   unsigned i;
 
-   while (1) {
-   rcu_read_lock();
-   table = rcu_dereference(mm->ioctx_table);
-
-   do {
-   if (!table || i >= table->nr) {
-   rcu_read_unlock();
-   rcu_assign_pointer(mm->ioctx_table, NULL);
-   if (table)
-   kfree(table);
-   return;
-   }
+   rcu_read_lock();
+   table = rcu_dereference(mm->ioctx_table);
+   rcu_read_unlock();
 
-   ctx = table->table[i++];
-   } while (!ctx);
+   if (!table)
+   return;
 
-   rcu_read_unlock();
+   for (i = 0; i < table->nr; i++) {
+   ctx = table->table[i];
+   if (!ctx)
+   continue;
 
/*
 * We don't need to bother with munmap() here -
 * exit_mmap(mm) is coming and it'll unmap everything.
-* Since aio_free_ring() uses non-zero ->mmap_size
+* Since kill_ioctx() uses non-zero ->mmap_size
 * as indicator that it needs to unmap the area,
-* just set it to 0; aio_free_ring() is the only
+* just set it to 0; kill_ioctx() is the only
 * place that uses ->mmap_size, so it's safe.
 */
ctx->mmap_size = 0;
 
kill_ioctx(mm, ctx);
}
+
+   rcu_assign_pointer(mm->ioctx_table, NULL);
+   kfree(table);
 }
 
 static void put_reqs_available(struct kioctx *ctx, unsigned nr)
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] staging: line6: fix possible overrun

2014-04-29 Thread Mateusz Guzik
On Tue, Apr 29, 2014 at 04:47:11PM +0200, Takashi Iwai wrote:
> At Mon, 28 Apr 2014 01:44:25 +0300,
> Dan Carpenter wrote:
> > 
> > On Sun, Apr 27, 2014 at 10:00:43PM +0200, Mateusz Guzik wrote:
> > > > >  and a WARN_ON + -EINVAL in line6_init_audio to catch future
> > > > >  offenders.
> > > > 
> > > > Returning -EINVAL is a bad idea because it would break the driver
> > > > completely and make it unusable.
> > > > 
> > > 
> > > Well I would vote for returning the error anyway.
> > 
> > I'm trying to be polite, but you are talking about adding regressions
> > deliberately...
> > 
> > It's very rare for people to deliberately add regressions to the kernel.
> > I have only seen it one time before.
> 
> I don't think Dan would be against returning -EINVAL if all the
> offender codes have been fixed (e.g. truncating strings to fit with
> the fixed arrays) at first.  Then it'd be a good help to catch any
> future bugs.  But, having -EINVAL without fixing the caller side means
> essentially that you're introducing the breakage intentionally
> although you know it certainly breaks, which is obviously bad.
> 
> 

We clearly have a serious miscommunication here (and apparently it
started with me not addressing the concern of complete driver breakage).

line6_init_audio consumers have to be fixed first, no doubt about that.

I was only commenting on catching *future* offenders, which I thought
would implictly mean *afterwards*.

With that in mind it would seem we are in agreement after all. :-)

As far getting this done maybe OP is interested.

-- 
Mateusz Guzik
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/1] aio: change exit_aio() to load mm->ioctx_table once and avoid rcu_read_lock()

2014-04-29 Thread Mateusz Guzik
On Tue, Apr 29, 2014 at 05:22:22PM -0400, Benjamin LaHaise wrote:
> On Tue, Apr 29, 2014 at 04:42:17PM -0400, Benjamin LaHaise wrote:
> > > Signed-off-by: Oleg Nesterov 
> > 
> > Your patch does not apply because it is whitespace damaged.  Please resend 
> > and verify that it applies with 'git am'.
> 
> Whoops, it's not whitespace damange, but rather that it doesn't apply with 
> the other changes that are queued up in the aio-next tree.  You can find a 
> copy of that tree at git://git.kvack.org/~bcrl/aio-next.git .  The change 
> that conflicts is an additional parameter to kill_ioctx().

While here is there any reason for:
rcu_assign_pointer(mm->ioctx_table, NULL);

Nothing looks at this pointer afterwards and mm is about to be freed.

I thought it would be used to sanity check that everything was cleared
before freeing, but that is nod one and not every pointer is nullified
anyway.

-- 
Mateusz Guzik
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] aio: fix potential leak in aio_run_iocb().

2014-04-30 Thread Mateusz Guzik
On Thu, May 01, 2014 at 03:31:28AM +, Leon Yu wrote:
> iovec should be reclaimed whenever caller of rw_copy_check_uvector() returns,
> but it doesn't hold when failure happens right after aio_setup_vectored_rw().
> 

There is a proposal (wich a patch) to modify semantics of rw_copy_check_uvector
so that it frees stuff on error, taking care of this case as well:

https://lkml.org/lkml/2014/4/25/778

-- 
Mateusz Guzik
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] staging: rtl8723au: fix potential leak in update_bcn_wps_ie()

2014-05-01 Thread Mateusz Guzik
On Thu, May 01, 2014 at 01:57:27PM +0200, Christian Engelmayer wrote:
> Fix a potential leak in the error path of function update_bcn_wps_ie().
> Make sure that allocated memory for 'pbackup_remainder_ie' is freed
> upon return. Detected by Coverity - CID 1077718.
> 

if (remainder_ielen > 0) {
pbackup_remainder_ie = kmalloc(remainder_ielen, GFP_ATOMIC);
if (pbackup_remainder_ie)
memcpy(pbackup_remainder_ie, premainder_ie,
   remainder_ielen);
}

pwps_ie_src = pmlmepriv->wps_beacon_ie;
if (pwps_ie_src == NULL)
return;


Maybe just check pwps_ie_src earlier?

-- 
Mateusz Guzik
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/1] mm/zswap.c: add lock helper

2014-05-02 Thread Mateusz Guzik
On Fri, May 02, 2014 at 06:35:10PM +0200, Fabian Frederick wrote:
> &tree->lock is used all over the place
> 
[..]
> + spinlock_t *lock = &tree->lock;
>  
>   if (!tree)
>   return;
>  

Rather fishy, although I'm unsure if this is a real problem.

-- 
Mateusz Guzik
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 1/2] fs: include device name in error messages about freezing

2014-05-13 Thread Mateusz Guzik
While here use pr_err instead of printk(KERN_ERR...)

Signed-off-by: Mateusz Guzik 
Cc: linux-fsde...@vger.kernel.org
Cc: Josef Bacik 
Cc: Jan Kara 
Cc: Al Viro 
Cc: Eric Sandeen 

---
 fs/super.c | 6 ++
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/fs/super.c b/fs/super.c
index 48377f7..017e10a 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -1323,8 +1323,7 @@ int freeze_super(struct super_block *sb)
if (sb->s_op->freeze_fs) {
ret = sb->s_op->freeze_fs(sb);
if (ret) {
-   printk(KERN_ERR
-   "VFS:Filesystem freeze failed\n");
+   pr_err("VFS:Filesystem %s freeze failed\n", sb->s_id);
sb->s_writers.frozen = SB_UNFROZEN;
smp_wmb();
wake_up(&sb->s_writers.wait_unfrozen);
@@ -1364,8 +1363,7 @@ int thaw_super(struct super_block *sb)
if (sb->s_op->unfreeze_fs) {
error = sb->s_op->unfreeze_fs(sb);
if (error) {
-   printk(KERN_ERR
-   "VFS:Filesystem thaw failed\n");
+   pr_err("VFS:Filesystem %s thaw failed\n", sb->s_id);
up_write(&sb->s_umount);
return error;
}
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 2/2] fs: print a message when freezing/unfreezing filesystems

2014-05-13 Thread Mateusz Guzik
This helps hang troubleshooting efforts when only dmesg is available.

Signed-off-by: Mateusz Guzik 
Cc: linux-fsde...@vger.kernel.org
Cc: Josef Bacik 
Cc: Jan Kara 
Cc: Al Viro 
Cc: Eric Sandeen 

---
 fs/super.c | 11 +--
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/fs/super.c b/fs/super.c
index 017e10a..581c008 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -1289,12 +1289,9 @@ int freeze_super(struct super_block *sb)
return 0;   /* sic - it's "nothing to do" */
}
 
-   if (sb->s_flags & MS_RDONLY) {
+   if (sb->s_flags & MS_RDONLY)
/* Nothing to do really... */
-   sb->s_writers.frozen = SB_FREEZE_COMPLETE;
-   up_write(&sb->s_umount);
-   return 0;
-   }
+   goto out;
 
/* From now on, no new normal writers can start */
sb->s_writers.frozen = SB_FREEZE_WRITE;
@@ -1335,8 +1332,10 @@ int freeze_super(struct super_block *sb)
 * This is just for debugging purposes so that fs can warn if it
 * sees write activity when frozen is set to SB_FREEZE_COMPLETE.
 */
+out:
sb->s_writers.frozen = SB_FREEZE_COMPLETE;
up_write(&sb->s_umount);
+   pr_info("VFS:Filesystem %s thawed\n", sb->s_id);
return 0;
 }
 EXPORT_SYMBOL(freeze_super);
@@ -1374,7 +1373,7 @@ out:
smp_wmb();
wake_up(&sb->s_writers.wait_unfrozen);
deactivate_locked_super(sb);
-
+   pr_info("VFS:Filesystem %s thawed\n", sb->s_id);
return 0;
 }
 EXPORT_SYMBOL(thaw_super);
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 2/2] fs: print a message when freezing/unfreezing filesystems

2014-05-13 Thread Mateusz Guzik
On Tue, May 13, 2014 at 11:39:31AM -0700, Joe Perches wrote:
> On Tue, 2014-05-13 at 20:31 +0200, Mateusz Guzik wrote:
> > This helps hang troubleshooting efforts when only dmesg is available.
> []
> > diff --git a/fs/super.c b/fs/super.c
> []
> > @@ -1289,12 +1289,9 @@ int freeze_super(struct super_block *sb)
> > return 0;   /* sic - it's "nothing to do" */
> > }
> >  
> > -   if (sb->s_flags & MS_RDONLY) {
> > +   if (sb->s_flags & MS_RDONLY)
> > /* Nothing to do really... */
> > -   sb->s_writers.frozen = SB_FREEZE_COMPLETE;
> > -   up_write(&sb->s_umount);
> > -   return 0;
> > -   }
> > +   goto out;
> 
> Should this block really be part of this patch?
> If so, please add a small note to the commit log.
> 

This is the same code which you can find at the end of the function.
I added the label so that I can write freeze printk only once.

> I'm not sure making the reader goto out for this
> bit of the code needs doing as there are multiple
> blocks with return above here.
> 

There are 2. One just returns an error and second one does not change
s_writers.frozen, so unreezing of something like this would fail.

However, the case which got modified results in unfreezable fs, so not
printing the message for this case would result in unmatched entries in
the log.

> Trivially, when there is a comment in a single line
> like this, then braces are often used too.
>  

Thanks, I'll send a modified patch later. Waiting for some other comments.

-- 
Mateusz Guzik
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 2/2] fs: print a message when freezing/unfreezing filesystems

2014-05-13 Thread Mateusz Guzik
On Tue, May 13, 2014 at 12:00:21PM -0700, Joe Perches wrote:
> On Tue, 2014-05-13 at 20:53 +0200, Mateusz Guzik wrote:
> > This is the same code which you can find at the end of the function.
> > I added the label so that I can write freeze printk only once.
> 
> Yes, but for a read-only filesystem is the message useful?
> 

I see no harm in printing it and it seems more correct given that
s_writers.frozen changed, but I'm fine either way.

btw, I just noticed a copy-pasto - the same message for freezing and
unfreezing found its way into the patch. sigh.
-- 
Mateusz Guzik
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH V2 1/2] fs: include device name in error messages about freezing

2014-05-13 Thread Mateusz Guzik
While here use pr_err instead of printk(KERN_ERR...)

Signed-off-by: Mateusz Guzik 
Cc: linux-fsde...@vger.kernel.org
Cc: Josef Bacik 
Cc: Jan Kara 
Cc: Al Viro 
Cc: Eric Sandeen 
---
 fs/super.c | 6 ++
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/fs/super.c b/fs/super.c
index 48377f7..017e10a 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -1323,8 +1323,7 @@ int freeze_super(struct super_block *sb)
if (sb->s_op->freeze_fs) {
ret = sb->s_op->freeze_fs(sb);
if (ret) {
-   printk(KERN_ERR
-   "VFS:Filesystem freeze failed\n");
+   pr_err("VFS:Filesystem %s freeze failed\n", sb->s_id);
sb->s_writers.frozen = SB_UNFROZEN;
smp_wmb();
wake_up(&sb->s_writers.wait_unfrozen);
@@ -1364,8 +1363,7 @@ int thaw_super(struct super_block *sb)
if (sb->s_op->unfreeze_fs) {
error = sb->s_op->unfreeze_fs(sb);
if (error) {
-   printk(KERN_ERR
-   "VFS:Filesystem thaw failed\n");
+   pr_err("VFS:Filesystem %s thaw failed\n", sb->s_id);
up_write(&sb->s_umount);
return error;
}
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH V2 2/2] fs: print a message when freezing/unfreezing filesystems

2014-05-13 Thread Mateusz Guzik
This helps hang troubleshooting efforts when only dmesg is available.

While here remove code duplication with MS_RDONLY case and fix a whitespace nit.

Signed-off-by: Mateusz Guzik 
Cc: linux-fsde...@vger.kernel.org
Cc: Josef Bacik 
Cc: Jan Kara 
Cc: Al Viro 
Cc: Eric Sandeen 
Cc: Joe Perches 
---
since v1:
fix copy-pasto which found its way into the patch
restore curly brackets in MS_RDONLY case
 fs/super.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/fs/super.c b/fs/super.c
index 017e10a..4dd7356 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -1291,9 +1291,7 @@ int freeze_super(struct super_block *sb)
 
if (sb->s_flags & MS_RDONLY) {
/* Nothing to do really... */
-   sb->s_writers.frozen = SB_FREEZE_COMPLETE;
-   up_write(&sb->s_umount);
-   return 0;
+   goto out;
}
 
/* From now on, no new normal writers can start */
@@ -1335,8 +1333,10 @@ int freeze_super(struct super_block *sb)
 * This is just for debugging purposes so that fs can warn if it
 * sees write activity when frozen is set to SB_FREEZE_COMPLETE.
 */
+out:
sb->s_writers.frozen = SB_FREEZE_COMPLETE;
up_write(&sb->s_umount);
+   pr_info("VFS:Filesystem %s frozen\n", sb->s_id);
return 0;
 }
 EXPORT_SYMBOL(freeze_super);
@@ -1374,7 +1374,7 @@ out:
smp_wmb();
wake_up(&sb->s_writers.wait_unfrozen);
deactivate_locked_super(sb);
-
+   pr_info("VFS:Filesystem %s thawed\n", sb->s_id);
return 0;
 }
 EXPORT_SYMBOL(thaw_super);
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v2] x86: kaslr to avoid setup_data regions

2014-05-14 Thread Mateusz Guzik
On Wed, May 14, 2014 at 04:02:01PM +0800, Dave Young wrote:
> X86 will pass setup_data region while necessary, these regions could be
> overwitten by kernel due to kaslr.
> 
> Thus iterate and add setup regions to mem_avoid[] in this patch.
> Set max mem avoid entries 32, hopefully it will be enough.
> 
> Signed-off-by: Dave Young 
> ---
>  arch/x86/boot/compressed/aslr.c |   30 --
>  1 file changed, 28 insertions(+), 2 deletions(-)
> 
> Index: linux-2.6/arch/x86/boot/compressed/aslr.c
> ===
> --- linux-2.6.orig/arch/x86/boot/compressed/aslr.c
> +++ linux-2.6/arch/x86/boot/compressed/aslr.c
> @@ -110,8 +110,9 @@ struct mem_vector {
>   unsigned long size;
>  };
>  
> -#define MEM_AVOID_MAX 5
> +#define MEM_AVOID_MAX 32
>  static struct mem_vector mem_avoid[MEM_AVOID_MAX];
> +static int mem_avoid_nr;
>  
>  static bool mem_contains(struct mem_vector *region, struct mem_vector *item)
>  {
> @@ -135,6 +136,28 @@ static bool mem_overlaps(struct mem_vect
>   return true;
>  }
>  
> +static void mem_avoid_setup_data(void)
> +{
> + struct setup_data *data;
> + u64 pa_data;
> +
> + if (mem_avoid_nr >= MEM_AVOID_MAX) {
> + debug_putstr("KASLR: too many setup_data ranges.\n");
> + return;
> + }
> +
> + pa_data = real_mode->hdr.setup_data;
> + while (pa_data) {
> + data = (struct setup_data *)pa_data;
> + if (pa_data < CONFIG_RANDOMIZE_BASE_MAX_OFFSET) {
> + mem_avoid[mem_avoid_nr].start = pa_data;
> + mem_avoid[mem_avoid_nr].size = sizeof(*data) + 
> data->len;
> + mem_avoid_nr++;
> + }
> + pa_data = data->next;
> + }
> +}
> +

I have no idea if this real_mode->hdr.setup_data examination is correct,
so not commenting on that.

What prevents the loop from overflowing the table?

Additinally reaching the limit should at least result in a big fat warning.

-- 
Mateusz Guzik
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH V2 2/2] fs: print a message when freezing/unfreezing filesystems

2014-05-14 Thread Mateusz Guzik
On Wed, May 14, 2014 at 01:14:49PM +0200, Jan Kara wrote:
> On Wed 14-05-14 00:04:43, Mateusz Guzik wrote:
> > This helps hang troubleshooting efforts when only dmesg is available.
> > 
> > While here remove code duplication with MS_RDONLY case and fix a
> > whitespace nit.
>   I'm somewhat undecided here I have to say. On one hand I don't like
> printing to kernel log when everything is fine and kernel is operating
> normally. On the other hand I've seen quite a few cases where people have
> shot themselves in the foot with filesystem freezing so having some trace
> of this in the log doesn't seem like a completely bad thing either. What do
> other people think?
> 

I would like to note that the kernel already prints messages when e.g.
filesystems get mounted.

And yes, situations where I/O is frozen and hung task detector comes into
play happen more often than you may think. I can hide these behind a
sysctl if this helps.

Finally, somewhat related to the first mail, my preferred message format
(which I was somewhat reluctant to propose earlier) would be:

VFS (%s): Filesystem frozen by %s:%d

with device, process and pid respectively.

-- 
Mateusz Guzik
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 2/2] fs: print a message when freezing/unfreezing filesystems

2014-05-15 Thread Mateusz Guzik
On Thu, May 15, 2014 at 07:54:57AM +1000, Dave Chinner wrote:
> On Tue, May 13, 2014 at 08:31:02PM +0200, Mateusz Guzik wrote:
> > This helps hang troubleshooting efforts when only dmesg is available.
> 
> I really don't think that spamming dmesg every time a filesystem is
> frozen or thawed is a good idea. This happens a *lot* when systems
> are using snapshots, and for the most part nobody cares about
> freeze/thaw cycles because they almost always work just fine.
> 

I agree it may get noisy.

> I'd think that /proc/self/mounts would be a much better place to
> indicate that the fs is frozen. After all, that's where we tell
> people whether the filesystem is ro or rw, and frozen is just
> a temporary, non-invasive ro state...
> 

Except you can't inspect /proc/self/mounts when the only thing you got
is dmesg, so this does not really help my case.

That said, I'll try to come up with a different solution.

Poorly reported side-effects of frozen I/O are only a part of the real
problem which is hung task detector being able to typically report
backtraces of "victims" only. I came up with printks becuase these are
a cheap way and would help us out in a lot of cases.

So general idea is to support providing callbacks when setting tasks to
TASK_UNINTERRUPTIBLE which could either tell the user what's up directly
or would perform some heuristics. I'll post this in a separate thread
later, maybe with PoC.

Thanks,
-- 
Mateusz Guzik
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH V2 2/2] fs: print a message when freezing/unfreezing filesystems

2014-05-15 Thread Mateusz Guzik
On Thu, May 15, 2014 at 12:40:19PM +0200, Lukáš Czerner wrote:
> On Wed, 14 May 2014, Eric Sandeen wrote:
> 
> > Date: Wed, 14 May 2014 17:40:22 -0500
> > From: Eric Sandeen 
> > Reply-To: sand...@redhat.com
> > To: Dave Chinner , Jan Kara 
> > Cc: Mateusz Guzik , linux-kernel@vger.kernel.org,
> > linux-fsde...@vger.kernel.org, Josef Bacik ,
> > Al Viro , Joe Perches 
> > Subject: Re: [PATCH V2 2/2] fs: print a message when freezing/unfreezing
> > filesystems
> > 
> > On 5/14/14, 5:37 PM, Dave Chinner wrote:
> > > On Thu, May 15, 2014 at 08:00:52AM +1000, Dave Chinner wrote:
> > >> On Wed, May 14, 2014 at 01:39:45PM +0200, Jan Kara wrote:
> > >>> On Wed 14-05-14 13:26:21, Mateusz Guzik wrote:
> > >>>> On Wed, May 14, 2014 at 01:14:49PM +0200, Jan Kara wrote:
> > >>>>> On Wed 14-05-14 00:04:43, Mateusz Guzik wrote:
> > >>>>>> This helps hang troubleshooting efforts when only dmesg is available.
> > >>>>>>
> > >>>>>> While here remove code duplication with MS_RDONLY case and fix a
> > >>>>>> whitespace nit.
> > >>>>>   I'm somewhat undecided here I have to say. On one hand I don't like
> > >>>>> printing to kernel log when everything is fine and kernel is operating
> > >>>>> normally. On the other hand I've seen quite a few cases where people 
> > >>>>> have
> > >>>>> shot themselves in the foot with filesystem freezing so having some 
> > >>>>> trace
> > >>>>> of this in the log doesn't seem like a completely bad thing either. 
> > >>>>> What do
> > >>>>> other people think?
> > >>>>>
> > >>>>
> > >>>> I would like to note that the kernel already prints messages when e.g.
> > >>>> filesystems get mounted.
> > >>>   Yeah, that's a fair point.
> > >>
> > >> But filesystems choose to output that info, not the VFS. When you do
> > >> a remount,ro there is no output in syslog, because filesystems don't
> > >> need to dump any output - the state change is reflected in
> > >> /proc/self/mounts. IMO frozen should state should be communicated
> > >> the same way so that it is silent when it just works, and the state
> > >> can easily be determined when something goes wrong.
> > > 
> > > Say, like this:
> > > 
> > > $ grep /mnt/test /proc/mounts
> > > /dev/vda /mnt/test xfs rw,relatime,attr2,inode64,noquota 0 0
> > > $ sudo xfs_freeze -f /mnt/test
> > > $ grep /mnt/test /proc/mounts
> > > /dev/vda /mnt/test xfs rw,frozen,relatime,attr2,inode64,noquota 0 0
> > > $ sudo xfs_freeze -u /mnt/test
> > > $ grep /mnt/test /proc/mounts
> > > /dev/vda /mnt/test xfs rw,relatime,attr2,inode64,noquota 0 0
> > > $
> > 
> > I'm not totally convinced that including a non-mount option in what
> > has always (?) been a list of mount options is a great idea.
> 
> I do not like it either. Mixing this together with other mount
> options does not seem like a great idea, however we really need a
> way to report this and I guess we can not just change the
> /proc/self/mounts, or /proc/self/mountinfo format.
> 
> So what about crating a new file /proc/self/frozen with the list of
> frozen file systems in the same format what mounts, or mountinfo has
> ?
> 

As the time goes on there will be more data to present about given
mountpoint. So either mountinfo should be extendable to support this
(can't see why not) or new generic file with that property should be
provided. I vote for the former.

IOW, a new column in mountinfo. For frozen filesystems it would contain
'frozen_by=[%s]:[%d]' (escaped comm, pid).

-- 
Mateusz Guzik
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: WARN in do_rt_tgsigqueueinfo()

2014-05-15 Thread Mateusz Guzik
On Thu, May 15, 2014 at 03:38:34PM +0200, Peter Zijlstra wrote:
> Hi Thomas, Oleg,
> 
> 
> trinity$ MALLOC_CHECK_=0 ./trinity -xinit_module -xreboot -xshutdown 
> -xunshare -xnfsservctl -xclock_nanosleep -xuselib -xumount -xmount -m --quiet 
> -C 400 -l off -xmremap
> 
> [watchdog] kernel became tainted! (512/0) Last seed was 4072360471
> 
> [15908.562512] [ cut here ]
> [15908.567245] WARNING: CPU: 22 PID: 3312 at 
> /usr/src/linux-2.6/kernel/signal.c:3060 do_rt_tgsigqueueinfo+0xb4/0xc0()
> [15908.577715] Modules linked in:
> [15908.580857] CPU: 22 PID: 3312 Comm: trinity-c198 Not tainted 
> 3.15.0-rc5-01700-g505011124ad0-dirty #1070
> [15908.590412] Hardware name: Supermicro X8DTN/X8DTN, BIOS 4.6.3 01/08/2010
> [15908.597228]  0009 880428715e40 81682052 
> 
> [15908.604834]  880428715e78 8109808c 0001 
> 
> [15908.612402]  efdcefbf efdcefbf 0129 
> 880428715e88
> [15908.619941] Call Trace:
> [15908.622423]  [] dump_stack+0x4e/0x7a
> [15908.627600]  [] warn_slowpath_common+0x8c/0xc0
> [15908.633658]  [] warn_slowpath_null+0x1a/0x20
> [15908.639535]  [] do_rt_tgsigqueueinfo+0xb4/0xc0
> [15908.645591]  [] SYSC_rt_tgsigqueueinfo+0x61/0x80
> [15908.651851]  [] SyS_rt_tgsigqueueinfo+0xe/0x10
> [15908.657946]  [] system_call_fastpath+0x16/0x1b
> [15908.664036] ---[ end trace 02450728e2526283 ]---
> 
> 
> This is apparently a very common thing to hit according to google.

Well, it warns about an argument received from userland, so nothing fishy
going on as far as kernel consistency is concerned if that's what you mean.

-- 
Mateusz Guzik
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: WARN in do_rt_tgsigqueueinfo()

2014-05-15 Thread Mateusz Guzik
On Thu, May 15, 2014 at 11:54:15AM -0400, Dave Jones wrote:
> On Thu, May 15, 2014 at 05:04:12PM +0200, Peter Zijlstra wrote:
> 
>  > > > trinity$ MALLOC_CHECK_=0 ./trinity -xinit_module -xreboot -xshutdown 
> -xunshare -xnfsservctl -xclock_nanosleep -xuselib -xumount -xmount -m --quiet 
> -C 400 -l off -xmremap
>  > > > 
>  > > > [watchdog] kernel became tainted! (512/0) Last seed was 4072360471
>  > > > 
>  > > > [15908.562512] [ cut here ]
>  > > > [15908.567245] WARNING: CPU: 22 PID: 3312 at 
> /usr/src/linux-2.6/kernel/signal.c:3060 do_rt_tgsigqueueinfo+0xb4/0xc0()
>  > > > 
>  > > > This is apparently a very common thing to hit according to google.
>  > > 
>  > > Well, it warns about an argument received from userland, so nothing fishy
>  > > going on as far as kernel consistency is concerned if that's what you 
> mean.
>  > 
>  > I hadn't looked that far.. I just manged to hit it two times in a row
>  > while waiting for my bug to trigger.
> 
> There's a ton of 'noise' like this that fuzzing turns up.
> I've been collecting some of the stuff to shut it up..
> 
> http://codemonkey.org.uk/junk/silence-fuzz-testing-noise.patch
> http://codemonkey.org.uk/junk/silence-noisy-deprecated-warnings-while-fuzzing.patch
> 
> I'm of the opinion that a user-triggerable WARN is a bug, but aparently
> not everyone else feels that way.
> 

imho warning sysadmins about userspace using deprecated stuff is fine as
long as users are not able to spam the console with it.

Warning reported here just looks scary and provides no information for
sysadmins so I guess it hould be changed.

I see no value in removing this stuff. If it shows up in production you
most likely want to know. Filtering this out during fuzzing should be an
issue.

my $0,03
-- 
Mateusz Guzik
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] net: ipv4: current group_info should be put after using.

2014-05-11 Thread Mateusz Guzik
On Sun, Apr 13, 2014 at 10:54:07PM -0400, David Miller wrote:
> From: "Wang, Xiaoming" 
> Date: Mon, 14 Apr 2014 12:30:45 -0400
> 
> > Plug a group_info refcount leak in ping_init.
> > group_info is only needed during initialization and 
> > the code failed to release the reference on exit.
> > While here move grabbing the reference to a place 
> > where it is actually needed.
> > 
> > Signed-off-by: Chuansheng Liu 
> > Signed-off-by: Zhang Dongxing 
> > Signed-off-by: xiaoming wang 
> 
> Applied and queued up for -stable, thanks.

I see this only made its way to 3.12 kernel, what is holding up inclusion
in other kernels? (or maybe this question should be directed at someone
else?)

The bug was introduced with:
commit c319b4d76b9e583a5d88d6bf190e079c4e43213d
Author: Vasiliy Kulikov 
Date:   Fri May 13 10:01:00 2011 +0000
net: ipv4: add IPPROTO_ICMP socket kind

starting with 3.0.

Thanks,
-- 
Mateusz Guzik
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC][PATCH] vfs: add closefrom(2) syscall

2014-04-08 Thread Mateusz Guzik
On Tue, Apr 08, 2014 at 03:12:22PM +0800, Zheng Liu wrote:
>  
> +int __close_fds(struct files_struct *files, int lowfd)
> +{
> + struct file *file;
> + struct fdtable *fdt;
> + int fd;
> +
> + if (lowfd < 0)
> + lowfd = 0;
> + spin_lock(&files->file_lock);
> + fdt = files_fdtable(files);
> + if (lowfd >= fdt->max_fds)
> + goto out_unlock;
> + for (fd = lowfd; fd < fdt->max_fds; fd++) {
> + file = fdt->fd[fd];
> + if (!file)
> + continue;
> +
> + rcu_assign_pointer(fdt->fd[fd], NULL);
> + __clear_close_on_exec(fd, fdt);
> + __put_unused_fd(files, fd);
> + spin_unlock(&files->file_lock);
> + filp_close(file, files);
> + spin_lock(&files->file_lock);
> + }
> +
> +out_unlock:
> + spin_unlock(&files->file_lock);
> + return 0;
> +}
> +

Can't comment on the usefulness of the patch, but I would like to note:

1. fdt could be freed after you drop the lock, but you never reload the
pointer, thus this looks like use-after-free
2. most of this looks like __close_fd, maybe some parts could be moved
to an inline function so that code duplication is reduced?

-- 
Mateusz Guzik
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] net: ipv4: current group_info should be put after using.

2014-04-11 Thread Mateusz Guzik
On Fri, Apr 11, 2014 at 01:37:08PM -0400, Wang, Xiaoming wrote:
> There is a memory leak in ping. Current group_info had been got in 
> ping_init_sock and group_info->usage increased. 
> But the usage hasn't decreased anywhere in ping.
> This will make this group_info never freed and cause memory leak.
> 

Memory leak is only one of possible side-effects, thus I believe commit
message should be adjusted.

This is a typical refcount leak exploitable by unprivileged users, so
side effects can range from nothing through memory leaks and crashes to
possibly privilege escalation.

That said losing ' and cause memory leak' from your commit message
would be fine in my opinion.

See also a nit below.

> ---
>  net/ipv4/ping.c |   11 ---
>  1 files changed, 8 insertions(+), 3 deletions(-)
> 
> diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
> index f4b19e5..2af7b1f 100644
> --- a/net/ipv4/ping.c
> +++ b/net/ipv4/ping.c
> @@ -255,23 +255,28 @@ int ping_init_sock(struct sock *sk)
>   struct group_info *group_info = get_current_groups();
>   int i, j, count = group_info->ngroups;
>   kgid_t low, high;
> + int ret = 0;
>  
>   inet_get_ping_group_range_net(net, &low, &high);
>   if (gid_lte(low, group) && gid_lte(group, high))
> - return 0;
> + goto out_release_group;
>  

Since group_info is not even used here maybe it would be better to leave
return 0 as it is and call get_current_groups before the loop?

>   for (i = 0; i < group_info->nblocks; i++) {
>   int cp_count = min_t(int, NGROUPS_PER_BLOCK, count);
>   for (j = 0; j < cp_count; j++) {
>   kgid_t gid = group_info->blocks[i][j];
>   if (gid_lte(low, gid) && gid_lte(gid, high))
> - return 0;
> + goto out_release_group;
>   }
>  
>   count -= cp_count;
>   }
>  
> - return -EACCES;
> + ret = -EACCES;
> +
> +out_release_group:
> + put_group_info(group_info);
> + return ret;
>  }
>  EXPORT_SYMBOL_GPL(ping_init_sock);
>  

-- 
Mateusz Guzik
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] net: ipv4: current group_info should be put after using.

2014-04-11 Thread Mateusz Guzik
On Fri, Apr 11, 2014 at 10:35:33AM +0200, Mateusz Guzik wrote:
> On Fri, Apr 11, 2014 at 01:37:08PM -0400, Wang, Xiaoming wrote:
> > There is a memory leak in ping. Current group_info had been got in 
> > ping_init_sock and group_info->usage increased. 
> > But the usage hasn't decreased anywhere in ping.
> > This will make this group_info never freed and cause memory leak.
> > 
> 
> Memory leak is only one of possible side-effects, thus I believe commit
> message should be adjusted.
> 
> This is a typical refcount leak exploitable by unprivileged users, so
> side effects can range from nothing through memory leaks and crashes to
> possibly privilege escalation.
> 
> That said losing ' and cause memory leak' from your commit message
> would be fine in my opinion.
> 

Huh, not sure why I wrote that last sentence, it does not make much
sense. Sorry.

There is a pending CVE request for this bug:
http://seclists.org/oss-sec/2014/q2/97

The bug was introduced with:
commit c319b4d76b9e583a5d88d6bf190e079c4e43213d
Author: Vasiliy Kulikov 
Date:   Fri May 13 10:01:00 2011 +

net: ipv4: add IPPROTO_ICMP socket kind

starting with 3.0 kernel.


> See also a nit below.
> 
> > ---
> >  net/ipv4/ping.c |   11 ---
> >  1 files changed, 8 insertions(+), 3 deletions(-)
> > 
> > diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
> > index f4b19e5..2af7b1f 100644
> > --- a/net/ipv4/ping.c
> > +++ b/net/ipv4/ping.c
> > @@ -255,23 +255,28 @@ int ping_init_sock(struct sock *sk)
> > struct group_info *group_info = get_current_groups();
> > int i, j, count = group_info->ngroups;
> > kgid_t low, high;
> > +   int ret = 0;
> >  
> > inet_get_ping_group_range_net(net, &low, &high);
> > if (gid_lte(low, group) && gid_lte(group, high))
> > -   return 0;
> > +   goto out_release_group;
> >  
> 
> Since group_info is not even used here maybe it would be better to leave
> return 0 as it is and call get_current_groups before the loop?
> 
> > for (i = 0; i < group_info->nblocks; i++) {
> > int cp_count = min_t(int, NGROUPS_PER_BLOCK, count);
> > for (j = 0; j < cp_count; j++) {
> > kgid_t gid = group_info->blocks[i][j];
> > if (gid_lte(low, gid) && gid_lte(gid, high))
> > -   return 0;
> > +   goto out_release_group;
> > }
> >  
> > count -= cp_count;
> > }
> >  
> > -   return -EACCES;
> > +   ret = -EACCES;
> > +
> > +out_release_group:
> > +   put_group_info(group_info);
> > +   return ret;
> >  }
> >  EXPORT_SYMBOL_GPL(ping_init_sock);
> >  
> 
> -- 
> Mateusz Guzik

-- 
Mateusz Guzik
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] net: ipv4: current group_info should be put after using.

2014-04-11 Thread Mateusz Guzik
On Fri, Apr 11, 2014 at 10:53:21PM -0400, Wang, Xiaoming wrote:
> This is a typical refcount leak exploitable by unprivileged users.
> Current group_info had been got in ping_init_sock and
> group_info->usage increased. But the usage hasn't decreased
> anywhere in ping. This will make this group_info never freed.
> 

The patch is fine, however I had a brainfart with my last sentence about
commit message, sorry for that.

group_info *can be freed* by malicious user while still being pointed to
by something, that's the biggest problem with refcount leaks, therefore
this message needs some reworking.

I think that discussion about various consequences of refcount leak in
commit message is not necessary.

how about:

Plug a group_info refcount leak in ping_init.

group_info is only needed during initialization and the code failed to
release the reference on exit.

While here move grabbing the reference to a place where it is actually
needed.



Please cc: me if you resend the patch.

Thanks,
-- 
Mateusz Guzik
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v2] hid-appleir: Fix kernel panic due to null pointer

2014-06-30 Thread Mateusz Guzik
On Tue, Jul 01, 2014 at 08:39:06AM +0200, Jiri Kosina wrote:
> On Mon, 30 Jun 2014, Nicholas Krause wrote:
> 
> > Fixes a null pointer in appleir_input_configured due to reading
> > into wrong size array. Changed the variable to input_dev->keycodemax.
> 
> This is a stale changelog from the previous buggy version.
> 
> > Signed-off-by: Nicholas Krause 
> > ---
> >  drivers/hid/hid-appleir.c | 2 +-
> >  1 file changed, 1 insertion(+), 1 deletion(-)
> > 
> > diff --git a/drivers/hid/hid-appleir.c b/drivers/hid/hid-appleir.c
> > index 0e6a42d..cc02df4 100644
> > --- a/drivers/hid/hid-appleir.c
> > +++ b/drivers/hid/hid-appleir.c
> > @@ -272,7 +272,7 @@ static void appleir_input_configured(struct hid_device 
> > *hid,
> > input_dev->evbit[0] = BIT(EV_KEY) | BIT(EV_REP);
> >  
> > memcpy(appleir->keymap, appleir_key_table, sizeof(appleir->keymap));
> > -   for (i = 0; i < ARRAY_SIZE(appleir_key_table); i++)
> > +   for (i = 0; i < appleir->keymap; i++)
> 
> This is wrong. appleir->keymap is an array, and you want to count its 
> elements.
> 

But where the claim of 'null pointer' is coming from (or reading/writing
past the array for that matter)?

Replacing appleir_key_table with appleir->keymap is a noop anyway
because:
unsigned short keymap[ARRAY_SIZE(appleir_key_table)];

although one may argue is slightly nicer since it is used in memcpy line
earlier.
-- 
Mateusz Guzik
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: lib/argv_split.c : should argv be kfree'ed ?

2014-06-28 Thread Mateusz Guzik
On Sat, Jun 28, 2014 at 11:52:37PM +0200, Toralf Förster wrote:
> /me wonders if this patch is needed here :
> 
> 
> diff --git a/lib/argv_split.c b/lib/argv_split.c
> index e927ed0..7de4cb4 100644
> --- a/lib/argv_split.c
> +++ b/lib/argv_split.c
> @@ -85,6 +85,7 @@ char **argv_split(gfp_t gfp, const char *str, int *argcp)
> *argv++ = argv_str;
> }
> }
> +   kfree (argv);
> *argv = NULL;
> 
>     if (argcp)
> 

No, see argv_free.

-- 
Mateusz Guzik
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: lib/argv_split.c : should argv be kfree'ed ?

2014-06-29 Thread Mateusz Guzik
On Sun, Jun 29, 2014 at 04:40:17PM +0200, Toralf Förster wrote:
> On 06/29/2014 12:04 AM, Mateusz Guzik wrote:
> > On Sat, Jun 28, 2014 at 11:52:37PM +0200, Toralf Förster wrote:
> >> /me wonders if this patch is needed here :
> >>
> >>
> >> diff --git a/lib/argv_split.c b/lib/argv_split.c
> >> index e927ed0..7de4cb4 100644
> >> --- a/lib/argv_split.c
> >> +++ b/lib/argv_split.c
> >> @@ -85,6 +85,7 @@ char **argv_split(gfp_t gfp, const char *str, int *argcp)
> >> *argv++ = argv_str;
> >> }
> >> }
> >> +   kfree (argv);
> >> *argv = NULL;
> >>
> >> if (argcp)
> >>
> > 
> > No, see argv_free.
> > 
> Ah, understood, it is in the responsibility of the caller to avoid the 
> memleak.
> BTW may I ask you about your opinion about this warning of cppcheck in 
> lib/flex_array.c:
> 
> for (part_nr = start_part; part_nr <= end_part; part_nr++) {<--- 
> Memory leak: part
> part = __fa_get_part(fa, part_nr, flags);
> if (!part)
> return -ENOMEM;
> }
> return 0;
> 


static struct flex_array_part *
__fa_get_part(struct flex_array *fa, int part_nr, gfp_t flags)
{
struct flex_array_part *part = fa->parts[part_nr];
if (!part) {
part = kmalloc(sizeof(struct flex_array_part), flags);
if (!part)
return NULL;
if (!(flags & __GFP_ZERO))
memset(part, FLEX_ARRAY_FREE,
sizeof(struct flex_array_part));
fa->parts[part_nr] = part;

    ^^
}
return part;
}


Allocated memory is not leaked. It is stored in 'fa' and is perfectly
reachable afterwards.

'part' in flex_array_prealloc is only used for error checking.

-- 
Mateusz Guzik
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] NFS: populate ->net in mount data when remounting

2014-06-10 Thread Mateusz Guzik
Otherwise the kernel oopses when remounting with IPv6 server because
net is dereferenced in dev_get_by_name.

Use net ns of current thread so that dev_get_by_name does not operate on
foreign ns. Changing the address is prohibited anyway so this should not
affect anything.

Signed-off-by: Mateusz Guzik 
Cc: linux-...@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Cc: sta...@vger.kernel.org # 3.4+
---
 fs/nfs/super.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 2cb5694..104ef01 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -2248,6 +2248,7 @@ nfs_remount(struct super_block *sb, int *flags, char 
*raw_data)
data->nfs_server.addrlen = nfss->nfs_client->cl_addrlen;
data->version = nfsvers;
data->minorversion = nfss->nfs_client->cl_minorversion;
+   data->net = current->nsproxy->net_ns;
memcpy(&data->nfs_server.address, &nfss->nfs_client->cl_addr,
data->nfs_server.addrlen);
 
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] sched: fix possible divide by zero in avg_atom calculation

2014-06-14 Thread Mateusz Guzik
proc_sched_show_task does:
if (nr_switches)
do_div(avg_atom, nr_switches);

nr_switches is unsigned long and do_div truncates it to 32 bits, which
means it can test non-zero on e.g. x86-64 and be truncated to zero for
division.

Fix the problem by using div64_ul instead.

As a side effect calculations of avg_atom for big nr_switches are now correct.

Signed-off-by: Mateusz Guzik 
Cc: Ingo Molnar 
Cc: Peter Zijlstra 
Cc: sta...@vger.kernel.org
---
 kernel/sched/debug.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 695f977..627b3c3 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -608,7 +608,7 @@ void proc_sched_show_task(struct task_struct *p, struct 
seq_file *m)
 
avg_atom = p->se.sum_exec_runtime;
if (nr_switches)
-   do_div(avg_atom, nr_switches);
+   avg_atom = div64_ul(avg_atom, nr_switches);
else
avg_atom = -1LL;
 
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] Check for Null return from logfs_readpage_nolock in btree_write_block

2014-06-16 Thread Mateusz Guzik
On Mon, Jun 16, 2014 at 03:47:01PM -0400, Nicholas Krause wrote:
> Signed-off-by: Nicholas Krause 
> ---
>  fs/logfs/readwrite.c | 2 ++
>  1 file changed, 2 insertions(+)
> 
> diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c
> index 4814031..adb9233 100644
> --- a/fs/logfs/readwrite.c
> +++ b/fs/logfs/readwrite.c
> @@ -2210,6 +2210,8 @@ void btree_write_block(struct logfs_block *block)
>   page = logfs_get_write_page(inode, block->bix, block->level);
>  
>   err = logfs_readpage_nolock(page);
> + if (!err)
> + return -ENOMEM;
>   BUG_ON(err);
>   BUG_ON(!PagePrivate(page));
>   BUG_ON(logfs_block(page) != block);

This function returns 0 on success, which you turn into error condition
and return -ENOMEM.
But the function returns void, thus it cannot return the error.
It does not allocate anything, thus -ENOMEM would not be appropriate in
the first place.

Since the function returns error, nobody would check the condition in
the first place.

Even if it was not void, it would either have to return the error or
oops on BUG_ON(err).

Read the code more carefully and at least compile-test your changes.
Instructions how to compile the kernel can be found here:
http://kernelnewbies.org/FAQ/KernelCompilation

I would also suggest letting the patch wait few hours and have another
look before sending.

Cheers,
-- 
Mateusz Guzik
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] fs: clear close-on-exec flag as part of put_unused_fd()

2013-12-11 Thread Mateusz Guzik
On Wed, Dec 11, 2013 at 10:08:27PM +0100, Yann Droneaud wrote:
> close-on-exec flag is set by get_unused_fd_flags(), it makes
> sense to clear it in the opposite function, eg. put_unused_fd().
> 
> Additionally, since the close_on_exec bit array is always initialized
> to 0, it can be safely assumed that any newly allocated file descriptor
> has close-on-exec flag not set, so there's no need to clear it
> explicitly.
> 
> Signed-off-by: Yann Droneaud 
> ---
>  fs/file.c | 7 +--
>  1 file changed, 1 insertion(+), 6 deletions(-)
> 
> diff --git a/fs/file.c b/fs/file.c
> index 4a78f981557a..e98f5a5b1050 100644
> --- a/fs/file.c
> +++ b/fs/file.c
> @@ -500,8 +500,6 @@ repeat:
>   __set_open_fd(fd, fdt);
>   if (flags & O_CLOEXEC)
>   __set_close_on_exec(fd, fdt);
> - else
> - __clear_close_on_exec(fd, fdt);
>   error = fd;
>  #if 1
>   /* Sanity check */
> @@ -530,6 +528,7 @@ EXPORT_SYMBOL(get_unused_fd_flags);
>  static void __put_unused_fd(struct files_struct *files, unsigned int fd)
>  {
>   struct fdtable *fdt = files_fdtable(files);
> + __clear_close_on_exec(fd, fdt);
>   __clear_open_fd(fd, fdt);
>   if (fd < files->next_fd)
>   files->next_fd = fd;
> @@ -599,7 +598,6 @@ int __close_fd(struct files_struct *files, unsigned fd)
>   if (!file)
>   goto out_unlock;
>   rcu_assign_pointer(fdt->fd[fd], NULL);
> - __clear_close_on_exec(fd, fdt);
>   __put_unused_fd(files, fd);
>   spin_unlock(&files->file_lock);
>   return filp_close(file, files);
> @@ -625,7 +623,6 @@ void do_close_on_exec(struct files_struct *files)
>   set = fdt->close_on_exec[i];
>   if (!set)
>   continue;
> - fdt->close_on_exec[i] = 0;
>   for ( ; set ; fd++, set >>= 1) {
>   struct file *file;
>   if (!(set & 1))
> @@ -806,8 +803,6 @@ static int do_dup2(struct files_struct *files,
>   __set_open_fd(fd, fdt);
>   if (flags & O_CLOEXEC)
>   __set_close_on_exec(fd, fdt);
> - else
> - __clear_close_on_exec(fd, fdt);
>   spin_unlock(&files->file_lock);
>  
>   if (tofree)

>From my reading this will break at least the following:
fd = open(..., .. | O_CLOEXEC);
dup2(whatever, fd);

now fd has O_CLOEXEC even though it should not

-- 
Mateusz Guzik
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 3.10] aio: restore locking of ioctx list on removal

2013-12-05 Thread Mateusz Guzik
Commit 36f5588905c10a8c4568a210d601fe8c3c27e0f0
"aio: refcounting cleanup" resulted in ioctx_lock not being held
during ctx removal, leaving the list susceptible to corruptions.

In mainline kernel the issue went away as a side effect of
db446a08c23d5475e6b08c87acca79ebb20f283c "aio: convert the ioctx list to
table lookup v3".

Fix the problem by restoring appropriate locking.

Signed-off-by: Mateusz Guzik 
Reported-by: Eryu Guan 
Cc: Jeff Moyer 
Cc: Kent Overstreet 
Cc: linux-...@kvack.org
Cc: linux-kernel@vger.kernel.org

---
 fs/aio.c | 10 ++
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/fs/aio.c b/fs/aio.c
index 2bbcacf..ebd06fd 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -423,10 +423,12 @@ static void kill_ioctx_rcu(struct rcu_head *head)
  * when the processes owning a context have all exited to encourage
  * the rapid destruction of the kioctx.
  */
-static void kill_ioctx(struct kioctx *ctx)
+static void kill_ioctx(struct mm_struct *mm, struct kioctx *ctx)
 {
if (!atomic_xchg(&ctx->dead, 1)) {
+   spin_lock(&mm->ioctx_lock);
hlist_del_rcu(&ctx->list);
+   spin_unlock(&mm->ioctx_lock);
 
/*
 * It'd be more correct to do this in free_ioctx(), after all
@@ -494,7 +496,7 @@ void exit_aio(struct mm_struct *mm)
 */
ctx->mmap_size = 0;
 
-   kill_ioctx(ctx);
+   kill_ioctx(mm, ctx);
}
 }
 
@@ -852,7 +854,7 @@ SYSCALL_DEFINE2(io_setup, unsigned, nr_events, 
aio_context_t __user *, ctxp)
if (!IS_ERR(ioctx)) {
ret = put_user(ioctx->user_id, ctxp);
if (ret)
-   kill_ioctx(ioctx);
+   kill_ioctx(current->mm, ioctx);
put_ioctx(ioctx);
}
 
@@ -870,7 +872,7 @@ SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx)
 {
struct kioctx *ioctx = lookup_ioctx(ctx);
if (likely(NULL != ioctx)) {
-   kill_ioctx(ioctx);
+   kill_ioctx(current->mm, ioctx);
put_ioctx(ioctx);
return 0;
}
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 3.10] aio: restore locking of ioctx list on removal

2013-12-06 Thread Mateusz Guzik
On Thu, Dec 05, 2013 at 05:03:47PM -0800, Greg KH wrote:
> On Thu, Dec 05, 2013 at 11:09:02AM +0100, Mateusz Guzik wrote:
> > Commit 36f5588905c10a8c4568a210d601fe8c3c27e0f0
> > "aio: refcounting cleanup" resulted in ioctx_lock not being held
> > during ctx removal, leaving the list susceptible to corruptions.
> > 
> > In mainline kernel the issue went away as a side effect of
> > db446a08c23d5475e6b08c87acca79ebb20f283c "aio: convert the ioctx list to
> > table lookup v3".
> > 
> > Fix the problem by restoring appropriate locking.
> 
> Why can't I just take db446a08c23d5475e6b08c87acca79ebb20f283c instead?
> Does it not work well enough, or is there other issues involved in it
> that would keep it out of stable?
> 
> Also, it seems like the performance increase of that patch would be good
> to have backported, right?
> 

Sorry, should have noted this in my original message:
db446a08c23d5475e6b08c87acca79ebb20f283c is not trivial and applying it
results in some conflicts, in addition to that the patch itself had bugs
which were fixed in:
da90382c2ec367aac88ff6aa76afb659ee0e4235
f30d704fe1244c44a984d3d1f47bc648bcc6c9f7
77d30b14d24e557f89c41980011d72428514d729
d9b2c8714aef102dea95544a8cd9372b21af463f

It may be that the most convienent way to deal with this backport would
be to just sync the file with mainline.

As such, I think backporting is too risky at this stage.

Additionally my understanding of Documentation/stable_kernel_rules.txt
was that somewhat simpler patches are preferred.

So in the end I decided to fix the problem just by adding locking.

Unfortunately at this time I can't volunteer to do the work if
backporting is preferred.

-- 
Mateusz Guzik
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC][PATCH 3/3] audit: Audit proc cmdline value

2014-01-06 Thread Mateusz Guzik
I can't comment on the concept, but have one nit.

On Mon, Jan 06, 2014 at 07:30:30AM -0800, William Roberts wrote:
> +static void audit_log_cmdline(struct audit_buffer *ab, struct task_struct 
> *tsk,
> +  struct audit_context *context)
> +{
> + int res;
> + char *buf;
> + char *msg = "(null)";
> + audit_log_format(ab, " cmdline=");
> +
> + /* Not  cached */
> + if (!context->cmdline) {
> + buf = kmalloc(PATH_MAX, GFP_KERNEL);
> + if (!buf)
> + goto out;
> + res = get_cmdline(tsk, buf, PATH_MAX);
> + /* Ensure NULL terminated */
> + if (buf[res-1] != '\0')
> + buf[res-1] = '\0';

This accesses memory below the buffer if get_cmdline returned 0, which I
believe will be the case when someone jokingly unmaps the area (all
maybe when it is swapped out but can't be swapped in due to I/O errors).

Also since you are just putting 0 in there anyway I don't see much point
in testing for it.

> + context->cmdline = buf;
> + }
> + msg = context->cmdline;
> +out:
> + audit_log_untrustedstring(ab, msg);
> +}
> +



-- 
Mateusz Guzik
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: /proc//exe symlink behavior change in >=3.15.

2014-09-07 Thread Mateusz Guzik
On Sat, Sep 06, 2014 at 11:44:32PM +0200, Piotr Karbowski wrote:
> Hi,
> 
> Starting with kernel 3.15 the 'exe' symlink under /proc// acts diffrent
> than it used to in all the pre-3.15 kernels.
> 
> The usecase:
> 
> run /root/testbin (app that just sleeps)
> cp /root/testbin /root/testbin.new
> mv /root/testbin.new /root/testbin
> ls -al /proc/`pidof testbin`/exe
> 
> <=3.14: /root/testbin (deleted)
> >=3.15: /root/testbin.new (deleted)
> 
> Was the change intentional? It does render my system unusable and I failed
> to find a information about such change in the ChangeLog.
> 

It looks like this was already broken for "long" (> DNAME_INLINE_LEN)
names.

Short names share the problem since da1ce0670c14d8 "vfs: add
cross-rename".

The following change to switch_names is the culprit:

-   memcpy(dentry->d_iname, target->d_name.name,
-   target->d_name.len + 1);
-   dentry->d_name.len = target->d_name.len;
-   return;
+   unsigned int i;
+   BUILD_BUG_ON(!IS_ALIGNED(DNAME_INLINE_LEN, sizeof(long)));
+   for (i = 0; i < DNAME_INLINE_LEN / sizeof(long); i++) {
+ swap(((long *) &dentry->d_iname)[i],
+   ((long *) &target->d_iname)[i]);
+   }


Dentries can have names from embedded structure or from an external buffer.

If you take a look around you will see the code just swaps pointers for
"both external" case. But this results in the same behavoiur you are seeing.

Not sure how to fix it. Name in 'target' needs to be preserved, but memory
allocation which may be needed for this purpose can fail and switch_names
returns void, just like its callers (not to mention locks held around this).

One crap idea would be to have external buffers with a reference counter.
d_inode would still be set to the buffer and freeing funcs would use
container_of to get to the counter.

I can implement that later if it sounds sane enough.

Note this behaviour seems to be a requirement for cross-rename to work.

At least restoring previous behaviour while keeping cross-rename is not hard,
I can write it later.

-- 
Mateusz Guzik
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v2 1/1] vfs: Respect MS_RDONLY at bind mount creation

2014-08-01 Thread Mateusz Guzik
On Fri, Aug 01, 2014 at 02:12:24PM -0400, Richard Yao wrote:
> `mount -o bind,ro ...` suffers from a silent failure where the readonly
> flag is ignored. The bind mount will be created rw whenever the target
> is rw. Users typically workaround this by remounting readonly, but that
> does not work when you want to define readonly bind mounts in fstab.
> This is a major annoyance when dealing with recursive bind mounts
> because the userland mount command does not expose the option to
> recursively remount a subtree as readonly.
> 
> Signed-off-by: Richard Yao 
> ---
>  fs/namespace.c | 12 +---
>  1 file changed, 9 insertions(+), 3 deletions(-)
> 
> diff --git a/fs/namespace.c b/fs/namespace.c
> index 182bc41..0d23525 100644
> --- a/fs/namespace.c
> +++ b/fs/namespace.c
> @@ -1827,11 +1827,12 @@ static bool has_locked_children(struct mount *mnt, 
> struct dentry *dentry)
>   * do loopback mount.
>   */
>  static int do_loopback(struct path *path, const char *old_name,
> - int recurse)
> + unsigned long flags)
>  {
>   struct path old_path;
> - struct mount *mnt = NULL, *old, *parent;
> + struct mount *mnt = NULL, *old, *parent, *m;
>   struct mountpoint *mp;
> + int recurse = flags & MS_REC;
>   int err;
>   if (!old_name || !*old_name)
>   return -EINVAL;
> @@ -1871,6 +1872,10 @@ static int do_loopback(struct path *path, const char 
> *old_name,
>   goto out2;
>   }
>  
> + if (flags & MS_RDONLY)
> + for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL))
> + mnt_make_readonly(m);
> +
>   mnt->mnt.mnt_flags &= ~MNT_LOCKED;
>  
>   err = graft_tree(mnt, parent, mp);
> @@ -2444,7 +2449,8 @@ long do_mount(const char *dev_name, const char 
> *dir_name,
>   retval = do_remount(&path, flags & ~MS_REMOUNT, mnt_flags,
>   data_page);
>   else if (flags & MS_BIND)
> - retval = do_loopback(&path, dev_name, flags & MS_REC);
> + retval = do_loopback(&path, dev_name, flags & (MS_REC |
> +MS_RDONLY));
>   else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
>   retval = do_change_type(&path, flags);
>   else if (flags & MS_MOVE)

I don't really know this code, but have to ask.

Would not it be much better to pass down info about rdonly request to
copy_tree/clone_mnt (perhaps CL_MOUNT_RDONLY flag or a separate flags
argument) and handle it there?

This would avoid fishy-looking traversal before graft_tree, which even
if correct should not be necessary.

-- 
Mateusz Guzik
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v3 1/1] vfs: Respect MS_RDONLY at bind mount creation

2014-08-02 Thread Mateusz Guzik
On Fri, Aug 01, 2014 at 11:55:12PM -0400, Richard Yao wrote:
> `mount -o bind,ro ...` suffers from a silent failure where the readonly
> flag is ignored. The bind mount will be created rw whenever the target
> is rw. Users typically workaround this by remounting readonly, but that
> does not work when you want to define readonly bind mounts in fstab.
> This is a major annoyance when dealing with recursive bind mounts
> because the userland mount command does not expose the option to
> recursively remount a subtree as readonly.
> 
> Signed-off-by: Richard Yao 
> ---
>  fs/namespace.c | 15 +++
>  fs/pnode.h | 17 +
>  2 files changed, 20 insertions(+), 12 deletions(-)
> 
> diff --git a/fs/namespace.c b/fs/namespace.c
> index 182bc41..6b3a566 100644
> --- a/fs/namespace.c
> +++ b/fs/namespace.c
> @@ -921,6 +921,9 @@ static struct mount *clone_mnt(struct mount *old, struct 
> dentry *root,
>   if (flag & CL_MAKE_SHARED)
>   set_mnt_shared(mnt);
>  
> + if (flag & CL_MAKE_RDONLY)
> + mnt_make_readonly(mnt);
> +

Is not this more heavyweight than necessary?

i.e. would not mnt->mnt.mnt_flags |= MNT_READONLY suffice?

Since this mount point is not attached anywhere there cannot be any
writers to synchronize against (and if there could be some, you would
have to check error code from mnt_make_readonly).

That said, apart from this thingy and cosmetic issues looks good to me,
but note I'm just some random guy commenting on a patch.

>   /* stick the duplicate mount on the same expiry list
>* as the original if that was on one */
>   if (flag & CL_EXPIRE) {
> @@ -1827,11 +1830,13 @@ static bool has_locked_children(struct mount *mnt, 
> struct dentry *dentry)
>   * do loopback mount.
>   */
>  static int do_loopback(struct path *path, const char *old_name,
> - int recurse)
> + unsigned long flags)
>  {
>   struct path old_path;
>   struct mount *mnt = NULL, *old, *parent;
>   struct mountpoint *mp;
> + int recurse = flags & MS_REC;
> + int clflags = (flags & MS_RDONLY) ? CL_MAKE_RDONLY : 0;
>   int err;
>   if (!old_name || !*old_name)
>   return -EINVAL;
> @@ -1862,9 +1867,10 @@ static int do_loopback(struct path *path, const char 
> *old_name,
>   goto out2;
>  
>   if (recurse)
> - mnt = copy_tree(old, old_path.dentry, CL_COPY_MNT_NS_FILE);
> + mnt = copy_tree(old, old_path.dentry, CL_COPY_MNT_NS_FILE |
> + clflags);
>   else
> - mnt = clone_mnt(old, old_path.dentry, 0);
> + mnt = clone_mnt(old, old_path.dentry, clflags);
>  
>   if (IS_ERR(mnt)) {
>   err = PTR_ERR(mnt);
> @@ -2444,7 +2450,8 @@ long do_mount(const char *dev_name, const char 
> *dir_name,
>   retval = do_remount(&path, flags & ~MS_REMOUNT, mnt_flags,
>   data_page);
>   else if (flags & MS_BIND)
> - retval = do_loopback(&path, dev_name, flags & MS_REC);
> + retval = do_loopback(&path, dev_name, flags & (MS_REC |
> +MS_RDONLY));
>   else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
>   retval = do_change_type(&path, flags);
>   else if (flags & MS_MOVE)
> diff --git a/fs/pnode.h b/fs/pnode.h
> index 4a24635..7d32196 100644
> --- a/fs/pnode.h
> +++ b/fs/pnode.h
> @@ -20,14 +20,15 @@
>  #define SET_MNT_MARK(m) ((m)->mnt.mnt_flags |= MNT_MARKED)
>  #define CLEAR_MNT_MARK(m) ((m)->mnt.mnt_flags &= ~MNT_MARKED)
>  
> -#define CL_EXPIRE0x01
> -#define CL_SLAVE 0x02
> -#define CL_COPY_UNBINDABLE   0x04
> -#define CL_MAKE_SHARED   0x08
> -#define CL_PRIVATE   0x10
> -#define CL_SHARED_TO_SLAVE   0x20
> -#define CL_UNPRIVILEGED  0x40
> -#define CL_COPY_MNT_NS_FILE  0x80
> +#define CL_EXPIRE0x001
> +#define CL_SLAVE 0x002
> +#define CL_COPY_UNBINDABLE   0x004
> +#define CL_MAKE_SHARED   0x008
> +#define CL_PRIVATE   0x010
> +#define CL_SHARED_TO_SLAVE   0x020
> +#define CL_UNPRIVILEGED  0x040
> +#define CL_COPY_MNT_NS_FILE  0x080
> +#define CL_MAKE_RDONLY   0x100
>  
>  #define CL_COPY_ALL  (CL_COPY_UNBINDABLE | CL_COPY_MNT_NS_FILE)
>  
> -- 
> 1.8.5.5
> 

-- 
Mateusz Guzik
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] scatterlist.h: Change CONFIG_DEBUG_SG for ifdef statement in sg_set_bf

2014-08-02 Thread Mateusz Guzik
On Sat, Aug 02, 2014 at 10:56:13PM -0400, Nicholas Krause wrote:
> This changes the ifdef statement in sg_set_bg to !CONFIG_DEBUG_SG in order
> to avoid a bug with xhci dequence/enquence functions.
> 
> Signed-off-by: Nicholas Krause 
> ---
>  include/linux/scatterlist.h | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h
> index adae88f..62de7b3 100644
> --- a/include/linux/scatterlist.h
> +++ b/include/linux/scatterlist.h
> @@ -111,7 +111,7 @@ static inline struct page *sg_page(struct scatterlist *sg)
>  static inline void sg_set_buf(struct scatterlist *sg, const void *buf,
> unsigned int buflen)
>  {
> -#ifdef CONFIG_DEBUG_SG
> +#ifdef !CONFIG_DEBUG_SG
>   BUG_ON(!virt_addr_valid(buf));
>  #endif

Have you tried compiling this? IIRC you said you would compile your
stuff, what hapened to that?

What exactly were you trying to achieve? Did this BUG_ON detect a
problem on your system and now you are trying to silence it?

The change would be wrong even if it compiled since it would just
execute the assertion only when debug is disabled.

-- 
Mateusz Guzik
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] scatterlist.h: Change CONFIG_DEBUG_SG for ifdef statement in sg_set_bf

2014-08-02 Thread Mateusz Guzik
On Sun, Aug 03, 2014 at 12:31:30AM -0400, Nick Krause wrote:
> On Sat, Aug 2, 2014 at 11:59 PM, Mateusz Guzik  wrote:
> > On Sat, Aug 02, 2014 at 10:56:13PM -0400, Nicholas Krause wrote:
> >> This changes the ifdef statement in sg_set_bg to !CONFIG_DEBUG_SG in order
> >> to avoid a bug with xhci dequence/enquence functions.
> >>
> >> Signed-off-by: Nicholas Krause 
> >> ---
> >>  include/linux/scatterlist.h | 2 +-
> >>  1 file changed, 1 insertion(+), 1 deletion(-)
> >>
> >> diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h
> >> index adae88f..62de7b3 100644
> >> --- a/include/linux/scatterlist.h
> >> +++ b/include/linux/scatterlist.h
> >> @@ -111,7 +111,7 @@ static inline struct page *sg_page(struct scatterlist 
> >> *sg)
> >>  static inline void sg_set_buf(struct scatterlist *sg, const void *buf,
> >> unsigned int buflen)
> >>  {
> >> -#ifdef CONFIG_DEBUG_SG
> >> +#ifdef !CONFIG_DEBUG_SG
> >>   BUG_ON(!virt_addr_valid(buf));
> >>  #endif
> >
> > Have you tried compiling this? IIRC you said you would compile your
> > stuff, what hapened to that?
> >
> > What exactly were you trying to achieve? Did this BUG_ON detect a
> > problem on your system and now you are trying to silence it?
> >
> > The change would be wrong even if it compiled since it would just
> > execute the assertion only when debug is disabled.
> >
> > --
> > Mateusz Guzik
> This is the mailing theme I am getting this from,[xhci] kernel BUG at
> include/linux/scatterlist.h:115.
> I hope this answers your question about the BUG_ON and yes I did
> compile check it with make
> M=include/. I also checked usb and usb net directories too.

So how have you verified it tests you change? Why didn't you perform a
full build?

This is a syntax error, I suggest you read up about C preprocessor.

Your change attempts to flip the condition. Now virt_addr_valid(buf) is
tested only with debug disabled. When you enable debug it is suddenly
not tested - definitely does not make sense.

I'm assuming you are talking about https://lkml.org/lkml/2014/7/30/810

If you actually read the thread you will note:
> Looks like I either need specify valid addresses to sg_set_buf(), or
> just make the unit test depend on !CONFIG_DEBUG_SG. 

1. It is acknowleged the problem is in the caller
2. There is a suggestion to ensure that the UNIT TEST is not executed if
CONFIG_DEBUG_SG  is enabled (this part was shortened to "!CONFIG_DEBUG_SG"
but nobody claims you can use this in if/if[n]def statements)

UNIT TEST as in the thingy which resulted in passing down a buffer
failing on this BUG_ON.

There is no suggestion to do anything with sg_set_buf itself.

You were advised several times to find a simpler project. Also people
noted that a "beginner kernel programmer" actually means "seasoned
programmer learning the kernel". It is clear you are not a seasoned
programmer, so why do you insist on doing kernel work?

I can only recommend you play with userspace programs for now. These are
much easier to debug and experiment with, not to mention have a lot lower
entry point.

-- 
Mateusz Guzik
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] scatterlist.h: Change CONFIG_DEBUG_SG for ifdef statement in sg_set_bf

2014-08-03 Thread Mateusz Guzik
of them
were harder to test in runtime, but then why were you trying to address
some obscure issue?

Most of your patches also show you didn't perform due dilligence in
understanding the code you are modifying. This happens with beginners
quite often, I definitely make such mistakes myself. Even experienced
people make mistakes, that's again nothing to be worried about.

The problem is there if you repeatedly make the same mistakes.

Let me repeat:

There is nothing wrong with creating bad code, everyone has to start
somewhere. It is wrong though to send out such code to the world over
and over again, even though recipients repeatedly tell you that:
- the patch is incorrect
AND/OR
- it does not compile
AND
- you should take a step back and focus on something less demanding for
  now

It's great you are interested in kernel development. But in order to do
it, you have to have solid programming skills. Various patches and
questions you sent over last 2 months suggest you don't have them yet.

So people advised you how to get them: userspace development.

Userspace development is well equipped with tools for beginners. It is
way easier to experiment there, there are better debugging tools,
simpler problems to solve (in fact, you can come up with approachable
toy problems at any time).

Some people were rude to you (and some of them apologized). This is
quite unfortunate, but the point still stands: kernel development is not 
a playground for people who are new to programming.

There are trivial problems here and there, but you have to find them
yourself. And it is unlikely if you are not proficient to some extent
already.

So, I strongly recommend you take the advice. The sooner the better.
You are not going to become a proficient programmer in finite time with
your current approach.

This is not a 'go away', this is an actual advice to give you a chance
to improve.

Example programs you can write for fun:
- a tcp proxy using poll, epoll and select. backend can be chosen at
  startup. implement everything yourself, do not use wrappers
- a simple shell supporting output redirection and pipes
- pick your favourite utility from and implement your own version. can
  be echo, cat, rev, kill, strings; then you can move to maybe stat, du,
  ls, chmod

I hope this helps,
-- 
Mateusz Guzik
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] xhci: Merge and Update debugging for patches from 3.6 kernel tree

2014-08-05 Thread Mateusz Guzik
On Tue, Aug 05, 2014 at 01:55:34AM -0400, Nick Krause wrote:
> On Tue, Aug 5, 2014 at 1:45 AM, Greg KH  wrote:
> > On Tue, Aug 05, 2014 at 12:56:57AM -0400, Nicholas Krause wrote:
> >> I am adding the fixes to the tree send for adding debugging to the kernel
> >> tree from a patch sent in 2013 on the the 3.6 release. The patch adds
> >> debugging over xhci capable debugging usb ports and needed to be updated
> >> into the latest rc tree. The patch was first sent in this thread,
> >> http://marc.info/?l=linux-usb&m=135948845511047.
> >>
> >> Signed-off-by: Nicholas Krause 
> >
> > If you send one more patch, I am going to have to ask vger.kernel.org to
> > ban you from their servers.  You are actively bothering people and
> > causing problems and wasting time.
> >
> > You have been told _numerous_ times to stop, yet you refuse to listen.
> > You hold the record for getting kicked out of the Eudyptula challenge in
> > a matter of hours, something no one else has ever had happen.  You
> > ignore lots of very valid comments and suggestions, for no good reason.
> > You flail about, making mistakes that are now starting to bother users,
> > which is not acceptable at all.
> >
> > I will not respond to any more of your emails, and ask everyone else to
> > now stop as well.
> >
> > good bye.
> >
> > *plonk*
> >
> > greg k-h
> 
> Greg,
> You haven't even checked my patch, our you just going to assume it's wrong?


I don't know this code and I'm not interested in changing that, so just I'm
just pointing out obvious problems.

1. You somehow managed to have changes like his all over the patch:
-#define TT_HS_OVERHEAD (31 + 94)
-#define TT_DMI_OVERHEAD (25 + 12)
+#define TT_HS_OVERHEAD (31  94)
+#define TT_DMI_OVERHEAD (25  12)

Clearly, this will cause compilation problems. But maybe this weird
corruption happened after you tested your patch?

2.  In the original patch there were some additions to struct xhci_hcd,
e.g. dbg_cap_reg.

Let's look at a random user:
+void xhci_teardown_dbg_cap(struct xhci_hcd *xhci, struct device *dev)
+{
+   u32 val;
+
+   xhci_dbg(xhci, "xhci_teardown_dbg_cap()\n");
+   if (!xhci->dbg_cap_regs)
+   return;

xhci is a pointer to struct xhci_hcd.

Except in your patch all these additions landed in struct xhci_dbg_cap_ctx,
thus this could not possibly compile.

But what's the most important thing here is this: writing a compilable
patch is not a problem, writing a working patch is.

Forward porting, like this one, demands that you:
- understand the patch as applied to the original tree
- understand current state of the tree and how it influences the patch

As such, foward porting is typically not a task for beginners.

I can only recommend one last time you leave the kernel alone for the
time being and focus on userspace.

There. This is my last reply, have fun.

-- 
Mateusz Guzik
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 1/2v6] procfs: show hierarchy of pid namespace

2014-11-05 Thread Mateusz Guzik
 

> + /*
> +  * Only one pid found as the child reaper,
> +  * so current pid namespace do not have sub-namespace,
> +  * return 0 directly.
> +  */
> + if (list_is_singular(pidns_pid_list)) {
> + rc = 0;
> + goto out;
> + }
> +
> + /*
> +  * screen duplicate pids from pidns_pid_list
> +  * and form a new list pidns_pid_tree.
> +  */
> + rc = pidns_list_filter(pidns_pid_list, pidns_pid_tree);
> + if (rc)
> + goto out;
> +
> + return 0;
> +
> +out:
> + free_pidns_list(pidns_pid_list);
> + return rc;
> +}
> +
> +static int nslist_proc_show(struct seq_file *m, void *v)
> +{
> + struct pidns_list *pos;
> + struct pid_namespace *ns, *curr_ns;
> + struct pid *pid;
> + char pid_buf[16];
> + int i, rc;
> +
> + LIST_HEAD(pidns_pid_list);
> + LIST_HEAD(pidns_pid_tree);
> +
> + curr_ns = task_active_pid_ns(current);
> +
> + rc = proc_pidns_list_refresh(curr_ns, &pidns_pid_list, &pidns_pid_tree);
> + if (rc)
> + return rc;
> +
> + /* print pid namespace's hierarchy */
> + list_for_each_entry(pos, &pidns_pid_tree, list) {
> + pid = pos->pid;
> + for (i = curr_ns->level + 1; i <= pid->level; i++) {
> + ns = pid->numbers[i].ns;
> + /* show PID '1' in specific pid ns */
> + snprintf(pid_buf, 16, "%u",
> + pid_vnr(find_pid_ns(1, ns)));
> + seq_printf(m, "%s ", pid_buf);
> + }
> +
> + seq_putc(m, '\n');
> + }
> +
> + free_pidns_list(&pidns_pid_tree);
> +
> + return 0;
> +}
> +
> +static int nslist_proc_open(struct inode *inode, struct file *file)
> +{
> + return single_open(file, nslist_proc_show, NULL);
> +}
> +
> +static const struct file_operations proc_nspid_nslist_fops = {
> + .open   = nslist_proc_open,
> + .read   = seq_read,
> + .llseek = seq_lseek,
> + .release= single_release,
> +};
> +
> +static int __init pidns_hierarchy_init(void)
> +{
> + proc_create(NS_HIERARCHY, S_IWUGO,
> + NULL, &proc_nspid_nslist_fops);
> +
> + return 0;
> +}
> +fs_initcall(pidns_hierarchy_init);
> -- 
> 1.9.3
> 

-- 
Mateusz Guzik
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] gpu: drm: drm_dp_mst_topology.c: Fix improper use of strncat.

2014-10-05 Thread Mateusz Guzik
On Sun, Oct 05, 2014 at 11:46:03PM +0200, Rickard Strandqvist wrote:
> I have now eliminate the need to use the temporary string,
> and therefore also the use of strncat.
> And I think this code is clearer and more effective.
> 
The code is definitely less clear.

> Signed-off-by: Rickard Strandqvist 
> ---
>  drivers/gpu/drm/drm_dp_mst_topology.c |   15 ---
>  1 file changed, 8 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/gpu/drm/drm_dp_mst_topology.c 
> b/drivers/gpu/drm/drm_dp_mst_topology.c
> index ac3c273..c3f6571 100644
> --- a/drivers/gpu/drm/drm_dp_mst_topology.c
> +++ b/drivers/gpu/drm/drm_dp_mst_topology.c
> @@ -997,17 +997,18 @@ static void build_mst_prop_path(struct drm_dp_mst_port 
> *port,
>   struct drm_dp_mst_branch *mstb,
>   char *proppath)
>  {
> - int i;
> - char temp[8];
> - snprintf(proppath, 255, "mst:%d", mstb->mgr->conn_base_id);
> + int i, len;
> + static const int proppath_len = 255;

Length should be passed by the caller.

> + memset(proppath, 0, proppath_len);

Why?
> + len = snprintf(proppath, proppath_len, "mst:%d", 
> mstb->mgr->conn_base_id);
>   for (i = 0; i < (mstb->lct - 1); i++) {
>   int shift = (i % 2) ? 0 : 4;
>   int port_num = mstb->rad[i / 2] >> shift;
> - snprintf(temp, 8, "-%d", port_num);
> - strncat(proppath, temp, 255);
> + len += snprintf(&proppath[(len < proppath_len ? len : 0)],
> + proppath_len - len, "-%d", 
> port_num);

And if the space ran out you just overwrite from the start of the buffer
and provide negative length.

Since this function returns void (and so does its caller) it is quite
unclear what to do if the space ran out. I would just WARN.

>   }
> - snprintf(temp, 8, "-%d", port->port_num);
> - strncat(proppath, temp, 255);
> + snprintf(&proppath[(len < proppath_len ? len : 0)], proppath_len - len,
> + "-%d", port->port_num);
>  }
>  
>  static void drm_dp_add_port(struct drm_dp_mst_branch *mstb,

-- 
Mateusz Guzik
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] fs: add the FIGETFROZEN ioctl call

2016-04-14 Thread Mateusz Guzik
On Thu, Apr 14, 2016 at 09:57:07AM +0200, Florian Margaine wrote:
> This lets userland get the filesystem freezing status, aka whether the
> filesystem is frozen or not. This is so that an application can know if
> it should freeze the filesystem or if it isn't necessary when taking a
> snapshot.

The feature may be useful in general, I don't know.

However, I'm confused why programs would depend on it. If you froze
a particular subsystem, you don't have to check. If you did not, what
prevents whoever originaly froze it from unfreezing as you access it?

As such, maybe the feature you are looking for would count how many
times the fs is frozen.

-- 
Mateusz Guzik


Re: [PATCH] proc: prevent accessing /proc//environ until it's ready

2016-04-28 Thread Mateusz Guzik
On Thu, Apr 28, 2016 at 09:04:18PM +0200, Mathias Krause wrote:
> If /proc//environ gets read before the envp[] array is fully set
> up in create_{aout,elf,elf_fdpic,flat}_tables(), we might end up trying
> to read more bytes than are actually written, as env_start will already
> be set but env_end will still be zero, making the range calculation
> underflow, allowing to read beyond the end of what has been written.
> 
> Fix this as it is done for /proc//cmdline by testing env_end for
> zero. It is, apparently, intentionally set last in create_*_tables().
> 
> This bug was found by the PaX size_overflow plugin that detected the
> arithmetic underflow of 'this_len = env_end - (env_start + src)' when
> env_end is still zero.
> 
> Reported-at: https://forums.grsecurity.net/viewtopic.php?f=3&t=4363
> Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=116461
> ---
>  fs/proc/base.c |3 ++-
>  1 file changed, 2 insertions(+), 1 deletion(-)
> 
> diff --git a/fs/proc/base.c b/fs/proc/base.c
> index 4f764c2ac1a5..45f2162e55b2 100644
> --- a/fs/proc/base.c
> +++ b/fs/proc/base.c
> @@ -955,7 +955,8 @@ static ssize_t environ_read(struct file *file, char 
> __user *buf,
>   struct mm_struct *mm = file->private_data;
>   unsigned long env_start, env_end;
>  
> - if (!mm)
> + /* Ensure the process spawned far enough to have an environment. */
> + if (!mm || !mm->env_end)
>   return 0;
>  
>   page = (char *)__get_free_page(GFP_TEMPORARY);

In this case get_cmdline in mm/util.c should also be patched for
completness. It tests for arg_end, but later accesses env_end.

-- 
Mateusz Guzik


Re: linux-next: Tree for May 2 [WARNING: at fs/dcache.c]

2016-05-02 Thread Mateusz Guzik
On Mon, May 02, 2016 at 07:15:24PM +0900, Sergey Senozhatsky wrote:
> On (05/02/16 18:40), Stephen Rothwell wrote:
> > Hi all,
> > 
> > Changes since 20160429
> 
> Hello,
> 
> [0.368791] [ cut here ]
> [0.368850] WARNING: CPU: 0 PID: 1 at fs/dcache.c:1688 d_set_d_op+0x5e/0xcc
> [0.368911] Modules linked in:
> [0.369002] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 
> 4.6.0-rc6-next-20160502-dbg-5-gf58c9da-dirty #404
> [0.369161]   880133067908 811b8202 
> 
> [0.369371]   880133067948 81039365 
> 0698e5dffe26
> [0.369580]  880132c090c0 81613680 880132c040a0 
> 880132c08000
> [0.369791] Call Trace:
> [0.369846]  [] dump_stack+0x4d/0x63
> [0.369904]  [] __warn+0xb8/0xd3
> [0.369962]  [] warn_slowpath_null+0x18/0x1a
> [0.370021]  [] d_set_d_op+0x5e/0xcc
> [0.370079]  [] simple_lookup+0x2e/0x45

The issue is that 2 macros have the same value:

#define DCACHE_OP_REAL  0x0800

#define DCACHE_PAR_LOOKUP   0x0800 /* being looked up
(with parent locked shared) */

Verified with switching one to 0x1000 and the warning went away.

-- 
Mateusz Guzik


[PATCH] rlimit: locking tidy ups

2016-05-04 Thread Mateusz Guzik
rlimits are stored in task->signal and are guaranteed to remain valid as
long as the task struct is valid. All modifications are protected by
locking task->group_leader. Additionally changes to RLIMIT_CPU need
task->sighand.

do_prlimit takes tasklist_lock, which as a side effect gurantees stable
->sighand however, there is no need to take the lock for any limit other
than RLIMIT_CPU and even then we can get away with locking sighand itself.

proc_pid_limits takes ->sighand lock prior to accessing rlimits, but it
serves no purpose as it does not prevent modifications.

Both functions effectively always perform ->sighand != NULL check, but it
is only of concern when RLIMIT_CPU is being set. ->sighand is only cleared
when the process is reaped, so a dedicated check only makes it less likely
to access limits of a dead process.

As such, eliminate the unneeded check and:
- do_prlimit: stop taking tasklist_lock at all and only lock sighand when
necessary
- proc_pid_limits: lock group leader in order to obtain a stable copy

Signed-off-by: Mateusz Guzik 
---
 fs/proc/base.c |  6 ++
 kernel/sys.c   | 22 ++
 kernel/time/posix-cpu-timers.c |  3 +--
 security/selinux/hooks.c   |  4 +++-
 4 files changed, 20 insertions(+), 15 deletions(-)

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 704ae63..3d4963e 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -618,14 +618,12 @@ static int proc_pid_limits(struct seq_file *m, struct 
pid_namespace *ns,
   struct pid *pid, struct task_struct *task)
 {
unsigned int i;
-   unsigned long flags;
 
struct rlimit rlim[RLIM_NLIMITS];
 
-   if (!lock_task_sighand(task, &flags))
-   return 0;
+   task_lock(task->group_leader);
memcpy(rlim, task->signal->rlim, sizeof(struct rlimit) * RLIM_NLIMITS);
-   unlock_task_sighand(task, &flags);
+   task_unlock(task->group_leader);
 
/*
 * print the file header
diff --git a/kernel/sys.c b/kernel/sys.c
index 89d5be4..1c8a67d 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1361,7 +1361,9 @@ int do_prlimit(struct task_struct *tsk, unsigned int 
resource,
struct rlimit *new_rlim, struct rlimit *old_rlim)
 {
struct rlimit *rlim;
+   unsigned long flags;
int retval = 0;
+   int sighand_locked = 0;
 
if (resource >= RLIM_NLIMITS)
return -EINVAL;
@@ -1373,15 +1375,17 @@ int do_prlimit(struct task_struct *tsk, unsigned int 
resource,
return -EPERM;
}
 
-   /* protect tsk->signal and tsk->sighand from disappearing */
-   read_lock(&tasklist_lock);
-   if (!tsk->sighand) {
-   retval = -ESRCH;
-   goto out;
+   task_lock(tsk->group_leader);
+   if (new_rlim && resource == RLIMIT_CPU &&
+   new_rlim->rlim_cur != RLIM_INFINITY) {
+   if (!lock_task_sighand(tsk, &flags)) {
+   retval = -ESRCH;
+   goto out;
+   }
+   sighand_locked = 1;
}
 
rlim = tsk->signal->rlim + resource;
-   task_lock(tsk->group_leader);
if (new_rlim) {
/* Keep the capable check against init_user_ns until
   cgroups can contain all limits */
@@ -1407,7 +1411,6 @@ int do_prlimit(struct task_struct *tsk, unsigned int 
resource,
if (new_rlim)
*rlim = *new_rlim;
}
-   task_unlock(tsk->group_leader);
 
/*
 * RLIMIT_CPU handling.   Note that the kernel fails to return an error
@@ -1418,8 +1421,11 @@ int do_prlimit(struct task_struct *tsk, unsigned int 
resource,
 if (!retval && new_rlim && resource == RLIMIT_CPU &&
 new_rlim->rlim_cur != RLIM_INFINITY)
update_rlimit_cpu(tsk, new_rlim->rlim_cur);
+
+   if (sighand_locked)
+   unlock_task_sighand(tsk, &flags);
 out:
-   read_unlock(&tasklist_lock);
+   task_unlock(tsk->group_leader);
return retval;
 }
 
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 1cafba8..fc38417 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -23,9 +23,8 @@ void update_rlimit_cpu(struct task_struct *task, unsigned 
long rlim_new)
 {
cputime_t cputime = secs_to_cputime(rlim_new);
 
-   spin_lock_irq(&task->sighand->siglock);
+   lockdep_assert_held(&task->sighand->siglock);
set_process_cpu_timer(task, CPUCLOCK_PROF, &cputime, NULL);
-   spin_unlock_irq(&task->sighand->siglock);
 }
 
 static int check_clock(const clockid_t which_clock)
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index a86d537

  1   2   >