from:"Ram Pai"

Re: [PATCH 2/3] resource: Add release_mem_region_adjustable()

2013-04-02 Thread Ram Pai

gt;end != end) {
> + /* adjust the start */
> + ret = __adjust_resource(res, end+1,
> + res->end - end);
> + } else if (res->start != start && res->end == end) {
> + /* adjust the end */
> + ret = __adjust_resource(res, res->start,
> + start - res->start);
> + } else {
> + /* split into two entries */
> + new = kzalloc(sizeof(struct resource), GFP_KERNEL);
> + if (!new) {
> + ret = -ENOMEM;
> + break;
> + }
> + new->name = res->name;
> + new->start = end + 1;
> + new->end = res->end;
> + new->flags = res->flags;
> + new->parent = res->parent;
> + new->sibling = res->sibling;
> + new->child = NULL;
> +
> + ret = __adjust_resource(res, res->start,
> + start - res->start);
> + if (ret) {
> + kfree(new);
> + break;
> + }
> + res->sibling = new;
> + }
> +
> + break;
> + }
> +
> + write_unlock(&resource_lock);
> + return ret;
> +}
> +
>  /*
>   * Managed region resource
>   */

-- 
Ram Pai

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 2/3] resource: Add release_mem_region_adjustable()

2013-04-03 Thread Ram Pai

On Wed, Apr 03, 2013 at 01:55:05PM -0600, Toshi Kani wrote:
> On Wed, 2013-04-03 at 13:37 +0800, Ram Pai wrote:
> > On Tue, Apr 02, 2013 at 10:17:29AM -0600, Toshi Kani wrote:
> > > + while ((res = *p)) {

...snip...

> > > + if (res->start > start || res->end < end) {
> > 
> > This check looks sub-optimal; possbily wrong, to me.  if the res->start
> > is greater than 'start', then obviously its sibling's start will
> > also be greater than 'start'. So it will loop through all the
> > resources unnecesarily.
> 
> I think this check is necessary to check if the requested range fits
> into a resource.  It needs to check both sides to verify this.  I will
> add some comment on this check.
> 
> >   you might want something like
> > 
> > if (start >= res->end) {
> 
> I agree that this list is sorted, so we can optimize an error case (i.e.
> no matching entry is found) with an additional check.  I will add the
> following check at the beginning of the while loop.  
> 
> if (res->start >= end)
> break;
> 
> I also realized that the function returns 0 when no matching entry is
> found.  I will change it to return -EINVAL as well.  

ok. this will take care of it.

> 
> > 
> > > + p = &res->sibling;
> > > + continue;
> > > + }
> > > +
> > > + if (!(res->flags & IORESOURCE_MEM)) {
> > > + ret = -EINVAL;
> > > + break;
> > > + }
> > > +
> > > + if (!(res->flags & IORESOURCE_BUSY)) {
> > > + p = &res->child;
> > > + continue;
> > > + }
> > > +
> > > + if (res->start == start && res->end == end) {
> > > + /* free the whole entry */
> > > + *p = res->sibling;
> > > + kfree(res);
> > 
> > This is incomplete. the prev resource's sibling should now point to
> > this resource's sibling. The parent's child has to be updated if
> > this resource is the first child resource. no?
> 
> If this resource is the first child, *p is set to &parent->child.  So,
> it will update the parents' child.

But if the resource is not the parent's first child? will it update the
previous siblings ->sibling ?

-- 
Ram Pai

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] kernel/resource.c: fix stack overflow in __reserve_region_with_split

2012-08-27 Thread Ram Pai

On Mon, Aug 27, 2012 at 06:47:54PM -0600, T Makphaibulchoke wrote:
> Using recurvise call to try adding a non-conflicting region in the function
> __reserve_region_with_split() could result in a stack overflow in the case
> that the recursive calls are too deep.  Convert the recursive calls to
> an iterative loop to avoid the problem.
> 
> Signed-off-by: T Makphaibulchoke 
> ---
>  kernel/resource.c |   32 ++--
>  1 files changed, 18 insertions(+), 14 deletions(-)
> 
> diff --git a/kernel/resource.c b/kernel/resource.c
> index 34d4588..d6e9f9c 100644
> --- a/kernel/resource.c
> +++ b/kernel/resource.c
> @@ -768,25 +768,29 @@ static void __init __reserve_region_with_split(struct 
> resource *root,
>   return;
> 
>   res->name = name;
> - res->start = start;
> - res->end = end;
>   res->flags = IORESOURCE_BUSY;
> 
> - conflict = __request_resource(parent, res);
> - if (!conflict)
> - return;
> + while (1) {
> + res->start = start;
> + res->end = end;
> 
> - /* failed, split and try again */
> - kfree(res);
> + conflict = __request_resource(parent, res);
> + if (!conflict)
> + break;
> 
> - /* conflict covered whole area */
> - if (conflict->start <= start && conflict->end >= end)
> - return;
> + /* conflict covered whole area */
> + if (conflict->start <= start && conflict->end >= end) {
> + kfree(res);
> + break;
> + }
> +
> + /* failed, split and try again */
> + if (conflict->start > start)
> + end = conflict->start - 1;
> + if (conflict->end < end)
> + start = conflict->end + 1;
> + }

Earlier the patch reserved all areas from 'start' to 'end' skipping any
conflicting intermediate regions.  Your patch will reserve just the
first available fragment before the conflicting range, but will not 
reserve any fragments after the conflicting range. 

For example:
if the region requested is 1 to 100, but 20-30 is already reserved, than
the earlier behavior would reserve 1-20 and 30-100. With your
patch, it will just reserve 1-20.

RP

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH -v12 02/15] resources: Add probe_resource()

2012-08-29 Thread Ram Pai

On Tue, Aug 28, 2012 at 05:10:43PM -0700, Linus Torvalds wrote:
> On Tue, Aug 28, 2012 at 10:05 AM, Linus Torvalds
>  wrote:
> >
> > Ugh. Ok, looking closer at this,
> 
> Btw, looking at that code, I also found what looks like a potential
> locking bug in allocate_resource().
> 
> The code does
> 
> if (new->parent)
>.. reallocate ..
> 
> to check whether a resource was already allocated. HOWEVER, it does so
> without actually holding the resource lock. Which means that
> "new->parent" might in theory change.
> 

:( it was my mistake.

BTW: adjust_resource() also has the same problem. It is also
accessing res->parent without holding the lock.

The following patch enhances your patch to fix that potential race too.


kernel/resource.c |   81 
+
  1 file changed, 45 insertions(+), 36 deletions(-)

diff --git a/kernel/resource.c b/kernel/resource.c
index 34d4588..427ed48 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -462,50 +462,59 @@ static int find_resource(struct resource *root, struct 
resource *new,
return  __find_resource(root, NULL, new, size, constraint);
 }
 
-/**
- * reallocate_resource - allocate a slot in the resource tree given range & 
alignment.
- * The resource will be relocated if the new size cannot be reallocated in 
the
- * current location.
- *
- * @root: root resource descriptor
- * @old:  resource descriptor desired by caller
- * @newsize: new size of the resource descriptor
- * @constraint: the size and alignment constraints to be met.
- */
-int reallocate_resource(struct resource *root, struct resource *old,
+
+static int __reallocate_resource(struct resource *root, struct resource *old,
resource_size_t newsize,
struct resource_constraint  *constraint)
 {
-   int err=0;
+   int err;
struct resource new = *old;
struct resource *conflict;
 
-   write_lock(&resource_lock);
-
-   if ((err = __find_resource(root, old, &new, newsize, constraint)))
-   goto out;
+   err = __find_resource(root, old, &new, newsize, constraint);
+   if (err)
+   return err;
 
if (resource_contains(&new, old)) {
old->start = new.start;
old->end = new.end;
-   goto out;
+   return 0;
}
 
-   if (old->child) {
-   err = -EBUSY;
-   goto out;
-   }
+   if (old->child)
+   return -EBUSY;
 
if (resource_contains(old, &new)) {
old->start = new.start;
old->end = new.end;
-   } else {
-   __release_resource(old);
-   *old = new;
-   conflict = __request_resource(root, old);
-   BUG_ON(conflict);
+   return 0;
}
-out:
+
+   __release_resource(old);
+   *old = new;
+   conflict = __request_resource(root, old);
+   BUG_ON(conflict);
+   return 0;
+}
+
+/**
+ * reallocate_resource - allocate a slot in the resource tree given range
+ * & alignment. The resource will be relocated if the new size cannot be
+ * reallocated in the current location.
+ *
+ * @root: root resource descriptor
+ * @old:  resource descriptor desired by caller
+ * @newsize: new size of the resource descriptor
+ * @constraint: the size and alignment constraints to be met.
+ */
+int reallocate_resource(struct resource *root, struct resource *old,
+   resource_size_t newsize,
+   struct resource_constraint  *constraint)
+{
+   int err;
+
+   write_lock(&resource_lock);
+   err = __reallocate_resource(root, old, newsize, constraint);
write_unlock(&resource_lock);
return err;
 }
@@ -544,16 +553,16 @@ int allocate_resource(struct resource *root, struct 
resource *new,
constraint.alignf = alignf;
constraint.alignf_data = alignf_data;
 
-   if ( new->parent ) {
+   write_lock(&resource_lock);
+   if (new->parent) {
/* resource is already allocated, try reallocating with
   the new constraints */
-   return reallocate_resource(root, new, size, &constraint);
+   err = __reallocate_resource(root, new, size, &constraint);
+   } else {
+   err = find_resource(root, new, size, &constraint);
+   if (err >= 0 && __request_resource(root, new))
+   err = -EBUSY;
}
-
-   write_lock(&resource_lock);
-   err = find_resource(root, new, size, &constraint);
-   if (err >= 0 && __request_resource(root, new))
-   err = -EBUSY;
write_unlock(&resource_lock);
return err;
 }
@@ -718,12 +727,12 @@ void insert_resource_expand_to_fit(struct resource *root, 
struct resource *new)
  */
 int adjust_resource(struct resource *res, resource_size_t start, 
resource_size_t size)
 {
-   struc

Re: [PATCH v2] kernel/resource.c: fix stack overflow in __reserve_region_with_split

2012-09-02 Thread Ram Pai

On Fri, Aug 31, 2012 at 03:04:25PM -0600, T Makphaibulchoke wrote:
> Using recurvise call to try adding a non-conflicting region in the function
> __reserve_region_with_split() could result in a stack overflow in the case
> that the recursive calls are too deep.  Convert the recursive calls to
> an iterative loop to avoid the problem.
> 
> Signed-off-by: T Makphaibulchoke 
> 
> --
> Change since v1:
> * Fixing __resrve_region_with_split() to ensure a reqioon reserve request is
>   satisfied to the fullest extent, minus any overlapping conflicting regions.
> ---
>  kernel/resource.c |   47 +++
>  1 files changed, 35 insertions(+), 12 deletions(-)
> 
> diff --git a/kernel/resource.c b/kernel/resource.c
> index 34d4588..f0cdeb6 100644
> --- a/kernel/resource.c
> +++ b/kernel/resource.c
> @@ -763,6 +763,7 @@ static void __init __reserve_region_with_split(struct 
> resource *root,
>   struct resource *parent = root;
>   struct resource *conflict;
>   struct resource *res = kzalloc(sizeof(*res), GFP_ATOMIC);
> + struct resource *next_res = NULL;
> 
>   if (!res)
>   return;
> @@ -772,21 +773,43 @@ static void __init __reserve_region_with_split(struct 
> resource *root,
>   res->end = end;
>   res->flags = IORESOURCE_BUSY;
> 
> - conflict = __request_resource(parent, res);
> - if (!conflict)
> - return;
> + while (1) {
> 
> - /* failed, split and try again */
> - kfree(res);
> + conflict = __request_resource(parent, res);
> + if (!conflict) {
> + if (!next_res)
> + break;
> + res = next_res;
> + next_res = NULL;
> + continue;
> + }
> 
> - /* conflict covered whole area */
> - if (conflict->start <= start && conflict->end >= end)
> - return;
> + /* conflict covered whole area */
> + if (conflict->start <= res->start &&
> + conflict->end >= res->end) {
> + kfree(res);
> + WARN_ON(next_res);
> + break;
> + }
> +
> + /* failed, split and try again */
> + if (conflict->start > res->start) {
> + end = res->end;
> + res->end = conflict->start - 1;
> + if (conflict->end < end) {
> + next_res = kzalloc(sizeof(*res), GFP_ATOMIC);
> + if (!next_res) {
> + kfree(res);
> + break;
> + }
> + next_res->start = conflict->end + 1;
> + next_res->end = end;

The new resources name and flags have to be set here.

Otherwise looks correct to me. Certainly some testing will be needed.
RP

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [patch] vfs: create /proc//mountinfo

2008-01-31 Thread Ram Pai

On Thu, 2008-01-31 at 10:17 +0100, Miklos Szeredi wrote:
> > > From: Ram Pai <[EMAIL PROTECTED]>

...snipped...

> IDR ids are 'int' but they are always positive (AFAICT), but yeah,
> maybe this is confusing.
> 
> > The new exported-to-everyone dentry_path() probably could do with a bit
> > more documentation - it's the sort of thing which people keep on wanting
> > and using.
> 
> OK.
> 
> > How does dentry_path() differ from d_path() and why do we need both and can
> > we get some sharing/consolidation happening here?

d_path displays the path from the rootfs, whereas dentry_path displays
the path from the root of that filesystem.

> 
> Tried that but not easy, without removing some of the
> microoptimizations in d_path(), which I'm not sure are really
> important, but...

this patch was intially developed with Al Viro. He preferred to keep the
two functions separate. BTW: this patch owes credits to Al Viro for his
initial set of ideas.

> 
> > Why do d_path() and dentry_path() have differing conventions for displaying
> > a deleted file and can we fix that?
> 
> I think Ram chose a different convention in dentry_path() in order to
> make sure, there was no space in the resulting path.  But spaces would
> be escaped anyway, so this isn't really important.  So yes, this could
> be fixed.

my patch was generated about  a year or so back using probably the
2.6.18 code base which had the "//deleted" convention. That got copied
in my patch. But since then I see that the original code has changed to
use the " (deleted)" convention. 

Yes this patch has to be changed to be consistent with the existing
code. 

> 
> > This patch adds a lot of code which is, I guess, unused if
> > CONFIG_PROC_FS=n.  Fixable?

yes. good observation. I will send a patch with this optimization and
the above mentioned change. 

RP

> 
> Possibly yes.  A good chunk of namespace.c could be surrounded by an
> #ifdef, which would save even more, than was added by this particular
> patch.

> Thanks,
> Miklos

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[RFC PATCH] vfs: optimization to /proc//mountinfo patch

2008-02-04 Thread Ram Pai

1) reports deleted inode in dentry_path() consistent with that in __d_path()
2) modified __d_path() to use prepend(), reducing the size of __d_path()
3) moved all the functionality that reports mount information in /proc under
CONFIG_PROC_FS.

Could not verify if the code would work with CONFIG_PROC_FS=n, since it was
impossible to disable CONFIG_PROC_FS. Looking for ideas on how to disable
CONFIG_PROC_FS.



Signed-off-by: Ram Pai <[EMAIL PROTECTED]>
---
 fs/dcache.c  |   59 +++
 fs/namespace.c   |2 +
 fs/seq_file.c|2 +
 include/linux/dcache.h   |3 ++
 include/linux/seq_file.h |3 ++
 5 files changed, 34 insertions(+), 35 deletions(-)

Index: linux-2.6.23/fs/dcache.c
===
--- linux-2.6.23.orig/fs/dcache.c
+++ linux-2.6.23/fs/dcache.c
@@ -1747,6 +1747,17 @@ shouldnt_be_hashed:
goto shouldnt_be_hashed;
 }
 
+static int prepend(char **buffer, int *buflen, const char *str,
+ int namelen)
+{
+   *buflen -= namelen;
+   if (*buflen < 0)
+   return 1;
+   *buffer -= namelen;
+   memcpy(*buffer, str, namelen);
+   return 0;
+}
+
 /**
  * d_path - return the path of a dentry
  * @dentry: dentry to report
@@ -1768,17 +1779,11 @@ static char *__d_path(struct dentry *den
 {
char * end = buffer+buflen;
char * retval;
-   int namelen;
 
-   *--end = '\0';
-   buflen--;
-   if (!IS_ROOT(dentry) && d_unhashed(dentry)) {
-   buflen -= 10;
-   end -= 10;
-   if (buflen < 0)
+   prepend(&end, &buflen, "\0", 1);
+   if (!IS_ROOT(dentry) && d_unhashed(dentry) &&
+   prepend(&end, &buflen, " (deleted)", 10))
goto Elong;
-   memcpy(end, " (deleted)", 10);
-   }
 
if (buflen < 1)
goto Elong;
@@ -1805,13 +1810,10 @@ static char *__d_path(struct dentry *den
}
parent = dentry->d_parent;
prefetch(parent);
-   namelen = dentry->d_name.len;
-   buflen -= namelen + 1;
-   if (buflen < 0)
+   if (prepend(&end, &buflen, dentry->d_name.name,
+   dentry->d_name.len) ||
+   prepend(&end, &buflen, "/", 1))
goto Elong;
-   end -= namelen;
-   memcpy(end, dentry->d_name.name, namelen);
-   *--end = '/';
retval = end;
dentry = parent;
}
@@ -1819,12 +1821,9 @@ static char *__d_path(struct dentry *den
return retval;
 
 global_root:
-   namelen = dentry->d_name.len;
-   buflen -= namelen;
-   if (buflen < 0)
-   goto Elong;
-   retval -= namelen-1;/* hit the slash */
-   memcpy(retval, dentry->d_name.name, namelen);
+   retval += 1;/* hit the slash */
+   if (prepend(&retval, &buflen, dentry->d_name.name, dentry->d_name.len))
+   goto Elong;
return retval;
 Elong:
return ERR_PTR(-ENAMETOOLONG);
@@ -1890,17 +1889,8 @@ char *dynamic_dname(struct dentry *dentr
return memcpy(buffer, temp, sz);
 }
 
-static int prepend(char **buffer, int *buflen, const char *str,
- int namelen)
-{
-   *buflen -= namelen;
-   if (*buflen < 0)
-   return 1;
-   *buffer -= namelen;
-   memcpy(*buffer, str, namelen);
-   return 0;
-}
 
+#ifdef CONFIG_PROC_FS
 /*
  * Write full pathname from the root of the filesystem into the buffer.
  */
@@ -1910,11 +1900,9 @@ char *dentry_path(struct dentry *dentry,
char *retval;
 
spin_lock(&dcache_lock);
-   prepend(&end, &buflen, "\0", 1);
-   if (!IS_ROOT(dentry) && d_unhashed(dentry)) {
-   if (prepend(&end, &buflen, "//deleted", 9))
+   if (!IS_ROOT(dentry) && d_unhashed(dentry) &&
+   prepend(&end, &buflen, " (deleted)", 10))
goto Elong;
-   }
if (buflen < 1)
goto Elong;
/* Get '/' right */
@@ -1943,6 +1931,7 @@ Elong:
spin_unlock(&dcache_lock);
return ERR_PTR(-ENAMETOOLONG);
 }
+#endif /* CONFIG_PROC_FS */
 
 /*
  * NOTE! The user-level library version returns a
Index: linux-2.6.23/fs/namespace.c
===
--- linux-2.6.23.orig/fs/namespace.c
+++ linux-2.6.23/fs/namespace.c
@@ -609,6 +609,7 @@ void mnt_unpin(struct vfsmount *mnt)
 
 EXPORT_SYMBOL(mnt_unpin);
 
+#ifdef CONFIG_PROC_FS
 /* iterator */
 static

Re: [RFC PATCH] vfs: optimization to /proc//mountinfo patch

2008-02-10 Thread Ram Pai

On Mon, 2008-02-04 at 01:28 -0800, Andrew Morton wrote:
> On Mon, 04 Feb 2008 01:15:05 -0800 Ram Pai <[EMAIL PROTECTED]> wrote:
> 
> > 1) reports deleted inode in dentry_path() consistent with that in __d_path()
> > 2) modified __d_path() to use prepend(), reducing the size of __d_path()
> > 3) moved all the functionality that reports mount information in /proc under
> > CONFIG_PROC_FS.
> > 
> > Could not verify if the code would work with CONFIG_PROC_FS=n, since it was
> > impossible to disable CONFIG_PROC_FS. Looking for ideas on how to disable
> > CONFIG_PROC_FS.

this worked. thanks. There was one place in ipv4 that failed compilation
with proc_fs disabled. Fixed that. Otherwise everything compiled
cleanly.

> 
> Do `make menuconfig', then hit '/' and search for "proc_fs".
> 
> It'll tell you that you need to set EMBEDDED=y to disable procfs.
> 
> >  fs/dcache.c  |   59 
> > +++
> >  fs/namespace.c   |2 +
> >  fs/seq_file.c|2 +
> >  include/linux/dcache.h   |3 ++
> >  include/linux/seq_file.h |3 ++
> 
> Please resend after testing that, thanks.

with procfs disabled, the boot fails, since nash(fedora's startup
script) which links with libc.so library has dependencies on /proc. Nash
segfaults and so does init.  looking for ideas.

without passing this hurdle, its hard to test the patch :(
RP

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [RFC v2 05/10] vfs: introduce one hash table

2012-09-25 Thread Ram Pai

On Sun, Sep 23, 2012 at 08:56:30PM +0800, zwu.ker...@gmail.com wrote:
> From: Zhi Yong Wu 
> 
>   Adds a hash table structure which contains
> a lot of hash list and is used to efficiently
> look up the data temperature of a file or its
> ranges.
>   In each hash list of hash table, the hash node
> will keep track of temperature info.
> 
> Signed-off-by: Zhi Yong Wu 
> ---
>  fs/hot_tracking.c|   77 -
>  include/linux/hot_tracking.h |   35 +++
>  2 files changed, 110 insertions(+), 2 deletions(-)
> 
> diff --git a/fs/hot_tracking.c b/fs/hot_tracking.c
> index fa89f70..5f96442 100644
> --- a/fs/hot_tracking.c
> +++ b/fs/hot_tracking.c
> @@ -16,6 +16,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include 
>  #include 
>  #include 
> @@ -24,6 +25,9 @@

...snip...

> +/* Hash list heads for hot hash table */
> +struct hot_hash_head {
> + struct hlist_head hashhead;
> + rwlock_t rwlock;
> + u32 temperature;
> +};
> +
> +/* Nodes stored in each hash list of hash table */
> +struct hot_hash_node {
> + struct hlist_node hashnode;
> + struct list_head node;
> + struct hot_freq_data *hot_freq_data;
> + struct hot_hash_head *hlist;
> + spinlock_t lock; /* protects hlist */
> +
> + /*
> +  * number of references to this node
> +  * equals 1 (hashlist entry)
> +  */
> + struct kref refs;
> +};

Dont see why you need yet another datastructure to hash the inode_item
and the range_item into a hash list.  You can just add another
hlist_node in the inode_item and range_item. This field can be then used
to link into the corresponding hash list.

You can use the container_of() get to the inode_item or the range_item
using the hlist_node field. 

You can thus eliminate a lot of code.

> +
>  /* An item representing an inode and its access frequency */
>  struct hot_inode_item {
>   /* node for hot_inode_tree rb_tree */
> @@ -68,6 +93,8 @@ struct hot_inode_item {
>   spinlock_t lock;
>   /* prevents kfree */
>   struct kref refs;
> + /* hashlist node for this inode */
> + struct hot_hash_node *heat_node;

this can be just
struct hlist_node head_node; /* lookup hot_inode hash list */

Use this field to link it into the corresponding hashlist.

>  };
> 
this can be just 
>  /*
> @@ -91,6 +118,8 @@ struct hot_range_item {
>   spinlock_t lock;
>   /* prevents kfree */
>   struct kref refs;
> + /* hashlist node for this range */
> + struct hot_hash_node *heat_node;

this can be just 
struct hlist_node head_node; /* lookup hot_range hash list */


>  };
> 
>  struct hot_info {
> @@ -98,6 +127,12 @@ struct hot_info {
> 
>   /* red-black tree that keeps track of fs-wide hot data */
>   struct hot_inode_tree hot_inode_tree;
> +
> + /* hash map of inode temperature */
> + struct hot_hash_head heat_inode_hl[HEAT_HASH_SIZE];
> +
> + /* hash map of range temperature */
> + struct hot_hash_head heat_range_hl[HEAT_HASH_SIZE];
>  };
> 
>  #endif  /* _LINUX_HOTTRACK_H */

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [RFC v2 01/10] vfs: introduce private rb structures

2012-09-25 Thread Ram Pai

On Sun, Sep 23, 2012 at 08:56:26PM +0800, zwu.ker...@gmail.com wrote:
> From: Zhi Yong Wu 
> 
>   One root structure hot_info is defined, is hooked
> up in super_block, and will be used to hold rb trees
> root, hash list root and some other information, etc.
>   Adds hot_inode_tree struct to keep track of
> frequently accessed files, and be keyed by {inode, offset}.
> Trees contain hot_inode_items representing those files
> and ranges.
>   Having these trees means that vfs can quickly determine the
> temperature of some data by doing some calculations on the
> hot_freq_data struct that hangs off of the tree item.
>   Define two items hot_inode_item and hot_range_item,
> one of them represents one tracked file
> to keep track of its access frequency and the tree of
> ranges in this file, while the latter represents
> a file range of one inode.
>   Each of the two structures contains a hot_freq_data
> struct with its frequency of access metrics (number of
> {reads, writes}, last {read,write} time, frequency of
> {reads,writes}).
>   Also, each hot_inode_item contains one hot_range_tree
> struct which is keyed by {inode, offset, length}
> and used to keep track of all the ranges in this file.
> 
> Signed-off-by: Zhi Yong Wu 
> ---
> +
..snip..

> +/* A tree that sits on the hot_info */
> +struct hot_inode_tree {
> + struct rb_root map;
> + rwlock_t lock;
> +};
> +
> +/* A tree of ranges for each inode in the hot_inode_tree */
> +struct hot_range_tree {
> + struct rb_root map;
> + rwlock_t lock;
> +};

Can as well have a generic datastructure called hot_tree instead
of having two different datastructure which basically are the same.

> +
> +/* A frequency data struct holds values that are used to
> + * determine temperature of files and file ranges. These structs
> + * are members of hot_inode_item and hot_range_item
> + */
> +struct hot_freq_data {
> + struct timespec last_read_time;
> + struct timespec last_write_time;
> + u32 nr_reads;
> + u32 nr_writes;
> + u64 avg_delta_reads;
> + u64 avg_delta_writes;
> + u8 flags;
> + u32 last_temperature;
> +};
> +
> +/* An item representing an inode and its access frequency */
> +struct hot_inode_item {
> + /* node for hot_inode_tree rb_tree */
> + struct rb_node rb_node;
> + /* tree of ranges in this inode */
> + struct hot_range_tree hot_range_tree;
> + /* frequency data for this inode */
> + struct hot_freq_data hot_freq_data;
> + /* inode number, copied from inode */
> + unsigned long i_ino;
> + /* used to check for errors in ref counting */
> + u8 in_tree;
> + /* protects hot_freq_data, i_no, in_tree */
> + spinlock_t lock;
> + /* prevents kfree */
> + struct kref refs;
> +};
> +
> +/*
> + * An item representing a range inside of an inode whose frequency
> + * is being tracked
> + */
> +struct hot_range_item {
> + /* node for hot_range_tree rb_tree */
> + struct rb_node rb_node;
> + /* frequency data for this range */
> + struct hot_freq_data hot_freq_data;
> + /* the hot_inode_item associated with this hot_range_item */
> + struct hot_inode_item *hot_inode;
> + /* starting offset of this range */
> + u64 start;
> + /* length of this range */
> + u64 len;
> + /* used to check for errors in ref counting */
> + u8 in_tree;
> + /* protects hot_freq_data, start, len, and in_tree */
> + spinlock_t lock;
> + /* prevents kfree */
> + struct kref refs;
> +};

might as well have just one generic datastructure called hot_item with 
all the common fields and then have 

struct hot_inode_item  {
struct hot_item hot_inode;
struct hot_tree hot_range_tree;
unsigned long i_ino;
}

and 

struct hot_range_item {
struct hot_item hot_range;
u64 start;
u64 len;/* length of this range */
}

This should help you eliminate some duplicate code as well.


RP

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v3] kernel/resource.c: fix stack overflow in __reserve_region_with_split

2012-09-09 Thread Ram Pai

On Fri, Sep 07, 2012 at 04:08:19PM -0600, T Makphaibulchoke wrote:
> Using recurvise call to try adding a non-conflicting region in the function
> __reserve_region_with_split() could result in a stack overflow in the case
> that the recursive calls are too deep.  Convert the recursive calls to
> an iterative loop to avoid the problem.
> 
> Tested on a machine containing 135 regions.  The kernel no longer panicked
> with stack overflow.
> 
> Also tested with code arbitrarily adding regions with no conflict, embedding
> two consecutive conflicts and embedding two non-consecutive conflicts.
> 
> Signed-off-by: T Makphaibulchoke 
> 
> ---
> Change since v2:
> * Initializing both the name and flags fileds of the newly allocated region
>   request.
> 
> Change since v1:
> * Fixing __resrve_region_with_split() to ensure a reqion reserve request is
>   satisfied to the fullest extent, minus any overlapping conflicting regions.
> ---
>  kernel/resource.c |   50 ++
>  1 files changed, 38 insertions(+), 12 deletions(-)
> 
> diff --git a/kernel/resource.c b/kernel/resource.c
> index 34d4588..73f35d4 100644
> --- a/kernel/resource.c
> +++ b/kernel/resource.c
> @@ -763,6 +763,7 @@ static void __init __reserve_region_with_split(struct 
> resource *root,
>   struct resource *parent = root;
>   struct resource *conflict;
>   struct resource *res = kzalloc(sizeof(*res), GFP_ATOMIC);
> + struct resource *next_res = NULL;
> 
>   if (!res)
>   return;
> @@ -772,21 +773,46 @@ static void __init __reserve_region_with_split(struct 
> resource *root,
>   res->end = end;
>   res->flags = IORESOURCE_BUSY;
> 
> - conflict = __request_resource(parent, res);
> - if (!conflict)
> - return;
> + while (1) {
> 
> - /* failed, split and try again */
> - kfree(res);
> + conflict = __request_resource(parent, res);
> + if (!conflict) {
> + if (!next_res)
> + break;
> + res = next_res;
> + next_res = NULL;
> + continue;
> + }
> 
> - /* conflict covered whole area */
> - if (conflict->start <= start && conflict->end >= end)
> - return;
> + /* conflict covered whole area */
> + if (conflict->start <= res->start &&
> + conflict->end >= res->end) {
> + kfree(res);
> + WARN_ON(next_res);
> + break;
> + }
> +
> + /* failed, split and try again */
> + if (conflict->start > res->start) {
> + end = res->end;
> + res->end = conflict->start - 1;
> + if (conflict->end < end) {
> + next_res = kzalloc(sizeof(*next_res),
> + GFP_ATOMIC);
> + if (!next_res) {
> + kfree(res);
> + break;
> + }
> + next_res->name = name;
> + next_res->start = conflict->end + 1;
> + next_res->end = end;
> + next_res->flags = IORESOURCE_BUSY;
> + }
> + } else {
> + res->start = conflict->end + 1;
> + }
> + }
> 
> - if (conflict->start > start)
> - __reserve_region_with_split(root, start, conflict->start-1, 
> name);
> - if (conflict->end < end)
> - __reserve_region_with_split(root, conflict->end+1, end, name);
>  }
> 
>  void __init reserve_region_with_split(struct resource *root,

Reviewed-by: Ram Pai 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 33/58] kernel/resource.c: Make internal function reallocate_resource static

2012-11-18 Thread Ram Pai

On Sun, Nov 18, 2012 at 09:28:12PM -0800, Josh Triplett wrote:
> Nothing outside of kernel/resource.c references reallocate_resource, so
> mark it static.  This eliminates warnings from GCC
> (-Wmissing-prototypes) and Sparse (-Wdecl).
> 
> kernel/resource.c:476:5: warning: no previous prototype for 
> ‘reallocate_resource’ [-Wmissing-prototypes]
> 
> Signed-off-by: Josh Triplett 
> ---
>  kernel/resource.c |2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/kernel/resource.c b/kernel/resource.c
> index f593f18..0cc07d4 100644
> --- a/kernel/resource.c
> +++ b/kernel/resource.c
> @@ -473,7 +473,7 @@ static int find_resource(struct resource *root, struct 
> resource *new,
>   * @newsize: new size of the resource descriptor
>   * @constraint: the size and alignment constraints to be met.
>   */
> -int reallocate_resource(struct resource *root, struct resource *old,
> +static int reallocate_resource(struct resource *root, struct resource *old,
>   resource_size_t newsize,
>           struct resource_constraint  *constraint)
>  {

Acked-by: Ram Pai 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 2/3] resource: Add release_mem_region_adjustable()

2013-04-06 Thread Ram Pai

On Thu, Apr 04, 2013 at 08:07:44AM -0600, Toshi Kani wrote:
> On Thu, 2013-04-04 at 14:48 +0800, Ram Pai wrote:
> > On Wed, Apr 03, 2013 at 01:55:05PM -0600, Toshi Kani wrote:
> > > On Wed, 2013-04-03 at 13:37 +0800, Ram Pai wrote:
> > > > On Tue, Apr 02, 2013 at 10:17:29AM -0600, Toshi Kani wrote:
> > > > > + while ((res = *p)) {
> > 
> > ...snip...
> > 
> > > > > + if (res->start > start || res->end < end) {
> > > > 
> > > > This check looks sub-optimal; possbily wrong, to me.  if the res->start
> > > > is greater than 'start', then obviously its sibling's start will
> > > > also be greater than 'start'. So it will loop through all the
> > > > resources unnecesarily.
> > > 
> > > I think this check is necessary to check if the requested range fits
> > > into a resource.  It needs to check both sides to verify this.  I will
> > > add some comment on this check.
> > > 
> > > >   you might want something like
> > > > 
> > > > if (start >= res->end) {
> > > 
> > > I agree that this list is sorted, so we can optimize an error case (i.e.
> > > no matching entry is found) with an additional check.  I will add the
> > > following check at the beginning of the while loop.  
> > > 
> > > if (res->start >= end)
> > > break;
> > > 
> > > I also realized that the function returns 0 when no matching entry is
> > > found.  I will change it to return -EINVAL as well.  
> > 
> > ok. this will take care of it.
> > 
> > > 
> > > > 
> > > > > + p = &res->sibling;
> > > > > + continue;
> > > > > + }
> > > > > +
> > > > > + if (!(res->flags & IORESOURCE_MEM)) {
> > > > > + ret = -EINVAL;
> > > > > + break;
> > > > > + }
> > > > > +
> > > > > + if (!(res->flags & IORESOURCE_BUSY)) {
> > > > > + p = &res->child;
> > > > > + continue;
> > > > > + }
> > > > > +
> > > > > + if (res->start == start && res->end == end) {
> > > > > + /* free the whole entry */
> > > > > + *p = res->sibling;
> > > > > + kfree(res);
> > > > 
> > > > This is incomplete. the prev resource's sibling should now point to
> > > > this resource's sibling. The parent's child has to be updated if
> > > > this resource is the first child resource. no?
> > > 
> > > If this resource is the first child, *p is set to &parent->child.  So,
> > > it will update the parents' child.
> > 
> > But if the resource is not the parent's first child? will it update the
> > previous siblings ->sibling ?
> 
> Yes.  When it continues in the while loop, p is set to &res->sibling.
> So, it will update the previous sibling's ->sibling.

You are right. It does update the pointers correctly. I mis-read the
code.
RP

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v3 03/27] PCI: pci resource iterator

2013-04-08 Thread Ram Pai

On Thu, Apr 04, 2013 at 04:18:01PM -0600, Bjorn Helgaas wrote:
> On Wed, Mar 13, 2013 at 5:27 PM, Yinghai Lu  wrote:
> > From: Ram Pai 
> >
> > Currently pci_dev structure holds an array of 17 PCI resources; six base
> > BARs, one ROM BAR, four BRIDGE BARs, six sriov BARs.  This is wasteful.
> > A bridge device just needs the 4 bridge resources. A non-bridge device
> > just needs the six base resources and one ROM resource. The sriov
> > resources are needed only if the device has SRIOV capability.
> >
> > The pci_dev structure needs to be re-organized to avoid unnecessary
> > bloating.  However too much code outside the pci-bus driver, assumes the
> > internal details of the pci_dev structure, thus making it hard to
> > re-organize the datastructure.
> >
> > As a first step this patch provides generic methods to access the
> > resource structure of the pci_dev.
> >
> > Finally we can re-organize the resource structure in the pci_dev
> > structure and correspondingly update the methods.
> >
> > -v2: Consolidated iterator interface as per Bjorn's suggestion.
> > -v3: Add the idx back - Yinghai Lu
> > -v7: Change to use bitmap for searching - Yinghai Lu
> > -v8: Fix acpiphp module compiling error that is found by
> > Steven Newbury  - Yinghai Lu
> >
> > Signed-off-by: Ram Pai 
> > Signed-off-by: Yinghai Lu 
> > ---
> >  drivers/pci/probe.c |   48 
> >  include/linux/pci.h |   24 
> >  2 files changed, 72 insertions(+)
> >
> > diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
> > index 1df75f7..ac751a6 100644
> > --- a/drivers/pci/probe.c
> > +++ b/drivers/pci/probe.c
> > @@ -123,6 +123,54 @@ int pci_dev_resource_idx(struct pci_dev *dev, struct 
> > resource *res)
> > return -1;
> >  }
> >
> > +static void __init_res_idx_mask(unsigned long *mask, int flag)
> > +{
> > +   bitmap_zero(mask, PCI_NUM_RESOURCES);
> > +   if (flag & PCI_STD_RES)
> > +   bitmap_set(mask, PCI_STD_RESOURCES,
> > +   PCI_STD_RESOURCE_END - PCI_STD_RESOURCES + 1);
> > +   if (flag & PCI_ROM_RES)
> > +   bitmap_set(mask, PCI_ROM_RESOURCE, 1);
> > +#ifdef CONFIG_PCI_IOV
> > +   if (flag & PCI_IOV_RES)
> > +   bitmap_set(mask, PCI_IOV_RESOURCES,
> > +   PCI_IOV_RESOURCE_END - PCI_IOV_RESOURCES + 1);
> > +#endif
> > +   if (flag & PCI_BRIDGE_RES)
> > +   bitmap_set(mask, PCI_BRIDGE_RESOURCES,
> > +   PCI_BRIDGE_RESOURCE_END - PCI_BRIDGE_RESOURCES + 1);
> > +}
> > +
> > +static DECLARE_BITMAP(res_idx_mask[1 << PCI_RES_BLOCK_NUM], 
> > PCI_NUM_RESOURCES);
> > +static int __init pci_res_idx_mask_init(void)
> > +{
> > +   int i;
> > +
> > +   for (i = 0; i < (1 << PCI_RES_BLOCK_NUM); i++)
> > +   __init_res_idx_mask(res_idx_mask[i], i);
> > +
> > +   return 0;
> > +}
> > +postcore_initcall(pci_res_idx_mask_init);
> > +
> > +static inline unsigned long *get_res_idx_mask(int flag)
> > +{
> > +   return res_idx_mask[flag & ((1 << PCI_RES_BLOCK_NUM) - 1)];
> > +}
> > +
> > +int pci_next_resource_idx(int i, int flag)
> > +{
> > +   i++;
> > +   if (i < PCI_NUM_RESOURCES)
> > +   i = find_next_bit(get_res_idx_mask(flag), 
> > PCI_NUM_RESOURCES, i);
> > +
> > +   if (i < PCI_NUM_RESOURCES)
> > +   return i;
> > +
> > +   return -1;
> > +}
> > +EXPORT_SYMBOL(pci_next_resource_idx);
> > +
> >  static u64 pci_size(u64 base, u64 maxbase, u64 mask)
> >  {
> > u64 size = mask & maxbase;  /* Find the significant bits */
> > diff --git a/include/linux/pci.h b/include/linux/pci.h
> > index aefff8b..127a856 100644
> > --- a/include/linux/pci.h
> > +++ b/include/linux/pci.h
> > @@ -341,6 +341,30 @@ struct pci_dev {
> >  struct resource *pci_dev_resource_n(struct pci_dev *dev, int n);
> >  int pci_dev_resource_idx(struct pci_dev *dev, struct resource *res);
> >
> > +#define PCI_STD_RES(1<<0)
> > +#define PCI_ROM_RES(1<<1)
> > +#define PCI_IOV_RES(1<<2)
> > +#define PCI_BRIDGE_RES (1<<3)
> > +#define PCI_RES_BLOCK_NUM  4
> > +
> > +#define PCI_ALL_RES(PCI_STD_RES | PCI_R

Re: [UPDATE][PATCH v2 2/3] resource: Add release_mem_region_adjustable()

2013-04-08 Thread Ram Pai

On Mon, Apr 08, 2013 at 03:47:35PM -0600, Toshi Kani wrote:
> Added release_mem_region_adjustable(), which releases a requested
> region from a currently busy memory resource.  This interface
> adjusts the matched memory resource accordingly even if the
> requested region does not match exactly but still fits into.
> 
> This new interface is intended for memory hot-delete.  During
> bootup, memory resources are inserted from the boot descriptor
> table, such as EFI Memory Table and e820.  Each memory resource
> entry usually covers the whole contigous memory range.  Memory
> hot-delete request, on the other hand, may target to a particular
> range of memory resource, and its size can be much smaller than
> the whole contiguous memory.  Since the existing release interfaces
> like __release_region() require a requested region to be exactly
> matched to a resource entry, they do not allow a partial resource
> to be released.
> 
> There is no change to the existing interfaces since their restriction
> is valid for I/O resources.
> 
> Signed-off-by: Toshi Kani 
> Reviewed-by : Yasuaki Ishimatsu 
> ---
> 
> Added #ifdef CONFIG_MEMORY_HOTPLUG as suggested by Andrew Morton.
> 
> ---
>  include/linux/ioport.h |4 ++
>  kernel/resource.c  |   96 
> 
>  2 files changed, 100 insertions(+)
> 
> diff --git a/include/linux/ioport.h b/include/linux/ioport.h
> index 85ac9b9b..961d4dc 100644
> --- a/include/linux/ioport.h
> +++ b/include/linux/ioport.h
> @@ -192,6 +192,10 @@ extern struct resource * __request_region(struct 
> resource *,
>  extern int __check_region(struct resource *, resource_size_t, 
> resource_size_t);
>  extern void __release_region(struct resource *, resource_size_t,
>   resource_size_t);
> +#ifdef CONFIG_MEMORY_HOTPLUG
> +extern int release_mem_region_adjustable(struct resource *, resource_size_t,
> + resource_size_t);
> +#endif
> 
>  static inline int __deprecated check_region(resource_size_t s,
>   resource_size_t n)
> diff --git a/kernel/resource.c b/kernel/resource.c
> index ae246f9..25b945c 100644
> --- a/kernel/resource.c
> +++ b/kernel/resource.c
> @@ -1021,6 +1021,102 @@ void __release_region(struct resource *parent, 
> resource_size_t start,
>  }
>  EXPORT_SYMBOL(__release_region);
> 
> +#ifdef CONFIG_MEMORY_HOTPLUG
> +/**
> + * release_mem_region_adjustable - release a previously reserved memory 
> region
> + * @parent: parent resource descriptor
> + * @start: resource start address
> + * @size: resource region size
> + *
> + * This interface is intended for memory hot-delete.  The requested region is
> + * released from a currently busy memory resource.  It adjusts the matched
> + * busy memory resource accordingly even if the requested region does not
> + * match exactly but still fits into.  Existing children of the busy memory
> + * resource must be immutable in this request.
> + *
> + * Note, when the busy memory resource gets split into two entries, the code
> + * assumes that all children remain in the lower address entry for 
> simplicity.
> + * Enhance this logic when necessary.
> + */
> +int release_mem_region_adjustable(struct resource *parent,
> + resource_size_t start, resource_size_t size)
> +{
> + struct resource **p;
> + struct resource *res, *new;
> + resource_size_t end;
> + int ret = -EINVAL;
> +
> + end = start + size - 1;
> + if ((start < parent->start) || (end > parent->end))
> + return ret;
> +
> + p = &parent->child;
> + write_lock(&resource_lock);
> +
> + while ((res = *p)) {
> + if (res->start >= end)
> + break;
> +
> + /* look for the next resource if it does not fit into */
> + if (res->start > start || res->end < end) {
> + p = &res->sibling;
> + continue;
> + }

What if the resource overlaps. In other words, the res->start > start 
but res->end > end  ? 

Also do you handle the case where the range  spans
across multiple adjacent resources?

-- 
Ram Pai

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v3 2/3] resource: Add release_mem_region_adjustable()

2013-04-24 Thread Ram Pai

On Thu, Apr 11, 2013 at 10:30:02AM -0600, Toshi Kani wrote:
> On Wed, 2013-04-10 at 15:24 -0700, Andrew Morton wrote:
> > On Wed, 10 Apr 2013 15:08:29 -0700 (PDT) David Rientjes 
> >  wrote:
> > 
> > > On Wed, 10 Apr 2013, Toshi Kani wrote:
> > > 
> > > > > I'll switch it to GFP_ATOMIC.  Which is horridly lame but the
> > > > > allocation is small and alternatives are unobvious.
> > > > 
> > > > Great!  Again, thanks for the update!
> > > 
> > > release_mem_region_adjustable() allocates at most one struct resource, so 
> > > why not do kmalloc(sizeof(struct resource), GFP_KERNEL) before taking 
> > > resource_lock and then testing whether it's NULL or not when splitting?  
> > > It unnecessarily allocates memory when there's no split, but 
> > > __remove_pages() shouldn't be a hotpath.
> > 
> > yup.
> > 
> > --- a/kernel/resource.c~resource-add-release_mem_region_adjustable-fix-fix
> > +++ a/kernel/resource.c
> > @@ -1046,7 +1046,8 @@ int release_mem_region_adjustable(struct
> > resource_size_t start, resource_size_t size)
> >  {
> > struct resource **p;
> > -   struct resource *res, *new;
> > +   struct resource *res;
> > +   struct resource *new_res;
> > resource_size_t end;
> > int ret = -EINVAL;
> >  
> > @@ -1054,6 +1055,9 @@ int release_mem_region_adjustable(struct
> > if ((start < parent->start) || (end > parent->end))
> > return ret;
> >  
> > +   /* The kzalloc() result gets checked later */
> > +   new_res = kzalloc(sizeof(struct resource), GFP_KERNEL);
> > +
> > p = &parent->child;
> > write_lock(&resource_lock);
> >  
> > @@ -1091,32 +1095,33 @@ int release_mem_region_adjustable(struct
> > start - res->start);
> > } else {
> > /* split into two entries */
> > -   new = kzalloc(sizeof(struct resource), GFP_ATOMIC);
> > -   if (!new) {
> > +   if (!new_res) {
> > ret = -ENOMEM;
> > break;
> > }
> > -   new->name = res->name;
> > -   new->start = end + 1;
> > -   new->end = res->end;
> > -   new->flags = res->flags;
> > -   new->parent = res->parent;
> > -   new->sibling = res->sibling;
> > -   new->child = NULL;
> > +   new_res->name = res->name;
> > +   new_res->start = end + 1;
> > +   new_res->end = res->end;
> > +   new_res->flags = res->flags;
> > +   new_res->parent = res->parent;
> > +   new_res->sibling = res->sibling;
> > +   new_res->child = NULL;
> >  
> > ret = __adjust_resource(res, res->start,
> > start - res->start);
> > if (ret) {
> > -   kfree(new);
> > +   kfree(new_res);
> > break;
> > }
> 
> The kfree() in the if-statement above is not necessary since kfree() is
> called before the return at the end.  That is, the if-statement needs to
> be:
>   if (ret)
>   break;
> 
> With this change, I confirmed that all my test cases passed (with all
> the config debug options this time :).  With the change:
> 
> Reviewed-by: Toshi Kani 

I am not confortable witht the assumption, that when a split takes
place, the children are assumed to be in the lower entry. Probably a
warning to that effect,  would help quickly
nail down the problem, if such a case does encounter ?

Otherwise this looks fine. Sorry for the delayed reply. Was out.

Reviewed-by: Ram Pai 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v3 03/27] PCI: pci resource iterator

2013-04-24 Thread Ram Pai

On Wed, Apr 10, 2013 at 09:22:48AM -0600, Bjorn Helgaas wrote:
> On Mon, Apr 8, 2013 at 10:51 PM, Ram Pai  wrote:
> > On Thu, Apr 04, 2013 at 04:18:01PM -0600, Bjorn Helgaas wrote:
> >> On Wed, Mar 13, 2013 at 5:27 PM, Yinghai Lu  wrote:
> >> > From: Ram Pai 
> >> >
> >> > Currently pci_dev structure holds an array of 17 PCI resources; six base
> >> > BARs, one ROM BAR, four BRIDGE BARs, six sriov BARs.  This is wasteful.
> >> > A bridge device just needs the 4 bridge resources. A non-bridge device
> >> > just needs the six base resources and one ROM resource. The sriov
> >> > resources are needed only if the device has SRIOV capability.
> >> >
> >> > The pci_dev structure needs to be re-organized to avoid unnecessary
> >> > bloating.  However too much code outside the pci-bus driver, assumes the
> >> > internal details of the pci_dev structure, thus making it hard to
> >> > re-organize the datastructure.
> >> >
> >> > As a first step this patch provides generic methods to access the
> >> > resource structure of the pci_dev.
> >> >
> >> > Finally we can re-organize the resource structure in the pci_dev
> >> > structure and correspondingly update the methods.
> >> >
> >> > -v2: Consolidated iterator interface as per Bjorn's suggestion.
> >> > -v3: Add the idx back - Yinghai Lu
> >> > -v7: Change to use bitmap for searching - Yinghai Lu
> >> > -v8: Fix acpiphp module compiling error that is found by
> >> > Steven Newbury  - Yinghai Lu
> >> >
> >> > Signed-off-by: Ram Pai 
> >> > Signed-off-by: Yinghai Lu 
> >> > ---
> >> >  drivers/pci/probe.c |   48 
> >> > 
> >> >  include/linux/pci.h |   24 
> >> >  2 files changed, 72 insertions(+)
> >> >
> >> > diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
> >> > index 1df75f7..ac751a6 100644
> >> > --- a/drivers/pci/probe.c
> >> > +++ b/drivers/pci/probe.c
> >> > @@ -123,6 +123,54 @@ int pci_dev_resource_idx(struct pci_dev *dev, 
> >> > struct resource *res)
> >> > return -1;
> >> >  }
> >> >
> >> > +static void __init_res_idx_mask(unsigned long *mask, int flag)
> >> > +{
> >> > +   bitmap_zero(mask, PCI_NUM_RESOURCES);
> >> > +   if (flag & PCI_STD_RES)
> >> > +   bitmap_set(mask, PCI_STD_RESOURCES,
> >> > +   PCI_STD_RESOURCE_END - PCI_STD_RESOURCES + 1);
> >> > +   if (flag & PCI_ROM_RES)
> >> > +   bitmap_set(mask, PCI_ROM_RESOURCE, 1);
> >> > +#ifdef CONFIG_PCI_IOV
> >> > +   if (flag & PCI_IOV_RES)
> >> > +   bitmap_set(mask, PCI_IOV_RESOURCES,
> >> > +   PCI_IOV_RESOURCE_END - PCI_IOV_RESOURCES + 1);
> >> > +#endif
> >> > +   if (flag & PCI_BRIDGE_RES)
> >> > +   bitmap_set(mask, PCI_BRIDGE_RESOURCES,
> >> > +   PCI_BRIDGE_RESOURCE_END - PCI_BRIDGE_RESOURCES + 
> >> > 1);
> >> > +}
> >> > +
> >> > +static DECLARE_BITMAP(res_idx_mask[1 << PCI_RES_BLOCK_NUM], 
> >> > PCI_NUM_RESOURCES);
> >> > +static int __init pci_res_idx_mask_init(void)
> >> > +{
> >> > +   int i;
> >> > +
> >> > +   for (i = 0; i < (1 << PCI_RES_BLOCK_NUM); i++)
> >> > +   __init_res_idx_mask(res_idx_mask[i], i);
> >> > +
> >> > +   return 0;
> >> > +}
> >> > +postcore_initcall(pci_res_idx_mask_init);
> >> > +
> >> > +static inline unsigned long *get_res_idx_mask(int flag)
> >> > +{
> >> > +   return res_idx_mask[flag & ((1 << PCI_RES_BLOCK_NUM) - 1)];
> >> > +}
> >> > +
> >> > +int pci_next_resource_idx(int i, int flag)
> >> > +{
> >> > +   i++;
> >> > +   if (i < PCI_NUM_RESOURCES)
> >> > +   i = find_next_bit(get_res_idx_mask(flag), 
> >> > PCI_NUM_RESOURCES, i);
> >> > +
> >> > +   if (i < PCI_NUM_RESOURCES)
> >> > +   return i;
> >> > +
> >> > +   return -1;
> >

Re: [PATCH v3 03/27] PCI: pci resource iterator

2013-04-27 Thread Ram Pai

On Thu, Apr 25, 2013 at 11:22:59AM -0600, Bjorn Helgaas wrote:
> On Wed, Apr 24, 2013 at 9:55 PM, Ram Pai  wrote:
> > On Wed, Apr 10, 2013 at 09:22:48AM -0600, Bjorn Helgaas wrote:
> >> On Mon, Apr 8, 2013 at 10:51 PM, Ram Pai  wrote:
> >> > On Thu, Apr 04, 2013 at 04:18:01PM -0600, Bjorn Helgaas wrote:
> >> >> On Wed, Mar 13, 2013 at 5:27 PM, Yinghai Lu  wrote:
> >> >> > From: Ram Pai 
> >> >> >
> >> >> > Currently pci_dev structure holds an array of 17 PCI resources; six 
> >> >> > base
> >> >> > BARs, one ROM BAR, four BRIDGE BARs, six sriov BARs.  This is 
> >> >> > wasteful.
> >> >> > A bridge device just needs the 4 bridge resources. A non-bridge device
> >> >> > just needs the six base resources and one ROM resource. The sriov
> >> >> > resources are needed only if the device has SRIOV capability.
> >> >> >
> ...
> >> > I agree. There are two cleanups needed.
> >> >
> >> > a) pci drivers should not assume the internal organization of the
> >> > resources in the struct pci_dev.
> >>
> >> Do you mean that drivers should not use "pci_dev->resource[i]"?  If
> >> so, I agree that it would be great if we had an accessor for BARs, but
> >> it seems impractical to change all the drivers that use the current
> >> style.
> >
> > Sorry for the delay. Was vacationing.  I mean, we cannot let drivers
> > assume anything about the how the resources are organized.
> >
> > The only thing the drivers should know is that there are 6 normal
> > resources, 4 bridge resources, 1 ROM resource and 6 iov resources.
> >
> > Currently the drivers assume that ROM resource follows normal resources
> > followed by IOV followed by bridge. These assumptions are making it hard
> > to re-organize the layout of resources in struct pci_dev.
> >
> > I think we need to expose the following interfaces to drivers.
> >
> > a) return the nth normal resource
> 
> I think this needs to remain "pci_dev->resource[n]", because so many
> drivers do this that it would be impractical to change them all.

Scanning through the entire kernel tree, I did find about 40 different
drivers that are accessing pci_dev->resource[n]. These drivers can
be changed to use the newer interface. Out-of-tree drivers can continue
to access it directly, but they will break, when the
datastructure is eventually re-organized.

I was thinking of a interface something like

pci_get_std_resource(dev,i) which is implemented internally as

#define pci_get_std_resource(dev,i) dev->resource[i]

> 
> > b) return the nth iov resource
> 
> I could imagine a new interface for this, given that I only see a
> dozen SR-IOV drivers in the tree.  There might be a few out-of-tree,
> but there probably aren't many.
> 
> > c) return the rom resource
> 
> There are only about 30 drivers in the tree that reference
> PCI_ROM_RESOURCE.  Fewer than I expected, but I'd still be hesitant
> about make "pci_dev->resource[PCI_ROM_RESOURCE]" stop working.

It will work till someday when the datastructure is re-organized.

Again the interface will be something like

pci_get_rom_resource(dev) which is implemented internally as

#define pci_get_std_resource(dev)  pci_dev->resource[PCI_ROM_RESOURCE]

> 
> > d) return the nth bridge resource
> 
> I think it's reasonable to have a new interface for this because
> bridges are handled almost entirely in the PCI core and architecture
> code, and I doubt there are many, if any, drivers that care.
> 
> > e) return the type and index of a given resource, where 'index' is
> > the index w.r.t to that resource type; not w.r.t to all
> > the resources of the device.
> > f) ability to loop through all resources of the given type/types.
> 
> We do loop through resources in the core when we're assigning, fixing
> up, etc., and that makes some sense to me.  But I actually don't see
> the use case for *drivers* to loop through resources.  All a driver
> knows is "BAR X means Y", and it generally doesn't need to iterate and
> do something generic to all of them.

Yes mostly true. However I have seen a couple of drivers looping through
the resources. An examples is ..

yenta_free_resources()

RP

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] resource: make sure requested range intersects root range

2012-07-10 Thread Ram Pai

On Tue, Jul 10, 2012 at 02:33:48PM -0700, Andrew Morton wrote:
> On Sat, 30 Jun 2012 15:00:57 +0300
> Octavian Purdila  wrote:
> 
> > When the requested and root ranges do not intersect the logic in
> > __reserve_region_with_split will cause an infinite recursion which
> > will overflow the stack as seen in the warning bellow.
> > 
> > This particular stack overflow was caused by requesting the
> > (1-107ff) range while the root range was (0-). In
> > this case __request_resource would return the whole root range as
> > conflict range (i.e. 0-). Then, the logic in
> > __reserve_region_with_split would continue the recursion requesting
> > the new range as (conflict->end+1, end) which incidentally in this
> > case equals the originally requested range.
> > 
> > This patch aborts looking for a usable range when the requested one is
> > completely outside the root range to avoid the infinite recursion, and
> > since this indicates a problem in the layers above, it also prints an
> > error message indicating the requested and root range in order to make
> > the problem more easily traceable.
> 
> I think we should also emit a stack trace so the faulty caller can be
> pinpointed.
> 
> > ...
> >
> > --- a/kernel/resource.c
> > +++ b/kernel/resource.c
> > @@ -789,7 +789,13 @@ void __init reserve_region_with_split(struct resource 
> > *root,
> > const char *name)
> >  {
> > write_lock(&resource_lock);
> > -   __reserve_region_with_split(root, start, end, name);
> > +   if (start > root->end || end < root->start)
> > +   pr_err("Requested range (0x%llx-0x%llx) not in root range 
> > (0x%llx-0x%llx)\n",
> > +  (unsigned long long)start, (unsigned long long)end,
> > +  (unsigned long long)root->start,
> > +  (unsigned long long)root->end);
> > +   else
> > +   __reserve_region_with_split(root, start, end, name);
> > write_unlock(&resource_lock);
> >  }
> 
> The fancy way of doing that is
> 
>   if (!WARN(start > root->end || end < root->start),
> "Requested range (0x%llx-0x%llx) not in root range 
> (0x%llx-0x%llx)\n",
>  (unsigned long long)start, (unsigned long long)end,
>  (unsigned long long)root->start,
>  (unsigned long long)root->end)
>   __reserve_region_with_split(root, start, end, name);
> 
> but that's quite the eyesore.  How about doing it the simple way?
> 
> --- 
> a/kernel/resource.c~resource-make-sure-requested-range-intersects-root-range-fix
> +++ a/kernel/resource.c
> @@ -792,13 +792,15 @@ void __init reserve_region_with_split(st
>   const char *name)
>  {
>   write_lock(&resource_lock);
> - if (start > root->end || end < root->start)
> + if (start > root->end || end < root->start) {
>   pr_err("Requested range (0x%llx-0x%llx) not in root range 
> (0x%llx-0x%llx)\n",
>  (unsigned long long)start, (unsigned long long)end,
>  (unsigned long long)root->start,
>  (unsigned long long)root->end);
> - else
> + dump_stack();
> + } else {
>   __reserve_region_with_split(root, start, end, name);
> + }

Wait.. I am not sure this will fix the problem entirely. The above check
will handle the case where the range requested is entirey out of the
root's range.  But if the requested range overlapps that of the root
range, we will still call __reserve_region_with_split() and end up with 
a recursion if there is a overflow. Wont we?


>   write_unlock(&resource_lock);
>  }
> 
RP

-- 
Ram Pai

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] resource: make sure requested range intersects root range

2012-07-11 Thread Ram Pai

On Wed, Jul 11, 2012 at 02:06:10PM +0300, Purdila, Octavian wrote:
> On Wed, Jul 11, 2012 at 5:09 AM, Ram Pai  wrote:
> 
> >
> > Wait.. I am not sure this will fix the problem entirely. The above check
> > will handle the case where the range requested is entirey out of the
> > root's range.  But if the requested range overlapps that of the root
> > range, we will still call __reserve_region_with_split() and end up with
> > a recursion if there is a overflow. Wont we?
> >
> 
> Good catch. I will fix this as well as address Andrew's and Joe's
> comments in a new patch. The only question is how to handle the
> overlap case:
> 
> (a) abort the whole request or
> 
> (b) try to reserve the part that overlaps (and adjust the request to
> avoid the overflow)
> 
> I think (b) is more in line with the current implementation for reservations.


I prefer (b).  following patch should handle that.

diff --git a/kernel/resource.c b/kernel/resource.c
index e1d2b8e..dd87fde 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -780,6 +780,10 @@ static void __init __reserve_region_with_split(struct 
resource *root,
 
if (conflict->start > start)
__reserve_region_with_split(root, start, conflict->start-1, 
name);
+
+   if (conflict->end == parent->end )
+   return;
+
if (conflict->end < end)
__reserve_region_with_split(root, conflict->end+1, end, name);
 }

RP

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] resource: make sure requested range intersects root range

2012-07-11 Thread Ram Pai

On Wed, Jul 11, 2012 at 06:26:49PM +0300, Purdila, Octavian wrote:
> On Wed, Jul 11, 2012 at 5:54 PM, Ram Pai  wrote:
> > On Wed, Jul 11, 2012 at 02:06:10PM +0300, Purdila, Octavian wrote:
> >> On Wed, Jul 11, 2012 at 5:09 AM, Ram Pai  wrote:
> >>
> >> >
> >> > Wait.. I am not sure this will fix the problem entirely. The above check
> >> > will handle the case where the range requested is entirey out of the
> >> > root's range.  But if the requested range overlapps that of the root
> >> > range, we will still call __reserve_region_with_split() and end up with
> >> > a recursion if there is a overflow. Wont we?
> >> >
> >>
> >> Good catch. I will fix this as well as address Andrew's and Joe's
> >> comments in a new patch. The only question is how to handle the
> >> overlap case:
> >>
> >> (a) abort the whole request or
> >>
> >> (b) try to reserve the part that overlaps (and adjust the request to
> >> avoid the overflow)
> >>
> >> I think (b) is more in line with the current implementation for 
> >> reservations.
> >
> >
> > I prefer (b).  following patch should handle that.
> >
> > diff --git a/kernel/resource.c b/kernel/resource.c
> > index e1d2b8e..dd87fde 100644
> > --- a/kernel/resource.c
> > +++ b/kernel/resource.c
> > @@ -780,6 +780,10 @@ static void __init __reserve_region_with_split(struct 
> > resource *root,
> >
> > if (conflict->start > start)
> > __reserve_region_with_split(root, start, conflict->start-1, 
> > name);
> > +
> > +   if (conflict->end == parent->end )
> > +   return;
> > +
> > if (conflict->end < end)
> > __reserve_region_with_split(root, conflict->end+1, end, 
> > name);
> >  }
> >
> 
> I don't think this covers all cases, e.g. if root range starts
> somewhere above 0 and the request is below the root start point.

__reserve_region_with_split() is expected to reserve all available
requested range within the root's range. Correct?

If that is the case, the above patch will reserve the range from the
start of the root's range to the request's end? In other words whatever
is overlapping and available. No?

> 
> What about something like below? It is maybe too verbose, but it
> should make it easier to find the offender.
> 
> diff --git a/kernel/resource.c b/kernel/resource.c
> index e1d2b8e..0d71983 100644
> --- a/kernel/resource.c
> +++ b/kernel/resource.c
> @@ -788,8 +788,29 @@ void __init reserve_region_with_split(struct
> resource *root,
>   resource_size_t start, resource_size_t end,
>   const char *name)
>  {
> + int abort = 0;
> +
>   write_lock(&resource_lock);
> - __reserve_region_with_split(root, start, end, name);
> + if (!(root->start >= start && root->end >= end)) {

This is checking if the request overlapps with the beginning of 
the root's range?


> + pr_err("Requested range (0x%llx-0x%llx) not in root %pr\n",
> +(unsigned long long)start, (unsigned long long)end,
> +root);
> + if (start > root->end || end < root->start) {

and here it is checking if the requested range has no overlapp with the
root's range, which will always be false.


> + abort = 1;
> + pr_err("Unable to fix request, aborting\n");
> + } else {
> + if (end > root->end)
> + end = root->end;
> + else if (start < root->start)
> + start = root->start;
> + pr_err("Request trimmed to (0x%llx-0x%llx)\n",
> +(unsigned long long)start,
> +(unsigned long long)end);

Yes it is too verbose :), and feels wrong.

> + }
> + dump_stack();
> + }
> + if (!abort)
> + __reserve_region_with_split(root, start, end, name);
>   write_unlock(&resource_lock);
>  }

I think your original patch with Andrew's modification and my above
proposal should solve the problem. 

RP

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] resource: make sure requested range intersects root range

2012-07-12 Thread Ram Pai

On Thu, Jul 12, 2012 at 10:02:06AM +0800, Ram Pai wrote:
> On Wed, Jul 11, 2012 at 06:26:49PM +0300, Purdila, Octavian wrote:
> > On Wed, Jul 11, 2012 at 5:54 PM, Ram Pai  wrote:
> > > On Wed, Jul 11, 2012 at 02:06:10PM +0300, Purdila, Octavian wrote:
> > >> On Wed, Jul 11, 2012 at 5:09 AM, Ram Pai  wrote:
> > >>
> > >> >
> > >> > Wait.. I am not sure this will fix the problem entirely. The above 
> > >> > check
> > >> > will handle the case where the range requested is entirey out of the
> > >> > root's range.  But if the requested range overlapps that of the root
> > >> > range, we will still call __reserve_region_with_split() and end up with
> > >> > a recursion if there is a overflow. Wont we?
> > >> >
> > >>
> > >> Good catch. I will fix this as well as address Andrew's and Joe's
> > >> comments in a new patch. The only question is how to handle the
> > >> overlap case:
> > >>
> > >> (a) abort the whole request or
> > >>
> > >> (b) try to reserve the part that overlaps (and adjust the request to
> > >> avoid the overflow)
> > >>
> > >> I think (b) is more in line with the current implementation for 
> > >> reservations.
> > >
> > >
> > > I prefer (b).  following patch should handle that.
> > >
> > > diff --git a/kernel/resource.c b/kernel/resource.c
> > > index e1d2b8e..dd87fde 100644
> > > --- a/kernel/resource.c
> > > +++ b/kernel/resource.c
> > > @@ -780,6 +780,10 @@ static void __init 
> > > __reserve_region_with_split(struct resource *root,
> > >
> > > if (conflict->start > start)
> > > __reserve_region_with_split(root, start, 
> > > conflict->start-1, name);
> > > +
> > > +   if (conflict->end == parent->end )
> > > +   return;
> > > +
> > > if (conflict->end < end)
> > > __reserve_region_with_split(root, conflict->end+1, end, 
> > > name);
> > >  }
> > >
> > 
> > I don't think this covers all cases, e.g. if root range starts
> > somewhere above 0 and the request is below the root start point.
> 

Ok. I see your point.  Here is a proposal that incoporates the best of all 
the proposals till now..


diff --git a/kernel/resource.c b/kernel/resource.c
index e1d2b8e..c6f4958 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -789,7 +789,19 @@ void __init reserve_region_with_split(struct resource 
*root,
const char *name)
 {
write_lock(&resource_lock);
-   __reserve_region_with_split(root, start, end, name);
+   if (start > root->end || end < root->start) {
+   pr_err("Requested range (0x%llx-0x%llx) not in root range 
(0x%llx-0x%llx)\n",
+  (unsigned long long)start, (unsigned long long)end,
+  (unsigned long long)root->start,
+  (unsigned long long)root->end);
+   dump_stack();
+   } else {
+   if (start < root->start)
+   start = root->start;
+   if (end > root->end)
+   end = root->end;
+   __reserve_region_with_split(root, start, end, name);
+   }
write_unlock(&resource_lock);
 }
 

Offcourse; it does not warn when the request is partially out of root's range.
But that should be ok, because its still a valid request.
RP

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[RFC-2 PATCH 2/8] shared subtree

2005-07-18 Thread Ram Pai

 Adds the ability to unclone a vfs tree. A uncloned vfs tree will not be
 clonnable, and hence cannot be bind/rbind to any other mountpoint.

 RP


 fs/namespace.c|   15 ++-
 include/linux/fs.h|1 +
 include/linux/mount.h |   15 +++
 3 files changed, 30 insertions(+), 1 deletion(-)

Index: 2.6.12.work1/fs/namespace.c
===
--- 2.6.12.work1.orig/fs/namespace.c
+++ 2.6.12.work1/fs/namespace.c
@@ -678,6 +678,14 @@ static int do_make_private(struct vfsmou
 	return 0;
 }
 
+static int do_make_unclone(struct vfsmount *mnt)
+{
+	if(mnt->mnt_pnode)
+		pnode_disassociate_mnt(mnt);
+	set_mnt_unclone(mnt);
+	return 0;
+}
+
 /*
  * recursively change the type of the mountpoint.
  */
@@ -687,6 +695,7 @@ static int do_change_type(struct nameida
 	int err=0;
 
 	if (!(flag & MS_SHARED) && !(flag & MS_PRIVATE)
+			&& !(flag & MS_UNCLONE)
 			&& !(flag & MS_SLAVE))
 		return -EINVAL;
 
@@ -705,6 +714,9 @@ static int do_change_type(struct nameida
 		case MS_PRIVATE:
 			err = do_make_private(m);
 			break;
+		case MS_UNCLONE:
+			err = do_make_unclone(m);
+			break;
 		}
 	}
 	spin_unlock(&vfsmount_lock);
@@ -1145,7 +1157,8 @@ long do_mount(char * dev_name, char * di
 data_page);
 	else if (flags & MS_BIND)
 		retval = do_loopback(&nd, dev_name, flags & MS_REC);
-	else if (flags & MS_SHARED || flags & MS_PRIVATE || flags & MS_SLAVE)
+	else if (flags & MS_SHARED || flags & MS_UNCLONE ||
+			flags & MS_PRIVATE || flags & MS_SLAVE)
 		retval = do_change_type(&nd, flags);
 	else if (flags & MS_MOVE)
 		retval = do_move_mount(&nd, dev_name);
Index: 2.6.12.work1/include/linux/fs.h
===
--- 2.6.12.work1.orig/include/linux/fs.h
+++ 2.6.12.work1/include/linux/fs.h
@@ -102,6 +102,7 @@ extern int dir_notify_enable;
 #define MS_MOVE		8192
 #define MS_REC		16384
 #define MS_VERBOSE	32768
+#define MS_UNCLONE	(1<<17) /* recursively change to unclonnable */
 #define MS_PRIVATE	(1<<18) /* recursively change to private */
 #define MS_SLAVE	(1<<19) /* recursively change to slave */
 #define MS_SHARED	(1<<20) /* recursively change to shared */
Index: 2.6.12.work1/include/linux/mount.h
===
--- 2.6.12.work1.orig/include/linux/mount.h
+++ 2.6.12.work1/include/linux/mount.h
@@ -22,15 +22,18 @@
 #define MNT_PRIVATE	0x10  /* if the vfsmount is private, by default it is private*/
 #define MNT_SLAVE	0x20  /* if the vfsmount is a slave mount of its pnode */
 #define MNT_SHARED	0x40  /* if the vfsmount is a slave mount of its pnode */
+#define MNT_UNCLONE	0x80  /* if the vfsmount is unclonable */
 #define MNT_PNODE_MASK	0xf0  /* propogation flag mask */
 
 #define IS_MNT_SHARED(mnt) (mnt->mnt_flags & MNT_SHARED)
 #define IS_MNT_SLAVE(mnt) (mnt->mnt_flags & MNT_SLAVE)
 #define IS_MNT_PRIVATE(mnt) (mnt->mnt_flags & MNT_PRIVATE)
+#define IS_MNT_UNCLONE(mnt) (mnt->mnt_flags & MNT_UNCLONE)
 
 #define CLEAR_MNT_SHARED(mnt) (mnt->mnt_flags &= ~(MNT_PNODE_MASK & MNT_SHARED))
 #define CLEAR_MNT_PRIVATE(mnt) (mnt->mnt_flags &= ~(MNT_PNODE_MASK & MNT_PRIVATE))
 #define CLEAR_MNT_SLAVE(mnt) (mnt->mnt_flags &= ~(MNT_PNODE_MASK & MNT_SLAVE))
+#define CLEAR_MNT_UNCLONE(mnt) (mnt->mnt_flags &= ~(MNT_PNODE_MASK & MNT_UNCLONE))
 
 struct vfsmount
 {
@@ -59,6 +62,7 @@ static inline void set_mnt_shared(struct
 	mnt->mnt_flags |= MNT_PNODE_MASK & MNT_SHARED;
 	CLEAR_MNT_PRIVATE(mnt);
 	CLEAR_MNT_SLAVE(mnt);
+	CLEAR_MNT_UNCLONE(mnt);
 }
 
 static inline void set_mnt_private(struct vfsmount *mnt)
@@ -66,6 +70,16 @@ static inline void set_mnt_private(struc
 	mnt->mnt_flags |= MNT_PNODE_MASK & MNT_PRIVATE;
 	CLEAR_MNT_SLAVE(mnt);
 	CLEAR_MNT_SHARED(mnt);
+	CLEAR_MNT_UNCLONE(mnt);
+	mnt->mnt_pnode = NULL;
+}
+
+static inline void set_mnt_unclone(struct vfsmount *mnt)
+{
+	mnt->mnt_flags |= MNT_PNODE_MASK & MNT_UNCLONE;
+	CLEAR_MNT_SLAVE(mnt);
+	CLEAR_MNT_SHARED(mnt);
+	CLEAR_MNT_PRIVATE(mnt);
 	mnt->mnt_pnode = NULL;
 }
 
@@ -74,6 +88,7 @@ static inline void set_mnt_slave(struct 
 	mnt->mnt_flags |= MNT_PNODE_MASK & MNT_SLAVE;
 	CLEAR_MNT_PRIVATE(mnt);
 	CLEAR_MNT_SHARED(mnt);
+	CLEAR_MNT_UNCLONE(mnt);
 }
 
 static inline struct vfsmount *mntget(struct vfsmount *mnt)
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[RFC-2 PATCH 0/8] shared subtree

2005-07-18 Thread Ram Pai

Enclosed 8 patches that implement shared subtree functionality as
detailed in Al Viro's RFC found at http://lwn.net/Articles/119232/

I have incorporated all the comments received earlier in first round.  Thanks
to Miklos and Pekka for the valuable comments.  Also I have optimized lots of
code, especially in pnode.c . Code is unit tested. However the code in its
current form does not handle ENOMEM error gracefully. I am working on it. 

The incremental patches provide the following functionality:

1) shared_private_slave.patch : Provides the ability to mark a subtree as
shared or private or slave.

2) unclone.patch : provides the ability to mark a subtree as unclonable.  NOTE:
this feature is an addition to Al Viro's RFC, to solve the vfsmount explosion.
The problem is  detailed here:
http://www.ussg.iu.edu/hypermail/linux/kernel/0502.0/0468.html

3) rbind.patch : this patch adds the ability to propagate binds/rbinds across
vfsmounts.

4) move.patch : this patch provides the ability to move a
shared/private/slave/unclonable subtree to some other mount-point. It also
provides the same feature to pivot_root()

5) umount.patch: this patch provides the ability to propagate unmounts.

6) namespace.patch: this patch provides ability to clone a namespace, with
propagation set to vfsmounts in the new namespace.

7) automount.patch: this patch provides the automatic propagation for
mounts/unmounts done through automounter.

8) pnode_opt.patch: this patch optimizes the redundant code in pnode.c .

Looking forward for comments, 
RP
---

CHANGES DONE IN RESPONSE TO COMMENTS RECEIVED IN 1ST ROUND,


response to Pekka J Enberg Comments:


>>Inlining the patches to email would be greatly appreciated. Here are
>>some comments.

done

> +int
> +_do_make_mounted(struct nameidata *nd, struct vfsmount **mnt)

>>Use two underscores to follow naming conventions.

Yes done. In fact this function is renamed as make_mounted because
that seemed to make more sense. But in general throughout the patches
I have changed all newly introduced function that start with one
underscore to two underscores.


> Index: 2.6.12/fs/pnode.c
> ===
> --- /dev/null
> +++ 2.6.12/fs/pnode.c
> @@ -0,0 +1,362 @@
> +
> +#define PNODE_MEMBER_VFS  0x01
> +#define PNODE_SLAVE_VFS   0x02

>>Enums, please.

done


> +
> +static kmem_cache_t * pnode_cachep;
> +
> +/* spinlock for pnode related operations */
> + __cacheline_aligned_in_smp DEFINE_SPINLOCK(vfspnode_lock);
> +
> +
> +static void
> +pnode_init_fn(void *data, kmem_cache_t *cachep, unsigned long flags)
> +{
> + struct vfspnode *pnode = (struct vfspnode *)data;

>>Redundant cast.

yes. removed.

> + INIT_LIST_HEAD(&pnode->pnode_vfs);
> + INIT_LIST_HEAD(&pnode->pnode_slavevfs);
> + INIT_LIST_HEAD(&pnode->pnode_slavepnode);
> + INIT_LIST_HEAD(&pnode->pnode_peer_slave);
> + pnode->pnode_master = NULL;
> + pnode->pnode_flags = 0;
> + atomic_set(&pnode->pnode_count,0);
> +}
> +
> +void __init
> +pnode_init(unsigned long mempages)
> +{
> + pnode_cachep = kmem_cache_create("pnode_cache",
> +   sizeof(struct vfspnode), 0,
> +   SLAB_HWCACHE_ALIGN|SLAB_PANIC, pnode_init_fn, NULL);
> +}
> +
> +
> +struct vfspnode *
> +pnode_alloc(void)
> +{
> + struct vfspnode *pnode =  (struct vfspnode *)kmem_cache_alloc(
> + pnode_cachep, GFP_KERNEL);

>>Redundant cast.

yes removed.

> +struct inoutdata {

>>Wants a better name.

This datastructure is gone after optimizing pnode.c


> + void *my_data; /* produced and consumed by me */
> + void *in_data; /* produced by master, consumed by slave */
> + void *out_data; /* produced by slave, comsume by master */
> +};
> +
> +struct pcontext {
> + struct vfspnode *start;
> + int flag;
> + int traversal;
> + int level;
> + struct vfspnode *master_pnode;
> + struct vfspnode *pnode;
> + struct vfspnode *slave_pnode;
> +};
> +
> +
> +#define PNODE_UP 1
> +#define PNODE_DOWN 2
> +#define PNODE_MID 3

>>Enums, please.

These #defines are gone after the optimizations and cleanup.

> +
> +/*
> + * Walk the pnode tree for each pnode encountered.  A given pnode in the tree
> + * can be returned a minimum of 2 times.  First time the pnode is 
> encountered,
> + * it is returned with the flag PNODE_DOWN. Every time the pnode is 
> encountered
> + * after having traversed through each of its children, it is returned with 
> the
> + * flag PNODE_MID.  And finally when the pnode is encountered after having
> + * walked all of its children, it is returned with the flag PNODE_UP.
> + *
> + * @context: provides context on the state of the last walk in the pnode
> + *   tree.
> + */
> +static int inline
> +pnode_next(struct pcontext *context)

>>Rather large function to be an inline.

yes. done.

> +{
> +

[RFC-2 PATCH 7/8] shared subtree

2005-07-18 Thread Ram Pai

adds support for mount/umount propogation for autofs initiated operations,
RP


 fs/namespace.c|  151 +-
 fs/pnode.c|   13 ++--
 include/linux/pnode.h |3 
 3 files changed, 61 insertions(+), 106 deletions(-)

Index: 2.6.12.work1/fs/namespace.c
===
--- 2.6.12.work1.orig/fs/namespace.c
+++ 2.6.12.work1/fs/namespace.c
@@ -215,6 +215,9 @@ struct vfsmount *do_attach_prepare_mnt(s
 		if(!(child_mnt = clone_mnt(template_mnt,
 template_mnt->mnt_root)))
 			return NULL;
+		spin_lock(&vfsmount_lock);
+		list_del_init(&child_mnt->mnt_fslink);
+		spin_unlock(&vfsmount_lock);
 	} else
 		child_mnt = template_mnt;
 
@@ -352,38 +355,16 @@ struct seq_operations mounts_op = {
  * open files, pwds, chroots or sub mounts that are
  * busy.
  */
-//TOBEFIXED
 int may_umount_tree(struct vfsmount *mnt)
 {
-	struct list_head *next;
-	struct vfsmount *this_parent = mnt;
-	int actual_refs;
-	int minimum_refs;
+	int actual_refs=0;
+	int minimum_refs=0;
+	struct vfsmount *p;
 
 	spin_lock(&vfsmount_lock);
-	actual_refs = atomic_read(&mnt->mnt_count);
-	minimum_refs = 2;
-repeat:
-	next = this_parent->mnt_mounts.next;
-resume:
-	while (next != &this_parent->mnt_mounts) {
-		struct vfsmount *p = list_entry(next, struct vfsmount, mnt_child);
-
-		next = next->next;
-
+	for (p = mnt; p; p = next_mnt(p, mnt)) {
 		actual_refs += atomic_read(&p->mnt_count);
 		minimum_refs += 2;
-
-		if (!list_empty(&p->mnt_mounts)) {
-			this_parent = p;
-			goto repeat;
-		}
-	}
-
-	if (this_parent != mnt) {
-		next = this_parent->mnt_child.next;
-		this_parent = this_parent->mnt_parent;
-		goto resume;
 	}
 	spin_unlock(&vfsmount_lock);
 
@@ -395,18 +376,18 @@ resume:
 
 EXPORT_SYMBOL(may_umount_tree);
 
-int mount_busy(struct vfsmount *mnt)
+int mount_busy(struct vfsmount *mnt, int refcnt)
 {
 	struct vfspnode *parent_pnode;
 
 	if (mnt == mnt->mnt_parent || !IS_MNT_SHARED(mnt->mnt_parent))
-		return do_refcount_check(mnt, 2);
+		return do_refcount_check(mnt, refcnt);
 
 	parent_pnode = mnt->mnt_parent->mnt_pnode;
 	BUG_ON(!parent_pnode);
 	return pnode_mount_busy(parent_pnode,
 			mnt->mnt_mountpoint,
-			mnt->mnt_root, mnt);
+			mnt->mnt_root, mnt, refcnt);
 }
 
 /**
@@ -424,7 +405,7 @@ int mount_busy(struct vfsmount *mnt)
  */
 int may_umount(struct vfsmount *mnt)
 {
-	if (mount_busy(mnt))
+	if (mount_busy(mnt, 2))
 		return -EBUSY;
 	return 0;
 }
@@ -445,6 +426,25 @@ void do_detach_mount(struct vfsmount *mn
 	spin_lock(&vfsmount_lock);
 }
 
+void umount_mnt(struct vfsmount *mnt)
+{
+	if (mnt->mnt_parent != mnt &&
+		IS_MNT_SHARED(mnt->mnt_parent)) {
+		struct vfspnode *parent_pnode
+			= mnt->mnt_parent->mnt_pnode;
+		BUG_ON(!parent_pnode);
+		pnode_umount(parent_pnode,
+			mnt->mnt_mountpoint,
+			mnt->mnt_root);
+	} else {
+		if (IS_MNT_SHARED(mnt) || IS_MNT_SLAVE(mnt)) {
+			BUG_ON(!mnt->mnt_pnode);
+			pnode_disassociate_mnt(mnt);
+		}
+		do_detach_mount(mnt);
+	}
+}
+
 void umount_tree(struct vfsmount *mnt)
 {
 	struct vfsmount *p;
@@ -459,21 +459,7 @@ void umount_tree(struct vfsmount *mnt)
 		mnt = list_entry(kill.next, struct vfsmount, mnt_list);
 		list_del_init(&mnt->mnt_list);
 		list_del_init(&mnt->mnt_fslink);
-		if (mnt->mnt_parent != mnt &&
-			IS_MNT_SHARED(mnt->mnt_parent)) {
-			struct vfspnode *parent_pnode
-= mnt->mnt_parent->mnt_pnode;
-			BUG_ON(!parent_pnode);
-			pnode_umount(parent_pnode,
-mnt->mnt_mountpoint,
-mnt->mnt_root);
-		} else {
-			if (IS_MNT_SHARED(mnt) || IS_MNT_SLAVE(mnt)) {
-BUG_ON(!mnt->mnt_pnode);
-pnode_disassociate_mnt(mnt);
-			}
-			do_detach_mount(mnt);
-		}
+		umount_mnt(mnt);
 	}
 }
 
@@ -568,7 +554,7 @@ int do_umount(struct vfsmount *mnt, int 
 		spin_lock(&vfsmount_lock);
 	}
 	retval = -EBUSY;
-	if (flags & MNT_DETACH || !mount_busy(mnt)) {
+	if (flags & MNT_DETACH || !mount_busy(mnt, 2)) {
 		if (!list_empty(&mnt->mnt_list))
 			umount_tree(mnt);
 		retval = 0;
@@ -1490,6 +1476,8 @@ void mark_mounts_for_expiry(struct list_
 	if (list_empty(mounts))
 		return;
 
+	down_write(&namespace_sem);
+
 	spin_lock(&vfsmount_lock);
 
 	/* extract from the expiration list every vfsmount that matches the
@@ -1499,8 +1487,7 @@ void mark_mounts_for_expiry(struct list_
 	 *   cleared by mntput())
 	 */
 	list_for_each_entry_safe(mnt, next, mounts, mnt_fslink) {
-		if (!xchg(&mnt->mnt_expiry_mark, 1) ||
-		atomic_read(&mnt->mnt_count) != 1)
+		if (!xchg(&mnt->mnt_expiry_mark, 1) || mount_busy(mnt, 1))
 			continue;
 
 		mntget(mnt);
@@ -1508,12 +1495,13 @@ void mark_mounts_for_expiry(struct list_
 	}
 
 	/*
-	 * go through the vfsmounts we've just consigned to the graveyard to
-	 * - check that they're still dead
+	 * go through the vfsmounts we've just consigned to the graveyard
 	 * - delete the vfsmount from the appropriate namespace under lock
 	 * - dispose of the corpse
 	 */
 	while (!list_empty(&graveyard)) {
+		struct super_block *sb;
+
 		mnt

[RFC-2 PATCH 8/8] shared subtree

2005-07-18 Thread Ram Pai

	code Optimization for pnode.c


 fs/pnode.c |  478 -
 1 files changed, 224 insertions(+), 254 deletions(-)

Index: 2.6.12.work1/fs/pnode.c
===
--- 2.6.12.work1.orig/fs/pnode.c
+++ 2.6.12.work1/fs/pnode.c
@@ -26,6 +26,7 @@
 #include 
 #include 
 
+
 enum pnode_vfs_type {
 	PNODE_MEMBER_VFS = 0x01,
 	PNODE_SLAVE_VFS = 0x02
@@ -34,7 +35,7 @@ enum pnode_vfs_type {
 static kmem_cache_t * pnode_cachep;
 
 /* spinlock for pnode related operations */
-  __cacheline_aligned_in_smp DEFINE_SPINLOCK(vfspnode_lock);
+__cacheline_aligned_in_smp DEFINE_SPINLOCK(vfspnode_lock);
 
 
 void __init pnode_init(unsigned long mempages)
@@ -58,7 +59,7 @@ struct vfspnode * pnode_alloc(void)
 	return pnode;
 }
 
-void pnode_free(struct vfspnode *pnode)
+void inline pnode_free(struct vfspnode *pnode)
 {
 	kmem_cache_free(pnode_cachep, pnode);
 }
@@ -147,7 +148,6 @@ static int pnode_next(struct pcontext *c
 	return 1;
 }
 
-
 static void inline pnode_add_mnt(struct vfspnode *pnode,
 		struct vfsmount *mnt, int slave)
 {
@@ -180,6 +180,111 @@ void pnode_add_slave_mnt(struct vfspnode
 	pnode_add_mnt(pnode, mnt, 1);
 }
 
+/*
+ * traverse the pnode tree and at each pnode encountered, execute the
+ * pnode_fnc(). For each vfsmount encountered call the vfs_fnc().
+ *
+ * @pnode: pnode tree to be traversed
+ * @in_data: input data
+ * @out_data: output data
+ * @pnode_func: function to be called when a new pnode is encountered.
+ * @vfs_func: function to be called on each slave and member vfs belonging
+ * 		to the pnode.
+ */
+static int pnode_traverse(struct vfspnode *pnode,
+		void *in_data,
+		void **out_data,
+		int (*pnode_pre_func)(struct vfspnode *,
+			void *, void **, va_list),
+		int (*pnode_post_func)(struct vfspnode *,
+			void *, va_list),
+		int (*vfs_func)(struct vfsmount *,
+			enum pnode_vfs_type, void *,  va_list),
+		...)
+{
+	va_list args;
+	int ret = 0, level;
+	void *my_data, *data_from_master;
+ 	struct vfspnode *master_pnode;
+ 	struct vfsmount *slave_mnt, *member_mnt, *t_m;
+	struct pcontext context;
+	static void *p_array[PNODE_MAX_SLAVE_LEVEL];
+
+	context.start = pnode;
+	context.pnode = NULL;
+	/*
+	 * determine whether to process vfs first or the
+	 * slave pnode first
+	 */
+	while (pnode_next(&context)) {
+		level = context.level;
+		pnode = context.pnode;
+		master_pnode = context.master_pnode;
+
+		if (master_pnode) {
+			data_from_master = p_array[level-1];
+			my_data = NULL;
+		} else {
+			data_from_master = NULL;
+			my_data = in_data;
+		}
+
+		if (pnode_pre_func) {
+			va_start(args, vfs_func);
+			if((ret = pnode_pre_func(pnode,
+data_from_master, &my_data, args)))
+goto error;
+			va_end(args);
+		}
+
+		// traverse member vfsmounts
+		spin_lock(&vfspnode_lock);
+		list_for_each_entry_safe(member_mnt,
+			t_m, &pnode->pnode_vfs, mnt_pnode_mntlist) {
+
+			spin_unlock(&vfspnode_lock);
+			va_start(args, vfs_func);
+			if ((ret = vfs_func(member_mnt,
+PNODE_MEMBER_VFS, my_data, args)))
+goto error;
+			va_end(args);
+			spin_lock(&vfspnode_lock);
+		}
+		list_for_each_entry_safe(slave_mnt, t_m,
+			&pnode->pnode_slavevfs, mnt_pnode_mntlist) {
+
+			spin_unlock(&vfspnode_lock);
+			va_start(args, vfs_func);
+			if ((ret = vfs_func(slave_mnt, PNODE_SLAVE_VFS,
+my_data, args)))
+goto error;
+			va_end(args);
+			spin_lock(&vfspnode_lock);
+		}
+		spin_unlock(&vfspnode_lock);
+
+		if (pnode_post_func) {
+			va_start(args, vfs_func);
+			if((ret = pnode_post_func(pnode,
+my_data, args)))
+goto error;
+			va_end(args);
+		}
+
+		p_array[level] = my_data;
+	}
+out:
+	if (out_data)
+		*out_data = p_array[0];
+	return ret;
+error:
+	va_end(args);
+	if (out_data)
+		*out_data = NULL;
+	goto out;
+}
+
+
 void pnode_add_slave_pnode(struct vfspnode *pnode,
 		struct vfspnode *slave_pnode)
 {
@@ -219,6 +324,7 @@ void pnode_del_member_mnt(struct vfsmoun
 	CLEAR_MNT_SHARED(mnt);
 }
 
+
 void pnode_disassociate_mnt(struct vfsmount *mnt)
 {
 	if (!mnt)
@@ -228,6 +334,7 @@ void pnode_disassociate_mnt(struct vfsmo
 	CLEAR_MNT_SLAVE(mnt);
 }
 
+
 // merge pnode into peer_pnode and get rid of pnode
 int pnode_merge_pnode(struct vfspnode *pnode, struct vfspnode *peer_pnode)
 {
@@ -268,15 +375,18 @@ int pnode_merge_pnode(struct vfspnode *p
 	 * with some cost.
 	 */
 	for (i=0 ; i mnt_mounts)) {
 		if (origmnt == child_mnt)
 			ret = do_refcount_check(child_mnt, refcnt+1);
-  		else
+		else
 			ret = do_refcount_check(child_mnt, refcnt);
 	}
 	mntput(child_mnt);
@@ -300,52 +410,32 @@ static int vfs_busy(struct vfsmount *mnt
 int pnode_mount_busy(struct vfspnode *pnode, struct dentry *mntpt,
 		struct dentry *root, struct vfsmount *mnt, int refcnt)
 {
-	int ret=0;
- 	struct vfsmount *slave_mnt, *member_mnt, *t_m;
-	struct pcontext context;
-
-	context.start = pnode;
-	context.pnode = NULL;
-	while (pnode_next(&context)) {
-		pnode = context.pnode;
-
-		// tr

[RFC-2 PATCH 3/8] shared subtree

2005-07-18 Thread Ram Pai

Adds the ability to bind/rbind a shared/private/slave subtree and set up
propogation wherever needed.

RP

Signed by Ram Pai ([EMAIL PROTECTED])

 fs/namespace.c|  559 --
 fs/pnode.c|  416 +-
 include/linux/dcache.h|2 
 include/linux/fs.h|4 
 include/linux/namespace.h |1 
 include/linux/pnode.h |5 
 6 files changed, 906 insertions(+), 81 deletions(-)

Index: 2.6.12.work1/fs/namespace.c
===
--- 2.6.12.work1.orig/fs/namespace.c
+++ 2.6.12.work1/fs/namespace.c
@@ -42,7 +42,8 @@ static inline int sysfs_init(void)
 
 static struct list_head *mount_hashtable;
 static int hash_mask, hash_bits;
-static kmem_cache_t *mnt_cache; 
+static kmem_cache_t *mnt_cache;
+static struct rw_semaphore namespace_sem;
 
 static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry)
 {
@@ -54,7 +55,7 @@ static inline unsigned long hash(struct 
 
 struct vfsmount *alloc_vfsmnt(const char *name)
 {
-	struct vfsmount *mnt = kmem_cache_alloc(mnt_cache, GFP_KERNEL); 
+	struct vfsmount *mnt = kmem_cache_alloc(mnt_cache, GFP_KERNEL);
 	if (mnt) {
 		memset(mnt, 0, sizeof(struct vfsmount));
 		atomic_set(&mnt->mnt_count,1);
@@ -86,7 +87,8 @@ void free_vfsmnt(struct vfsmount *mnt)
  * Now, lookup_mnt increments the ref count before returning
  * the vfsmount struct.
  */
-struct vfsmount *lookup_mnt(struct vfsmount *mnt, struct dentry *dentry)
+struct vfsmount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry,
+		struct dentry *root)
 {
 	struct list_head * head = mount_hashtable + hash(mnt, dentry);
 	struct list_head * tmp = head;
@@ -99,7 +101,8 @@ struct vfsmount *lookup_mnt(struct vfsmo
 		if (tmp == head)
 			break;
 		p = list_entry(tmp, struct vfsmount, mnt_hash);
-		if (p->mnt_parent == mnt && p->mnt_mountpoint == dentry) {
+		if (p->mnt_parent == mnt && p->mnt_mountpoint == dentry &&
+(root == NULL || p->mnt_root == root)) {
 			found = mntget(p);
 			break;
 		}
@@ -108,6 +111,37 @@ struct vfsmount *lookup_mnt(struct vfsmo
 	return found;
 }
 
+struct vfsmount *lookup_mnt(struct vfsmount *mnt, struct dentry *dentry)
+{
+	return __lookup_mnt(mnt, dentry, NULL);
+}
+
+static struct vfsmount *
+clone_mnt(struct vfsmount *old, struct dentry *root)
+{
+	struct super_block *sb = old->mnt_sb;
+	struct vfsmount *mnt = alloc_vfsmnt(old->mnt_devname);
+
+	if (mnt) {
+		mnt->mnt_flags = old->mnt_flags;
+		atomic_inc(&sb->s_active);
+		mnt->mnt_sb = sb;
+		mnt->mnt_root = dget(root);
+		mnt->mnt_mountpoint = mnt->mnt_root;
+		mnt->mnt_parent = mnt;
+		mnt->mnt_namespace = old->mnt_namespace;
+		mnt->mnt_pnode = get_pnode(old->mnt_pnode);
+
+		/* stick the duplicate mount on the same expiry list
+		 * as the original if that was on one */
+		spin_lock(&vfsmount_lock);
+		if (!list_empty(&old->mnt_fslink))
+			list_add(&mnt->mnt_fslink, &old->mnt_fslink);
+		spin_unlock(&vfsmount_lock);
+	}
+	return mnt;
+}
+
 static inline int check_mnt(struct vfsmount *mnt)
 {
 	return mnt->mnt_namespace == current->namespace;
@@ -128,11 +162,70 @@ static void attach_mnt(struct vfsmount *
 {
 	mnt->mnt_parent = mntget(nd->mnt);
 	mnt->mnt_mountpoint = dget(nd->dentry);
+	mnt->mnt_namespace = nd->mnt->mnt_namespace;
 	list_add(&mnt->mnt_hash, mount_hashtable+hash(nd->mnt, nd->dentry));
 	list_add_tail(&mnt->mnt_child, &nd->mnt->mnt_mounts);
 	nd->dentry->d_mounted++;
 }
 
+static struct vfsmount *do_attach_mnt(struct vfsmount *mnt,
+		struct dentry *dentry,
+		struct vfsmount *child_mnt)
+{
+	struct nameidata nd;
+	LIST_HEAD(head);
+
+	nd.mnt = mnt;
+	nd.dentry = dentry;
+	attach_mnt(child_mnt, &nd);
+	list_add_tail(&head, &child_mnt->mnt_list);
+	list_splice(&head, child_mnt->mnt_namespace->list.prev);
+	return child_mnt;
+}
+
+static void attach_prepare_mnt(struct vfsmount *mnt, struct nameidata *nd)
+{
+	mnt->mnt_parent = mntget(nd->mnt);
+	mnt->mnt_mountpoint = dget(nd->dentry);
+	nd->dentry->d_mounted++;
+}
+
+void do_attach_real_mnt(struct vfsmount *mnt)
+{
+	struct vfsmount *parent = mnt->mnt_parent;
+	BUG_ON(parent==mnt);
+	if(list_empty(&mnt->mnt_hash))
+		list_add(&mnt->mnt_hash,
+			mount_hashtable+hash(parent, mnt->mnt_mountpoint));
+	if(list_empty(&mnt->mnt_child))
+		list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
+	mnt->mnt_namespace = parent->mnt_namespace;
+	list_add_tail(&mnt->mnt_list, &mnt->mnt_namespace->list);
+}
+
+struct vfsmount *do_attach_prepare_mnt(struct vfsmount *mnt,
+		struct dentry *dentry,
+		struct vfsmount *template_mnt,
+		int clone_flag)
+{
+	struct vfsmount *child_mnt;
+	struct nameidata nd;
+
+	if (

[RFC-2 PATCH 1/8] shared subtree

2005-07-18 Thread Ram Pai

This patch adds the shared/private/slave support for VFS trees.

Signed by Ram Pai ([EMAIL PROTECTED])

 fs/Makefile   |2 
 fs/dcache.c   |2 
 fs/namespace.c|   98 +++
 fs/pnode.c|  158 ++
 include/linux/fs.h|5 +
 include/linux/mount.h |   44 -
 include/linux/pnode.h |   80 +
 7 files changed, 385 insertions(+), 4 deletions(-)

Index: 2.6.12.work1/fs/namespace.c
===
--- 2.6.12.work1.orig/fs/namespace.c
+++ 2.6.12.work1/fs/namespace.c
@@ -22,6 +22,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 
@@ -62,6 +63,7 @@ struct vfsmount *alloc_vfsmnt(const char
 		INIT_LIST_HEAD(&mnt->mnt_mounts);
 		INIT_LIST_HEAD(&mnt->mnt_list);
 		INIT_LIST_HEAD(&mnt->mnt_fslink);
+		INIT_LIST_HEAD(&mnt->mnt_pnode_mntlist);
 		if (name) {
 			int size = strlen(name)+1;
 			char *newname = kmalloc(size, GFP_KERNEL);
@@ -615,6 +617,100 @@ out_unlock:
 	return err;
 }
 
+static int do_make_shared(struct vfsmount *mnt)
+{
+	int err=0;
+	struct vfspnode *old_pnode = NULL;
+	/*
+	 * if the mount is already a slave mount,
+	 * allocated a new pnode and make it
+	 * a slave pnode of the original pnode.
+	 */
+	if (IS_MNT_SLAVE(mnt)) {
+		old_pnode = mnt->mnt_pnode;
+		pnode_del_slave_mnt(mnt);
+	}
+	if(!IS_MNT_SHARED(mnt)) {
+		mnt->mnt_pnode = pnode_alloc();
+		if(!mnt->mnt_pnode) {
+			pnode_add_slave_mnt(old_pnode, mnt);
+			err = -ENOMEM;
+			goto out;
+		}
+		pnode_add_member_mnt(mnt->mnt_pnode, mnt);
+	}
+	if(old_pnode)
+		pnode_add_slave_pnode(old_pnode, mnt->mnt_pnode);
+	set_mnt_shared(mnt);
+out:
+	return err;
+}
+
+static int do_make_slave(struct vfsmount *mnt)
+{
+	int err=0;
+	struct vfspnode *old_pnode = NULL;
+
+	if (IS_MNT_SLAVE(mnt))
+		goto out;
+	/*
+	 * only shared mounts can
+	 * be made slave
+	 */
+	if (!IS_MNT_SHARED(mnt)) {
+		err = -EINVAL;
+		goto out;
+	}
+	old_pnode = mnt->mnt_pnode;
+	pnode_del_member_mnt(mnt);
+	pnode_add_slave_mnt(old_pnode, mnt);
+	set_mnt_slave(mnt);
+
+out:
+	return err;
+}
+
+static int do_make_private(struct vfsmount *mnt)
+{
+	if(mnt->mnt_pnode)
+		pnode_disassociate_mnt(mnt);
+	set_mnt_private(mnt);
+	return 0;
+}
+
+/*
+ * recursively change the type of the mountpoint.
+ */
+static int do_change_type(struct nameidata *nd, int flag)
+{
+	struct vfsmount *m, *mnt = nd->mnt;
+	int err=0;
+
+	if (!(flag & MS_SHARED) && !(flag & MS_PRIVATE)
+			&& !(flag & MS_SLAVE))
+		return -EINVAL;
+
+	if (nd->dentry != nd->mnt->mnt_root)
+		return -EINVAL;
+
+	spin_lock(&vfsmount_lock);
+	for (m = mnt; m; m = next_mnt(m, mnt)) {
+		switch (flag) {
+		case MS_SHARED:
+			err = do_make_shared(m);
+			break;
+		case MS_SLAVE:
+			err = do_make_slave(m);
+			break;
+		case MS_PRIVATE:
+			err = do_make_private(m);
+			break;
+		}
+	}
+	spin_unlock(&vfsmount_lock);
+	return err;
+}
+
 /*
  * do loopback mount.
  */
@@ -1049,6 +1145,8 @@ long do_mount(char * dev_name, char * di
 data_page);
 	else if (flags & MS_BIND)
 		retval = do_loopback(&nd, dev_name, flags & MS_REC);
+	else if (flags & MS_SHARED || flags & MS_PRIVATE || flags & MS_SLAVE)
+		retval = do_change_type(&nd, flags);
 	else if (flags & MS_MOVE)
 		retval = do_move_mount(&nd, dev_name);
 	else
Index: 2.6.12.work1/fs/pnode.c
===
--- /dev/null
+++ 2.6.12.work1/fs/pnode.c
@@ -0,0 +1,158 @@
+/*
+ *  linux/fs/pnode.c
+ *
+ * (C) Copyright IBM Corporation 2005.
+ *	Released under GPL v2.
+ *	Author : Ram Pai ([EMAIL PROTECTED])
+ *
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+
+static kmem_cache_t * pnode_cachep;
+
+/* spinlock for pnode related operations */
+ __cacheline_aligned_in_smp DEFINE_SPINLOCK(vfspnode_lock);
+
+
+void __init pnode_init(unsigned long mempages)
+{
+	pnode_cachep = kmem_cache_create("pnode_cache",
+   sizeof(struct vfspnode), 0,
+   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
+}
+
+
+struct vfspnode * pnode_alloc(void)
+{
+	struct vfspnode *pnode =  kmem_cache_alloc(pnode_cachep, GFP_KERNEL);
+	INIT_LIST_HEAD(&pnode->pnode_vfs);
+	INIT_LIST_HEAD(&pnode->pnode_slavevfs);
+	INIT_LIST_HEAD(&pnode->pnode_slavepnode);
+	INIT_LIST_HEAD(&pnode->pnode_peer_slave);
+	pnode->pnode_master = NULL;
+	pnode->pnode_flags = 0;
+	atomic_set(&pnode->pnode_count,0);
+	return pnode;
+}
+
+void pnode_free(struct vfspnode *pnode)
+{
+	kmem_cache_free(pnode_cachep, pnode);
+}
+
+/*
+ * __put_pnode() should be called with vfspnode_lo

[RFC-2 PATCH 4/8] shared subtree

2005-07-18 Thread Ram Pai

Adds ability to move a shared/private/slave/unclone tree to any other
shared/private/slave/unclone tree. Also incorporates the same behavior
for pivot_root()

RP


Signed by Ram Pai ([EMAIL PROTECTED])

 fs/namespace.c |  150 +++--
 1 files changed, 125 insertions(+), 25 deletions(-)

Index: 2.6.12.work1/fs/namespace.c
===
--- 2.6.12.work1.orig/fs/namespace.c
+++ 2.6.12.work1/fs/namespace.c
@@ -664,9 +664,12 @@ static struct vfsmount *copy_tree(struct
 	return NULL;
 }
 
+
  /*
  *  @source_mnt : mount tree to be attached
  *  @nd		: place the mount tree @source_mnt is attached
+ *  @move	: use the move semantics if set, else use normal attach semantics
+ *as explained below
  *
  *  NOTE: in the table below explains the semantics when a source vfsmount
  *  of a given type is attached to a destination vfsmount of a give type.
@@ -699,16 +702,44 @@ static struct vfsmount *copy_tree(struct
  * (+)  the mount will be propogated to the destination vfsmount
  *	  and the new mount will be added to the source vfsmount's pnode.
  *
+ *
+ *  -
+ *  |MOVE MOUNT OPERATION			|
+ *  |***|
+ *  |  dest --> | shared	|	private	 |  slave   |unclonable	|
+ *  | source	|		|   	 |   	|	|
+ *  |   |   	|		|   	 |   	|	|
+ *  |   v 	|		|   	 |   	|	|
+ *  |***|
+ *  |	 	|		|   	 |   	|	|
+ *  |  shared	| shared (++) 	|  shared (+)|shared (+)| shared (+)|
+ *  |		|		|   	 |   	|	|
+ *  |		|		|   	 |   	|	|
+ *  | private	| shared (+)	|  private	 | private  | private  	|
+ *  |		|		|   	 |   	|	|
+ *  |		|		|   	 |   	|	|
+ *  | slave	| shared (+++)	|  slave | slave| slave  	|
+ *  |		|		|   	 |   	|	|
+ *  |		|		|   	 |   	|	|
+ *  | unclonable| unclonable	| unclonable |unclonable| unclonable|
+ *  |		|		|   	 |   	|	|
+ *  |		|		|   	 |   	|	|
+ *   
+ *
+ * (+++)  the mount will be propogated to all the vfsmounts in the pnode tree
+ *	  of the destination vfsmount, and all the new mounts will be
+ *	  added to a new pnode , which will be a slave pnode of the
+ *	  source vfsmount's pnode.
+ *
  * if the source mount is a tree, the operations explained above is
- * applied to each
- * vfsmount in the tree.
+ * applied to each vfsmount in the tree.
  *
  * Should be called without spinlocks held, because this function can sleep
  * in allocations.
  *
   */
 static int attach_recursive_mnt(struct vfsmount *source_mnt,
-		struct nameidata *nd)
+		struct nameidata *nd, int move)
 {
 	struct vfsmount *mntpt_mnt, *m, *p;
 	struct vfspnode *src_pnode, *t_p, *dest_pnode, *tmp_pnode;
@@ -718,7 +749,9 @@ static int attach_recursive_mnt(struct v
 
 	mntpt_mnt = nd->mnt;
 	dest_pnode = IS_MNT_SHARED(mntpt_mnt) ? mntpt_mnt->mnt_pnode : NULL;
-	src_pnode = IS_MNT_SHARED(source_mnt) ? source_mnt->mnt_pnode : NULL;
+	src_pnode = IS_MNT_SHARED(source_mnt) ||
+		(move && IS_MNT_SLAVE(source_mnt)) ?
+		source_mnt->mnt_pnode : NULL;
 
 	if (!dest_pnode && !src_pnode) {
 		LIST_HEAD(head);
@@ -739,6 +772,7 @@ static int attach_recursive_mnt(struct v
 	p = NULL;
 	for (m = source_mnt; m; m = next_mnt(m, source_mnt)) {
 		int unclone = IS_MNT_UNCLONE(m);
+		int slave = IS_MNT_SLAVE(m);
 
 		list_del_init(&m->mnt_list);
 
@@ -756,7 +790,7 @@ static int attach_recursive_mnt(struct v
 		p=m;
 		dest_pnode = IS_MNT_SHARED(mntpt_mnt) ?
 			mntpt_mnt->mnt_pnode : NULL;
-		src_pnode = (IS_MNT_SHARED(m))?
+		src_pnode = (IS_MNT_SHARED(m) || (move && slave))?
 m->mnt_pnode : NULL;
 
 		m->mnt_pnode = NULL;
@@ -772,19 +806,35 @@ static int attach_recursive_mnt(struct v
 			if ((ret = pnode_prepare_mount(dest_pnode, tmp_pnode,
 	mntpt_dentry, m, mntpt_mnt)))
 return ret;
+			if (move && dest_pnode && slave)
+SET_PNODE_SLAVE(tmp_pnode);
 		} else {
 			if (m == m->mnt_parent)
 do_attach_prepare_mnt(mntpt_mnt,
 	mntpt_dentry, m, 0);
-			pnode_add_member_mnt(tmp_pnode, m);
-			if (unclone) {
-set_mnt_unclone(m);
-m->mnt_pnode = tmp_pnode;
-SET_PNODE_DELETE(tmp_pnode);
-			} else if (!src_pnode) {
-set_mnt_private(m);
-m->mnt_pnode = tmp_pnode;
-SET_PNODE_DELETE(tmp_pnode);
+			if (move && slave)
+pnode_add_slave_mnt(tmp_pnode, m);
+			else {
+pnode_add_member_mnt(tmp_pnode, m);
+if (unclone) {
+	BUG_ON(!move);
+	set_mnt_unclone(m);
+	m->mnt_pnode = tmp_pnode;
+	SET_PNODE_DELETE(tmp_pnode);
+} else if (!sr

[RFC-2 PATCH 5/8] shared subtree

2005-07-18 Thread Ram Pai

Adds ability to unmount a shared/slave/unclone/private tree

RP

Signed by Ram Pai ([EMAIL PROTECTED])

 fs/namespace.c|   68 +-
 fs/pnode.c|  112 ++
 include/linux/fs.h|3 +
 include/linux/pnode.h |5 ++
 4 files changed, 177 insertions(+), 11 deletions(-)

Index: 2.6.12.work1/fs/pnode.c
===
--- 2.6.12.work1.orig/fs/pnode.c
+++ 2.6.12.work1/fs/pnode.c
@@ -273,6 +273,117 @@ int pnode_merge_pnode(struct vfspnode *p
 	return 0;
 }
 
+static int vfs_busy(struct vfsmount *mnt, struct dentry *dentry,
+		struct dentry *rootdentry, struct vfsmount *origmnt)
+{
+	struct vfsmount *child_mnt;
+	int ret=0;
+
+	spin_unlock(&vfsmount_lock);
+	child_mnt = __lookup_mnt(mnt, dentry, rootdentry);
+	spin_lock(&vfsmount_lock);
+
+	if (!child_mnt)
+		return 0;
+
+	if (list_empty(&child_mnt->mnt_mounts)) {
+		if (origmnt == child_mnt)
+			ret = do_refcount_check(child_mnt, 3);
+		else
+			ret = do_refcount_check(child_mnt, 2);
+	}
+	mntput(child_mnt);
+	return ret;
+}
+
+int pnode_mount_busy(struct vfspnode *pnode, struct dentry *mntpt,
+		struct dentry *root, struct vfsmount *mnt)
+{
+	int ret=0;
+ 	struct vfsmount *slave_mnt, *member_mnt, *t_m;
+	struct pcontext context;
+
+	context.start = pnode;
+	context.pnode = NULL;
+	while (pnode_next(&context)) {
+		pnode = context.pnode;
+
+		// traverse member vfsmounts
+		spin_lock(&vfspnode_lock);
+		list_for_each_entry_safe(member_mnt,
+			t_m, &pnode->pnode_vfs, mnt_pnode_mntlist) {
+			spin_unlock(&vfspnode_lock);
+			if ((ret = vfs_busy(member_mnt, mntpt,
+	root, mnt)))
+goto out;
+			spin_lock(&vfspnode_lock);
+		}
+		list_for_each_entry_safe(slave_mnt, t_m,
+			&pnode->pnode_slavevfs, mnt_pnode_mntlist) {
+			spin_unlock(&vfspnode_lock);
+			if ((ret = vfs_busy(slave_mnt, mntpt,
+	root, mnt)))
+goto out;
+			spin_lock(&vfspnode_lock);
+		}
+		spin_unlock(&vfspnode_lock);
+	}
+out:
+	return ret;
+}
+
+int vfs_umount(struct vfsmount *mnt, struct dentry *dentry,
+		struct dentry *rootdentry)
+{
+	struct vfsmount *child_mnt;
+
+	spin_unlock(&vfsmount_lock);
+	child_mnt = __lookup_mnt(mnt, dentry, rootdentry);
+	spin_lock(&vfsmount_lock);
+	mntput(child_mnt);
+	if (child_mnt && list_empty(&child_mnt->mnt_mounts)) {
+		do_detach_mount(child_mnt);
+		if (child_mnt->mnt_pnode)
+			pnode_disassociate_mnt(child_mnt);
+	}
+	return 0;
+}
+
+int pnode_umount(struct vfspnode *pnode, struct dentry *dentry,
+			struct dentry *rootdentry)
+{
+	int ret=0;
+ 	struct vfsmount *slave_mnt, *member_mnt, *t_m;
+	struct pcontext context;
+
+	context.start = pnode;
+	context.pnode = NULL;
+	while (pnode_next(&context)) {
+		pnode = context.pnode;
+		// traverse member vfsmounts
+		spin_lock(&vfspnode_lock);
+		list_for_each_entry_safe(member_mnt,
+			t_m, &pnode->pnode_vfs, mnt_pnode_mntlist) {
+			spin_unlock(&vfspnode_lock);
+			if ((ret = vfs_umount(member_mnt,
+	dentry, rootdentry)))
+goto out;
+			spin_lock(&vfspnode_lock);
+		}
+		list_for_each_entry_safe(slave_mnt, t_m,
+			&pnode->pnode_slavevfs, mnt_pnode_mntlist) {
+			spin_unlock(&vfspnode_lock);
+			if ((ret = vfs_umount(slave_mnt,
+	dentry, rootdentry)))
+goto out;
+			spin_lock(&vfspnode_lock);
+		}
+		spin_unlock(&vfspnode_lock);
+	}
+out:
+	return ret;
+}
+
 /*
  * @pnode: pnode that contains the vfsmounts, on which the
  *  		new mount is created at dentry 'dentry'
@@ -532,6 +643,7 @@ int pnode_real_mount(struct vfspnode *pn
 			if ((ret = vfs_real_mount_func(member_mnt,
 			flag)))
 goto out;
+			spin_lock(&vfspnode_lock);
 		}
 		list_for_each_entry_safe(slave_mnt, t_m,
 			&pnode->pnode_slavevfs, mnt_pnode_mntlist) {
Index: 2.6.12.work1/fs/namespace.c
===
--- 2.6.12.work1.orig/fs/namespace.c
+++ 2.6.12.work1/fs/namespace.c
@@ -352,6 +352,7 @@ struct seq_operations mounts_op = {
  * open files, pwds, chroots or sub mounts that are
  * busy.
  */
+//TOBEFIXED
 int may_umount_tree(struct vfsmount *mnt)
 {
 	struct list_head *next;
@@ -394,6 +395,20 @@ resume:
 
 EXPORT_SYMBOL(may_umount_tree);
 
+int mount_busy(struct vfsmount *mnt)
+{
+	struct vfspnode *parent_pnode;
+
+	if (mnt == mnt->mnt_parent || !IS_MNT_SHARED(mnt->mnt_parent))
+		return do_refcount_check(mnt, 2);
+
+	parent_pnode = mnt->mnt_parent->mnt_pnode;
+	BUG_ON(!parent_pnode);
+	return pnode_mount_busy(parent_pnode,
+			mnt->mnt_mountpoint,
+			mnt->mnt_root, mnt);
+}
+
 /**
  * may_umount - check if a mount point is busy
  * @mnt: root of mount
@@ -409,13 +424,27 @@ EXPORT_SYMBOL(may_umount_tree);
  */
 int may_umount(struct vfsmount *mnt)
 {
-	if (atomic_read(&mnt->mnt_count) > 2)
+	if (mount_busy(mnt))
 		return -EBU

[RFC-2 PATCH 6/8] shared subtree

2005-07-18 Thread Ram Pai

Adds ability to clone a namespace that has shared/private/slave/unclone
subtrees in it.

RP


Signed by Ram Pai ([EMAIL PROTECTED])

 fs/namespace.c |9 +
 1 files changed, 9 insertions(+)

Index: 2.6.12.work1/fs/namespace.c
===
--- 2.6.12.work1.orig/fs/namespace.c
+++ 2.6.12.work1/fs/namespace.c
@@ -1763,6 +1763,13 @@ int copy_namespace(int flags, struct tas
 	q = new_ns->root;
 	while (p) {
 		q->mnt_namespace = new_ns;
+
+		if (IS_MNT_SHARED(q))
+			pnode_add_member_mnt(q->mnt_pnode, q);
+		else if (IS_MNT_SLAVE(q))
+			pnode_add_slave_mnt(q->mnt_pnode, q);
+		put_pnode(q->mnt_pnode);
+
 		if (fs) {
 			if (p == fs->rootmnt) {
 rootmnt = p;
@@ -2129,6 +2136,8 @@ void __put_namespace(struct namespace *n
 	spin_lock(&vfsmount_lock);
 
 	list_for_each_entry(mnt, &namespace->list, mnt_list) {
+		if (mnt->mnt_pnode)
+			pnode_disassociate_mnt(mnt);
 		mnt->mnt_namespace = NULL;
 	}
 
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: shared subtrees implementation writeup

2005-07-18 Thread Ram Pai

On Mon, 2005-07-18 at 04:06, Miklos Szeredi wrote:
> Thanks for the writeup, it helps to understand things a bit better.
> However I still don't understand a few things:
> 
> 
> > Section 1. mount:
> > 
> > to begin with we have a the following mount tree 
> > 
> >  root
> >   / /  \  \ \
> >  /  t1  t2 \  \ 
> >t0   t3 \
> > t4
> > 
> > note: 
> > t0, t1, t2, t3, t4 all contain mounts.
> > t1 t2 t3 are the slave of t0. 
> > t4 is the slave of t2.
> > t4 and t3 is marked as shared.
> > 
> > The corresponding propagation tree will be:
> > 
> > p0
> >   /   \
> >  p1   p2
> >  / 
> >  p3
> > 
> > 
> > ***
> >   p0 contains the mount t0, and contains the slave mount t1
> >   p1 contains the mount t2
> >   p3 contains the mount t4
> >   p2 contains the mount t3
> > 
> >   NOTE: you may need to look at this multiple time as you try to
> > understand the various scenarios.
> > ***
> 
> Why you have p2 and p3?  They contain a single mount only, which could
> directly be slaves to p0 and p1 respectively.  Does it have something
> to do with being shared?

Yes. If the mounts were just slave than they could be a slave member of
their corresponding master pnode, i.e p0 and p1 respectively. But 
in my example above they are also shared. And a shared mount could be
bind mounted with propagation set in either direction. Hence they
deserve a separate pnode.  If it was just a slave mount then binding to
it would not set any propagation and hence there need not be a separate
pnodes to track the propagation.

Just for clarification:
1. a slave mount is represented as a slave member of a pnode.
2. a shared mount is represented as a member of a  pnode.
3. a slave as well as a shared mount is represented a member of
separate pnode, which in itself is a slave pnode.
4. a private mount is not part of any pnode.
5. a unclone mount is also not part of any pnode.

> 
> BTW, is there a reason not to include the pnode info in 'struct
> vfsmount'?  That would simplify a lot of allocation error cases.
> 
> > The key point to be noted in the above set of operations is:
> > each pnode does three different operations corresponding to each stage.
> > 
> > A. when the pnode is encountered the first time, it has to create
> > a new pnode for its child mounts.
> > B. when the pnode is encountered again after it has traversed down
> >each slave pnode, it has to associate the slave pnode's newly created
> >pnode with the pnode's newly created pnode.
> > C. when the pnode is encountered finally after having traversed through
> > all its slave pnodes, it has to create new child mounts
> > for each of its member mounts.
> 
> Now why is this needed?  Couldn't each of these be done in a single step?
> 
> I still can't see the reason for having these things done at different
> stages of the traversal.

Yes. This can be done in a single step. And in fact in my latest patches
that I sent yesterday I did exactly that. It works. All that messy
PNODE_UP,PNODE_DOWN,PNODE_MID is all gone. Code has become
much simpler.

The reason this was there earlier was that I was thinking we may need
all these phases for some operations like umount, make_mounted.. 
But as I understand the operations better I am convinced that it is not
required, and you reconfirm that point :)

Thanks,
RP
> 
> Thanks,
> Miklos

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[no subject]

2005-07-25 Thread Ram Pai

, [EMAIL PROTECTED], Janak Desai <[EMAIL PROTECTED]>, 
linux-fsdevel@vger.kernel.org, linux-kernel@vger.kernel.org
Subject: [PATCH 4/7] shared subtree
Content-Type: text/x-patch; name=move.patch
Content-Disposition: inline; filename=move.patch

Adds ability to move a shared/private/slave/unclone tree to any other
shared/private/slave/unclone tree. Also incorporates the same behavior
for pivot_root()

RP


Signed by Ram Pai ([EMAIL PROTECTED])

 fs/namespace.c|  196 +++---
 include/linux/mount.h |2 
 2 files changed, 173 insertions(+), 25 deletions(-)

Index: 2.6.12.work2/fs/namespace.c
===
--- 2.6.12.work2.orig/fs/namespace.c
+++ 2.6.12.work2/fs/namespace.c
@@ -772,9 +772,12 @@ static void abort_attach_recursive_mnt(s
list_del_init(head);
 }
 
+
  /*
  *  @source_mnt : mount tree to be attached
  *  @nd: place the mount tree @source_mnt is attached
+ *  @move  : use the move semantics if set, else use normal attach 
semantics
+ *as explained below
  *
  *  NOTE: in the table below explains the semantics when a source vfsmount
  *  of a given type is attached to a destination vfsmount of a give type.
@@ -801,12 +804,41 @@ static void abort_attach_recursive_mnt(s
  *  |  |   ||  |   |
  *   
  *
- * (++)  the mount will be propogated to all the vfsmounts in the pnode tree
+ * (++)  the mount is propogated to all the vfsmounts in the pnode tree
  *   of the destination vfsmount, and all the non-slave new mounts in
  *   destination vfsmount will be added the source vfsmount's pnode.
- * (+)  the mount will be propogated to the destination vfsmount
+ * (+)  the mount is propogated to the destination vfsmount
  *   and the new mount will be added to the source vfsmount's pnode.
  *
+ *  -
+ *  |  MOVE MOUNT OPERATION|
+ *  |***|
+ *  |  dest --> | shared   |   private  |  slave   |unclonable |
+ *  | source   |   ||  |   |
+ *  |   |  |   ||  |   |
+ *  |   v  |   ||  |   |
+ *  |***|
+ *  |  |   ||  |   |
+ *  |  shared  | shared (++)   |  shared (+)|shared (+)| shared (+)|
+ *  |  |   ||  |   |
+ *  |  |   ||  |   |
+ *  | private  | shared (+)|  private   | private  | private   |
+ *  |  |   ||  |   |
+ *  |  |   ||  |   |
+ *  | slave| shared (+++)  |  slave | slave| slave |
+ *  |  |   ||  |   |
+ *  |  |   ||  |   |
+ *  | unclonable|  invalid | unclonable |unclonable| unclonable|
+ *  |  |   ||  |   |
+ *  |  |   ||  |   |
+ *   
+ *
+ * (+++)  the mount is propogated to all the vfsmounts in the pnode tree
+ *   of the destination vfsmount, and all the new mounts is
+ *   added to a new pnode , which is a slave pnode of the
+ *   source vfsmount's pnode.
+ *
+ *
  * if the source mount is a tree, the operations explained above is
  * applied to each vfsmount in the tree.
  *
@@ -815,7 +847,7 @@ static void abort_attach_recursive_mnt(s
  *
   */
 static int attach_recursive_mnt(struct vfsmount *source_mnt,
-   struct nameidata *nd)
+   struct nameidata *nd, int move)
 {
struct vfsmount *mntpt_mnt, *last, *m, *p;
struct vfspnode *src_pnode, *dest_pnode, *tmp_pnode;
@@ -849,8 +881,8 @@ static int attach_recursive_mnt(struct v
list_add_tail(&mnt_list_head, &source_mnt->mnt_list);
 
for (m = source_mnt; m; m = next_mnt(m, source_mnt)) {
-
-   BUG_ON(IS_MNT_UNCLONE(m));
+   int unclone = IS_MNT_UNCLONE(m);
+   int slave = IS_MNT_SLAVE(m);
 
while (p && p != m->mnt_parent)
p = p->mnt_parent;
@@ -866,7 +898,7 @@ static int attach_recursive_mnt(struct v
 
dest_pnode = IS_MNT_SHARED(mntpt_mnt) ?
mntpt_mnt->mnt_pnode : NULL;
-   src_p

[no subject]

2005-07-25 Thread Ram Pai

, [EMAIL PROTECTED], Janak Desai <[EMAIL PROTECTED]>, 
linux-fsdevel@vger.kernel.org, linux-kernel@vger.kernel.org
Subject: [PATCH 3/7] shared subtree
Content-Type: text/x-patch; name=rbind.patch
Content-Disposition: inline; filename=rbind.patch

Adds the ability to bind/rbind a shared/private/slave subtree and set up
propogation wherever needed.

RP

Signed by Ram Pai ([EMAIL PROTECTED])

 fs/namespace.c|  660 --
 fs/pnode.c|  235 
 include/linux/dcache.h|2 
 include/linux/fs.h|5 
 include/linux/namespace.h |1 
 5 files changed, 826 insertions(+), 77 deletions(-)

Index: 2.6.12.work2/fs/namespace.c
===
--- 2.6.12.work2.orig/fs/namespace.c
+++ 2.6.12.work2/fs/namespace.c
@@ -42,7 +42,8 @@ static inline int sysfs_init(void)
 
 static struct list_head *mount_hashtable;
 static int hash_mask, hash_bits;
-static kmem_cache_t *mnt_cache; 
+static kmem_cache_t *mnt_cache;
+static struct rw_semaphore namespace_sem;
 
 static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry)
 {
@@ -54,7 +55,7 @@ static inline unsigned long hash(struct 
 
 struct vfsmount *alloc_vfsmnt(const char *name)
 {
-   struct vfsmount *mnt = kmem_cache_alloc(mnt_cache, GFP_KERNEL); 
+   struct vfsmount *mnt = kmem_cache_alloc(mnt_cache, GFP_KERNEL);
if (mnt) {
memset(mnt, 0, sizeof(struct vfsmount));
atomic_set(&mnt->mnt_count,1);
@@ -86,7 +87,8 @@ void free_vfsmnt(struct vfsmount *mnt)
  * Now, lookup_mnt increments the ref count before returning
  * the vfsmount struct.
  */
-struct vfsmount *lookup_mnt(struct vfsmount *mnt, struct dentry *dentry)
+struct vfsmount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry,
+   struct dentry *root)
 {
struct list_head * head = mount_hashtable + hash(mnt, dentry);
struct list_head * tmp = head;
@@ -99,7 +101,8 @@ struct vfsmount *lookup_mnt(struct vfsmo
if (tmp == head)
break;
p = list_entry(tmp, struct vfsmount, mnt_hash);
-   if (p->mnt_parent == mnt && p->mnt_mountpoint == dentry) {
+   if (p->mnt_parent == mnt && p->mnt_mountpoint == dentry &&
+   (root == NULL || p->mnt_root == root)) {
found = mntget(p);
break;
}
@@ -108,6 +111,37 @@ struct vfsmount *lookup_mnt(struct vfsmo
return found;
 }
 
+struct vfsmount *lookup_mnt(struct vfsmount *mnt, struct dentry *dentry)
+{
+   return __lookup_mnt(mnt, dentry, NULL);
+}
+
+static struct vfsmount *
+clone_mnt(struct vfsmount *old, struct dentry *root)
+{
+   struct super_block *sb = old->mnt_sb;
+   struct vfsmount *mnt = alloc_vfsmnt(old->mnt_devname);
+
+   if (mnt) {
+   mnt->mnt_flags = old->mnt_flags;
+   atomic_inc(&sb->s_active);
+   mnt->mnt_sb = sb;
+   mnt->mnt_root = dget(root);
+   mnt->mnt_mountpoint = mnt->mnt_root;
+   mnt->mnt_parent = mnt;
+   mnt->mnt_namespace = old->mnt_namespace;
+   mnt->mnt_pnode = get_pnode(old->mnt_pnode);
+
+   /* stick the duplicate mount on the same expiry list
+* as the original if that was on one */
+   spin_lock(&vfsmount_lock);
+   if (!list_empty(&old->mnt_fslink))
+   list_add(&mnt->mnt_fslink, &old->mnt_fslink);
+   spin_unlock(&vfsmount_lock);
+   }
+   return mnt;
+}
+
 static inline int check_mnt(struct vfsmount *mnt)
 {
return mnt->mnt_namespace == current->namespace;
@@ -128,11 +162,71 @@ static void attach_mnt(struct vfsmount *
 {
mnt->mnt_parent = mntget(nd->mnt);
mnt->mnt_mountpoint = dget(nd->dentry);
-   list_add(&mnt->mnt_hash, mount_hashtable+hash(nd->mnt, nd->dentry));
+   mnt->mnt_namespace = nd->mnt->mnt_namespace;
+   list_add_tail(&mnt->mnt_hash,
+   mount_hashtable+hash(nd->mnt, nd->dentry));
list_add_tail(&mnt->mnt_child, &nd->mnt->mnt_mounts);
nd->dentry->d_mounted++;
 }
 
+static void attach_prepare_mnt(struct vfsmount *mnt, struct nameidata *nd)
+{
+   mnt->mnt_parent = mntget(nd->mnt);
+   mnt->mnt_mountpoint = dget(nd->dentry);
+   nd->dentry->d_mounted++;
+}
+
+
+void do_attach_commit_mnt(struct vfsmount *mnt)
+{
+   struct vfsmount *parent = mnt->mnt_parent;
+   BUG_ON(parent==mnt);
+   if(list_empty(&mnt->mnt_hash))
+   list_add_tail(&mnt->mnt_hash,
+   m

[no subject]

2005-07-25 Thread Ram Pai

, [EMAIL PROTECTED], Janak Desai <[EMAIL PROTECTED]>, 
linux-fsdevel@vger.kernel.org, linux-kernel@vger.kernel.org
Subject: [PATCH 5/7] shared subtree
Content-Type: text/x-patch; name=umount.patch
Content-Disposition: inline; filename=umount.patch

Adds ability to unmount a shared/slave/unclone/private tree

RP

Signed by Ram Pai ([EMAIL PROTECTED])

 fs/namespace.c|   76 --
 fs/pnode.c|   66 +++
 include/linux/fs.h|3 +
 include/linux/pnode.h |9 -
 4 files changed, 138 insertions(+), 16 deletions(-)

Index: 2.6.12.work2/fs/pnode.c
===
--- 2.6.12.work2.orig/fs/pnode.c
+++ 2.6.12.work2/fs/pnode.c
@@ -666,3 +666,69 @@ int pnode_abort_mount(struct vfspnode *p
NULL, (void *)NULL, NULL, NULL,
vfs_abort_mount_func, exception_mnt);
 }
+
+static int vfs_busy(struct vfsmount *mnt, enum pnode_vfs_type flag,
+   void *indata, va_list args)
+{
+   struct dentry *dentry = va_arg(args, struct dentry *);
+   struct dentry *rootdentry = va_arg(args, struct dentry *);
+   struct vfsmount *origmnt = va_arg(args, struct vfsmount *);
+   struct vfsmount *child_mnt;
+   int ret=0;
+
+   spin_unlock(&vfsmount_lock);
+   child_mnt = __lookup_mnt(mnt, dentry, rootdentry);
+   spin_lock(&vfsmount_lock);
+
+   if (!child_mnt)
+   return 0;
+
+   if (list_empty(&child_mnt->mnt_mounts)) {
+   if (origmnt == child_mnt)
+   ret = do_refcount_check(child_mnt, 3);
+   else
+   ret = do_refcount_check(child_mnt, 2);
+   }
+   mntput(child_mnt);
+   return ret;
+}
+
+int pnode_mount_busy(struct vfspnode *pnode, struct dentry *mntpt,
+   struct dentry *root, struct vfsmount *mnt)
+{
+   return pnode_traverse(pnode, NULL, NULL,
+   NULL, NULL, vfs_busy, mntpt, root, mnt);
+}
+
+
+int vfs_umount(struct vfsmount *mnt, enum pnode_vfs_type flag,
+   void *indata, va_list args)
+{
+   struct vfsmount *child_mnt;
+   struct dentry *dentry, *rootdentry;
+
+
+   dentry = va_arg(args, struct dentry *);
+   rootdentry = va_arg(args, struct dentry *);
+
+   spin_unlock(&vfsmount_lock);
+   child_mnt = __lookup_mnt(mnt, dentry, rootdentry);
+   spin_lock(&vfsmount_lock);
+   mntput(child_mnt);
+   if (child_mnt && list_empty(&child_mnt->mnt_mounts)) {
+   if (IS_MNT_SHARED(child_mnt) ||
+   IS_MNT_SLAVE(child_mnt)) {
+   BUG_ON(!child_mnt->mnt_pnode);
+   pnode_disassociate_mnt(child_mnt);
+   }
+   do_detach_mount(child_mnt);
+   }
+   return 0;
+}
+
+int pnode_umount(struct vfspnode *pnode, struct dentry *dentry,
+   struct dentry *rootdentry)
+{
+   return pnode_traverse(pnode, NULL, (void *)NULL,
+   NULL, NULL, vfs_umount, dentry, rootdentry);
+}
Index: 2.6.12.work2/fs/namespace.c
===
--- 2.6.12.work2.orig/fs/namespace.c
+++ 2.6.12.work2/fs/namespace.c
@@ -395,6 +395,20 @@ resume:
 
 EXPORT_SYMBOL(may_umount_tree);
 
+int mount_busy(struct vfsmount *mnt)
+{
+   struct vfspnode *parent_pnode;
+
+   if (mnt == mnt->mnt_parent || !IS_MNT_SHARED(mnt->mnt_parent))
+   return do_refcount_check(mnt, 2);
+
+   parent_pnode = mnt->mnt_parent->mnt_pnode;
+   BUG_ON(!parent_pnode);
+   return pnode_mount_busy(parent_pnode,
+   mnt->mnt_mountpoint,
+   mnt->mnt_root, mnt);
+}
+
 /**
  * may_umount - check if a mount point is busy
  * @mnt: root of mount
@@ -410,14 +424,28 @@ EXPORT_SYMBOL(may_umount_tree);
  */
 int may_umount(struct vfsmount *mnt)
 {
-   if (atomic_read(&mnt->mnt_count) > 2)
+   if (mount_busy(mnt))
return -EBUSY;
return 0;
 }
 
 EXPORT_SYMBOL(may_umount);
 
-void umount_tree(struct vfsmount *mnt)
+void do_detach_mount(struct vfsmount *mnt)
+{
+   struct nameidata old_nd;
+   if (mnt != mnt->mnt_parent) {
+   detach_mnt(mnt, &old_nd);
+   path_release(&old_nd);
+   }
+   list_del_init(&mnt->mnt_list);
+   list_del_init(&mnt->mnt_fslink);
+   spin_unlock(&vfsmount_lock);
+   mntput(mnt);
+   spin_lock(&vfsmount_lock);
+}
+
+void __umount_tree(struct vfsmount *mnt, int propogate)
 {
struct vfsmount *p;
LIST_HEAD(kill);
@@ -431,20 +459,40 @@ void umount_tree(struct vfsmount *mnt)
mnt = list_entry(kill.next, struct vfsmount, mnt_list);
list_del_init(

[no subject]

2005-07-25 Thread Ram Pai

, [EMAIL PROTECTED], Janak Desai <[EMAIL PROTECTED]>, 
linux-fsdevel@vger.kernel.org, linux-kernel@vger.kernel.org
Subject: [PATCH 7/7] shared subtree
Content-Type: text/x-patch; name=automount.patch
Content-Disposition: inline; filename=automount.patch

adds support for mount/umount propogation for autofs initiated operations,
RP

Signed by Ram Pai ([EMAIL PROTECTED])

 fs/namespace.c|  176 +++---
 fs/pnode.c|   12 +--
 include/linux/pnode.h |3 
 3 files changed, 76 insertions(+), 115 deletions(-)

Index: 2.6.12.work2/fs/namespace.c
===
--- 2.6.12.work2.orig/fs/namespace.c
+++ 2.6.12.work2/fs/namespace.c
@@ -202,6 +202,9 @@ struct vfsmount *do_attach_prepare_mnt(s
if(!(child_mnt = clone_mnt(template_mnt,
template_mnt->mnt_root)))
return NULL;
+   spin_lock(&vfsmount_lock);
+   list_del_init(&child_mnt->mnt_fslink);
+   spin_unlock(&vfsmount_lock);
} else
child_mnt = template_mnt;
 
@@ -355,35 +358,14 @@ struct seq_operations mounts_op = {
  */
 int may_umount_tree(struct vfsmount *mnt)
 {
-   struct list_head *next;
-   struct vfsmount *this_parent = mnt;
-   int actual_refs;
-   int minimum_refs;
+   int actual_refs=0;
+   int minimum_refs=0;
+   struct vfsmount *p;
 
spin_lock(&vfsmount_lock);
-   actual_refs = atomic_read(&mnt->mnt_count);
-   minimum_refs = 2;
-repeat:
-   next = this_parent->mnt_mounts.next;
-resume:
-   while (next != &this_parent->mnt_mounts) {
-   struct vfsmount *p = list_entry(next, struct vfsmount, 
mnt_child);
-
-   next = next->next;
-
+   for (p = mnt; p; p = next_mnt(p, mnt)) {
actual_refs += atomic_read(&p->mnt_count);
minimum_refs += 2;
-
-   if (!list_empty(&p->mnt_mounts)) {
-   this_parent = p;
-   goto repeat;
-   }
-   }
-
-   if (this_parent != mnt) {
-   next = this_parent->mnt_child.next;
-   this_parent = this_parent->mnt_parent;
-   goto resume;
}
spin_unlock(&vfsmount_lock);
 
@@ -395,18 +377,18 @@ resume:
 
 EXPORT_SYMBOL(may_umount_tree);
 
-int mount_busy(struct vfsmount *mnt)
+int mount_busy(struct vfsmount *mnt, int refcnt)
 {
struct vfspnode *parent_pnode;
 
if (mnt == mnt->mnt_parent || !IS_MNT_SHARED(mnt->mnt_parent))
-   return do_refcount_check(mnt, 2);
+   return do_refcount_check(mnt, refcnt);
 
parent_pnode = mnt->mnt_parent->mnt_pnode;
BUG_ON(!parent_pnode);
return pnode_mount_busy(parent_pnode,
mnt->mnt_mountpoint,
-   mnt->mnt_root, mnt);
+   mnt->mnt_root, mnt, refcnt);
 }
 
 /**
@@ -424,9 +406,12 @@ int mount_busy(struct vfsmount *mnt)
  */
 int may_umount(struct vfsmount *mnt)
 {
-   if (mount_busy(mnt))
-   return -EBUSY;
-   return 0;
+   int ret=0;
+   spin_lock(&vfsmount_lock);
+   if (mount_busy(mnt, 2))
+   ret = -EBUSY;
+   spin_unlock(&vfsmount_lock);
+   return ret;
 }
 
 EXPORT_SYMBOL(may_umount);
@@ -445,7 +430,26 @@ void do_detach_mount(struct vfsmount *mn
spin_lock(&vfsmount_lock);
 }
 
-void __umount_tree(struct vfsmount *mnt, int propogate)
+void umount_mnt(struct vfsmount *mnt, int propogate)
+{
+   if (propogate && mnt->mnt_parent != mnt &&
+   IS_MNT_SHARED(mnt->mnt_parent)) {
+   struct vfspnode *parent_pnode
+   = mnt->mnt_parent->mnt_pnode;
+   BUG_ON(!parent_pnode);
+   pnode_umount(parent_pnode,
+   mnt->mnt_mountpoint,
+   mnt->mnt_root);
+   } else {
+   if (IS_MNT_SHARED(mnt) || IS_MNT_SLAVE(mnt)) {
+   BUG_ON(!mnt->mnt_pnode);
+   pnode_disassociate_mnt(mnt);
+   }
+   do_detach_mount(mnt);
+   }
+}
+
+static void __umount_tree(struct vfsmount *mnt, int propogate)
 {
struct vfsmount *p;
LIST_HEAD(kill);
@@ -459,21 +463,7 @@ void __umount_tree(struct vfsmount *mnt,
mnt = list_entry(kill.next, struct vfsmount, mnt_list);
list_del_init(&mnt->mnt_list);
list_del_init(&mnt->mnt_fslink);
-   if (propogate && mnt->mnt_parent != mnt &&
-   IS_MNT_SHARED(mnt->mnt_parent)) {
-   struct vfspnode *parent_pnode
-   =

[no subject]

2005-07-25 Thread Ram Pai

, [EMAIL PROTECTED], Janak Desai <[EMAIL PROTECTED]>, 
linux-fsdevel@vger.kernel.org, linux-kernel@vger.kernel.org
Subject: [PATCH 1/7] shared subtree
Content-Type: text/x-patch; name=shared_private_slave.patch
Content-Disposition: inline; filename=shared_private_slave.patch

This patch adds the shared/private/slave support for VFS trees.

Signed by Ram Pai ([EMAIL PROTECTED])

 fs/Makefile   |2 
 fs/dcache.c   |2 
 fs/namespace.c|   93 ++
 fs/pnode.c|  441 ++
 include/linux/fs.h|5 
 include/linux/mount.h |   44 
 include/linux/pnode.h |   90 ++
 7 files changed, 673 insertions(+), 4 deletions(-)

Index: 2.6.12.work2/fs/namespace.c
===
--- 2.6.12.work2.orig/fs/namespace.c
+++ 2.6.12.work2/fs/namespace.c
@@ -22,6 +22,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 
@@ -62,6 +63,7 @@ struct vfsmount *alloc_vfsmnt(const char
INIT_LIST_HEAD(&mnt->mnt_mounts);
INIT_LIST_HEAD(&mnt->mnt_list);
INIT_LIST_HEAD(&mnt->mnt_fslink);
+   INIT_LIST_HEAD(&mnt->mnt_pnode_mntlist);
if (name) {
int size = strlen(name)+1;
char *newname = kmalloc(size, GFP_KERNEL);
@@ -615,6 +617,95 @@ out_unlock:
return err;
 }
 
+static int do_make_shared(struct vfsmount *mnt)
+{
+   int err=0;
+   struct vfspnode *old_pnode = NULL;
+   /*
+* if the mount is already a slave mount,
+* allocate a new pnode and make it
+* a slave pnode of the original pnode.
+*/
+   if (IS_MNT_SLAVE(mnt)) {
+   old_pnode = mnt->mnt_pnode;
+   pnode_del_slave_mnt(mnt);
+   }
+   if(!IS_MNT_SHARED(mnt)) {
+   mnt->mnt_pnode = pnode_alloc();
+   if(!mnt->mnt_pnode) {
+   pnode_add_slave_mnt(old_pnode, mnt);
+   err = -ENOMEM;
+   goto out;
+   }
+   pnode_add_member_mnt(mnt->mnt_pnode, mnt);
+   }
+   if(old_pnode)
+   pnode_add_slave_pnode(old_pnode, mnt->mnt_pnode);
+   set_mnt_shared(mnt);
+out:
+   return err;
+}
+
+static int do_make_slave(struct vfsmount *mnt)
+{
+   int err=0;
+
+   if (IS_MNT_SLAVE(mnt))
+   goto out;
+   /*
+* only shared mounts can
+* be made slave
+*/
+   if (!IS_MNT_SHARED(mnt)) {
+   err = -EINVAL;
+   goto out;
+   }
+   pnode_member_to_slave(mnt);
+out:
+   return err;
+}
+
+static int do_make_private(struct vfsmount *mnt)
+{
+   if(mnt->mnt_pnode)
+   pnode_disassociate_mnt(mnt);
+   set_mnt_private(mnt);
+   return 0;
+}
+
+/*
+ * recursively change the type of the mountpoint.
+ */
+static int do_change_type(struct nameidata *nd, int flag)
+{
+   struct vfsmount *m, *mnt = nd->mnt;
+   int err=0;
+
+   if (!(flag & MS_SHARED) && !(flag & MS_PRIVATE)
+   && !(flag & MS_SLAVE))
+   return -EINVAL;
+
+   if (nd->dentry != nd->mnt->mnt_root)
+   return -EINVAL;
+
+   spin_lock(&vfsmount_lock);
+   for (m = mnt; m; m = next_mnt(m, mnt)) {
+   switch (flag) {
+   case MS_SHARED:
+   err = do_make_shared(m);
+   break;
+   case MS_SLAVE:
+   err = do_make_slave(m);
+   break;
+   case MS_PRIVATE:
+   err = do_make_private(m);
+   break;
+   }
+   }
+   spin_unlock(&vfsmount_lock);
+   return err;
+}
+
 /*
  * do loopback mount.
  */
@@ -1049,6 +1140,8 @@ long do_mount(char * dev_name, char * di
data_page);
else if (flags & MS_BIND)
retval = do_loopback(&nd, dev_name, flags & MS_REC);
+   else if (flags & MS_SHARED || flags & MS_PRIVATE || flags & MS_SLAVE)
+   retval = do_change_type(&nd, flags);
else if (flags & MS_MOVE)
retval = do_move_mount(&nd, dev_name);
else
Index: 2.6.12.work2/fs/pnode.c
===
--- /dev/null
+++ 2.6.12.work2/fs/pnode.c
@@ -0,0 +1,441 @@
+/*
+ *  linux/fs/pnode.c
+ *
+ * (C) Copyright IBM Corporation 2005.
+ * Released under GPL v2.
+ * Author : Ram Pai ([EMAIL PROTECTED])
+ *
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+
+stati

[no subject]

2005-07-25 Thread Ram Pai

, [EMAIL PROTECTED], Janak Desai <[EMAIL PROTECTED]>, 
linux-fsdevel@vger.kernel.org, linux-kernel@vger.kernel.org
Subject: [PATCH 6/7] shared subtree
Content-Type: text/x-patch; name=namespace.patch
Content-Disposition: inline; filename=namespace.patch

Adds ability to clone a namespace that has shared/private/slave/unclone
subtrees in it.

RP


Signed by Ram Pai ([EMAIL PROTECTED])

 fs/namespace.c |9 +
 1 files changed, 9 insertions(+)

Index: 2.6.12-rc6.work1/fs/namespace.c
===
--- 2.6.12-rc6.work1.orig/fs/namespace.c
+++ 2.6.12-rc6.work1/fs/namespace.c
@@ -1894,6 +1894,13 @@ int copy_namespace(int flags, struct tas
q = new_ns->root;
while (p) {
q->mnt_namespace = new_ns;
+
+   if (IS_MNT_SHARED(q))
+   pnode_add_member_mnt(q->mnt_pnode, q);
+   else if (IS_MNT_SLAVE(q))
+   pnode_add_slave_mnt(q->mnt_pnode, q);
+   put_pnode(q->mnt_pnode);
+
if (fs) {
if (p == fs->rootmnt) {
rootmnt = p;
@@ -2271,6 +2278,8 @@ void __put_namespace(struct namespace *n
spin_lock(&vfsmount_lock);
 
list_for_each_entry(mnt, &namespace->list, mnt_list) {
+   if (mnt->mnt_pnode)
+   pnode_disassociate_mnt(mnt);
mnt->mnt_namespace = NULL;
}
 
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[no subject]

2005-07-25 Thread Ram Pai

, [EMAIL PROTECTED], Janak Desai <[EMAIL PROTECTED]>, 
linux-fsdevel@vger.kernel.org, linux-kernel@vger.kernel.org
Subject: [PATCH 2/7] shared subtree
Content-Type: text/x-patch; name=unclone.patch
Content-Disposition: inline; filename=unclone.patch

 Adds the ability to unclone a vfs tree. A uncloned vfs tree will not be
 clonnable, and hence cannot be bind/rbind to any other mountpoint.

 RP

Signed by Ram Pai ([EMAIL PROTECTED])

 fs/namespace.c|   15 ++-
 include/linux/fs.h|1 +
 include/linux/mount.h |   15 +++
 3 files changed, 30 insertions(+), 1 deletion(-)

Index: 2.6.12.work2/fs/namespace.c
===
--- 2.6.12.work2.orig/fs/namespace.c
+++ 2.6.12.work2/fs/namespace.c
@@ -673,6 +673,14 @@ static int do_make_private(struct vfsmou
return 0;
 }
 
+static int do_make_unclone(struct vfsmount *mnt)
+{
+   if(mnt->mnt_pnode)
+   pnode_disassociate_mnt(mnt);
+   set_mnt_unclone(mnt);
+   return 0;
+}
+
 /*
  * recursively change the type of the mountpoint.
  */
@@ -682,6 +690,7 @@ static int do_change_type(struct nameida
int err=0;
 
if (!(flag & MS_SHARED) && !(flag & MS_PRIVATE)
+   && !(flag & MS_UNCLONE)
&& !(flag & MS_SLAVE))
return -EINVAL;
 
@@ -700,6 +709,9 @@ static int do_change_type(struct nameida
case MS_PRIVATE:
err = do_make_private(m);
break;
+   case MS_UNCLONE:
+   err = do_make_unclone(m);
+   break;
}
}
spin_unlock(&vfsmount_lock);
@@ -1140,7 +1152,8 @@ long do_mount(char * dev_name, char * di
data_page);
else if (flags & MS_BIND)
retval = do_loopback(&nd, dev_name, flags & MS_REC);
-   else if (flags & MS_SHARED || flags & MS_PRIVATE || flags & MS_SLAVE)
+   else if (flags & MS_SHARED || flags & MS_UNCLONE ||
+   flags & MS_PRIVATE || flags & MS_SLAVE)
retval = do_change_type(&nd, flags);
else if (flags & MS_MOVE)
retval = do_move_mount(&nd, dev_name);
Index: 2.6.12.work2/include/linux/fs.h
===
--- 2.6.12.work2.orig/include/linux/fs.h
+++ 2.6.12.work2/include/linux/fs.h
@@ -102,6 +102,7 @@ extern int dir_notify_enable;
 #define MS_MOVE8192
 #define MS_REC 16384
 #define MS_VERBOSE 32768
+#define MS_UNCLONE (1<<17) /* recursively change to unclonnable */
 #define MS_PRIVATE (1<<18) /* recursively change to private */
 #define MS_SLAVE   (1<<19) /* recursively change to slave */
 #define MS_SHARED  (1<<20) /* recursively change to shared */
Index: 2.6.12.work2/include/linux/mount.h
===
--- 2.6.12.work2.orig/include/linux/mount.h
+++ 2.6.12.work2/include/linux/mount.h
@@ -22,15 +22,18 @@
 #define MNT_PRIVATE0x10  /* if the vfsmount is private, by default it is 
private*/
 #define MNT_SLAVE  0x20  /* if the vfsmount is a slave mount of its pnode 
*/
 #define MNT_SHARED 0x40  /* if the vfsmount is a slave mount of its pnode 
*/
+#define MNT_UNCLONE0x80  /* if the vfsmount is unclonable */
 #define MNT_PNODE_MASK 0xf0  /* propogation flag mask */
 
 #define IS_MNT_SHARED(mnt) (mnt->mnt_flags & MNT_SHARED)
 #define IS_MNT_SLAVE(mnt) (mnt->mnt_flags & MNT_SLAVE)
 #define IS_MNT_PRIVATE(mnt) (mnt->mnt_flags & MNT_PRIVATE)
+#define IS_MNT_UNCLONE(mnt) (mnt->mnt_flags & MNT_UNCLONE)
 
 #define CLEAR_MNT_SHARED(mnt) (mnt->mnt_flags &= ~(MNT_PNODE_MASK & 
MNT_SHARED))
 #define CLEAR_MNT_PRIVATE(mnt) (mnt->mnt_flags &= ~(MNT_PNODE_MASK & 
MNT_PRIVATE))
 #define CLEAR_MNT_SLAVE(mnt) (mnt->mnt_flags &= ~(MNT_PNODE_MASK & MNT_SLAVE))
+#define CLEAR_MNT_UNCLONE(mnt) (mnt->mnt_flags &= ~(MNT_PNODE_MASK & 
MNT_UNCLONE))
 
 struct vfsmount
 {
@@ -59,6 +62,7 @@ static inline void set_mnt_shared(struct
mnt->mnt_flags |= MNT_PNODE_MASK & MNT_SHARED;
CLEAR_MNT_PRIVATE(mnt);
CLEAR_MNT_SLAVE(mnt);
+   CLEAR_MNT_UNCLONE(mnt);
 }
 
 static inline void set_mnt_private(struct vfsmount *mnt)
@@ -66,6 +70,16 @@ static inline void set_mnt_private(struc
mnt->mnt_flags |= MNT_PNODE_MASK & MNT_PRIVATE;
CLEAR_MNT_SLAVE(mnt);
CLEAR_MNT_SHARED(mnt);
+   CLEAR_MNT_UNCLONE(mnt);
+   mnt->mnt_pnode = NULL;
+}
+
+static inline void set_mnt_unclone(struct vfsmount *mnt)
+{
+   mnt->mnt_flags |= MNT_PNODE_MASK & MNT_UNCLONE;
+   CLEAR_MNT_SLAVE(mnt);
+   CLEAR_MN

[no subject]

2005-07-25 Thread Ram Pai

, [EMAIL PROTECTED], Janak Desai <[EMAIL PROTECTED]>, 
linux-fsdevel@vger.kernel.org, linux-kernel@vger.kernel.org
Subject: [PATCH 0/7] shared subtree

Hi Andrew/Al Viro,

Enclosing a final set of well tested patches that implement
Al Viro's shared subtree proposal.

These patches provide the ability to mark a mount tree as
shared/private/slave/unclone, along with the ability to play with these
trees with operations like bind/rbind/move/pivot_root/namespace-clone
etc.

I believe this powerful feature can help build features like
per-user namespace.  Couple of projects may benefit from
shared subtrees.
1) automounter for the ability to automount across namespaces.
2) SeLinux for implementing polyinstantiated trees.
3) MVFS for providing versioning file system.
4) FUSE for per-user namespaces?

Thanks to Avantika for developing about 100+ test cases that tests
various combintation of private/shared/slave/unclonable trees. All
these tests have passed. I feel pretty confident about the stability of
the code.

The patches have been broken into 7 units, for ease of review.  I
realize that patch-3 'rbind.patch' is a bit heavier than all the other
patches. The reason being, most of the shared-subtree functionality 
gets manifestated during bind/rbind operation.

Couple of work items to be done are:
1. modify the mount command to support this feature
eg:  mount --make-shared /tmp
2. a tool that can help visualize the propogation tree, maybe
support in /proc?
3. some documentation on how to use all this functionality.

Please consider the patches for inclusion in your tree.

The footprint of this code is pretty small in the normal code path
where shared-subtree functionality is not used.

Any suggestions/comments to improve the code is welcome.

Thanks,
RP
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: supposed to be shared subtree patches.

2005-07-25 Thread Ram Pai

On Mon, 2005-07-25 at 15:44, Ram Pai wrote:
> , [EMAIL PROTECTED], Janak Desai <[EMAIL PROTECTED]>, 
> linux-fsdevel@vger.kernel.org, linux-kernel@vger.kernel.org
> Subject: [PATCH 0/7] shared subtree
> 
> Hi Andrew/Al Viro,
> 
>   Enclosing a final set of well tested patches that implement

my apologies. I screwed up sending the patches through quilt.

anyway I have received the following comments from Andrew Morton, which
I will incorporate before sending out saner looking patches.
sorry again,
RP

Andrew's comments follows:

Frankly, I don't even know what these patches _do_, and haven't spent
the time to try to find out.

If these patches are merged, how do we expect end-users to find out how
to use the new capabilities?

A few paragraphs in the patch #1 changelog would help.  A high-level
description of the new capability which explains what it does and why it
would be a useful thing for Linux.

And maybe some deeper information in a Documentation/ file.

Right now, there might well be a lot of people who could use these new
features, but they don't even know that these patches provide them! 
It's all a bit of a mystery, really.
-

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 3/7] shared subtree

2005-07-27 Thread Ram Pai

On Wed, 2005-07-27 at 12:13, Miklos Szeredi wrote:
> > @@ -54,7 +55,7 @@ static inline unsigned long hash(struct 
> >  
> >  struct vfsmount *alloc_vfsmnt(const char *name)
> >  {
> > -   struct vfsmount *mnt = kmem_cache_alloc(mnt_cache, GFP_KERNEL); 
> > +   struct vfsmount *mnt = kmem_cache_alloc(mnt_cache, GFP_KERNEL);
> > if (mnt) {
> > memset(mnt, 0, sizeof(struct vfsmount));
> > atomic_set(&mnt->mnt_count,1);
> 
> Please make whitespace changes a separate patch.

 I tried to remove trailing whitespaces in the current code
whereever I found them. Ok will them a separate patch.


> 
> > @@ -128,11 +162,71 @@ static void attach_mnt(struct vfsmount *
> >  {
> > mnt->mnt_parent = mntget(nd->mnt);
> > mnt->mnt_mountpoint = dget(nd->dentry);
> > -   list_add(&mnt->mnt_hash, mount_hashtable+hash(nd->mnt, nd->dentry));
> > +   mnt->mnt_namespace = nd->mnt->mnt_namespace;
> > +   list_add_tail(&mnt->mnt_hash,
> > +   mount_hashtable+hash(nd->mnt, nd->dentry));
> > list_add_tail(&mnt->mnt_child, &nd->mnt->mnt_mounts);
> > nd->dentry->d_mounted++;
> >  }
> 
> Why list_add_tail()?  This changes user visible behavior, and seems
> unnecessary.

Yes. I was about to send out a mail questioning the existing behavior. I
will start a seperate thread questioning the current behavoir. My plan
was to discuss the current behavior before making this change. I thought
I had reverted this change. But it slipped in. 

> 
> > +static void attach_prepare_mnt(struct vfsmount *mnt, struct nameidata *nd)
> > +{
> > +   mnt->mnt_parent = mntget(nd->mnt);
> > +   mnt->mnt_mountpoint = dget(nd->dentry);
> > +   nd->dentry->d_mounted++;
> > +}
> > +
> > +
> 
> You shouldn't add unnecessary newlines.  There are a lot of these,
> please audit all your patches.

ok. sure.

> 
> > +void do_attach_commit_mnt(struct vfsmount *mnt)
> > +{
> > +   struct vfsmount *parent = mnt->mnt_parent;
> > +   BUG_ON(parent==mnt);
> 
>   BUG_ON(parent == mnt);
> 
> > +   if(list_empty(&mnt->mnt_hash))
> 
>   if (list_empty(&mnt->mnt_hash))
> 
> > +   list_add_tail(&mnt->mnt_hash,
> > +   mount_hashtable+hash(parent, mnt->mnt_mountpoint));
> > +   if(list_empty(&mnt->mnt_child))
> > +   list_add_tail(&mnt->mnt_child, &parent->mnt_mounts);
> > +   mnt->mnt_namespace = parent->mnt_namespace;
> > +   list_add_tail(&mnt->mnt_list, &mnt->mnt_namespace->list);
> > +}
> 
> Etc.  Maybe you should run Lindent on your changes, but be careful not
> to change existing code, even if Lindent would do that!

sure :)

> 
> > @@ -191,7 +270,7 @@ static void *m_start(struct seq_file *m,
> > struct list_head *p;
> > loff_t l = *pos;
> >  
> > -   down_read(&n->sem);
> > +   down_read(&namespace_sem);
> > list_for_each(p, &n->list)
> > if (!l--)
> > return list_entry(p, struct vfsmount, mnt_list);
> 
> This should be a separate patch.  You can just take the one from the
> detached trees patch-series.

ok. in fact these changes were motivated by that patch.

> 
> > +/*
> > + * abort the operations done in attach_recursive_mnt(). run through the 
> > mount
> > + * tree, till vfsmount 'last' and undo the changes.  Ensure that all the 
> > mounts
> > + * in the tree are all back in the mnt_list headed at 'source_mnt'.
> > + * NOTE: This function is closely tied to the logic in
> > + * 'attach_recursive_mnt()'
> > + */
> > +static void abort_attach_recursive_mnt(struct vfsmount *source_mnt, struct
> > +   vfsmount *last, struct list_head *head) { struct vfsmount *p =
> > +   source_mnt, *m; struct vfspnode *src_pnode;
> 
> If you want to do proper error handling, instead of doing rollback, it
> seems better to first do anything that can fail (allocations), then do
> the actual attaching, which cannot fail.  It isn't nice to have
> transient states on failure.

yes. it does exactly what you said. In the prepare stage it does not
touch any of the existing vfstree or the pnode tree.

All it does it builds a new vfstree and pnode tree, does the necessary
changes to them. And if everthing is successful, it glues the new tree
to the existing tree (which is the commit phase), and if the prepare
stage fails allocating memory or any other reason, it goes and destroys
the new trees (in the abort phase).

Offcourse in the prepare state, it does increase the reference count of
the vfsmounts to which the new tree will be attached. This is to ensure
that the vfsmounts have not disappeared by the time we reach the commit
phase.  I think we are talking the same thing, and the code behaves
exactly as you said.


> 
> > + /*
> > + * This operation is equivalent of mount --bind dir dir
> > + * create a new mount at the dentry, and unmount all child mounts
> > + * mounted on top of dentries below 'dentry', and mount them
> > + * under the new mount.
> > +  */
> > +struct vfsmount *do_make_mounted(struct vfsmount *mnt, struct dentry 
> > *dentry)
> 
> W

Re: [PATCH 1/7] shared subtree

2005-07-27 Thread Ram Pai

On Wed, 2005-07-27 at 12:54, Miklos Szeredi wrote:
> > +static int do_make_shared(struct vfsmount *mnt)
> > +{
> > +   int err=0;
> > +   struct vfspnode *old_pnode = NULL;
> > +   /*
> > +* if the mount is already a slave mount,
> > +* allocate a new pnode and make it
> > +* a slave pnode of the original pnode.
> > +*/
> > +   if (IS_MNT_SLAVE(mnt)) {
> > +   old_pnode = mnt->mnt_pnode;
> > +   pnode_del_slave_mnt(mnt);
> > +   }
> > +   if(!IS_MNT_SHARED(mnt)) {
> > +   mnt->mnt_pnode = pnode_alloc();
> > +   if(!mnt->mnt_pnode) {
> > +   pnode_add_slave_mnt(old_pnode, mnt);
> > +   err = -ENOMEM;
> > +   goto out;
> > +   }
> > +   pnode_add_member_mnt(mnt->mnt_pnode, mnt);
> > +   }
> > +   if(old_pnode)
> > +   pnode_add_slave_pnode(old_pnode, mnt->mnt_pnode);
> > +   set_mnt_shared(mnt);
> > +out:
> > +   return err;
> > +}
> 
> This is an example, where having struct pnode just complicates things.
> If there was no struct pnode, this function would be just one line:
> setting the shared flag.
So your comment is mostly about getting rid of pnode and distributing
the pnode functionality in the vfsmount structure.

I know you are thinking of just having the necessary propogation list in
the vfsmount structure itself.  Yes true with that implementation the
complication is reduced in this part of the code, but really complicates
the propogation traversal routines. 

 In order to find out the slaves of a given mount:
with your proposal:  I have to walk through all the peer mounts of this
mount and check for any slaves there.
in my implementation: I have to just find which pnode it belongs to, and
all the slaves are easily available there.

  In order to find out all the shared mounts that are slave of this
mount: 

with your proposal: Not sure how to do. Maybe you have to have another
field in each of the vfsmounts that will point to
the shared mounts that are slave of this mount.??

in my implemenation: I have to just find the pnode it belongs to,
and all the slave pnodes are easily available there.

There is complexity tradeoffs in both the implementations. But I
personally felt having a pnode structure keeps the pnode operations
seperated out cleanly. It helps to easily visualize the propogation
tree. And also one more thing influenced my thought process. The
statement in Al Viro's RFC: 
---
How do we set them up? 

* we can mark a subtree sharable. Every vfsmount in the subtree 
that is not already in some p-node gets a single-element p-node of its 
own. 
* we can mark a subtree slave. That removes all vfsmounts in 
the subtree from their p-nodes and makes them owned by said p-nodes. 
p-nodes that became empty will disappear and everything they used to 
own will be repossessed by their owners (if any). 
* we can mark a subtree private. Same as above, but followed 
by taking all vfsmounts in our subtree and making them *not* owned 
by anybody. 

The above statements imply some implementation detail. Not sure if you
will buy this point :)

> 
> > +static kmem_cache_t * pnode_cachep;
> > +
> > +/* spinlock for pnode related operations */
> > + __cacheline_aligned_in_smp DEFINE_SPINLOCK(vfspnode_lock);
> > +
> > +enum pnode_vfs_type {
> > +   PNODE_MEMBER_VFS = 0x01,
> > +   PNODE_SLAVE_VFS = 0x02
> > +};
> > +
> > +void __init pnode_init(unsigned long mempages)
> > +{
> > +   pnode_cachep = kmem_cache_create("pnode_cache",
> > +   sizeof(struct vfspnode), 0,
> > +   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
> > +}
> > +
> > +struct vfspnode * pnode_alloc(void)
> > +{
> > +   struct vfspnode *pnode =  kmem_cache_alloc(pnode_cachep, GFP_KERNEL);
> > +   INIT_LIST_HEAD(&pnode->pnode_vfs);
> > +   INIT_LIST_HEAD(&pnode->pnode_slavevfs);
> > +   INIT_LIST_HEAD(&pnode->pnode_slavepnode);
> > +   INIT_LIST_HEAD(&pnode->pnode_peer_slave);
> > +   pnode->pnode_master = NULL;
> > +   pnode->pnode_flags = 0;
> > +   atomic_set(&pnode->pnode_count,0);
> > +   return pnode;
> > +}
> > +
> > +void inline pnode_free(struct vfspnode *pnode)
> > +{
> > +   kmem_cache_free(pnode_cachep, pnode);
> > +}
> > +
> > +/*
> > + * __put_pnode() should be called with vfspnode_lock held
> > + */
> > +void __put_pnode(struct vfspnode *pnode)
> > +{
> > +   struct vfspnode *tmp_pnode;
> > +   do {
> > +   tmp_pnode = pnode->pnode_master;
> > +   list_del_init(&pnode->pnode_peer_slave);
> > +   BUG_ON(!list_empty(&pnode->pnode_vfs));
> > +   BUG_ON(!list_empty(&pnode->pnode_slavevfs));
> > +   BUG_ON(!list_empty(&pnode->pnode_slavepnode));
> > +   pnode_free(pnode);
> > +   pnode = tmp_pnode;
> > +   if (!pnode || !atomic_dec_and_

Re: [PATCH 1/7] shared subtree

2005-07-29 Thread Ram Pai

On Thu, 2005-07-28 at 02:57, Miklos Szeredi wrote:
> > > This is an example, where having struct pnode just complicates things.
> > > If there was no struct pnode, this function would be just one line:
> > > setting the shared flag.
> > So your comment is mostly about getting rid of pnode and distributing
> > the pnode functionality in the vfsmount structure.
> 
> Yes, sorry if I didn't make it clear.
> 
> > I know you are thinking of just having the necessary propogation list in
> > the vfsmount structure itself.  Yes true with that implementation the
> > complication is reduced in this part of the code, but really complicates
> > the propogation traversal routines. 
> 
> On the contrary, I think it will simplify the traversal routines.
> 
> Here's an iterator function I coded up.  Not tested at all (may not
> even compile):

Your suggested code has bugs. But I understand what you are aiming at. 

Maybe you are right. I will try out a implementation using your idea.

Hmm.. lots of code change, and testing.

> 
> struct vfsmount {
>   /* ... */
> 
>   struct list_head mnt_share;  /* circular list of shared mounts */
>   struct list_head mnt_slave_list; /* list of slave mounts */
>   struct list_head mnt_slave;  /* slave list entry */
>   struct vfsmount *master; /* slave is on master->mnt_slave_list 
> */
> };
> 
> static inline struct vfsmount *next_shared(struct vfsmount *p)
> {
>   return list_entry(p->mnt_share.next, struct vfsmount, mnt_share);
> }
> 
> static inline struct vfsmount *first_slave(struct vfsmount *p)
> {
>   return list_entry(p->mnt_slave_list.next, struct vfsmount, mnt_slave);
> }
> 
> static inline struct vfsmount *next_slave(struct vfsmount *p)
> {
>   return list_entry(p->mnt_slave.next, struct vfsmount, mnt_slave);
> }
> 
> static struct vfsmount *propagation_next(struct vfsmount *p,
>struct vfsmount *base)
> {
>   /* first iterate over the slaves */
>   if (!list_empty(&p->mnt_slave_list))
>   return first_slave(p);

I think this code should be
if (!list_empty(&p->mnt_slave))
return next_slave(p);

Right? I think I get the idea. 



RP

> 
>   while (1) {
>   struct vfsmount *q;
> 
>   /* more vfsmounts belong to the pnode? */
>   if (!list_empty(&p->mnt_share)) {
>   p = next_shared(p);
>   if (list_empty(&p->mnt_slave) && p != base)
>   return p;
>   }
>   if (p == base)
>   break;
>   
>   BUG_ON(list_empty(&p->mnt_slave));
> 
>   /* more slaves? */
>   q = next_slave(p);
>   if (p->master != q)
>   return q;
> 
>   /* back at master */
>   p = q;
>   }
> 
>   return NULL;
> }
> 
> -
> To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
> the body of a message to [EMAIL PROTECTED]
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 1/7] shared subtree

2005-07-30 Thread Ram Pai

On Fri, 2005-07-29 at 22:39, Miklos Szeredi wrote:
> > > static struct vfsmount *propagation_next(struct vfsmount *p,
> > >struct vfsmount *base)
> > > {
> > >   /* first iterate over the slaves */
> > >   if (!list_empty(&p->mnt_slave_list))
> > >   return first_slave(p);
> > 
> > I think this code should be
> > if (!list_empty(&p->mnt_slave))
> > return next_slave(p);
> > 
> > Right? I think I get the idea. 
> 
> This is a depth-first search, so first_slave() is right.

Ok. I have started implementing your idea. But the implementation is no
simple.  Its becomes a complex mess. Atleast in the case of pnode
datastructure implementation, the propogation was all abstracted and
concentrated in the pnode datastructure. 

Here is a sample implementation of do_make_slave() with your idea. 

static int do_make_slave(struct vfsmount *mnt)
{
int err=0;
struct vfsmount *peer_mnt;

spin_lock(&vfspnode_lock);
if (!IS_MNT_SHARED(mnt)) {
spin_unlock(&vfspnode_lock);
err = -EINVAL;
goto out;
}

peer_mnt = list_entry(mnt->mnt_share.next, struct vfsmount,
mnt_share);
if (peer_mnt == mnt)
peer_mnt = NULL;

list_del_init(&mnt->mnt_share);
if (peer_mnt) {
/* move the slave list to the peer_mnt */
list_splice(&mnt->mnt_slave, &peer_mnt->mnt_slave);
list_add(&mnt->mnt_slave_list, &peer_mnt->mnt_slave);
set_mnt_slave(mnt);
} else {
struct vfsmount *slave_mnt, *t_slave_mnt;
list_for_each_entry_safe(slave_mnt, t_slave_mnt,
&mnt->mnt_slave, mnt_slave_list) {
CLEAR_MNT_SLAVE(slave_mnt);
list_del_init(&slave_mnt->mnt_slave_list);
}
}
list_del_init(&mnt->mnt_slave);
mnt->mnt_master = peer_mnt;
spin_unlock(&vfspnode_lock);
out:
return err;
}

Do you still believe that your idea is simpler? 

The most difficult part is, attaching shared vfs tree, which needs to be
attached at someother shared mount point. The problem here is while
traversing the propogation tree, one has to build another similar
propogation tree for the new child mount. Its a 2 dimensional tree walk,
i.e one walk is along the vfstree, and the other walk is along the pnode
tree for each mount.  Its much easier to abstract out the pnode
operations and concentrate on them separately than mixing the
functionality of vfs and pnode in a single vfs datastructure.

In my code, I have abstracted out the pnode tree traversing operation
using a single iterator function pnode_next().

Think about it, and let me know if it is worth the effort of changing
the implementation. I sincerely feel it just shifts complexity instead
of reducing complexity. I can eventually come up with a fully tested
implementation using your idea, but I am still not convinced that
it reduces complexity.

RP

> 
> Here's a less buggy (and even more simplified) version of the
> function.  Note: it must be called with 'origin' either on a slave
> list or at the root pnode.  That's because the function checks if the
> all vfsmounts in a pnode have been traversed by looking at the
> emptiness of mnt_slave.  So if origin was in a slave pnode, but is not
> the actual slave link, the algorithm will go off the starting pnode
> and up to it's master.
> 
> So here's a preparation function that finds the right place to start
> the propagation.
> 
> static struct vfsmount *propagation_first(struct vfsmount *p)
> {
>   struct vfsmount *q = p;
> 
>   while (list_empty(&q->mnt_slave)) {
>   q = next_shared(q);
>   if (q == p)
>   break;
>   }
>   return q;
> }
> 
> static struct vfsmount *propagation_next(struct vfsmount *p,
>struct vfsmount *origin)
> {
>   /* are there any slaves of this mount? */
>   if (!list_empty(&p->mnt_slave_list))
>   return first_slave(p);
> 
>   while (1) {
>   /* if p->mnt_share is empty, this is a no-op */
>   p = next_shared(p);
> 
>   /* finished traversing? */
>   if (p == origin)
>   break;
> 
>   /* more vfsmounts belong to the pnode? */
>   if (list_empty(&p->mnt_slave))
>   return p;
>   
>   /* more slaves? */
>   if (p->mnt_slave.next != &p->mnt_master->mnt_slave_list)
>   return next_slave(p);
> 
>   /* back at master */
>   p = p->mnt_master;
>   }
> 
>   return NULL;
> }
> 

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  htt

Re: [RFC][PATCH] VFS: create /proc//mountinfo

2008-01-21 Thread Ram Pai


Miklos,

You have removed the code that checked if the peer or
master mount was in the same namespace before reporting their
corresponding mount-ids. One downside of that approach is the
user will see an mount_id in the output with no corresponding
line to explain the details of the mount_id.  

And reporting the mount-id of a mount is some other namespace
could subtly mean information-leak?


One other comment I had received offline from Steve French was
that the patch did not consider the following case:

"Have you thought about whether this could handle the case in which 
cifs mounts with 
a relative path e.g. currently
mount -t cifs //server/share /mnt

can not be distinguished from
mount -t cifs //server/share/subdirectory /mnt

when you run the mount command (ie the cifs "prefixpath" in this case 
"/subdirectory" is not displayed)"


thanks for driving this patch further and sorry; have not been active on this 
work for a while,
RP


On Sat, 2008-01-19 at 12:05 +0100, Miklos Szeredi wrote:
> Seems, most people would be happier with a new file, instead of
> extending /proc/mounts.
> 
> This patch is the first attempt at doing that, as well as fixing the
> issues found in the previous submission.
> 
> Thanks,
> Miklos
> 
> ---
> From: Ram Pai <[EMAIL PROTECTED]>
> 
> /proc/mounts in its current state fail to disambiguate bind mounts, especially
> when the bind mount is subrooted. Also it does not capture propagation state 
> of
> the mounts(shared-subtree). The following patch addresses the problem.
> 
> The patch adds '/proc//mountinfo' which contains a superset of
> the fields in '/proc//mounts'. The following additional fields
> are added:
> 
> mntid -- is a unique identifier of the mount
> parent -- the id of the parent mount
> major:minor -- value of st_dev for files on that filesystem
> dir -- the subdir in the filesystem which forms the root of this mount
> propagation-type in the form of [:][,...]
>   note: 'shared' flag is followed by the mntid of its peer mount
> 'slave' flag is followed by the mntid of its master mount
> 'private' flag stands by itself
> 'unbindable' flag stands by itself
> 
> Also mount options are split into two fileds, the first containing the
> per mount flags, the second the per super block options.
> 
> Here is a sample cat /proc/mounts after execution the following commands:
> 
> mount --bind /mnt /mnt
> mount --make-shared /mnt
> mount --bind /mnt/1 /var
> mount --make-slave /var
> mount --make-shared /var
> mount --bind /var/abc /tmp
> mount --make-unbindable /proc
> 
> 2 2 0:1 rootfs rootfs / / rw rw private
> 16 2 98:0 ext2 /dev/root / / rw rw private
> 17 16 0:3 proc /proc / /proc rw rw unbindable
> 18 16 0:10 devpts devpts /dev/pts / rw rw private
> 19 16 98:0 ext2 /dev/root /mnt /mnt rw rw shared:19
> 20 16 98:0 ext2 /dev/root /mnt/1 /var rw rw shared:21,slave:19
> 21 16 98:0 ext2 /dev/root /mnt/1/abc /tmp rw rw shared:20,slave:19
> 
> For example, the last line indicates that:
> 
> 1) The mount is a shared mount.
> 2) Its peer mount of mount with id 20
> 3) It is also a slave mount of the master-mount with the id  19
> 4) The filesystem on device with major/minor number 98:0 and subdirectory
>   mnt/1/abc makes the root directory of this mount.
> 5) And finally the mount with id 16 is its parent.
> 
> 
> [EMAIL PROTECTED]:
> 
> - new file, rearrange fields
> - for mount ID's use IDA (from the IDR library) instead of a 32bit
>   counter, which could overflow
> - print canonical ID's (smallest one within the peer group) for peers
>   and master, this is more useful, than a random ID within the same namespace
> - fix a couple of small bugs
> - remove inlines
> - style fixes
> 
> Signed-off-by: Ram Pai <[EMAIL PROTECTED]>
> Signed-off-by: Miklos Szeredi <[EMAIL PROTECTED]>
> ---
> 
> Index: linux/fs/dcache.c
> ===
> --- linux.orig/fs/dcache.c2008-01-18 19:21:38.0 +0100
> +++ linux/fs/dcache.c 2008-01-18 19:22:27.0 +0100
> @@ -1890,6 +1890,60 @@ char *dynamic_dname(struct dentry *dentr
>   return memcpy(buffer, temp, sz);
>  }
> 
> +static int prepend(char **buffer, int *buflen, const char *str,
> +   int namelen)
> +{
> + *buflen -= namelen;
> + if (*buflen < 0)
> + return 1;
> + *buffer -= namelen;
> + memcpy(*buffer, str, namelen);
> + retur

Re: [RFC][PATCH] VFS: create /proc//mountinfo

2008-01-21 Thread Ram Pai

On Mon, 2008-01-21 at 22:25 +0100, Miklos Szeredi wrote:
> > You have removed the code that checked if the peer or
> > master mount was in the same namespace before reporting their
> > corresponding mount-ids. One downside of that approach is the
> > user will see an mount_id in the output with no corresponding
> > line to explain the details of the mount_id.  
> 
> Before the change, the peer and master ID's were basically randomly
> chosen from the peers, which means, it wasn't possible to always
> determine, that two mounts were peers, or that they were slaves to the
> same peer group.
> 
> After the change, this is possible, since the peer ID will be the same
> for all mounts which are peers.  This means, that even though the peer
> ID might be in a different namespace, it is possible to determine all
> peers within the same namespace by comparing their peer ID's.


 I agree with your reasoning on the random id; showing a single
 id avoids clutter. But my point is, why not show a
 id for the master or peer residing in the same namespace?
 Showing a id with no corresponding entry for that id, can be
 intriguing.

 
 If no master-mount exists in the same namespace then print -1
 meaning "masked". 

 there is always atleast one peer-mount in a given namespace; so no
 issue there.

 

> > 
> > And reporting the mount-id of a mount is some other namespace
> > could subtly mean information-leak?
> 
> I don't think the mount ID itself can be sensitive, it really doesn't
> contain any information, other than being an identifier.
> 
> > One other comment I had received offline from Steve French was
> > that the patch did not consider the following case:
> > 
> > "Have you thought about whether this could handle the case in which 
> > cifs mounts with 
> > a relative path e.g. currently
> > mount -t cifs //server/share /mnt
> > 
> > can not be distinguished from
> > mount -t cifs //server/share/subdirectory /mnt
> > 
> > when you run the mount command (ie the cifs "prefixpath" in this case 
> > "/subdirectory" is not displayed)"
> 
> Why cifs not displaying '//server/share/subdirectory' as the source of
> the mount?

dont know. not tried it myself.

RP
> 
> Miklos

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH -mm] readahead: partial sendfile fix

2007-02-12 Thread Ram Pai

On Sat, 2007-02-10 at 09:40 +0800, Fengguang Wu wrote:
> Enable readahead to handle partially done read requests, e.g.
> 
> sendfile(188, 1921, [1478592], 19553028) = 37440
> sendfile(188, 1921, [1516032], 19515588) = 28800
> sendfile(188, 1921, [1544832], 19486788) = 37440
> sendfile(188, 1921, [1582272], 19449348) = 14400
> sendfile(188, 1921, [1596672], 19434948) = 37440
> sendfile(188, 1921, [1634112], 19397508) = 37440
> 
> In the above strace log,
> - some lighttpd is doing _sequential_ reading
> - every sendfile() returns with only _partial_ work done
> 
> page_cache_readahead() expects that if it returns @next_index, it will
> be
> called exactly at @next_index next time. That's not true here. So the
> pattern
> will be falsely recognized as a random read trace.
> 
> Also documented in "Linux AIO Performance and Robustness for
> Enterprise
> Workloads" section 3.5:
> 
>   sendfile(fd, 0, 2GB, fd2) = 8192,
> tells readahead about up to 128KB of the read
>   sendfile(fd, 8192, 2GB - 8192, fd2) = 8192,
> tells readahead about 8KB - 132KB of the read
>   sendfile(fd, 16384, 2GB - 16384, fd2) = 8192,
> tells readahead about 16KB-140KB of the read
>...
> This confuses the readahead logic about the I/O pattern which
> appears
> to be 0-128K, 8K-132K, 16K-140K instead of clear sequentiality
> from
> 0-2GB that is really appropriate.
> 
> Retry based AIO shares the same read pattern and readahead problem.
> In this case, simply disabling readahead on restarted aio is not a
> good option:
> we still need to call into readahead in the rare case of (req_size >
> ra_max).

The solution you proposed seems kludgy to me. If you determine that the
its a restarted aio, then start reading from where readahead had left
reading from earlier. To me a simple fix is:

-   if (unlikely(aio_restarted()))
-   next_index = last_index; /* Avoid repeat readahead */

+   if (unlikely(aio_restarted()))
+   next_index = min(prev_index+1, last_index);


No? 
RP



> 
> Signed-off-by: Fengguang Wu <[EMAIL PROTECTED]>
> ---
>  mm/filemap.c   |3 ---
>  mm/readahead.c |9 +
>  2 files changed, 9 insertions(+), 3 deletions(-)
> 
> --- linux-2.6.20-rc6-mm3.orig/mm/readahead.c
> +++ linux-2.6.20-rc6-mm3/mm/readahead.c
> @@ -581,6 +581,15 @@ page_cache_readahead(struct address_spac
> int sequential;
> 
> /*
> +* A previous read request is partially completed,
> +* causing the retried/continued read calls into us
> prematurely.
> +*/
> +   if (ra->start < offset &&
> +   offset < ra->prev_page &&
> +ra->prev_page < ra->ahead_start +
> ra->ahead_size)
> +   goto out;
> +
> +   /*
>  * We avoid doing extra work and bogusly perturbing the
> readahead
>  * window expansion logic.
>  */
> --- linux-2.6.20-rc6-mm3.orig/mm/filemap.c
> +++ linux-2.6.20-rc6-mm3/mm/filemap.c
> @@ -915,9 +915,6 @@ void do_generic_mapping_read(struct addr
> if (!isize)
> goto out;
> 
> -   if (unlikely(aio_restarted()))
> -   next_index = last_index; /* Avoid repeat readahead */




> -
> end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
> for (;;) {
> struct page *page;
> 

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: Adding subroot information to /proc/mounts, or obtaining that through other means

2007-06-21 Thread Ram Pai

On Wed, 2007-06-20 at 14:20 -0700, H. Peter Anvin wrote:
> Al Viro wrote:
> > On Wed, Jun 20, 2007 at 01:57:33PM -0700, H. Peter Anvin wrote:
> >> ... or, alternatively, add a subfield to the first field (which would
> >> entail escaping whatever separator we choose):
> >>
> >> /dev/md6 /export ext3 rw,data=ordered 0 0
> >> /dev/md6:/users/foo /home/foo ext3 rw,data=ordered 0 0
> >> /dev/md6:/users/bar /home/bar ext3 rw,data=ordered 0 0
> > 
> > Hell, no.  The first field is in principle impossible to parse unless
> > you know the fs type.
> > 
> > How about making a new file with sane format?  From the very
> > beginning.  E.g. mountpoint + ID + relative path + type + options,
> > where ID uniquely identifies superblock (e.g. numeric st_dev)
> > and backing device (if any) is sitting among the options...
> 
> Okay, I see there has been some discussion on this earlier, based on a
> proposal by Ram Pai, so it pretty much comes down to redesigning this
> right.  I see some issues with his proposal (device numbers exported to
> userspace in text form should be separated into major:minor form, for
> one thing.)  I know the util-linux-ng people have also had issues with
> /proc/mounts that they would like resolved in order to finally nuke
> /etc/mtab.
> 
> Is Ram still working on this?  I'd like to help make this happen so we
> can be done with it.

Peter, I am not working on it currently. But i am interested in getting
it done. I have the seed set of patches which had Al Viro's ideas
incorporated. Infact those patches were sent on lkml 2 months back.
Shall we start with those patches?

RP


> 
>   -hpa
> 
> 

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: Adding subroot information to /proc/mounts, or obtaining that through other means

2007-06-21 Thread Ram Pai

On Thu, 2007-06-21 at 09:29 -0700, H. Peter Anvin wrote:
> Ram Pai wrote:
> > 
> > Peter, I am not working on it currently. But i am interested in getting
> > it done. I have the seed set of patches which had Al Viro's ideas
> > incorporated. Infact those patches were sent on lkml 2 months back.
> > Shall we start with those patches?
> > 
> 
> Are these the "unprivileged mount syscall" patches?

no. but those patches were sent in the same thread. Karel had provided
suggestions which I am yet to incorporate.

Give me today. I will send out the patches incorporating the comment
later in the evening.

ok?
RP

> 
> Otherwise I don't see any patches in my personal LKML cache (apparently
> my subscription to fsdevel was dropped at some point, so I don't have a
> stash of it.)


> 
>   -hpa

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: Adding subroot information to /proc/mounts, or obtaining that through other means

2007-06-21 Thread Ram Pai

On Thu, 2007-06-21 at 10:31 -0700, H. Peter Anvin wrote:
> Ram Pai wrote:
> > 
> > Peter, I am not working on it currently. But i am interested in getting
> > it done. I have the seed set of patches which had Al Viro's ideas
> > incorporated. Infact those patches were sent on lkml 2 months back.
> > Shall we start with those patches?
> > 
> 
> Okay, so what I see in your patches are:
> 
> > > path-from-root: mount point of the mount from /
> > > path-from-root-of-its-sb: path from its own root dentry.
> > > propagation-flag: SHARED, SLAVE, UNBINDABLE, PRIVATE
> > > peer-mount-id: the mount-id of its peer mount (if this mount is shared)
> > > master-mount-id: the mount-id of its master mount (if this mount is
> slave)
> 
> Other than cosmetic, I don't see anything terribly wrong with this,
> although getting a flag when the directory is overmounted would be nice.
> 
> I guess I suggest a single comma-separated field with flags and optional
> ":argument":
> 
>   private
>   shared:
>   slave:
>   unbindable
>   overmounted
> 
> So we could end up with something like:
> 
> rootfs / rootfs rw 0 0 0:1 / 1 private,overmounted
> 
> ... where 1 is the mnt_id (sequence number).
> 
> [Please see my other comments in this thread... basically I believe we
> should just add fields to /proc/mounts.]

I had two patches. The first patch added a new interface
called /proc/mounts_new  and had the following format.

FSID  mntpt  root-dentry  fstype fs-options

where FSID is a filesystem unique id
mntpt is the path to the mountpoint
root-dentry is the path to the dentry with respect to the root dentry of
the same filesystem.
fstype  is the filesystem type
fs-options  the mount options used.

the second patch made a /proc/propagation interface which had almost the
same fields, but also added fields to show the propagation type of the
mount as well as pointers to its peers and master depending on the type
of the mount. 

I think the consensus seems to have a new interface /proc/make-a-name
which extends the interface provided by /proc/mounts but provides the
propagation state of the mounts too as well as disambiguate bind mounts.
Which makes sense.

Why not have something like this?

mnt-id FSID backing-dev mntpt root-dentry fstype
comma-separated-fs-options

and one of the fields in the comma-separated-fs-options indicates the
propagation type of the mount.

BTW: what is the need for overmounted flag?  Do you mean two vfsmounts
mounted on the same dentry on the ***same vfsmount*** ?

RP

> 
>   -hpa

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: Adding subroot information to /proc/mounts, or obtaining that through other means

2007-06-22 Thread Ram Pai

On Fri, 2007-06-22 at 00:06 -0700, H. Peter Anvin wrote:
> Ram Pai wrote:
> > 
> > the second patch made a /proc/propagation interface which had almost the
> > same fields, but also added fields to show the propagation type of the
> > mount as well as pointers to its peers and master depending on the type
> > of the mount. 
> > 
> > I think the consensus seems to have a new interface /proc/make-a-name
> > which extends the interface provided by /proc/mounts but provides the
> > propagation state of the mounts too as well as disambiguate bind mounts.
> > Which makes sense.
> > 
> 
> Why?  It seems a lot cleaner to have all the information in the same
> place.  It is highly unfriendly to userspace to have to gather
> information in a lot of places, plus it adds race conditions.
> 
> It would be another matter if the format that we have now couldn't be
> extended, but we need those fields (well, except the two zeros, but who
> cares) *anyway*, so we might as well stick to the existing file, and
> reduce the total amount of code and clutter.

Ok. so you think /proc/mounts can be extended easily without breaking
any userspace commands?

well lets see..
1. to disambiguate bind mounts, we have to add a field that displays the
 path to the mount's root dentry from the filesystem's root
 dentry. Agree?

2. For filesystems that do not have a backing store, it becomes hard
to disambiguate bind mounts in (1). So we need to add a
filesystem-id field.

3. if we need to add the propagation status of the mount we need a
 propagation flag added in the output.

4. To be able to construct the propagation tree, we need a way to refer
to the other mounts, since some mounts are peers and some other
mounts are master. Which means we need a mount-id field.
Agree?

If you agree to the above 4 new fields, it becomes challenging to
extend /proc/mounts to incorporate these new fields without
breaking any existing applications. 

> > 
> > BTW: what is the need for overmounted flag?  Do you mean two vfsmounts
> > mounted on the same dentry on the ***same vfsmount*** ?
> > 
> 
> Maybe I'm not following the uses of your flags well enough to figure out
>  if that information can already been deduced.

With the addition of the above 4 mentioned fields, I think one should be
easily able to decipher which mnt-id is mounted on which mnt-id. no?
maybe not. Well we will have to extend the mountpoint field to indicate
the mnt-id in which the mountpoint resides.  

RP

> 
>   -hpa

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[RFC PATCH 1/1] VFS: Augment /proc/mount with subroot and shared-subtree

2007-06-25 Thread Ram Pai

Please check if the following modified patch meets the requirements.

It augments /proc/mount with additional information to
(1) disambiguate bind mounts with subroot information.
(2) display shared-subtree information using which one can
determine the propagation trees.


The following additional fields are appended to each record
in /proc/mounts

mntid=id->  The unique id associated with that mount.
fsid=id:dir ->  The filesystem's id and directory in that filesystem
that
makes the root directory of this mount.
parent=id   ->  The id of the mount's parent; on which it is mounted.

also flags are augmented with new information to indicate the mount's 
propagation type.

Here is a sample 'cat /proc/mounts' after executing the following
commands:
mount --bind /mnt /mnt
mount --make-shared /mnt
mount --bind /mnt/1 /var
mount --make-slave /var
mount --make-shared /mnt
mount --make-unbindable /proc

rootfs / rootfs rw PRIVATE mntid=c1708c30 fsid=1:/ parent=c1708c30 0 0
/dev/root / ext2 rw  PRIVATE mntid=c1208c08 fsid=6200:/ parent=c1708c30
0 0
/proc /proc proc rw UNBINDABLE mntid=c1108c90 fsid=3:/ parent=c1208c08 0
0
devpts /dev/pts devpts rw PRIVATE mntid=c1108c18 fsid=a:/
parent=c1208c08 0 0
/dev/root /mnt ext2 rw  SHARED:peer=c1e08cb0 mntid=c1e08cb0
fsid=6200:/mnt parent=c1208c08 0 0
/dev/root /var ext2 rw  SHARED:peer=c1f08c28 SLAVE:master=c1e08cb0
mntid=c1f08c28 fsid=6200:/mnt/1 parent=c1208c08 0 0


For example, the last line indicates that 
The mount is a shared mount.
Its peer mount is itself (note peer=c1f08c28 is the same mntid as
itself).
It is also a slave mount of the mount with the id c1e08cb0.  
The filesystem with fsid=6200 and subdirectory mnt/1 makes the root
directory 
of this mount.
And finally the mount with id c1208c08 is its parent.


Signed-off-by: Ram Pai <[EMAIL PROTECTED]>

---
 fs/dcache.c  |   53 +++
 fs/namespace.c   |   25 ++
 fs/pnode.c   |   22 +
 fs/pnode.h   |2 +
 fs/seq_file.c|   79
++-
 include/linux/dcache.h   |2 +
 include/linux/seq_file.h |1 
 7 files changed, 162 insertions(+), 22 deletions(-)

Index: linux-2.6.21.5/fs/dcache.c
===
--- linux-2.6.21.5.orig/fs/dcache.c
+++ linux-2.6.21.5/fs/dcache.c
@@ -1835,6 +1835,59 @@ char * d_path(struct dentry *dentry, str
return res;
 }

+static inline int prepend(char **buffer, int *buflen, const char *str,
+   int namelen)
+{
+   if ((*buflen -= namelen) < 0)
+   return 1;
+   *buffer -= namelen;
+   memcpy(*buffer, str, namelen);
+   return 0;
+}
+
+/*
+ * write full pathname into buffer and return start of pathname.
+ * If @vfsmnt is not specified return the path relative to the
+ * its filesystem's root.
+ */
+char * dentry_path(struct dentry *dentry, char *buf, int buflen)
+{
+   char * end = buf+buflen;
+   char * retval;
+
+   spin_lock(&dcache_lock);
+   prepend(&end, &buflen, "\0", 1);
+   if (!IS_ROOT(dentry) && d_unhashed(dentry)) {
+   if (prepend(&end, &buflen, "//deleted", 10))
+   goto Elong;
+   }
+   /* Get '/' right */
+   retval = end-1;
+   *retval = '/';
+
+   for (;;) {
+   struct dentry * parent;
+   if (IS_ROOT(dentry))
+   break;
+
+   parent = dentry->d_parent;
+   prefetch(parent);
+
+   if (prepend(&end, &buflen, dentry->d_name.name,
+   dentry->d_name.len) ||
+   prepend(&end, &buflen, "/", 1))
+   goto Elong;
+
+   retval = end;
+   dentry = parent;
+   }
+   spin_unlock(&dcache_lock);
+   return retval;
+Elong:
+   spin_unlock(&dcache_lock);
+   return ERR_PTR(-ENAMETOOLONG);
+}
+
 /*
  * NOTE! The user-level library version returns a
  * character pointer. The kernel system call just
Index: linux-2.6.21.5/fs/namespace.c
===
--- linux-2.6.21.5.orig/fs/namespace.c
+++ linux-2.6.21.5/fs/namespace.c
@@ -386,8 +386,31 @@ static int show_vfsmnt(struct seq_file *
if (mnt->mnt_flags & fs_infop->flag)
seq_puts(m, fs_infop->str);
}
-   if (mnt->mnt_sb->s_op->show_options)
+   seq_putc(m, ' ');
+   if (mnt->mnt_sb->s_op->show_options) {
err = mnt->mnt_sb->s_op->show_options(m, mnt);
+   seq_putc(m, ' ');
+   }
+   if (IS_MNT_SHA

Re: [RFC PATCH 1/1] VFS: Augment /proc/mount with subroot and shared-subtree

2007-07-11 Thread Ram Pai

On Wed, 2007-07-11 at 11:24 +0100, Christoph Hellwig wrote:
> On Sat, Jun 30, 2007 at 08:56:02AM -0400, H. Peter Anvin wrote:
> > Is that conjecture, or do you have evidence to that effect?  Most users 
> > of this file are using it via the glibc interfaces, and there probably 
> > aren't all that many users of it in the first place.
> 
> I have written parsers for personal projects that might not have been
> happy to deal with additional fields myself for example..

I modified the patch to add fields towards the end of each line.
i.e after 'freq, passno' fields. And symlinked /etc/mtab
to /proc/mounts.
mount,df  and friends were all perfectly happy.  

I imagine your script may also be happy with the additional fields
**towards the end**. I would like to avoid one more mount interface if
we can help it.

RP

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[RFC2 PATCH 1/1] VFS: Augment /proc/mount with subroot and shared-subtree

2007-07-16 Thread Ram Pai

/proc/mounts in its current state fail to disambiguate bind mounts, especially 
when the bind mount is subrooted. Also it does not capture propagation state of
the mounts(shared-subtree). The following patch addresses the problem.

The following additional fields to /proc/mounts are added.

propagation-type in the form of [:][,...]
note: 'shared' flag is followed by the mntid of its peer mount
  'slave' flag is followed by the mntid of its master mount
  'private' flag stands by itself
  'unbindable' flag stands by itself
   
mntid -- is a unique identifier of the mount
major:minor -- is the major minor number of the device hosting the filesystem
dir -- the subdir in the filesystem which forms the root of this mount
parent -- the id of the parent mount


Here is a sample cat /proc/mounts after execution the following commands:

mount --bind /mnt /mnt
mount --make-shared /mnt
mount --bind /mnt/1 /var
mount --make-slave /var
mount --make-shared /var
mount --bind /var/abc /tmp
mount --make-unbindable /proc

rootfs / rootfs rw 0 0 private 2 0:1 / 2 
/dev/root / ext2 rw  0 0 private 16 98:0 / 2 
/proc /proc proc rw 0 0 unbindable 17 0:3 / 16 
devpts /dev/pts devpts rw 0 0 private 18 0:10 / 16 
/dev/root /mnt ext2 rw  0 0 shared:19 19 98:0 /mnt 16 
/dev/root /var ext2 rw  0 0 shared:21,slave:19 20 98:0 /mnt/1 16 
/dev/root /tmp ext2 rw  0 0 shared:20,slave:19 21 98:0 /mnt/1/abc 16 

For example, the last line indicates that :

1) The mount is a shared mount.
2) Its peer mount of mount with id 20
3) It is also a slave mount of the master-mount with the id  19
4) The filesystem on device with major/minor number 98:0 and subdirectory 
mnt/1/abc makes the root directory of this mount.
5) And finally the mount with id 16 is its parent.


Testing: symlinked /etc/mtab to /proc/mounts and did some mount and df 
commands. They worked normally.



Signed-off-by: Ram Pai <[EMAIL PROTECTED]>

---
 fs/dcache.c  |   53 +++
 fs/namespace.c   |   35 +++-
 fs/pnode.c   |   22 +
 fs/pnode.h   |2 +
 fs/seq_file.c|   79 ++-
 include/linux/dcache.h   |2 +
 include/linux/mount.h|1 
 include/linux/seq_file.h |1 
 8 files changed, 172 insertions(+), 23 deletions(-)

Index: linux-2.6.21.5/fs/dcache.c
===
--- linux-2.6.21.5.orig/fs/dcache.c
+++ linux-2.6.21.5/fs/dcache.c
@@ -1835,6 +1835,59 @@ char * d_path(struct dentry *dentry, str
return res;
 }
 
+static inline int prepend(char **buffer, int *buflen, const char *str,
+   int namelen)
+{
+   if ((*buflen -= namelen) < 0)
+   return 1;
+   *buffer -= namelen;
+   memcpy(*buffer, str, namelen);
+   return 0;
+}
+
+/*
+ * write full pathname into buffer and return start of pathname.
+ * If @vfsmnt is not specified return the path relative to the
+ * its filesystem's root.
+ */
+char * dentry_path(struct dentry *dentry, char *buf, int buflen)
+{
+   char * end = buf+buflen;
+   char * retval;
+
+   spin_lock(&dcache_lock);
+   prepend(&end, &buflen, "\0", 1);
+   if (!IS_ROOT(dentry) && d_unhashed(dentry)) {
+   if (prepend(&end, &buflen, "//deleted", 10))
+   goto Elong;
+   }
+   /* Get '/' right */
+   retval = end-1;
+   *retval = '/';
+
+   for (;;) {
+   struct dentry * parent;
+   if (IS_ROOT(dentry))
+   break;
+
+   parent = dentry->d_parent;
+   prefetch(parent);
+
+   if (prepend(&end, &buflen, dentry->d_name.name,
+   dentry->d_name.len) ||
+   prepend(&end, &buflen, "/", 1))
+   goto Elong;
+
+   retval = end;
+   dentry = parent;
+   }
+   spin_unlock(&dcache_lock);
+   return retval;
+Elong:
+   spin_unlock(&dcache_lock);
+   return ERR_PTR(-ENAMETOOLONG);
+}
+
 /*
  * NOTE! The user-level library version returns a
  * character pointer. The kernel system call just
Index: linux-2.6.21.5/fs/namespace.c
===
--- linux-2.6.21.5.orig/fs/namespace.c
+++ linux-2.6.21.5/fs/namespace.c
@@ -33,6 +33,8 @@
 __cacheline_aligned_in_smp DEFINE_SPINLOCK(vfsmount_lock);
 
 static int event;
+static atomic_t mnt_counter;
+
 
 static struct list_head *mount_hashtable __read_mostly;
 static int hash_mask __read_mostly, hash_bits __read_mostly;
@@ -51,6 +53,7 @@ static inline unsigned long hash(struct 
return tmp & hash_mask;
 }
 
+
 struct v

Re: [patch 0/8] unprivileged mount syscall

2007-04-16 Thread Ram Pai

On Fri, 2007-04-13 at 13:58 +0200, Miklos Szeredi wrote:
> > On Wed, 2007-04-11 at 12:44 +0200, Miklos Szeredi wrote:
> > > > 1. clone the master namespace.
> > > > 
> > > > 2. in the new namespace
> > > > 
> > > > move the tree under /share/$me to /
> > > > for each ($user, $what, $how) {
> > > > move /share/$user/$what to /$what
> > > > if ($how == slave) {
> > > >  make the mount tree under /$what as slave
> > > > }
> > > > }
> > > > 
> > > > 3. in the new namespace make the tree under 
> > > >/share as private and unmount /share
> > > 
> > > Thanks.  I get the basic idea now: the namespace itself need not be
> > > shared between the sessions, it is enough if "share" propagation is
> > > set up between the different namespaces of a user.
> > > 
> > > I don't yet see either in your or Viro's description how the trees
> > > under /share/$USER are initialized.  I guess they are recursively
> > > bound from /, and are made slaves.
> > 
> > yes. I suppose, when a userid is created one of the steps would be
> > 
> > mount --rbind / /share/$USER
> > mount --make-rslave /share/$USER
> > mount --make-rshared /share/$USER
> 
> Thinking a bit more about this, I'm quite sure most users wouldn't
> even want private namespaces.  It would be enough to
> 
>   chroot /share/$USER
> 
> and be done with it.
> 
> Private namespaces are only good for keeping a bunch of mounts
> referenced by a group of processes.  But my guess is, that the natural
> behavior for users is to see a persistent set of mounts.
> 
> If for example they mount something on a remote machine, then log out
> from the ssh session and later log back in, they would want to see
> their previous mount still there.

They will continue see their previous mount tree. 
Even if all the namespaces belonging to the different sessions of the
user get dismantled when all the sessions exit, the a mirror of those 
mount trees continue to exist under /share/$USER in the original
namespace.  So I don't think we have a issue.

NOTE: when I say 'original namespace' I mean the admin namespace; the
first namespace that gets created when the machine boots.

RP


> 
> Miklos

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [patch 0/8] unprivileged mount syscall

2007-04-16 Thread Ram Pai

On Fri, 2007-04-13 at 16:05 +0200, Miklos Szeredi wrote:
> > > Thinking a bit more about this, I'm quite sure most users wouldn't
> > > even want private namespaces.  It would be enough to
> > > 
> > >   chroot /share/$USER
> > > 
> > > and be done with it.
> > > 
> > > Private namespaces are only good for keeping a bunch of mounts
> > > referenced by a group of processes.  But my guess is, that the natural
> > > behavior for users is to see a persistent set of mounts.
> > > 
> > > If for example they mount something on a remote machine, then log out
> > > from the ssh session and later log back in, they would want to see
> > > their previous mount still there.
> > > 
> > > Miklos
> > 
> > Agreed on desired behavior, but not on chroot sufficing.  It actually
> > sounds like you want exactly what was outlined in the OLS paper.
> > 
> > Users still need to be in a different mounts namespace from the admin
> > user so long as we consider the deluser and backup problems
> 
> I don't think it matters, because /share/$USER duplicates a part or
> the whole of the user's namespace.
> 
> So backup would have to be taught about /share anyway, and deluser
> operates on /home/$USER and not on /share/*, so there shouldn't be any
> problem.
> 
> There's actually very little difference between rbind+chroot, and
> CLONE_NEWNS.  In a private namespace:
> 
>   1) when no more processes reference the namespace, the tree will be
> disbanded
> 
>   2) the mount tree won't be accessible from outside the namespace
> 
> Wanting a persistent namespace contradicts 1).
> 
> Wanting a per-user (as opposed to per-session) namespace contradicts
> 2).  The namespace _has_ to be accessible from outside, so that a new
> session can access/copy it.

As i mentioned in the previous mail, disbanding all the namespaces of a
user will not disband his mount tree, because a mirror of the mount tree
still continues to exist in /share/$USER in the admin namespace.

And a new user session can always use this copy to create a namespace
that  looks identical to that which existed earlier.


> 
> So both requirements point to the rbind/chroot solution.

Arn't there ways to escape chroot jails? Serge had pointed me to a URL
which showed chroots can be escaped. And if that is true than having all
user's private mount tree in the same namespace can be a security issue?

RP

> 
> Miklos

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [Devel] Re: [patch 05/10] add "permit user mounts in new namespace" clone flag

2007-04-16 Thread Ram Pai


> 
> "Serge E. Hallyn" <[EMAIL PROTECTED]> writes:
> 
> > Quoting Miklos Szeredi ([EMAIL PROTECTED]):
> >> From: Miklos Szeredi <[EMAIL PROTECTED]>
> >> 
> >> If CLONE_NEWNS and CLONE_NEWNS_USERMNT are given to clone(2) or
> >> unshare(2), then allow user mounts within the new namespace.
> >> 
> >> This is not flexible enough, because user mounts can't be enabled
> for
> >> the initial namespace.
> >> 
> >> The remaining clone bits also getting dangerously few...
> >> 
> >> Alternatives are:
> >> 
> >>   - prctl() flag
> >>   - setting through the containers filesystem
> >
> > Sorry, I know I had mentioned it, but this is definately my least
> > favorite approach.
> >
> > Curious whether are any other suggestions/opinions from the
> containers
> > list?
> 
> Given the existence of shared subtrees allowing/denying this at the
> mount
> namespace level is silly and wrong.
> 
> If we need more than just the filesystem permission checks can we
> make it a mount flag settable with mount and remount that allows
> non-privileged users the ability to create mount points under it
> in directories they have full read/write access to.

Also for bind-mount and remount operations the flag has to be propagated
down its propagation tree.  Otherwise a unpriviledged mount in a shared
mount wont get reflected in its peers and slaves, leading to unidentical
shared-subtrees.

RP


> 
> I don't like the use of clone flags for this purpose but in this
> case the shared subtress are a much more fundamental reasons for not
> doing this at the namespace level.
> 
> Eric
> ___
> Containers mailing list
> [EMAIL PROTECTED]
> https://lists.linux-foundation.org/mailman/listinfo/containers 

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [Devel] Re: [patch 05/10] add "permit user mounts in new namespace" clone flag

2007-04-16 Thread Ram Pai

On Mon, 2007-04-16 at 11:32 +0200, Miklos Szeredi wrote:
> > > Given the existence of shared subtrees allowing/denying this at the
> > > mount
> > > namespace level is silly and wrong.
> > > 
> > > If we need more than just the filesystem permission checks can we
> > > make it a mount flag settable with mount and remount that allows
> > > non-privileged users the ability to create mount points under it
> > > in directories they have full read/write access to.
> > 
> > Also for bind-mount and remount operations the flag has to be propagated
> > down its propagation tree.  Otherwise a unpriviledged mount in a shared
> > mount wont get reflected in its peers and slaves, leading to unidentical
> > shared-subtrees.
> 
> That's an interesting question.  Do we want shared mounts to be
> totally identical, including mnt_flags?  It doesn't look as if
> do_remount() guarantees that currently.

Depends on the semantics of each of the flags. Some flags like of the
read/write flag, would not interfere with the propagation semantics
AFAICT.  But this one certainly seems to interfere.

RP

> Miklos

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [Devel] Re: [patch 05/10] add "permit user mounts in new namespace" clone flag

2007-04-16 Thread Ram Pai

On Mon, 2007-04-16 at 11:56 +0200, Miklos Szeredi wrote:
> > > > Also for bind-mount and remount operations the flag has to be propagated
> > > > down its propagation tree.  Otherwise a unpriviledged mount in a shared
> > > > mount wont get reflected in its peers and slaves, leading to unidentical
> > > > shared-subtrees.
> > > 
> > > That's an interesting question.  Do we want shared mounts to be
> > > totally identical, including mnt_flags?  It doesn't look as if
> > > do_remount() guarantees that currently.
> > 
> > Depends on the semantics of each of the flags. Some flags like of the
> > read/write flag, would not interfere with the propagation semantics
> > AFAICT.  But this one certainly seems to interfere.
> 
> That depends.  Current patches check the "unprivileged submounts
> allowed under this mount" flag only on the requested mount and not on
> the propagated mounts.  Do you see a problem with this?

Don't see a problem if the flag is propagated to all peers and slave
mounts. 

If not, I see a problem. What if the propagated mount has its flag set
to not do un-priviledged mounts, whereas the requested mount has it
allowed?

RP



> 
> Miklos

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [Devel] Re: [patch 05/10] add "permit user mounts in new namespace" clone flag

2007-04-17 Thread Ram Pai

On Tue, 2007-04-17 at 19:44 +0200, Miklos Szeredi wrote:
> > I'm a bit lost about what is currently done and who advocates for what.
> > 
> > It seems to me the MNT_ALLOWUSERMNT (or whatever :) flag should be
> > propagated.  In the /share rbind+chroot example, I assume the admin
> > would start by doing
> > 
> > mount --bind /share /share
> > mount --make-slave /share
> > mount --bind -o allow_user_mounts /share (or whatever)
> > mount --make-shared /share
> > 
> > then on login, pam does
> > 
> > chroot /share/$USER
> > 
> > or some sort of
> > 
> > mount --bind /share /home/$USER/root
> > chroot /home/$USER/root
> > 
> > or whatever.  In any case, the user cannot make user mounts except under
> > /share, and any cloned namespaces will still allow user mounts.
> 
> I don't quite understand your method.  This is how I think of it:
> 
> mount --make-rshared /
> mkdir -p /mnt/ns/$USER
> mount --rbind / /mnt/ns/$USER
> mount --make-rslave /mnt/ns/$USER
> mount --set-flags --recursive -oallowusermnt /mnt/ns/$USER
> chroot /mnt/ns/$USER
> su - $USER
> 
> I did actually try something equivalent (without the fancy mount
> commands though), and it worked fine.  The only "problem" is the
> proliferation of mounts in /proc/mounts.  There was a recently posted
> patch in AppArmor, that at least hides unreachable mounts from
> /proc/mounts, so the user wouldn't see all those.  But it could still
> be pretty confusing to the sysadmin.

unbindable mounts were designed to overcome the proliferation problem.

Your steps should be something like this:

mount --make-rshared /
mkdir -p /mnt/ns
mount --bind /mnt/ns /mnt/ns
mount --make-unbindable /mnt/ns
mkdir -p /mnt/ns/$USER
mount --rbind / /mnt/ns/$USER
mount --make-rslave /mnt/ns/$USER
mount --set-flags --recursive -oallowusermnt /mnt/ns/$USER
chroot /mnt/ns/$USER
su - $USER

try this and your proliferation problem will disappear. :-)

> 
> So in that sense doing it the complicated way, by first cloning the
> namespace, and then copying and sharing mounts individually which need
> to be shared could relieve this somewhat.

the unbindable mount will just provide you permanent relief.

> 
> Another point: user mounts under /proc and /sys shouldn't be allowed.
> There are files there (at least in /proc) that are seemingly writable
> by the user, but they are still not writable in the sense, that
> "normal" files are.
> 
> Anyway, there are lots of userspace policy issues, but those don't
> impact the kernel part.
> 
> As for the original question of propagating the "allowusermnt" flag, I
> think it doesn't matter, as long as it's consistent and documented.
> 
> Propagating some mount flags and not propagating others is
> inconsistent and confusing, so I wouldn't want that.  Currently
> remount doesn't propagate mount flags, that may be a bug, 

For consistency reason, one can propagate all the flags. But
propagating only those flags that interfere with shared-subtree
semantics should suffice.

wait...Dave's read-only bind mounts infact need the ability to
selectively make some mounts readonly. In such cases propagating
the read-only flag will just step on Dave's feature. Wont' it?

RP



> 
> Miklos

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [Devel] Re: [patch 05/10] add "permit user mounts in new namespace" clone flag

2007-04-17 Thread Ram Pai

On Tue, 2007-04-17 at 21:43 +0200, Miklos Szeredi wrote:
> > > > I'm a bit lost about what is currently done and who advocates for what.
> > > > 
> > > > It seems to me the MNT_ALLOWUSERMNT (or whatever :) flag should be
> > > > propagated.  In the /share rbind+chroot example, I assume the admin
> > > > would start by doing
> > > > 
> > > > mount --bind /share /share
> > > > mount --make-slave /share
> > > > mount --bind -o allow_user_mounts /share (or whatever)
> > > > mount --make-shared /share
> > > > 
> > > > then on login, pam does
> > > > 
> > > > chroot /share/$USER
> > > > 
> > > > or some sort of
> > > > 
> > > > mount --bind /share /home/$USER/root
> > > > chroot /home/$USER/root
> > > > 
> > > > or whatever.  In any case, the user cannot make user mounts except under
> > > > /share, and any cloned namespaces will still allow user mounts.
> > > 
> > > I don't quite understand your method.  This is how I think of it:
> > > 
> > > mount --make-rshared /
> > > mkdir -p /mnt/ns/$USER
> > > mount --rbind / /mnt/ns/$USER
> > > mount --make-rslave /mnt/ns/$USER
> > > mount --set-flags --recursive -oallowusermnt /mnt/ns/$USER
> > > chroot /mnt/ns/$USER
> > > su - $USER
> > > 
> > > I did actually try something equivalent (without the fancy mount
> > > commands though), and it worked fine.  The only "problem" is the
> > > proliferation of mounts in /proc/mounts.  There was a recently posted
> > > patch in AppArmor, that at least hides unreachable mounts from
> > > /proc/mounts, so the user wouldn't see all those.  But it could still
> > > be pretty confusing to the sysadmin.
> > 
> > unbindable mounts were designed to overcome the proliferation problem.
> > 
> > Your steps should be something like this:
> > 
> > mount --make-rshared /
> > mkdir -p /mnt/ns
> > mount --bind /mnt/ns /mnt/ns
> > mount --make-unbindable /mnt/ns
> > mkdir -p /mnt/ns/$USER
> > mount --rbind / /mnt/ns/$USER
> > mount --make-rslave /mnt/ns/$USER
> > mount --set-flags --recursive -oallowusermnt /mnt/ns/$USER
> > chroot /mnt/ns/$USER
> > su - $USER
> > 
> > try this and your proliferation problem will disappear. :-)
> 
> Right, this is needed.
> 
> My problem wasn't actually this (which would only have hit, if I tried
> with more than one user), just that the number of mounts in
> /proc/mounts grows linearly with the number of users.
> 
> That can't be helped in such an easy way unfortunately.
> 
> > > Propagating some mount flags and not propagating others is
> > > inconsistent and confusing, so I wouldn't want that.  Currently
> > > remount doesn't propagate mount flags, that may be a bug, 
> > 
> > For consistency reason, one can propagate all the flags. But
> > propagating only those flags that interfere with shared-subtree
> > semantics should suffice.
> 
> I still don't believe not propagating "allowusermnt" interferes with
> mount propagation.  In my posted patches the mount (including
> propagations) is allowed based on the "allowusermnt" flag on the
> parent of the requested mount.  The flag is _not_ checked during
> propagation.
> 
> Allowing this and other flags to NOT be propagated just makes it
> possible to have a set of shared mounts with asymmetric properties,
> which may actually be desirable.

The shared mount feature was designed to ensure that the mount remained
identical at all the locations. Now designing features 
to make it un-identical but still naming it shared, will break its
original purpose.  Slave mounts were designed to make it asymmetric.

Whatever feature that is desired to be exploited; can that be exploited
with the current set of semantics that we have? Is there a real need to
make the mounts asymmetric but at the same time name them as shared?
Maybe I dont understand what the desired application is? 

RP

> 
> Miklos

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [Devel] Re: [patch 05/10] add "permit user mounts in new namespace" clone flag

2007-04-18 Thread Ram Pai

On Wed, 2007-04-18 at 11:19 +0200, Miklos Szeredi wrote:
> > > Allowing this and other flags to NOT be propagated just makes it
> > > possible to have a set of shared mounts with asymmetric properties,
> > > which may actually be desirable.
> > 
> > The shared mount feature was designed to ensure that the mount remained
> > identical at all the locations.
> 
> OK, so remount not propagating mount flags is a bug then?

As I said earlier, are there any flags currently that if not propagated 
can lead to conflicts with the shared subtree semantics? I am not aware
of any.  If you did notice a case, than according to me its a bug.

But the new proposed 'allow unpriviledged mounts' flag; if not
propagated among peers (and slaves) of a shared mount can lead to
conflicts with shared subtree semantics. Since mount in one
shared-mount; when propagated to its peer fails to mount and hence lead
to un-identical peers.

> 
> > Now designing features to make it un-identical but still naming it
> > shared, will break its original purpose.  Slave mounts were designed
> > to make it asymmetric.
> 
> What if I want to modify flags in a master mount, but not the slave
> mount?  Would I be screwed?  For example: mount is read-only in both
> master and slave.  I want to mark it read-write in master but not in
> slave.  What do I do?

Making mounts read-only or read-write -- will that effect mount
propagation in such a way that future mounts in any one of the
peers will not be able to propagate that mount to its peers or slaves?

I don't think it will. Hence its ok to selectively mark some mounts
read-only and some mounts read-write.

However with the introduction of unpriviledged mount semantics, there 
can be cases where a user has priviledges to mount at one location but
not at a different location. if these two location happen to share
a peer-relationship than I see a case of interference of read-write
flag semantics with shared subtree semantics. And hence we will end up
propagating the read-write flag too or have to craft a different
semantics that stays consistent. 

> 
> > Whatever feature that is desired to be exploited; can that be exploited
> > with the current set of semantics that we have? Is there a real need to
> > make the mounts asymmetric but at the same time name them as shared?
> > Maybe I dont understand what the desired application is? 
> 
> I do think this question of propagating mount flags is totally
> independent of user mounts.
> 
> As it stands, currently remount doesn't propagate mount flags, and I
> don't see any compelling reasons why it should.
> 
> The patchset introduces a new mount flag "allowusermnt", but I don't
> see any compelling reason to propagate this flag _either_.
> 
> Please say so if you do have such a reason.  As I've explained, having
> this flag set differently in parts of a propagation tree does not
> interfere with or break propagation in any way.

As I said earlier, I see a case where two mounts that are peers of each
other can become un-identical if we dont propagate the "allowusermnt".

As a practical example.

/tmp and /mnt are peers of each other.
/tmp has its "allowusermnt" flag set, which has not been propagated
to /mnt.

now a normal-user mounts an ext2 file system under /tmp at /tmp/1

unfortunately the mount wont appear under /mnt/1 

and this breaks the shared-subtree semantics which promises: whatever is
mounted under /tmp will also be visible under /mnt

and in case if you allow the mount to appear under /mnt/1, you will
break unpriviledge mounts semantics which promises: a normal user will
not be able to mount at a location that does not allow user-mounts.

RP

> 
> Miklos
> 

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [Devel] Re: [patch 05/10] add "permit user mounts in new namespace" clone flag

2007-04-18 Thread Ram Pai

On Wed, 2007-04-18 at 21:14 +0200, Miklos Szeredi wrote:
> > As I said earlier, I see a case where two mounts that are peers of each
> > other can become un-identical if we dont propagate the "allowusermnt".
> > 
> > As a practical example.
> > 
> > /tmp and /mnt are peers of each other.
> > /tmp has its "allowusermnt" flag set, which has not been propagated
> > to /mnt.
> > 
> > now a normal-user mounts an ext2 file system under /tmp at /tmp/1
> > 
> > unfortunately the mount wont appear under /mnt/1 
> 
> Argh, that is not true.  That's what I've been trying to explain to
> you all along.

I now realize you did, but I failed to catch it. sorry :-(

> 
> The propagation will be done _regardless_ of the flag.  The flag is
> only checked for the parent of the _requested_ mount.  If it is
> allowed there, the mount, including any propagations are allowed.  If
> it's denied, then obviously it's denied everywhere.
> 
> > and in case if you allow the mount to appear under /mnt/1, you will
> > break unpriviledge mounts semantics which promises: a normal user will
> > not be able to mount at a location that does not allow user-mounts.
> 
> No, it does not promise that.  The flag just promises, that the user
> cannot _request_ a mount on the parent mount.

ok. if the ability for a normal user to mount something *indirectly*
under a mount that has its 'allowusermnt flag' unset, 
is acceptable under the definition of 'allowusermnt', i guess my only
choice is to accept it. :-)

RP

> 
> Miklos

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [patch 0/8] unprivileged mount syscall

2007-04-09 Thread Ram Pai

On Mon, 2007-04-09 at 12:07 -0500, Serge E. Hallyn wrote:
> Quoting Miklos Szeredi ([EMAIL PROTECTED]):

> >  - need to set up mount propagation from global namespace to private
> >ones, mount(8) does not yet have options to configure propagation
> 
> Hmm, I guess I get lost using my own little systems, and just assumed
> that shared subtree functionality was making its way up into mount(8).
> Ram, have you been working on that?

It is in FC6. I dont know the status off upstream util-linux. I did
submit the patch many times to Adrian Bunk (the then util-linux
maintainer) and got no response. I have not pushed the patches to the
new maintainer(Karel Zak?) though.

RP

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [patch 0/8] unprivileged mount syscall

2007-04-10 Thread Ram Pai

On Mon, 2007-04-09 at 22:10 +0200, Miklos Szeredi wrote:
> > > The one in pam-0.99.6.3-29.1 in opensuse-10.2 is totally broken.  Are
> > > you interested in the details?  I can reproduce it, but forgot to note
> > > down the details of the brokenness.
> > 
> > I don't know how far removed that is from the one being used by redhat,
> > but assuming it's the same, then redhat-lspp@redhat.com will be
> > very interested.
> 
> OK.
> 
> > >  - user namespace setup: what if user has multiple sessions?
> > > 
> > >1) namespaces are shared?  That's tricky because the session needs to
> > >be a child of a namespace server, not of login.  I'm not sure PAM
> > >can handle this
> > > 
> > >2) or mounts are copied on login?  That's not possible currently,
> > >as there's no way to send a mount between namespaces.  Also it's
> > >tricky to make sure that new mounts are also shared
> > 
> > See toward the end of the 'shared subtrees' OLS paper from last year for
> > a suggestion on how to let users effectively 'log in to' an existing
> > private mounts ns.
> 
> This?
> 
>   1. create a new namespace
>   2. bind /share/$USER to /share
>   3. for each pair ($who, $what) such that
>  /share/$USER/$who/$what exists, look
>  in /share/$who/allowed for "peer $what
>  $USER" or "slave $what $USER". If the
>  former is found, rbind /share/$who/$what
>  on /share/$USER/$who/$what; if the
>  latter is found, do the same and
>  follow with marking subtree under
>  /share/$USER/$who/$what as slave.
>   4. rbind /share/$USER to /share
>   5. mark subtree under /share as private.
>   6. umount -l /share
> 
> Well, someone please explain using short words, because I don't
> understand at all.

I am trying to re-construct Viro's thoughts.  I think the steps outlined
above; though not accurate, are still insightful.

The idea is -- there is one master namespace, which has
under /share, a replica of the mount tree of namespaces belonging to all
users. 

for example if there are two users A and B, then in the master namespace
under /share you will find /share/A and /share/B, each reflecting the
mount tree for the namespaces belonging to user-A and user-B
respectively. 

Note: /share is a shared mount-tree, which means it can propagate mount
events.

Everytime the user logs on the machine, a new namespace is created which
is the clone of the master namespace. In this new namespace,
the /share/$user is made the root of the namespace. Also if other
users have allowed part of their namespace available to this user,
than those mounts are also brought under this namespace. And finally the
entire tree under /share is unmounted.

Note, though multiple namespaces can exist simultaneously for the same
user, the user is provided the illusion of per-process-namespace since
all the namespaces look identical.  

I am trying to rewrite the steps outlined above, which may or may not
reflect Viro's thoughts, but certainly reflect my reconstruction of
viro's thoughts.

1. clone the master namespace.

2. in the new namespace

move the tree under /share/$me to /
for each ($user, $what, $how) {
move /share/$user/$what to /$what
if ($how == slave) {
 make the mount tree under /$what as slave
}
}

3. in the new namespace make the tree under 
   /share as private and unmount /share

RP

> 
> Thanks,
> Miklos

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [patch 0/8] unprivileged mount syscall

2007-04-11 Thread Ram Pai

On Wed, 2007-04-11 at 12:44 +0200, Miklos Szeredi wrote:
> > 1. clone the master namespace.
> > 
> > 2. in the new namespace
> > 
> > move the tree under /share/$me to /
> > for each ($user, $what, $how) {
> > move /share/$user/$what to /$what
> > if ($how == slave) {
> >  make the mount tree under /$what as slave
> > }
> > }
> > 
> > 3. in the new namespace make the tree under 
> >/share as private and unmount /share
> 
> Thanks.  I get the basic idea now: the namespace itself need not be
> shared between the sessions, it is enough if "share" propagation is
> set up between the different namespaces of a user.
> 
> I don't yet see either in your or Viro's description how the trees
> under /share/$USER are initialized.  I guess they are recursively
> bound from /, and are made slaves.

yes. I suppose, when a userid is created one of the steps would be

mount --rbind / /share/$USER
mount --make-rslave /share/$USER
mount --make-rshared /share/$USER

RP







> Miklos

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [RFC v6 21/62] powerpc: introduce execute-only pkey

2017-08-17 Thread Ram Pai

On Thu, Aug 17, 2017 at 04:35:55PM -0700, Ram Pai wrote:
> On Wed, Aug 02, 2017 at 07:40:46PM +1000, Michael Ellerman wrote:
> > Thiago Jung Bauermann  writes:
> > 
> > > Michael Ellerman  writes:
> > >
> > >> Thiago Jung Bauermann  writes:
> > >>> Ram Pai  writes:
> > >> ...
> > >>>> +
> > >>>> +  /* We got one, store it and use it from here on out */
> > >>>> +  if (need_to_set_mm_pkey)
> > >>>> +  mm->context.execute_only_pkey = execute_only_pkey;
> > >>>> +  return execute_only_pkey;
> > >>>> +}
> > >>>
> > >>> If you follow the code flow in __execute_only_pkey, the AMR and UAMOR
> > >>> are read 3 times in total, and AMR is written twice. IAMR is read and
> > >>> written twice. Since they are SPRs and access to them is slow (or isn't
> > >>> it?),
> > >>
> > >> SPRs read/writes are slow, but they're not *that* slow in comparison to
> > >> a system call (which I think is where this code is being called?).
> > >
> > > Yes, this code runs on mprotect and mmap syscalls if the memory is
> > > requested to have execute but not read nor write permissions.
> > 
> > Yep. That's not in the fast path for key usage, ie. the fast path is
> > userspace changing the AMR itself, and the overhead of a syscall is
> > already hundreds of cycles.
> > 
> > >> So we should try to avoid too many SPR read/writes, but at the same time
> > >> we can accept more than the minimum if it makes the code much easier to
> > >> follow.
> > >
> > > Ok. Ram had asked me to suggest a way to optimize the SPR reads and
> > > writes and I came up with the patch below. Do you think it's worth it?
> > 
> > At a glance no I don't think it is. Sorry you spent that much time on it.
> > 
> > I think we can probably reduce the number of SPR accesses without
> > needing to go to that level of complexity.
> > 
> > But don't throw the patch away, I may eat my words once I have the full
> > series applied and am looking at it hard - at the moment I'm just
> > reviewing the patches piecemeal as I get time.
> 

Thiago's patch does save some cycles. I dont feel like throwing his
work. I agree, It should be considered after applying all the patches. 
 
RP

-- 
Ram Pai

Re: [RFC v2 03/12] powerpc: Implement sys_pkey_alloc and sys_pkey_free system call.

2017-06-20 Thread Ram Pai

On Mon, Jun 19, 2017 at 10:18:01PM +1000, Michael Ellerman wrote:
> Hi Ram,
> 
> Ram Pai  writes:
> > Sys_pkey_alloc() allocates and returns available pkey
> > Sys_pkey_free()  frees up the pkey.
> >
> > Total 32 keys are supported on powerpc. However pkey 0,1 and 31
> > are reserved. So effectively we have 29 pkeys.
> >
> > Signed-off-by: Ram Pai 
> > ---
> >  include/linux/mm.h   |  31 ---
> >  include/uapi/asm-generic/mman-common.h   |   2 +-
> 
> Those changes need to be split out and acked by mm folks.
> 
> > diff --git a/include/linux/mm.h b/include/linux/mm.h
> > index 7cb17c6..34ddac7 100644
> > --- a/include/linux/mm.h
> > +++ b/include/linux/mm.h
> > @@ -204,26 +204,35 @@ extern int overcommit_kbytes_handler(struct ctl_table 
> > *, int, void __user *,
> >  #define VM_MERGEABLE   0x8000  /* KSM may merge identical 
> > pages */
> >  
> >  #ifdef CONFIG_ARCH_USES_HIGH_VMA_FLAGS
> > -#define VM_HIGH_ARCH_BIT_0 32  /* bit only usable on 64-bit 
> > architectures */
> > -#define VM_HIGH_ARCH_BIT_1 33  /* bit only usable on 64-bit 
> > architectures */
> > -#define VM_HIGH_ARCH_BIT_2 34  /* bit only usable on 64-bit 
> > architectures */
> > -#define VM_HIGH_ARCH_BIT_3 35  /* bit only usable on 64-bit 
> > architectures */
> > +#define VM_HIGH_ARCH_BIT_0 32  /* bit only usable on 64-bit arch */
> > +#define VM_HIGH_ARCH_BIT_1 33  /* bit only usable on 64-bit arch */
> > +#define VM_HIGH_ARCH_BIT_2 34  /* bit only usable on 64-bit arch */
> > +#define VM_HIGH_ARCH_BIT_3 35  /* bit only usable on 64-bit arch */
> 
> Please don't change the comments, it makes the diff harder to read.

The lines were surpassing 80 columns. tried to compress the comments
without loosing meaning. will restore.

> 
> You're actually just adding this AFAICS:
> 
> > +#define VM_HIGH_ARCH_BIT_4 36  /* bit only usable on 64-bit arch */
> 
> >  #define VM_HIGH_ARCH_0 BIT(VM_HIGH_ARCH_BIT_0)
> >  #define VM_HIGH_ARCH_1 BIT(VM_HIGH_ARCH_BIT_1)
> >  #define VM_HIGH_ARCH_2 BIT(VM_HIGH_ARCH_BIT_2)
> >  #define VM_HIGH_ARCH_3 BIT(VM_HIGH_ARCH_BIT_3)
> > +#define VM_HIGH_ARCH_4 BIT(VM_HIGH_ARCH_BIT_4)
> >  #endif /* CONFIG_ARCH_USES_HIGH_VMA_FLAGS */
> >  
> >  #if defined(CONFIG_X86)
>^
> >  # define VM_PATVM_ARCH_1   /* PAT reserves whole VMA at 
> > once (x86) */
> > -#if defined (CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS)
> > -# define VM_PKEY_SHIFT VM_HIGH_ARCH_BIT_0
> > -# define VM_PKEY_BIT0  VM_HIGH_ARCH_0  /* A protection key is a 4-bit 
> > value */
> > -# define VM_PKEY_BIT1  VM_HIGH_ARCH_1
> > -# define VM_PKEY_BIT2  VM_HIGH_ARCH_2
> > -# define VM_PKEY_BIT3  VM_HIGH_ARCH_3
> > -#endif
> > +#if defined(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) \
> > +   || defined(CONFIG_PPC64_MEMORY_PROTECTION_KEYS)
> > +#define VM_PKEY_SHIFT  VM_HIGH_ARCH_BIT_0
> > +#define VM_PKEY_BIT0   VM_HIGH_ARCH_0  /* A protection key is a 5-bit 
> > value */
>  ^ 4?
> > +#define VM_PKEY_BIT1   VM_HIGH_ARCH_1
> > +#define VM_PKEY_BIT2   VM_HIGH_ARCH_2
> > +#define VM_PKEY_BIT3   VM_HIGH_ARCH_3
> > +#endif /* CONFIG_PPC64_MEMORY_PROTECTION_KEYS */
> 
> That appears to be inside an #if defined(CONFIG_X86) ?
> 
> >  #elif defined(CONFIG_PPC)
>  ^
> Should be CONFIG_PPC64_MEMORY_PROTECTION_KEYS no?

Its a little garbled. Will fix it.
> 
> > +#define VM_PKEY_BIT0   VM_HIGH_ARCH_0  /* A protection key is a 5-bit 
> > value */
> > +#define VM_PKEY_BIT1   VM_HIGH_ARCH_1
> > +#define VM_PKEY_BIT2   VM_HIGH_ARCH_2
> > +#define VM_PKEY_BIT3   VM_HIGH_ARCH_3
> > +#define VM_PKEY_BIT4   VM_HIGH_ARCH_4  /* intel does not use this bit 
> > */
> > +   /* but reserved for future expansion */
> 
> But this hunk is for PPC ?
> 
> Is it OK for the other arches & generic code to add another VM_PKEY_BIT4 ?

No. it has to be PPC specific.

> 
> Do you need to update show_smap_vma_flags() ?
> 
> >  # define VM_SAOVM_ARCH_1   /* Strong Access Ordering 
> > (powerpc) */
> >  #elif defined(CONFIG_PARISC)
> >  # define VM_GROWSUPVM_ARCH_1
> 
> > diff --git a/include/uapi/asm-generic/mman-common.h 
> > b/include/uapi/asm-generic/mman-common.h
> > index 8c27db0..b13ecc6 100644
> > --- a/include/uapi/

Re: [RFC v2 01/12] powerpc: Free up four 64K PTE bits in 4K backed hpte pages.

2017-06-20 Thread Ram Pai

On Tue, Jun 20, 2017 at 03:50:25PM +0530, Anshuman Khandual wrote:
> On 06/17/2017 09:22 AM, Ram Pai wrote:
> > Rearrange 64K PTE bits to  free  up  bits 3, 4, 5  and  6
> > in the 4K backed hpte pages. These bits continue to be used
> > for 64K backed hpte pages in this patch, but will be freed
> > up in the next patch.
> 
> The counting 3, 4, 5 and 6 are in BE format I believe, I was
> initially trying to see that from right to left as we normally
> do in the kernel and was getting confused. So basically these
> bits (which are only applicable for 64K mapping IIUC) are going
> to be freed up from the PTE format.
> 
> #define _RPAGE_RSV1   0x1000UL
> #define _RPAGE_RSV2   0x0800UL
> #define _RPAGE_RSV3   0x0400UL
> #define _RPAGE_RSV4   0x0200UL
> 
> As you have mentioned before this feature is available for 64K
> page size only and not for 4K mappings. So I assume we support
> both the combinations.
> 
> * 64K mapping on 64K
> * 64K mapping on 4K

yes.

> 
> These are the current users of the above bits
> 
> #define H_PAGE_BUSY   _RPAGE_RSV1 /* software: PTE & hash are busy */
> #define H_PAGE_F_SECOND   _RPAGE_RSV2 /* HPTE is in 2ndary 
> HPTEG */
> #define H_PAGE_F_GIX  (_RPAGE_RSV3 | _RPAGE_RSV4 | _RPAGE_RPN44)
> #define H_PAGE_HASHPTE_RPAGE_RPN43/* PTE has associated 
> HPTE */
> 
> > 
> > The patch does the following change to the 64K PTE format
> > 
> > H_PAGE_BUSY moves from bit 3 to bit 9
> 
> and what is in there on bit 9 now ? This ?
> 
> #define _RPAGE_SW20x00400
> 
> which is used as 
> 
> #define _PAGE_SPECIAL _RPAGE_SW2 /* software: special page */
> 
> which will not be required any more ?

i think you are reading bit 9 from right to left. the bit 9 i refer to
is from left to right. Using the same numbering convention the ISA3.0 uses.
I know it is confusing, will make a mention in the comment of this
patch, to read it the big-endian way.

BTW: Bit 9 is not used currently. so using it in this patch. But this is
a temporary move. the H_PAGE_BUSY will move to bit 7 in the next patch.

Had to keep at bit 9, because bit 7 is not yet entirely freed up. it is
used by 64K PTE backed by 64k htpe.

> 
> > H_PAGE_F_SECOND which occupied bit 4 moves to the second part
> > of the pte.
> > H_PAGE_F_GIX which  occupied bit 5, 6 and 7 also moves to the
> > second part of the pte.
> > 
> > the four  bits((H_PAGE_F_SECOND|H_PAGE_F_GIX) that represent a slot
> > is  initialized  to  0xF  indicating  an invalid  slot.  If  a hpte
> > gets cached in a 0xF  slot(i.e  7th  slot  of  secondary),  it   is
> > released immediately. In  other  words, even  though   0xF   is   a
> 
> Release immediately means we attempt again for a new hash slot ?

yes.

> 
> > valid slot we discard  and consider it as an invalid
> > slot;i.e hpte_soft_invalid(). This  gives  us  an opportunity to not
> > depend on a bit in the primary PTE in order to determine the
> > validity of a slot.
> 
> So we have to see the slot number in the second half for each PTE to
> figure out if it has got a valid slot in the hash page table.

yes.

> 
> > 
> > When  we  release  ahpte   in the 0xF   slot we also   release a
> > legitimate primary   slot  andunmapthat  entry. This  is  to
> > ensure  that we do get a   legimate   non-0xF  slot the next time we
> > retry for a slot.
> 
> Okay.
> 
> > 
> > Though treating 0xF slot as invalid reduces the number of available
> > slots  and  may  have an effect  on the performance, the probabilty
> > of hitting a 0xF is extermely low.
> 
> Why you say that ? I thought every slot number has the same probability
> of hit from the hash function.

Every hash bucket has the same probability. But every slot within the
hash bucket is filled in sequentially. so it takes 15 hptes to hash to
the same bucket before we get to the 15th slot in the secondary.

> 
> > 
> > Compared  to the current scheme, the above described scheme reduces
> > the number of false hash table updates  significantly  and  has the
> 
> How it reduces false hash table updates ?

earlier, we had 1 bit allocated in the first-part-of-the 64K-PTE 
for four consecutive 4K hptes. If any one 4k hpte got hashed-in,
the bit got set. Which means anytime it faulted on the remaining
three 4k hpte, we saw the bit already set and tried to erroneously 
update that hpte. So we had a 75% update error rate. Funcationally
not bad, but bad from a performance point of view.

With the current scheme, we decide if a 4k slot is valid

Re: [RFC v2 02/12] powerpc: Free up four 64K PTE bits in 64K backed hpte pages.

2017-06-20 Thread Ram Pai

On Tue, Jun 20, 2017 at 04:21:45PM +0530, Anshuman Khandual wrote:
> On 06/17/2017 09:22 AM, Ram Pai wrote:
> > Rearrange 64K PTE bits to  free  up  bits 3, 4, 5  and  6
> > in the 64K backed hpte pages. This along with the earlier
> > patch will entirely free up the four bits from 64K PTE.
> > 
> > This patch does the following change to 64K PTE that is
> > backed by 64K hpte.
> > 
> > H_PAGE_F_SECOND which occupied bit 4 moves to the second part
> > of the pte.
> > H_PAGE_F_GIX which  occupied bit 5, 6 and 7 also moves to the
> > second part of the pte.
> > 
> > since bit 7 is now freed up, we move H_PAGE_BUSY from bit 9
> > to bit 7. Trying to minimize gaps so that contiguous bits
> > can be allocated if needed in the future.
> > 
> > The second part of the PTE will hold
> > (H_PAGE_F_SECOND|H_PAGE_F_GIX) at bit 60,61,62,63.
> 
> I still dont understand how we freed up the 5th bit which is
> used in the 5th patch. Was that bit never used for any thing
> on 64K page size (64K and 4K mappings) ?

yes. it was not used. So I gladly used it :-)


RP

Re: [RFC v2 06/12] powerpc: Program HPTE key protection bits.

2017-06-20 Thread Ram Pai

On Tue, Jun 20, 2017 at 01:51:45PM +0530, Anshuman Khandual wrote:
> On 06/17/2017 09:22 AM, Ram Pai wrote:
> > Map the PTE protection key bits to the HPTE key protection bits,
> > while creatiing HPTE  entries.
> > 
> > Signed-off-by: Ram Pai 
> > ---
> >  arch/powerpc/include/asm/book3s/64/mmu-hash.h | 5 +
> >  arch/powerpc/include/asm/pkeys.h  | 7 +++
> >  arch/powerpc/mm/hash_utils_64.c   | 5 +
> >  3 files changed, 17 insertions(+)
> > 
> > diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h 
> > b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
> > index cfb8169..3d7872c 100644
> > --- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h
> > +++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
> > @@ -90,6 +90,8 @@
> >  #define HPTE_R_PP0 ASM_CONST(0x8000)
> >  #define HPTE_R_TS  ASM_CONST(0x4000)
> >  #define HPTE_R_KEY_HI  ASM_CONST(0x3000)
> > +#define HPTE_R_KEY_BIT0ASM_CONST(0x2000)
> > +#define HPTE_R_KEY_BIT1ASM_CONST(0x1000)
> >  #define HPTE_R_RPN_SHIFT   12
> >  #define HPTE_R_RPN ASM_CONST(0x0000)
> >  #define HPTE_R_RPN_3_0 ASM_CONST(0x01fff000)
> > @@ -104,6 +106,9 @@
> >  #define HPTE_R_C   ASM_CONST(0x0080)
> >  #define HPTE_R_R   ASM_CONST(0x0100)
> >  #define HPTE_R_KEY_LO  ASM_CONST(0x0e00)
> > +#define HPTE_R_KEY_BIT2ASM_CONST(0x0800)
> > +#define HPTE_R_KEY_BIT3ASM_CONST(0x0400)
> > +#define HPTE_R_KEY_BIT4ASM_CONST(0x0200)
> > 
> 
> Should we indicate/document how these 5 bits are not contiguous
> in the HPTE format for any given real page ?

I can, but its all well documented in the ISA. Infact all the bits and
the macros are one to one translation from the ISA.

> 
> >  #define HPTE_V_1TB_SEG ASM_CONST(0x4000)
> >  #define HPTE_V_VRMA_MASK   ASM_CONST(0x4001ff00)
> > diff --git a/arch/powerpc/include/asm/pkeys.h 
> > b/arch/powerpc/include/asm/pkeys.h
> > index 0f3dca8..9b6820d 100644
> > --- a/arch/powerpc/include/asm/pkeys.h
> > +++ b/arch/powerpc/include/asm/pkeys.h
> > @@ -27,6 +27,13 @@
> > ((vm_flags & VM_PKEY_BIT3) ? H_PAGE_PKEY_BIT1 : 0x0UL) | \
> > ((vm_flags & VM_PKEY_BIT4) ? H_PAGE_PKEY_BIT0 : 0x0UL))
> > 
> > +#define calc_pte_to_hpte_pkey_bits(pteflags)   \
> > +   (((pteflags & H_PAGE_PKEY_BIT0) ? HPTE_R_KEY_BIT0 : 0x0UL) |\
> > +   ((pteflags & H_PAGE_PKEY_BIT1) ? HPTE_R_KEY_BIT1 : 0x0UL) | \
> > +   ((pteflags & H_PAGE_PKEY_BIT2) ? HPTE_R_KEY_BIT2 : 0x0UL) | \
> > +   ((pteflags & H_PAGE_PKEY_BIT3) ? HPTE_R_KEY_BIT3 : 0x0UL) | \
> > +   ((pteflags & H_PAGE_PKEY_BIT4) ? HPTE_R_KEY_BIT4 : 0x0UL))
> > +
> 
> We can drop calc_ in here. pte_to_hpte_pkey_bits should be
> sufficient.

ok. will do.

thanks for your comments,
RP

Re: [RFC v2 07/12] powerpc: Macro the mask used for checking DSI exception

2017-06-20 Thread Ram Pai

On Tue, Jun 20, 2017 at 01:44:25PM +0530, Anshuman Khandual wrote:
> On 06/17/2017 09:22 AM, Ram Pai wrote:
> > Replace the magic number used to check for DSI exception
> > with a meaningful value.
> > 
> > Signed-off-by: Ram Pai 
> > ---
> >  arch/powerpc/include/asm/reg.h   | 9 -
> >  arch/powerpc/kernel/exceptions-64s.S | 2 +-
> >  2 files changed, 9 insertions(+), 2 deletions(-)
> > 
> > diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
> > index 7e50e47..2dcb8a1 100644
> > --- a/arch/powerpc/include/asm/reg.h
> > +++ b/arch/powerpc/include/asm/reg.h
> > @@ -272,16 +272,23 @@
> >  #define SPRN_DAR   0x013   /* Data Address Register */
> >  #define SPRN_DBCR  0x136   /* e300 Data Breakpoint Control Reg */
> >  #define SPRN_DSISR 0x012   /* Data Storage Interrupt Status Register */
> > +#define   DSISR_BIT32  0x8000  /* not defined */
> >  #define   DSISR_NOHPTE 0x4000  /* no translation found 
> > */
> > +#define   DSISR_PAGEATTR_CONFLT0x2000  /* page attribute 
> > conflict */
> > +#define   DSISR_BIT35  0x1000  /* not defined */
> >  #define   DSISR_PROTFAULT  0x0800  /* protection fault */
> >  #define   DSISR_BADACCESS  0x0400  /* bad access to CI or G */
> >  #define   DSISR_ISSTORE0x0200  /* access was a store */
> >  #define   DSISR_DABRMATCH  0x0040  /* hit data breakpoint */
> > -#define   DSISR_NOSEGMENT  0x0020  /* SLB miss */
> >  #define   DSISR_KEYFAULT   0x0020  /* Key fault */
> > +#define   DSISR_BIT43  0x0010  /* not defined */
> >  #define   DSISR_UNSUPP_MMU 0x0008  /* Unsupported MMU config */
> >  #define   DSISR_SET_RC 0x0004  /* Failed setting of 
> > R/C bits */
> >  #define   DSISR_PGDIRFAULT  0x0002  /* Fault on page directory 
> > */
> > +#define   DSISR_PAGE_FAULT_MASK (DSISR_BIT32 | \
> > +   DSISR_PAGEATTR_CONFLT | \
> > +   DSISR_BADACCESS |   \
> > +   DSISR_BIT43)
> 
> Sorry missed this one. Seems like there are couple of unnecessary
> line additions in the subsequent patch which adds the new PKEY
> reason code.
> 
> -#define   DSISR_PAGE_FAULT_MASK (DSISR_BIT32 | \
> - DSISR_PAGEATTR_CONFLT | \
> - DSISR_BADACCESS |   \
> +#define   DSISR_PAGE_FAULT_MASK (DSISR_BIT32 |   \
> + DSISR_PAGEATTR_CONFLT | \
> + DSISR_BADACCESS |   \
> + DSISR_KEYFAULT |\
>   DSISR_BIT43)

i like to see them separately, one per line. But than you are right.
that is not the convention in this file. So will change it accordingly.

thanks,
RP
> 
> 

-- 
Ram Pai

Re: [RFC v2 08/12] powerpc: Handle exceptions caused by violation of pkey protection.

2017-06-20 Thread Ram Pai

On Tue, Jun 20, 2017 at 12:54:45PM +0530, Anshuman Khandual wrote:
> On 06/17/2017 09:22 AM, Ram Pai wrote:
> > Handle Data and Instruction exceptions caused by memory
> > protection-key.
> > 
> > Signed-off-by: Ram Pai 
> > (cherry picked from commit a5e5217619a0c475fe0cacc3b0cf1d3d33c79a09)

Sorry. it was residue of a bad cleanup. It got cherry-picked from my own
internal branch, but than i forgot to delete that line.

> 
> To which tree this commit belongs to ?
> 
> > 
> > Conflicts:
> > arch/powerpc/include/asm/reg.h
> > arch/powerpc/kernel/exceptions-64s.S

same here. these two line are some residues of patching-up my tree with
commits from other internal branches.

> > ---
> >  arch/powerpc/include/asm/mmu_context.h | 12 +
> >  arch/powerpc/include/asm/pkeys.h   |  9 
> >  arch/powerpc/include/asm/reg.h |  7 +--
> >  arch/powerpc/mm/fault.c| 21 +++-
> >  arch/powerpc/mm/pkeys.c| 90 
> > ++
> >  5 files changed, 134 insertions(+), 5 deletions(-)
> > 
> > diff --git a/arch/powerpc/include/asm/mmu_context.h 
> > b/arch/powerpc/include/asm/mmu_context.h
> > index da7e943..71fffe0 100644
> > --- a/arch/powerpc/include/asm/mmu_context.h
> > +++ b/arch/powerpc/include/asm/mmu_context.h
> > @@ -175,11 +175,23 @@ static inline void arch_bprm_mm_init(struct mm_struct 
> > *mm,
> >  {
> >  }
> > 
> > +#ifdef CONFIG_PPC64_MEMORY_PROTECTION_KEYS
> > +bool arch_pte_access_permitted(pte_t pte, bool write);
> > +bool arch_vma_access_permitted(struct vm_area_struct *vma,
> > +   bool write, bool execute, bool foreign);
> > +#else /* CONFIG_PPC64_MEMORY_PROTECTION_KEYS */
> > +static inline bool arch_pte_access_permitted(pte_t pte, bool write)
> > +{
> > +   /* by default, allow everything */
> > +   return true;
> > +}
> >  static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
> > bool write, bool execute, bool foreign)
> >  {
> > /* by default, allow everything */
> > return true;
> >  }
> 
> Right, these are the two functions the core VM expects the
> arch to provide.
> 
> > +#endif /* CONFIG_PPC64_MEMORY_PROTECTION_KEYS */
> > +
> >  #endif /* __KERNEL__ */
> >  #endif /* __ASM_POWERPC_MMU_CONTEXT_H */
> > diff --git a/arch/powerpc/include/asm/pkeys.h 
> > b/arch/powerpc/include/asm/pkeys.h
> > index 9b6820d..405e7db 100644
> > --- a/arch/powerpc/include/asm/pkeys.h
> > +++ b/arch/powerpc/include/asm/pkeys.h
> > @@ -14,6 +14,15 @@
> > VM_PKEY_BIT3 | \
> > VM_PKEY_BIT4)
> > 
> > +static inline u16 pte_flags_to_pkey(unsigned long pte_flags)
> > +{
> > +   return ((pte_flags & H_PAGE_PKEY_BIT4) ? 0x1 : 0x0) |
> > +   ((pte_flags & H_PAGE_PKEY_BIT3) ? 0x2 : 0x0) |
> > +   ((pte_flags & H_PAGE_PKEY_BIT2) ? 0x4 : 0x0) |
> > +   ((pte_flags & H_PAGE_PKEY_BIT1) ? 0x8 : 0x0) |
> > +   ((pte_flags & H_PAGE_PKEY_BIT0) ? 0x10 : 0x0);
> > +}
> 
> Add defines for the above 0x1, 0x2, 0x4, 0x8 etc ?

hmm...not sure if it will make the code any better.

> 
> > +
> >  #define pkey_to_vmflag_bits(key) (((key & 0x1UL) ? VM_PKEY_BIT0 : 0x0UL) | 
> > \
> > ((key & 0x2UL) ? VM_PKEY_BIT1 : 0x0UL) |\
> > ((key & 0x4UL) ? VM_PKEY_BIT2 : 0x0UL) |\
> > diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
> > index 2dcb8a1..a11977f 100644
> > --- a/arch/powerpc/include/asm/reg.h
> > +++ b/arch/powerpc/include/asm/reg.h
> > @@ -285,9 +285,10 @@
> >  #define   DSISR_UNSUPP_MMU 0x0008  /* Unsupported MMU config */
> >  #define   DSISR_SET_RC 0x0004  /* Failed setting of 
> > R/C bits */
> >  #define   DSISR_PGDIRFAULT  0x0002  /* Fault on page directory 
> > */
> > -#define   DSISR_PAGE_FAULT_MASK (DSISR_BIT32 | \
> > -   DSISR_PAGEATTR_CONFLT | \
> > -   DSISR_BADACCESS |   \
> > +#define   DSISR_PAGE_FAULT_MASK (DSISR_BIT32 | \
> > +   DSISR_PAGEATTR_CONFLT | \
> > +   DSISR_BADACCESS |   \
> > +   DSISR_KEYFAULT |\
> > DSISR_BIT43)
> 
> This should have been cleaned up before adding new
> DSISR_KEYFAULT reason code into it. But I guess its
> okay.
>

Re: [RFC v2 09/12] powerpc: Deliver SEGV signal on pkey violation.

2017-06-20 Thread Ram Pai

On Tue, Jun 20, 2017 at 12:24:53PM +0530, Anshuman Khandual wrote:
> On 06/17/2017 09:22 AM, Ram Pai wrote:
> > The value of the AMR register at the time of exception
> > is made available in gp_regs[PT_AMR] of the siginfo.
> > 
> > This field can be used to reprogram the permission bits of
> > any valid pkey.
> > 
> > Similarly the value of the pkey, whose protection got violated,
> > is made available at si_pkey field of the siginfo structure.
> > 
> > Signed-off-by: Ram Pai 
> > ---
> >  arch/powerpc/include/asm/paca.h|  1 +
> >  arch/powerpc/include/uapi/asm/ptrace.h |  3 ++-
> >  arch/powerpc/kernel/asm-offsets.c  |  5 
> >  arch/powerpc/kernel/exceptions-64s.S   |  8 ++
> >  arch/powerpc/kernel/signal_32.c| 14 ++
> >  arch/powerpc/kernel/signal_64.c| 14 ++
> >  arch/powerpc/kernel/traps.c| 49 
> > ++
> >  arch/powerpc/mm/fault.c|  4 +++
> >  8 files changed, 97 insertions(+), 1 deletion(-)
> > 
> > diff --git a/arch/powerpc/include/asm/paca.h 
> > b/arch/powerpc/include/asm/paca.h
> > index 1c09f8f..a41afd3 100644
> > --- a/arch/powerpc/include/asm/paca.h
> > +++ b/arch/powerpc/include/asm/paca.h
> > @@ -92,6 +92,7 @@ struct paca_struct {
> > struct dtl_entry *dispatch_log_end;
> >  #endif /* CONFIG_PPC_STD_MMU_64 */
> > u64 dscr_default;   /* per-CPU default DSCR */
> > +   u64 paca_amr;   /* value of amr at exception */
> > 
> >  #ifdef CONFIG_PPC_STD_MMU_64
> > /*
> > diff --git a/arch/powerpc/include/uapi/asm/ptrace.h 
> > b/arch/powerpc/include/uapi/asm/ptrace.h
> > index 8036b38..7ec2428 100644
> > --- a/arch/powerpc/include/uapi/asm/ptrace.h
> > +++ b/arch/powerpc/include/uapi/asm/ptrace.h
> > @@ -108,8 +108,9 @@ struct pt_regs {
> >  #define PT_DAR 41
> >  #define PT_DSISR 42
> >  #define PT_RESULT 43
> > -#define PT_DSCR 44
> >  #define PT_REGS_COUNT 44
> > +#define PT_DSCR 44
> > +#define PT_AMR 45
> 
> PT_REGS_COUNT is not getting incremented even after adding
> one more element into the pack ?

Correct. there are 48 entires in gp_regs table AFAICT, only the first 45
are exposed through pt_regs and through gp_regs. the remaining
are exposed through gp_regs only.

> 
> > 
> >  #define PT_FPR048  /* each FP reg occupies 2 slots in this space */
> > 
> > diff --git a/arch/powerpc/kernel/asm-offsets.c 
> > b/arch/powerpc/kernel/asm-offsets.c
> > index 709e234..17f5d8a 100644
> > --- a/arch/powerpc/kernel/asm-offsets.c
> > +++ b/arch/powerpc/kernel/asm-offsets.c
> > @@ -241,6 +241,11 @@ int main(void)
> > OFFSET(PACAHWCPUID, paca_struct, hw_cpu_id);
> > OFFSET(PACAKEXECSTATE, paca_struct, kexec_state);
> > OFFSET(PACA_DSCR_DEFAULT, paca_struct, dscr_default);
> > +
> > +#ifdef CONFIG_PPC64_MEMORY_PROTECTION_KEYS
> > +   OFFSET(PACA_AMR, paca_struct, paca_amr);
> > +#endif /* CONFIG_PPC64_MEMORY_PROTECTION_KEYS */
> > +
> 
> So we now have a place in PACA for AMR.

yes.

> 
> > OFFSET(ACCOUNT_STARTTIME, paca_struct, accounting.starttime);
> > OFFSET(ACCOUNT_STARTTIME_USER, paca_struct, accounting.starttime_user);
> > OFFSET(ACCOUNT_USER_TIME, paca_struct, accounting.utime);
> > diff --git a/arch/powerpc/kernel/exceptions-64s.S 
> > b/arch/powerpc/kernel/exceptions-64s.S
> > index 3fd0528..8db9ef8 100644
> > --- a/arch/powerpc/kernel/exceptions-64s.S
> > +++ b/arch/powerpc/kernel/exceptions-64s.S
> > @@ -493,6 +493,10 @@ EXC_COMMON_BEGIN(data_access_common)
> > ld  r12,_MSR(r1)
> > ld  r3,PACA_EXGEN+EX_DAR(r13)
> > lwz r4,PACA_EXGEN+EX_DSISR(r13)
> > +#ifdef CONFIG_PPC64_MEMORY_PROTECTION_KEYS
> > +   mfspr   r5,SPRN_AMR
> > +   std r5,PACA_AMR(r13)
> > +#endif /*  CONFIG_PPC64_MEMORY_PROTECTION_KEYS */
> > li  r5,0x300
> > std r3,_DAR(r1)
> > std r4,_DSISR(r1)
> > @@ -561,6 +565,10 @@ EXC_COMMON_BEGIN(instruction_access_common)
> > ld  r12,_MSR(r1)
> > ld  r3,_NIP(r1)
> > andis.  r4,r12,0x5820
> > +#ifdef CONFIG_PPC64_MEMORY_PROTECTION_KEYS
> > +   mfspr   r5,SPRN_AMR
> > +   std r5,PACA_AMR(r13)
> > +#endif /*  CONFIG_PPC64_MEMORY_PROTECTION_KEYS */
> 
> Saving the AMR context on page faults, this seems to be
> changing in the next patch again based on whether any
> key was active at that point and fault happened for the
> key enforcement ?

y

Re: [RFC v2 10/12] powerpc: Read AMR only if pkey-violation caused the exception.

2017-06-20 Thread Ram Pai

On Mon, Jun 19, 2017 at 09:06:13PM +1000, Michael Ellerman wrote:
> Ram Pai  writes:
> 
> > Signed-off-by: Ram Pai 
> > ---
> >  arch/powerpc/kernel/exceptions-64s.S | 16 ++--
> >  1 file changed, 10 insertions(+), 6 deletions(-)
> >
> > diff --git a/arch/powerpc/kernel/exceptions-64s.S 
> > b/arch/powerpc/kernel/exceptions-64s.S
> > index 8db9ef8..a4de1b4 100644
> > --- a/arch/powerpc/kernel/exceptions-64s.S
> > +++ b/arch/powerpc/kernel/exceptions-64s.S
> > @@ -493,13 +493,15 @@ EXC_COMMON_BEGIN(data_access_common)
> > ld  r12,_MSR(r1)
> > ld  r3,PACA_EXGEN+EX_DAR(r13)
> > lwz r4,PACA_EXGEN+EX_DSISR(r13)
> > +   std r3,_DAR(r1)
> > +   std r4,_DSISR(r1)
> >  #ifdef CONFIG_PPC64_MEMORY_PROTECTION_KEYS
> > +   andis.  r0,r4,DSISR_KEYFAULT@h /* save AMR only if its a key fault */
> > +   beq+1f
> 
> This seems to be incremental on top of one of your other patches.
> 
> But I don't see why, can you please just squash this into whatever patch
> adds this code in the first place.

Yes. squash it is.

next version of my patch will have it squashed.
RP

Re: [RFC v2 10/12] powerpc: Read AMR only if pkey-violation caused the exception.

2017-06-20 Thread Ram Pai

On Tue, Jun 20, 2017 at 12:16:40PM +0530, Anshuman Khandual wrote:
> On 06/19/2017 11:29 PM, Ram Pai wrote:
> > On Mon, Jun 19, 2017 at 09:06:13PM +1000, Michael Ellerman wrote:
> >> Ram Pai  writes:
> >>
> >>> Signed-off-by: Ram Pai 
> >>> ---
> >>>  arch/powerpc/kernel/exceptions-64s.S | 16 ++--
> >>>  1 file changed, 10 insertions(+), 6 deletions(-)
> >>>
> >>> diff --git a/arch/powerpc/kernel/exceptions-64s.S 
> >>> b/arch/powerpc/kernel/exceptions-64s.S
> >>> index 8db9ef8..a4de1b4 100644
> >>> --- a/arch/powerpc/kernel/exceptions-64s.S
> >>> +++ b/arch/powerpc/kernel/exceptions-64s.S
> >>> @@ -493,13 +493,15 @@ EXC_COMMON_BEGIN(data_access_common)
> >>>   ld  r12,_MSR(r1)
> >>>   ld  r3,PACA_EXGEN+EX_DAR(r13)
> >>>   lwz r4,PACA_EXGEN+EX_DSISR(r13)
> >>> + std r3,_DAR(r1)
> >>> + std r4,_DSISR(r1)
> >>>  #ifdef CONFIG_PPC64_MEMORY_PROTECTION_KEYS
> >>> + andis.  r0,r4,DSISR_KEYFAULT@h /* save AMR only if its a key fault */
> >>> + beq+1f
> >>
> >> This seems to be incremental on top of one of your other patches.
> >>
> >> But I don't see why, can you please just squash this into whatever patch
> >> adds this code in the first place.
> > 
> > It was an optimization added later. But yes it can be squashed into an
> > earlier patch.
> 
> Could you please explain what is the optimization this achieves ?

Don't want to read the AMR if it is not a key protection fault. This is a 
hot-hot-path.
A few cycles saved can accumulate into signficant savings overall.

RP

Re: [RFC v2 11/12]Documentation: Documentation updates.

2017-06-20 Thread Ram Pai

On Tue, Jun 20, 2017 at 11:48:23AM +0530, Anshuman Khandual wrote:
> On 06/17/2017 09:22 AM, Ram Pai wrote:
> > The Documentaton file is moved from x86 into the generic area,
> > since this feature is now supported by more than one archs.
> > 
> > Signed-off-by: Ram Pai 
> > ---
> >  Documentation/vm/protection-keys.txt  | 110 
> > ++
> >  Documentation/x86/protection-keys.txt |  85 --
> 
> I am not sure whether this is a good idea. There might be
> specifics for each architecture which need to be detailed
> again in this new generic one.
> 
> >  2 files changed, 110 insertions(+), 85 deletions(-)
> >  create mode 100644 Documentation/vm/protection-keys.txt
> >  delete mode 100644 Documentation/x86/protection-keys.txt
> > 
> > diff --git a/Documentation/vm/protection-keys.txt 
> > b/Documentation/vm/protection-keys.txt
> > new file mode 100644
> > index 000..b49e6bb
> > --- /dev/null
> > +++ b/Documentation/vm/protection-keys.txt
> > @@ -0,0 +1,110 @@
> > +Memory Protection Keys for Userspace (PKU aka PKEYs) is a CPU feature
> > +found in new generation of intel CPUs on PowerPC CPUs.
> > +
> > +Memory Protection Keys provides a mechanism for enforcing page-based
> > +protections, but without requiring modification of the page tables
> > +when an application changes protection domains.
> 
> Does resultant access through protection keys should be a
> subset of the protection bits enabled through original PTE
> PROT format ? Does the semantics exactly the same on x86
> and powerpc ?

The protection key takes precedence over protection done through
mprotect. 
Yes both on x86 and powerpc we maintain the same semantics.
> 
> > +
> > +
> > +On Intel:
> > +
> > +It works by dedicating 4 previously ignored bits in each page table
> > +entry to a "protection key", giving 16 possible keys.
> > +
> > +There is also a new user-accessible register (PKRU) with two separate
> > +bits (Access Disable and Write Disable) for each key.  Being a CPU
> > +register, PKRU is inherently thread-local, potentially giving each
> > +thread a different set of protections from every other thread.
> > +
> > +There are two new instructions (RDPKRU/WRPKRU) for reading and writing
> > +to the new register.  The feature is only available in 64-bit mode,
> > +even though there is theoretically space in the PAE PTEs.  These
> > +permissions are enforced on data access only and have no effect on
> > +instruction fetches.
> > +
> > +
> > +On PowerPC:
> > +
> > +It works by dedicating 5 page table entry to a "protection key",
> > +giving 32 possible keys.
> > +
> > +There is a user-accessible register (AMR) with two separate bits
> > +(Access Disable and Write Disable) for each key.  Being a CPU
> > +register, AMR is inherently thread-local, potentially giving each
> > +thread a different set of protections from every other thread.
> 
> Small nit. Space needed here.
> 
> > +NOTE: Disabling read permission does not disable
> > +write and vice-versa.
> > +
> > +The feature is available on 64-bit HPTE mode only.
> > +
> > +'mtspr 0xd, mem' reads the AMR register
> > +'mfspr mem, 0xd' writes into the AMR register.
> > +
> > +Permissions are enforced on data access only and have no effect on
> > +instruction fetches.
> > +
> > +=== Syscalls ===
> > +
> > +There are 3 system calls which directly interact with pkeys:
> > +
> > +   int pkey_alloc(unsigned long flags, unsigned long init_access_rights)
> > +   int pkey_free(int pkey);
> > +   int pkey_mprotect(unsigned long start, size_t len,
> > + unsigned long prot, int pkey);
> > +
> > +Before a pkey can be used, it must first be allocated with
> > +pkey_alloc().  An application calls the WRPKRU instruction
> > +directly in order to change access permissions to memory covered
> > +with a key.  In this example WRPKRU is wrapped by a C function
> > +called pkey_set().
> > +
> > +   int real_prot = PROT_READ|PROT_WRITE;
> > +   pkey = pkey_alloc(0, PKEY_DENY_WRITE);
> > +   ptr = mmap(NULL, PAGE_SIZE, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 
> > 0);
> > +   ret = pkey_mprotect(ptr, PAGE_SIZE, real_prot, pkey);
> > +   ... application runs here
> > +
> > +Now, if the application needs to update the data at 'ptr', it can
> > +gain access, do the update, then remove it

Re: [RFC v2 12/12]selftest: Updated protection key selftest

2017-06-20 Thread Ram Pai

On Tue, Jun 20, 2017 at 11:56:04AM +0530, Anshuman Khandual wrote:
> On 06/17/2017 09:22 AM, Ram Pai wrote:
> > Added test support for PowerPC implementation off protection keys.
> > 
> > Signed-off-by: Ram Pai 
> 
> First of all, there are a lot of instances where we use *pkru*
> named functions on power even the real implementations have
> taken care of doing appropriate things. That looks pretty
> hacky. We need to change them to generic names first before
> adding both x86 and powerpc procedures inside it.

I have abstracted out the arch-specific code. References to
pkru should now be constricted to x86 code only. 

The patch, i acknowledge, is not easily reviewable.
As Michael Ellermen mentioned I will  break them into two patches.
One moves the file and the second does the code changes. That way
it will be easy to review.

RP

Re: [RFC v2 09/12] powerpc: Deliver SEGV signal on pkey violation.

2017-06-20 Thread Ram Pai

On Wed, Jun 21, 2017 at 08:48:20AM +0530, Anshuman Khandual wrote:
> On 06/21/2017 05:26 AM, Ram Pai wrote:
> > On Tue, Jun 20, 2017 at 12:24:53PM +0530, Anshuman Khandual wrote:
> >> On 06/17/2017 09:22 AM, Ram Pai wrote:
> >>> The value of the AMR register at the time of exception
> >>> is made available in gp_regs[PT_AMR] of the siginfo.
> >>>
> >>> This field can be used to reprogram the permission bits of
> >>> any valid pkey.
> >>>
> >>> Similarly the value of the pkey, whose protection got violated,
> >>> is made available at si_pkey field of the siginfo structure.
> >>>
> >>> Signed-off-by: Ram Pai 
> >>> ---
> >>>  arch/powerpc/include/asm/paca.h|  1 +
> >>>  arch/powerpc/include/uapi/asm/ptrace.h |  3 ++-
> >>>  arch/powerpc/kernel/asm-offsets.c  |  5 
> >>>  arch/powerpc/kernel/exceptions-64s.S   |  8 ++
> >>>  arch/powerpc/kernel/signal_32.c| 14 ++
> >>>  arch/powerpc/kernel/signal_64.c| 14 ++
> >>>  arch/powerpc/kernel/traps.c| 49 
> >>> ++
> >>>  arch/powerpc/mm/fault.c|  4 +++
> >>>  8 files changed, 97 insertions(+), 1 deletion(-)
> >>>
> >>> diff --git a/arch/powerpc/include/asm/paca.h 
> >>> b/arch/powerpc/include/asm/paca.h
> >>> index 1c09f8f..a41afd3 100644
> >>> --- a/arch/powerpc/include/asm/paca.h
> >>> +++ b/arch/powerpc/include/asm/paca.h
> >>> @@ -92,6 +92,7 @@ struct paca_struct {
> >>>   struct dtl_entry *dispatch_log_end;
> >>>  #endif /* CONFIG_PPC_STD_MMU_64 */
> >>>   u64 dscr_default;   /* per-CPU default DSCR */
> >>> + u64 paca_amr;   /* value of amr at exception */
> >>>
> >>>  #ifdef CONFIG_PPC_STD_MMU_64
> >>>   /*
> >>> diff --git a/arch/powerpc/include/uapi/asm/ptrace.h 
> >>> b/arch/powerpc/include/uapi/asm/ptrace.h
> >>> index 8036b38..7ec2428 100644
> >>> --- a/arch/powerpc/include/uapi/asm/ptrace.h
> >>> +++ b/arch/powerpc/include/uapi/asm/ptrace.h
> >>> @@ -108,8 +108,9 @@ struct pt_regs {
> >>>  #define PT_DAR   41
> >>>  #define PT_DSISR 42
> >>>  #define PT_RESULT 43
> >>> -#define PT_DSCR 44
> >>>  #define PT_REGS_COUNT 44
> >>> +#define PT_DSCR 44
> >>> +#define PT_AMR   45
> >>
> >> PT_REGS_COUNT is not getting incremented even after adding
> >> one more element into the pack ?
> > 
> > Correct. there are 48 entires in gp_regs table AFAICT, only the first 45
> > are exposed through pt_regs and through gp_regs. the remaining
> > are exposed through gp_regs only.
> > 
> >>
> >>>
> >>>  #define PT_FPR0  48  /* each FP reg occupies 2 slots in this space */
> >>>
> >>> diff --git a/arch/powerpc/kernel/asm-offsets.c 
> >>> b/arch/powerpc/kernel/asm-offsets.c
> >>> index 709e234..17f5d8a 100644
> >>> --- a/arch/powerpc/kernel/asm-offsets.c
> >>> +++ b/arch/powerpc/kernel/asm-offsets.c
> >>> @@ -241,6 +241,11 @@ int main(void)
> >>>   OFFSET(PACAHWCPUID, paca_struct, hw_cpu_id);
> >>>   OFFSET(PACAKEXECSTATE, paca_struct, kexec_state);
> >>>   OFFSET(PACA_DSCR_DEFAULT, paca_struct, dscr_default);
> >>> +
> >>> +#ifdef CONFIG_PPC64_MEMORY_PROTECTION_KEYS
> >>> + OFFSET(PACA_AMR, paca_struct, paca_amr);
> >>> +#endif /* CONFIG_PPC64_MEMORY_PROTECTION_KEYS */
> >>> +
> >>
> >> So we now have a place in PACA for AMR.
> > 
> > yes.
> > 
> >>
> >>>   OFFSET(ACCOUNT_STARTTIME, paca_struct, accounting.starttime);
> >>>   OFFSET(ACCOUNT_STARTTIME_USER, paca_struct, accounting.starttime_user);
> >>>   OFFSET(ACCOUNT_USER_TIME, paca_struct, accounting.utime);
> >>> diff --git a/arch/powerpc/kernel/exceptions-64s.S 
> >>> b/arch/powerpc/kernel/exceptions-64s.S
> >>> index 3fd0528..8db9ef8 100644
> >>> --- a/arch/powerpc/kernel/exceptions-64s.S
> >>> +++ b/arch/powerpc/kernel/exceptions-64s.S
> >>> @@ -493,6 +493,10 @@ EXC_COMMON_BEGIN(data_access_common)
> >>>   ld  r12,_MSR(r1)
> >>>   ld  r3,PACA_EXGEN+EX_DAR(r13)
> >>>   lwz r4,PACA_EXGEN+EX_DSISR(r13)
> >>> +#ifde

Re: [RFC v2 08/12] powerpc: Handle exceptions caused by violation of pkey protection.

2017-06-20 Thread Ram Pai

On Wed, Jun 21, 2017 at 09:24:36AM +0530, Anshuman Khandual wrote:
> On 06/21/2017 05:13 AM, Ram Pai wrote:
> > On Tue, Jun 20, 2017 at 12:54:45PM +0530, Anshuman Khandual wrote:
> >> On 06/17/2017 09:22 AM, Ram Pai wrote:
> >>> Handle Data and Instruction exceptions caused by memory
> >>> protection-key.
> >>>
> >>> Signed-off-by: Ram Pai 
> >>> (cherry picked from commit a5e5217619a0c475fe0cacc3b0cf1d3d33c79a09)
> > 
> > Sorry. it was residue of a bad cleanup. It got cherry-picked from my own
> > internal branch, but than i forgot to delete that line.
> > 
> >>
> >> To which tree this commit belongs to ?
> >>
> >>>
> >>> Conflicts:
> >>>   arch/powerpc/include/asm/reg.h
> >>>   arch/powerpc/kernel/exceptions-64s.S
> > 
> > same here. these two line are some residues of patching-up my tree with
> > commits from other internal branches.
> > 
> >>> ---
> >>>  arch/powerpc/include/asm/mmu_context.h | 12 +
> >>>  arch/powerpc/include/asm/pkeys.h   |  9 
> >>>  arch/powerpc/include/asm/reg.h |  7 +--
> >>>  arch/powerpc/mm/fault.c| 21 +++-
> >>>  arch/powerpc/mm/pkeys.c| 90 
> >>> ++
> >>>  5 files changed, 134 insertions(+), 5 deletions(-)
> >>>
> >>> diff --git a/arch/powerpc/include/asm/mmu_context.h 
> >>> b/arch/powerpc/include/asm/mmu_context.h
> >>> index da7e943..71fffe0 100644
> >>> --- a/arch/powerpc/include/asm/mmu_context.h
> >>> +++ b/arch/powerpc/include/asm/mmu_context.h
> >>> @@ -175,11 +175,23 @@ static inline void arch_bprm_mm_init(struct 
> >>> mm_struct *mm,
> >>>  {
> >>>  }
> >>>
> >>> +#ifdef CONFIG_PPC64_MEMORY_PROTECTION_KEYS
> >>> +bool arch_pte_access_permitted(pte_t pte, bool write);
> >>> +bool arch_vma_access_permitted(struct vm_area_struct *vma,
> >>> + bool write, bool execute, bool foreign);
> >>> +#else /* CONFIG_PPC64_MEMORY_PROTECTION_KEYS */
> >>> +static inline bool arch_pte_access_permitted(pte_t pte, bool write)
> >>> +{
> >>> + /* by default, allow everything */
> >>> + return true;
> >>> +}
> >>>  static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
> >>>   bool write, bool execute, bool foreign)
> >>>  {
> >>>   /* by default, allow everything */
> >>>   return true;
> >>>  }
> >>
> >> Right, these are the two functions the core VM expects the
> >> arch to provide.
> >>
> >>> +#endif /* CONFIG_PPC64_MEMORY_PROTECTION_KEYS */
> >>> +
> >>>  #endif /* __KERNEL__ */
> >>>  #endif /* __ASM_POWERPC_MMU_CONTEXT_H */
> >>> diff --git a/arch/powerpc/include/asm/pkeys.h 
> >>> b/arch/powerpc/include/asm/pkeys.h
> >>> index 9b6820d..405e7db 100644
> >>> --- a/arch/powerpc/include/asm/pkeys.h
> >>> +++ b/arch/powerpc/include/asm/pkeys.h
> >>> @@ -14,6 +14,15 @@
> >>>   VM_PKEY_BIT3 | \
> >>>   VM_PKEY_BIT4)
> >>>
> >>> +static inline u16 pte_flags_to_pkey(unsigned long pte_flags)
> >>> +{
> >>> + return ((pte_flags & H_PAGE_PKEY_BIT4) ? 0x1 : 0x0) |
> >>> + ((pte_flags & H_PAGE_PKEY_BIT3) ? 0x2 : 0x0) |
> >>> + ((pte_flags & H_PAGE_PKEY_BIT2) ? 0x4 : 0x0) |
> >>> + ((pte_flags & H_PAGE_PKEY_BIT1) ? 0x8 : 0x0) |
> >>> + ((pte_flags & H_PAGE_PKEY_BIT0) ? 0x10 : 0x0);
> >>> +}
> >>
> >> Add defines for the above 0x1, 0x2, 0x4, 0x8 etc ?
> > 
> > hmm...not sure if it will make the code any better.
> > 
> >>
> >>> +
> >>>  #define pkey_to_vmflag_bits(key) (((key & 0x1UL) ? VM_PKEY_BIT0 : 0x0UL) 
> >>> | \
> >>>   ((key & 0x2UL) ? VM_PKEY_BIT1 : 0x0UL) |\
> >>>   ((key & 0x4UL) ? VM_PKEY_BIT2 : 0x0UL) |\
> >>> diff --git a/arch/powerpc/include/asm/reg.h 
> >>> b/arch/powerpc/include/asm/reg.h
> >>> index 2dcb8a1..a11977f 100644
> >>> --- a/arch/powerpc/include/asm/reg.h
> >>> +++ b/arch/powerpc/include/asm/reg.h
> >>> @@ -285,9 +285,10 @@
> >>&g

Re: [RFC v2 01/12] powerpc: Free up four 64K PTE bits in 4K backed hpte pages.

2017-06-20 Thread Ram Pai

On Wed, Jun 21, 2017 at 11:05:33AM +0530, Anshuman Khandual wrote:
> On 06/21/2017 04:53 AM, Ram Pai wrote:
> > On Tue, Jun 20, 2017 at 03:50:25PM +0530, Anshuman Khandual wrote:
> >> On 06/17/2017 09:22 AM, Ram Pai wrote:
> >>> Rearrange 64K PTE bits to  free  up  bits 3, 4, 5  and  6
> >>> in the 4K backed hpte pages. These bits continue to be used
> >>> for 64K backed hpte pages in this patch, but will be freed
> >>> up in the next patch.
> >>
> >> The counting 3, 4, 5 and 6 are in BE format I believe, I was
> >> initially trying to see that from right to left as we normally
> >> do in the kernel and was getting confused. So basically these
> >> bits (which are only applicable for 64K mapping IIUC) are going
> >> to be freed up from the PTE format.
> >>
> >> #define _RPAGE_RSV10x1000UL
> >> #define _RPAGE_RSV20x0800UL
> >> #define _RPAGE_RSV30x0400UL
> >> #define _RPAGE_RSV40x0200UL
> >>
> >> As you have mentioned before this feature is available for 64K
> >> page size only and not for 4K mappings. So I assume we support
> >> both the combinations.
> >>
> >> * 64K mapping on 64K
> >> * 64K mapping on 4K
> > 
> > yes.
> > 
> >>
> >> These are the current users of the above bits
> >>
> >> #define H_PAGE_BUSY_RPAGE_RSV1 /* software: PTE & hash are 
> >> busy */
> >> #define H_PAGE_F_SECOND_RPAGE_RSV2 /* HPTE is in 2ndary 
> >> HPTEG */
> >> #define H_PAGE_F_GIX   (_RPAGE_RSV3 | _RPAGE_RSV4 | 
> >> _RPAGE_RPN44)
> >> #define H_PAGE_HASHPTE _RPAGE_RPN43/* PTE has associated 
> >> HPTE */
> >>
> >>>
> >>> The patch does the following change to the 64K PTE format
> >>>
> >>> H_PAGE_BUSY moves from bit 3 to bit 9
> >>
> >> and what is in there on bit 9 now ? This ?
> >>
> >> #define _RPAGE_SW2 0x00400
> >>
> >> which is used as 
> >>
> >> #define _PAGE_SPECIAL  _RPAGE_SW2 /* software: special page */
> >>
> >> which will not be required any more ?
> > 
> > i think you are reading bit 9 from right to left. the bit 9 i refer to
> > is from left to right. Using the same numbering convention the ISA3.0 uses.
> 
> Right, my bad. Then it would be this one.
> 
> '#define _RPAGE_RPN42 0x0040UL'
> 
> > I know it is confusing, will make a mention in the comment of this
> > patch, to read it the big-endian way.
> 
> Right.
> 
> > 
> > BTW: Bit 9 is not used currently. so using it in this patch. But this is
> > a temporary move. the H_PAGE_BUSY will move to bit 7 in the next patch.
> > 
> > Had to keep at bit 9, because bit 7 is not yet entirely freed up. it is
> > used by 64K PTE backed by 64k htpe.
> 
> Got it.
> 
> > 
> >>
> >>> H_PAGE_F_SECOND which occupied bit 4 moves to the second part
> >>>   of the pte.
> >>> H_PAGE_F_GIX which  occupied bit 5, 6 and 7 also moves to the
> >>>   second part of the pte.
> >>>
> >>> the four  bits((H_PAGE_F_SECOND|H_PAGE_F_GIX) that represent a slot
> >>> is  initialized  to  0xF  indicating  an invalid  slot.  If  a hpte
> >>> gets cached in a 0xF  slot(i.e  7th  slot  of  secondary),  it   is
> >>> released immediately. In  other  words, even  though   0xF   is   a
> >>
> >> Release immediately means we attempt again for a new hash slot ?
> > 
> > yes.
> > 
> >>
> >>> valid slot we discard  and consider it as an invalid
> >>> slot;i.e hpte_soft_invalid(). This  gives  us  an opportunity to not
> >>> depend on a bit in the primary PTE in order to determine the
> >>> validity of a slot.
> >>
> >> So we have to see the slot number in the second half for each PTE to
> >> figure out if it has got a valid slot in the hash page table.
> > 
> > yes.
> > 
> >>
> >>>
> >>> When  we  release  ahpte   in the 0xF   slot we also   release a
> >>> legitimate primary   slot  andunmapthat  entry. This  is  to
> >>> ensure  that we do get a   legimate   non-0xF  slot the next time we
> >>> retry for a slot.
> >>
> >> Okay.
> >>
>

Re: [RFC v2 07/12] powerpc: Macro the mask used for checking DSI exception

2017-06-21 Thread Ram Pai

On Wed, Jun 21, 2017 at 12:55:42PM +0530, Aneesh Kumar K.V wrote:
> Ram Pai  writes:
> 
> > Replace the magic number used to check for DSI exception
> > with a meaningful value.
> >
> > Signed-off-by: Ram Pai 
> > ---
> >  arch/powerpc/include/asm/reg.h   | 9 -
> >  arch/powerpc/kernel/exceptions-64s.S | 2 +-
> >  2 files changed, 9 insertions(+), 2 deletions(-)
> >
> > diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
> > index 7e50e47..2dcb8a1 100644
> > --- a/arch/powerpc/include/asm/reg.h
> > +++ b/arch/powerpc/include/asm/reg.h
> > @@ -272,16 +272,23 @@
> >  #define SPRN_DAR   0x013   /* Data Address Register */
> >  #define SPRN_DBCR  0x136   /* e300 Data Breakpoint Control Reg */
> >  #define SPRN_DSISR 0x012   /* Data Storage Interrupt Status Register */
> > +#define   DSISR_BIT32  0x8000  /* not defined */
> >  #define   DSISR_NOHPTE 0x4000  /* no translation found 
> > */
> > +#define   DSISR_PAGEATTR_CONFLT0x2000  /* page attribute 
> > conflict */
> > +#define   DSISR_BIT35  0x1000  /* not defined */
> >  #define   DSISR_PROTFAULT  0x0800  /* protection fault */
> >  #define   DSISR_BADACCESS  0x0400  /* bad access to CI or G */
> >  #define   DSISR_ISSTORE0x0200  /* access was a store */
> >  #define   DSISR_DABRMATCH  0x0040  /* hit data breakpoint */
> > -#define   DSISR_NOSEGMENT  0x0020  /* SLB miss */
> >  #define   DSISR_KEYFAULT   0x0020  /* Key fault */
> > +#define   DSISR_BIT43  0x0010  /* not defined */
> >  #define   DSISR_UNSUPP_MMU 0x0008  /* Unsupported MMU config */
> >  #define   DSISR_SET_RC 0x0004  /* Failed setting of 
> > R/C bits */
> >  #define   DSISR_PGDIRFAULT  0x0002  /* Fault on page directory 
> > */
> > +#define   DSISR_PAGE_FAULT_MASK (DSISR_BIT32 | \
> > +   DSISR_PAGEATTR_CONFLT | \
> > +   DSISR_BADACCESS |   \
> > +   DSISR_BIT43)
> >  #define SPRN_TBRL  0x10C   /* Time Base Read Lower Register (user, R/O) */
> >  #define SPRN_TBRU  0x10D   /* Time Base Read Upper Register (user, R/O) */
> >  #define SPRN_CIR   0x11B   /* Chip Information Register (hyper, R/0) */
> > diff --git a/arch/powerpc/kernel/exceptions-64s.S 
> > b/arch/powerpc/kernel/exceptions-64s.S
> > index ae418b8..3fd0528 100644
> > --- a/arch/powerpc/kernel/exceptions-64s.S
> > +++ b/arch/powerpc/kernel/exceptions-64s.S
> > @@ -1411,7 +1411,7 @@ USE_TEXT_SECTION()
> > .balign IFETCH_ALIGN_BYTES
> >  do_hash_page:
> >  #ifdef CONFIG_PPC_STD_MMU_64
> > -   andis.  r0,r4,0xa410/* weird error? */
> > +   andis.  r0,r4,DSISR_PAGE_FAULT_MASK@h
> > bne-handle_page_fault   /* if not, try to insert a HPTE */
> > andis.  r0,r4,DSISR_DABRMATCH@h
> > bne-handle_dabr_fault
> 
> 
> Thanks for doing this. I always wondered what that 0xa410 indicates. Now
> tha it is documented, I am wondering are those the only DSISR values
> that we want to check early ? You also added few bit positions that is
> expected to carry value 0 ? But then excluded BIT35. Any reason ?

I did not look deeply into why the exact number 0xa410 was used in the
past.  I built the macro DSISR_PAGE_FAULT_MASK using whatever bits make
up 0xa410.  BIT35 if added to the DSISR_PAGE_FAULT_MASK would make it
0xb410. So I did not consider it.

However the macro for BIT35 is already defined in this patch, if that is what 
you were
looking for.
+#define   DSISR_BIT35  0x1000  /* not defined */

RP

Re: [RFC v2 01/12] powerpc: Free up four 64K PTE bits in 4K backed hpte pages.

2017-06-21 Thread Ram Pai

On Wed, Jun 21, 2017 at 12:11:32PM +0530, Aneesh Kumar K.V wrote:
> Ram Pai  writes:
> 
> > Rearrange 64K PTE bits to  free  up  bits 3, 4, 5  and  6
> > in the 4K backed hpte pages. These bits continue to be used
> > for 64K backed hpte pages in this patch, but will be freed
> > up in the next patch.
> >
> > The patch does the following change to the 64K PTE format
> >
> > H_PAGE_BUSY moves from bit 3 to bit 9
> > H_PAGE_F_SECOND which occupied bit 4 moves to the second part
> > of the pte.
> > H_PAGE_F_GIX which  occupied bit 5, 6 and 7 also moves to the
> > second part of the pte.
> >
> > the four  bits((H_PAGE_F_SECOND|H_PAGE_F_GIX) that represent a slot
> > is  initialized  to  0xF  indicating  an invalid  slot.  If  a hpte
> > gets cached in a 0xF  slot(i.e  7th  slot  of  secondary),  it   is
> > released immediately. In  other  words, even  though   0xF   is   a
> > valid slot we discard  and consider it as an invalid
> > slot;i.e hpte_soft_invalid(). This  gives  us  an opportunity to not
> > depend on a bit in the primary PTE in order to determine the
> > validity of a slot.
> >
> > When  we  release  ahpte   in the 0xF   slot we also   release a
> > legitimate primary   slot  andunmapthat  entry. This  is  to
> > ensure  that we do get a   legimate   non-0xF  slot the next time we
> > retry for a slot.
> >
> > Though treating 0xF slot as invalid reduces the number of available
> > slots  and  may  have an effect  on the performance, the probabilty
> > of hitting a 0xF is extermely low.
> >
> > Compared  to the current scheme, the above described scheme reduces
> > the number of false hash table updates  significantly  and  has the
> > added  advantage  of  releasing  four  valuable  PTE bits for other
> > purpose.
> >
> > This idea was jointly developed by Paul Mackerras, Aneesh, Michael
> > Ellermen and myself.
> >
> > 4K PTE format remain unchanged currently.
> >
> > Signed-off-by: Ram Pai 
> > ---
> >  arch/powerpc/include/asm/book3s/64/hash-4k.h  | 20 +++
> >  arch/powerpc/include/asm/book3s/64/hash-64k.h | 32 +++
> >  arch/powerpc/include/asm/book3s/64/hash.h | 15 +++--
> >  arch/powerpc/include/asm/book3s/64/mmu-hash.h |  5 ++
> >  arch/powerpc/mm/dump_linuxpagetables.c|  3 +-
> >  arch/powerpc/mm/hash64_4k.c   | 14 ++---
> >  arch/powerpc/mm/hash64_64k.c  | 81 
> > ---
> >  arch/powerpc/mm/hash_utils_64.c   | 30 +++---
> >  8 files changed, 122 insertions(+), 78 deletions(-)
> >
> > diff --git a/arch/powerpc/include/asm/book3s/64/hash-4k.h 
> > b/arch/powerpc/include/asm/book3s/64/hash-4k.h
> > index b4b5e6b..5ef1d81 100644
> > --- a/arch/powerpc/include/asm/book3s/64/hash-4k.h
> > +++ b/arch/powerpc/include/asm/book3s/64/hash-4k.h
> > @@ -16,6 +16,18 @@
> >  #define H_PUD_TABLE_SIZE   (sizeof(pud_t) << H_PUD_INDEX_SIZE)
> >  #define H_PGD_TABLE_SIZE   (sizeof(pgd_t) << H_PGD_INDEX_SIZE)
> >
> > +
> > +/*
> > + * Only supported by 4k linux page size
> > + */
> > +#define H_PAGE_F_SECOND_RPAGE_RSV2 /* HPTE is in 2ndary HPTEG 
> > */
> > +#define H_PAGE_F_GIX   (_RPAGE_RSV3 | _RPAGE_RSV4 | _RPAGE_RPN44)
> > +#define H_PAGE_F_GIX_SHIFT 56
> > +
> > +#define H_PAGE_BUSY_RPAGE_RSV1 /* software: PTE & hash are 
> > busy */
> > +#define H_PAGE_HASHPTE _RPAGE_RPN43/* PTE has associated HPTE */
> > +
> > +
> >  /* PTE flags to conserve for HPTE identification */
> >  #define _PAGE_HPTEFLAGS (H_PAGE_BUSY | H_PAGE_HASHPTE | \
> >  H_PAGE_F_SECOND | H_PAGE_F_GIX)
> > @@ -48,6 +60,14 @@ static inline int hash__hugepd_ok(hugepd_t hpd)
> >  }
> >  #endif
> >
> > +static inline unsigned long set_hidx_slot(pte_t *ptep, real_pte_t rpte,
> > +   unsigned int subpg_index, unsigned long slot)
> > +{
> > +   return (slot << H_PAGE_F_GIX_SHIFT) &
> > +   (H_PAGE_F_SECOND | H_PAGE_F_GIX);
> > +}
> > +
> > +
> >  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
> >
> >  static inline char *get_hpte_slot_array(pmd_t *pmdp)
> > diff --git a/arch/powerpc/include/asm/book3s/64/hash-64k.h 
> > b/arch/powerpc/include/asm/book3s/64/hash-64k.h
> > index 9732837..0eb3c89 100644
> > --- a/arch/powerpc/include/asm/book3s/64/hash-64k.h
> > +++ b/arch/powerpc/include/asm/book3s/64/hash-64k.h
> > @@ -10,23 +10,25

Re: [RFC v2 02/12] powerpc: Free up four 64K PTE bits in 64K backed hpte pages.

2017-06-21 Thread Ram Pai

On Wed, Jun 21, 2017 at 12:24:34PM +0530, Aneesh Kumar K.V wrote:
> Ram Pai  writes:
> 
> 
> 
> > diff --git a/arch/powerpc/mm/hugetlbpage-hash64.c 
> > b/arch/powerpc/mm/hugetlbpage-hash64.c
> > index a84bb44..239ca86 100644
> > --- a/arch/powerpc/mm/hugetlbpage-hash64.c
> > +++ b/arch/powerpc/mm/hugetlbpage-hash64.c
> > @@ -22,6 +22,7 @@ int __hash_page_huge(unsigned long ea, unsigned long 
> > access, unsigned long vsid,
> >  pte_t *ptep, unsigned long trap, unsigned long flags,
> >  int ssize, unsigned int shift, unsigned int mmu_psize)
> >  {
> > +   real_pte_t rpte;
> > unsigned long vpn;
> > unsigned long old_pte, new_pte;
> > unsigned long rflags, pa, sz;
> > @@ -61,6 +62,7 @@ int __hash_page_huge(unsigned long ea, unsigned long 
> > access, unsigned long vsid,
> > } while(!pte_xchg(ptep, __pte(old_pte), __pte(new_pte)));
> >
> > rflags = htab_convert_pte_flags(new_pte);
> > +   rpte = __real_pte(__pte(old_pte), ptep);
> >
> > sz = ((1UL) << shift);
> > if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
> > @@ -71,15 +73,10 @@ int __hash_page_huge(unsigned long ea, unsigned long 
> > access, unsigned long vsid,
> > /* Check if pte already has an hpte (case 2) */
> > if (unlikely(old_pte & H_PAGE_HASHPTE)) {
> > /* There MIGHT be an HPTE for this pte */
> > -   unsigned long hash, slot;
> > +   unsigned long gslot;
> >
> > -   hash = hpt_hash(vpn, shift, ssize);
> > -   if (old_pte & H_PAGE_F_SECOND)
> > -   hash = ~hash;
> > -   slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
> > -   slot += (old_pte & H_PAGE_F_GIX) >> H_PAGE_F_GIX_SHIFT;
> > -
> > -   if (mmu_hash_ops.hpte_updatepp(slot, rflags, vpn, mmu_psize,
> > +   gslot = get_hidx_gslot(vpn, shift, ssize, rpte, 0);
> > +   if (mmu_hash_ops.hpte_updatepp(gslot, rflags, vpn, mmu_psize,
> >mmu_psize, ssize, flags) == -1)
> > old_pte &= ~_PAGE_HPTEFLAGS;
> > }
> > @@ -106,8 +103,7 @@ int __hash_page_huge(unsigned long ea, unsigned long 
> > access, unsigned long vsid,
> > return -1;
> > }
> >
> > -   new_pte |= (slot << H_PAGE_F_GIX_SHIFT) &
> > -   (H_PAGE_F_SECOND | H_PAGE_F_GIX);
> > +   new_pte |= set_hidx_slot(ptep, rpte, 0, slot);
> 
> We don't really need rpte here. We are just need to track one entry
> here. May be it becomes simpler if use different helpers for 4k hpte and
> others ?

actually we need rpte here. the hidx for these 64K-hpte backed PTEs are
now stored in the second half of the pte. 
I have abstracted the helpers, so that the caller need not
know the location of the hidx. It comes in really handy.

RP

[RFC v3 03/23] powerpc: introduce get_hidx_gslot helper

2017-06-21 Thread Ram Pai

Introduce get_hidx_gslot() which returns the slot number of the HPTE
in the global hash table.

This function will come in handy as we work towards re-arranging the
PTE bits in the later patches.

Signed-off-by: Ram Pai 
---
 arch/powerpc/include/asm/book3s/64/hash.h |  3 +++
 arch/powerpc/mm/hash_utils_64.c   | 14 ++
 2 files changed, 17 insertions(+)

diff --git a/arch/powerpc/include/asm/book3s/64/hash.h 
b/arch/powerpc/include/asm/book3s/64/hash.h
index ac049de..e7cf03a 100644
--- a/arch/powerpc/include/asm/book3s/64/hash.h
+++ b/arch/powerpc/include/asm/book3s/64/hash.h
@@ -162,6 +162,9 @@ static inline bool hpte_soft_invalid(unsigned long slot)
return ((slot & 0xfUL) == 0xfUL);
 }
 
+unsigned long get_hidx_gslot(unsigned long vpn, unsigned long shift,
+   int ssize, real_pte_t rpte, unsigned int subpg_index);
+
 /* This low level function performs the actual PTE insertion
  * Setting the PTE depends on the MMU type and other factors. It's
  * an horrible mess that I'm not going to try to clean up now but
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index 1b494d0..99f97754c 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -1591,6 +1591,20 @@ static inline void tm_flush_hash_page(int local)
 }
 #endif
 
+unsigned long get_hidx_gslot(unsigned long vpn, unsigned long shift,
+   int ssize, real_pte_t rpte, unsigned int subpg_index)
+{
+   unsigned long hash, slot, hidx;
+
+   hash = hpt_hash(vpn, shift, ssize);
+   hidx = __rpte_to_hidx(rpte, subpg_index);
+   if (hidx & _PTEIDX_SECONDARY)
+   hash = ~hash;
+   slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
+   slot += hidx & _PTEIDX_GROUP_IX;
+   return slot;
+}
+
 /* WARNING: This is called from hash_low_64.S, if you change this prototype,
  *  do not forget to update the assembly call site !
  */
-- 
1.8.3.1

[RFC v3 01/23] powerpc: Free up four 64K PTE bits in 4K backed HPTE pages

2017-06-21 Thread Ram Pai

Rearrange 64K PTE bits to  free  up  bits 3, 4, 5  and  6,
in the 4K backed HPTE pages. These bits continue to be used
for 64K backed HPTE pages in this patch,  but will be freed
up in the next patch. The  bit  numbers  are big-endian  as
defined in the ISA3.0

The patch does the following change to the 64K PTE format

H_PAGE_BUSY moves from bit 3 to bit 9
H_PAGE_F_SECOND which occupied bit 4 moves to the second part
of the pte.
H_PAGE_F_GIX which  occupied bit 5, 6 and 7 also moves to the
second part of the pte.

the four  bits((H_PAGE_F_SECOND|H_PAGE_F_GIX) that represent a slot
is  initialized  to  0xF  indicating  an invalid  slot.  If  a HPTE
gets cached in a 0xF  slot(i.e  7th  slot  of  secondary),  it   is
released immediately. In  other  words, even  though   0xF   is   a
valid slot we discard  and consider it as an invalid
slot;i.e HPTE(). This  gives  us  an opportunity to not
depend on a bit in the primary PTE in order to determine the
validity of a slot.

When  we  release  aHPTE   in the 0xF   slot we also   release a
legitimate primary   slot  andunmapthat  entry. This  is  to
ensure  that we do get a   legimate   non-0xF  slot the next time we
retry for a slot.

Though treating 0xF slot as invalid reduces the number of available
slots  and  may  have an effect  on the performance, the probabilty
of hitting a 0xF is extermely low.

Compared  to the current scheme, the above described scheme reduces
the number of false hash table updates  significantly  and  has the
added  advantage  of  releasing  four  valuable  PTE bits for other
purpose.

This idea was jointly developed by Paul Mackerras, Aneesh, Michael
Ellermen and myself.

4K PTE format remain unchanged currently.

Signed-off-by: Ram Pai 

Conflicts:
arch/powerpc/include/asm/book3s/64/hash.h
---
 arch/powerpc/include/asm/book3s/64/hash-4k.h  |  7 +++
 arch/powerpc/include/asm/book3s/64/hash-64k.h | 17 ---
 arch/powerpc/include/asm/book3s/64/hash.h | 12 +++--
 arch/powerpc/mm/hash64_64k.c  | 70 +++
 arch/powerpc/mm/hash_utils_64.c   |  4 +-
 5 files changed, 66 insertions(+), 44 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/hash-4k.h 
b/arch/powerpc/include/asm/book3s/64/hash-4k.h
index b4b5e6b..9c2c8f1 100644
--- a/arch/powerpc/include/asm/book3s/64/hash-4k.h
+++ b/arch/powerpc/include/asm/book3s/64/hash-4k.h
@@ -16,6 +16,13 @@
 #define H_PUD_TABLE_SIZE   (sizeof(pud_t) << H_PUD_INDEX_SIZE)
 #define H_PGD_TABLE_SIZE   (sizeof(pgd_t) << H_PGD_INDEX_SIZE)
 
+#define H_PAGE_F_SECOND_RPAGE_RSV2 /* HPTE is in 2ndary HPTEG */
+#define H_PAGE_F_GIX   (_RPAGE_RSV3 | _RPAGE_RSV4 | _RPAGE_RPN44)
+#define H_PAGE_F_GIX_SHIFT 56
+
+#define H_PAGE_BUSY_RPAGE_RSV1 /* software: PTE & hash are busy */
+#define H_PAGE_HASHPTE _RPAGE_RPN43/* PTE has associated HPTE */
+
 /* PTE flags to conserve for HPTE identification */
 #define _PAGE_HPTEFLAGS (H_PAGE_BUSY | H_PAGE_HASHPTE | \
 H_PAGE_F_SECOND | H_PAGE_F_GIX)
diff --git a/arch/powerpc/include/asm/book3s/64/hash-64k.h 
b/arch/powerpc/include/asm/book3s/64/hash-64k.h
index 9732837..3f49941 100644
--- a/arch/powerpc/include/asm/book3s/64/hash-64k.h
+++ b/arch/powerpc/include/asm/book3s/64/hash-64k.h
@@ -10,20 +10,21 @@
  * 64k aligned address free up few of the lower bits of RPN for us
  * We steal that here. For more deatils look at pte_pfn/pfn_pte()
  */
-#define H_PAGE_COMBO   _RPAGE_RPN0 /* this is a combo 4k page */
-#define H_PAGE_4K_PFN  _RPAGE_RPN1 /* PFN is for a single 4k page */
+#define H_PAGE_COMBO   _RPAGE_RPN0 /* this is a combo 4k page */
+#define H_PAGE_4K_PFN  _RPAGE_RPN1 /* PFN is for a single 4k page */
+#define H_PAGE_F_SECOND_RPAGE_RSV2 /* HPTE is in 2ndary HPTEG */
+#define H_PAGE_F_GIX   (_RPAGE_RSV3 | _RPAGE_RSV4 | _RPAGE_RPN44)
+#define H_PAGE_F_GIX_SHIFT 56
+
+#define H_PAGE_BUSY_RPAGE_RPN42 /* software: PTE & hash are busy */
+#define H_PAGE_HASHPTE _RPAGE_RPN43/* PTE has associated HPTE */
+
 /*
  * We need to differentiate between explicit huge page and THP huge
  * page, since THP huge page also need to track real subpage details
  */
 #define H_PAGE_THP_HUGE  H_PAGE_4K_PFN
 
-/*
- * Used to track subpage group valid if H_PAGE_COMBO is set
- * This overloads H_PAGE_F_GIX and H_PAGE_F_SECOND
- */
-#define H_PAGE_COMBO_VALID (H_PAGE_F_GIX | H_PAGE_F_SECOND)
-
 /* PTE flags to conserve for HPTE identification */
 #define _PAGE_HPTEFLAGS (H_PAGE_BUSY | H_PAGE_F_SECOND | \
 H_PAGE_F_GIX | H_PAGE_HASHPTE | H_PAGE_COMBO)
diff --git a/arch/powerpc/include/asm/book3s/64/hash.h 
b/arch/powerpc/include/asm/book3s/64/hash.h
index 4e957b0..ac049de 100644
--- a/arch/powerpc/include/asm/book3s/64/hash.h
+++ b/arch/powerpc/include/asm/book3s/64/hash.h
@@ -8,11 +8,8 @@
  *
  */
 #define H_PTE_NONE_MASK_PAG

[RFC v3 15/23] powerpc: Program HPTE key protection bits

2017-06-21 Thread Ram Pai

Map the PTE protection key bits to the HPTE key protection bits,
while creating HPTE  entries.

Signed-off-by: Ram Pai 
---
 arch/powerpc/include/asm/book3s/64/mmu-hash.h | 5 +
 arch/powerpc/include/asm/pkeys.h  | 7 +++
 arch/powerpc/mm/hash_utils_64.c   | 5 +
 3 files changed, 17 insertions(+)

diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h 
b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
index 6981a52..f7a6ed3 100644
--- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
@@ -90,6 +90,8 @@
 #define HPTE_R_PP0 ASM_CONST(0x8000)
 #define HPTE_R_TS  ASM_CONST(0x4000)
 #define HPTE_R_KEY_HI  ASM_CONST(0x3000)
+#define HPTE_R_KEY_BIT0ASM_CONST(0x2000)
+#define HPTE_R_KEY_BIT1ASM_CONST(0x1000)
 #define HPTE_R_RPN_SHIFT   12
 #define HPTE_R_RPN ASM_CONST(0x0000)
 #define HPTE_R_RPN_3_0 ASM_CONST(0x01fff000)
@@ -104,6 +106,9 @@
 #define HPTE_R_C   ASM_CONST(0x0080)
 #define HPTE_R_R   ASM_CONST(0x0100)
 #define HPTE_R_KEY_LO  ASM_CONST(0x0e00)
+#define HPTE_R_KEY_BIT2ASM_CONST(0x0800)
+#define HPTE_R_KEY_BIT3ASM_CONST(0x0400)
+#define HPTE_R_KEY_BIT4ASM_CONST(0x0200)
 
 #define HPTE_V_1TB_SEG ASM_CONST(0x4000)
 #define HPTE_V_VRMA_MASK   ASM_CONST(0x4001ff00)
diff --git a/arch/powerpc/include/asm/pkeys.h b/arch/powerpc/include/asm/pkeys.h
index 0f3dca8..af3882f 100644
--- a/arch/powerpc/include/asm/pkeys.h
+++ b/arch/powerpc/include/asm/pkeys.h
@@ -27,6 +27,13 @@
((vm_flags & VM_PKEY_BIT3) ? H_PAGE_PKEY_BIT1 : 0x0UL) | \
((vm_flags & VM_PKEY_BIT4) ? H_PAGE_PKEY_BIT0 : 0x0UL))
 
+#define pte_to_hpte_pkey_bits(pteflags)\
+   (((pteflags & H_PAGE_PKEY_BIT0) ? HPTE_R_KEY_BIT0 : 0x0UL) |\
+   ((pteflags & H_PAGE_PKEY_BIT1) ? HPTE_R_KEY_BIT1 : 0x0UL) | \
+   ((pteflags & H_PAGE_PKEY_BIT2) ? HPTE_R_KEY_BIT2 : 0x0UL) | \
+   ((pteflags & H_PAGE_PKEY_BIT3) ? HPTE_R_KEY_BIT3 : 0x0UL) | \
+   ((pteflags & H_PAGE_PKEY_BIT4) ? HPTE_R_KEY_BIT4 : 0x0UL))
+
 /*
  * Bits are in BE format.
  * NOTE: key 31, 1, 0 are not used.
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index b3bc5d6..34bc94c 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -35,6 +35,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -230,6 +231,10 @@ unsigned long htab_convert_pte_flags(unsigned long 
pteflags)
 */
rflags |= HPTE_R_M;
 
+#ifdef CONFIG_PPC64_MEMORY_PROTECTION_KEYS
+   rflags |= pte_to_hpte_pkey_bits(pteflags);
+#endif
+
return rflags;
 }
 
-- 
1.8.3.1

[RFC v3 12/23] powerpc: Implement sys_pkey_alloc and sys_pkey_free system call

2017-06-21 Thread Ram Pai

Sys_pkey_alloc() allocates and returns available pkey
Sys_pkey_free()  frees up the pkey.

Total 32 keys are supported on powerpc. However pkey 0,1 and 31
are reserved. So effectively we have 29 pkeys.

Each key  can  be  initialized  to disable read, write and execute
permissions. On powerpc a key can be initialize to disable execute.

Signed-off-by: Ram Pai 
---
 arch/powerpc/Kconfig |  15 
 arch/powerpc/include/asm/book3s/64/mmu.h |  10 +++
 arch/powerpc/include/asm/book3s/64/pgtable.h |  62 ++
 arch/powerpc/include/asm/pkeys.h | 124 +++
 arch/powerpc/include/asm/systbl.h|   2 +
 arch/powerpc/include/asm/unistd.h|   4 +-
 arch/powerpc/include/uapi/asm/unistd.h   |   2 +
 arch/powerpc/mm/Makefile |   1 +
 arch/powerpc/mm/mmu_context_book3s64.c   |   5 ++
 arch/powerpc/mm/pkeys.c  |  88 +++
 10 files changed, 310 insertions(+), 3 deletions(-)
 create mode 100644 arch/powerpc/include/asm/pkeys.h
 create mode 100644 arch/powerpc/mm/pkeys.c

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index f7c8f99..b6960617 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -871,6 +871,21 @@ config SECCOMP
 
  If unsure, say Y. Only embedded should say N here.
 
+config PPC64_MEMORY_PROTECTION_KEYS
+   prompt "PowerPC Memory Protection Keys"
+   def_bool y
+   # Note: only available in 64-bit mode
+   depends on PPC64 && PPC_64K_PAGES
+   select ARCH_USES_HIGH_VMA_FLAGS
+   select ARCH_HAS_PKEYS
+   ---help---
+ Memory Protection Keys provides a mechanism for enforcing
+ page-based protections, but without requiring modification of the
+ page tables when an application changes protection domains.
+
+ For details, see Documentation/powerpc/protection-keys.txt
+
+ If unsure, say y.
 endmenu
 
 config ISA_DMA_API
diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h 
b/arch/powerpc/include/asm/book3s/64/mmu.h
index 77529a3..0c0a2a8 100644
--- a/arch/powerpc/include/asm/book3s/64/mmu.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu.h
@@ -108,6 +108,16 @@ struct patb_entry {
 #ifdef CONFIG_SPAPR_TCE_IOMMU
struct list_head iommu_group_mem_list;
 #endif
+
+#ifdef CONFIG_PPC64_MEMORY_PROTECTION_KEYS
+   /*
+* Each bit represents one protection key.
+* bit set   -> key allocated
+* bit unset -> key available for allocation
+*/
+   u32 pkey_allocation_map;
+   s16 execute_only_pkey; /* key holding execute-only protection */
+#endif
 } mm_context_t;
 
 /*
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h 
b/arch/powerpc/include/asm/book3s/64/pgtable.h
index 85bc987..87e9a89 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -428,6 +428,68 @@ static inline void huge_ptep_set_wrprotect(struct 
mm_struct *mm,
pte_update(mm, addr, ptep, 0, _PAGE_PRIVILEGED, 1);
 }
 
+
+#ifdef CONFIG_PPC64_MEMORY_PROTECTION_KEYS
+
+#include 
+static inline u64 read_amr(void)
+{
+   return mfspr(SPRN_AMR);
+}
+static inline void write_amr(u64 value)
+{
+   mtspr(SPRN_AMR, value);
+}
+static inline u64 read_iamr(void)
+{
+   return mfspr(SPRN_IAMR);
+}
+static inline void write_iamr(u64 value)
+{
+   mtspr(SPRN_IAMR, value);
+}
+static inline u64 read_uamor(void)
+{
+   return mfspr(SPRN_UAMOR);
+}
+static inline void write_uamor(u64 value)
+{
+   mtspr(SPRN_UAMOR, value);
+}
+
+#else /* CONFIG_PPC64_MEMORY_PROTECTION_KEYS */
+
+static inline u64 read_amr(void)
+{
+   WARN(1, "%s called with MEMORY PROTECTION KEYS disabled\n", __func__);
+   return -1;
+}
+static inline void write_amr(u64 value)
+{
+   WARN(1, "%s called with MEMORY PROTECTION KEYS disabled\n", __func__);
+}
+static inline u64 read_uamor(void)
+{
+   WARN(1, "%s called with MEMORY PROTECTION KEYS disabled\n", __func__);
+   return -1;
+}
+static inline void write_uamor(u64 value)
+{
+   WARN(1, "%s called with MEMORY PROTECTION KEYS disabled\n", __func__);
+}
+static inline u64 read_iamr(void)
+{
+   WARN(1, "%s called with MEMORY PROTECTION KEYS disabled\n", __func__);
+   return -1;
+}
+static inline void write_iamr(u64 value)
+{
+   WARN(1, "%s called with MEMORY PROTECTION KEYS disabled\n", __func__);
+}
+
+#endif /* CONFIG_PPC64_MEMORY_PROTECTION_KEYS */
+
+
 #define __HAVE_ARCH_PTEP_GET_AND_CLEAR
 static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
   unsigned long addr, pte_t *ptep)
diff --git a/arch/powerpc/include/asm/pkeys.h b/arch/powerpc/include/asm/pkeys.h
new file mode 100644
index 000..7bc8746
--- /dev/null
+++ b/arch/powerpc/include/asm/pkeys.h
@@ -0,0 +1,124 @@
+#ifndef _ASM_PPC64_PKEYS_

[RFC v3 06/23] powerpc: use helper functions in __hash_page_4K() for 64K PTE

2017-06-21 Thread Ram Pai

replace redundant code in __hash_page_4K() with helper
functions get_hidx_gslot() and set_hidx_slot()

Signed-off-by: Ram Pai 
---
 arch/powerpc/mm/hash64_64k.c | 24 ++--
 1 file changed, 6 insertions(+), 18 deletions(-)

diff --git a/arch/powerpc/mm/hash64_64k.c b/arch/powerpc/mm/hash64_64k.c
index 5cbdaa9..cb48a60 100644
--- a/arch/powerpc/mm/hash64_64k.c
+++ b/arch/powerpc/mm/hash64_64k.c
@@ -103,18 +103,12 @@ int __hash_page_4K(unsigned long ea, unsigned long 
access, unsigned long vsid,
if (__rpte_sub_valid(rpte, subpg_index)) {
int ret;
 
-   hash = hpt_hash(vpn, shift, ssize);
-   hidx = __rpte_to_hidx(rpte, subpg_index);
-   if (hidx & _PTEIDX_SECONDARY)
-   hash = ~hash;
-   slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
-   slot += hidx & _PTEIDX_GROUP_IX;
-
-   ret = mmu_hash_ops.hpte_updatepp(slot, rflags, vpn,
+   gslot = get_hidx_gslot(vpn, shift, ssize, rpte, subpg_index);
+   ret = mmu_hash_ops.hpte_updatepp(gslot, rflags, vpn,
 MMU_PAGE_4K, MMU_PAGE_4K,
 ssize, flags);
/*
-*if we failed because typically the HPTE wasn't really here
+* if we failed because typically the HPTE wasn't really here
 * we try an insertion.
 */
if (ret == -1)
@@ -214,15 +208,9 @@ int __hash_page_4K(unsigned long ea, unsigned long access, 
unsigned long vsid,
 * Since we have H_PAGE_BUSY set on ptep, we can be sure
 * nobody is undating hidx.
 */
-   hidxp = (unsigned long *)(ptep + PTRS_PER_PTE);
-   rpte.hidx &= ~(0xfUL << (subpg_index << 2));
-   *hidxp = rpte.hidx  | (slot << (subpg_index << 2));
-   new_pte = mark_subptegroup_valid(new_pte, subpg_index);
-   new_pte |=  H_PAGE_HASHPTE;
-   /*
-* check __real_pte for details on matching smp_rmb()
-*/
-   smp_wmb();
+   new_pte |= H_PAGE_HASHPTE;
+   new_pte |= set_hidx_slot(ptep, rpte, subpg_index, slot);
+
*ptep = __pte(new_pte & ~H_PAGE_BUSY);
return 0;
 }
-- 
1.8.3.1

[RFC v3 13/23] powerpc: store and restore the pkey state across context switches

2017-06-21 Thread Ram Pai

Store and restore the AMR, IAMR and UMOR register state of the task
before scheduling out and after scheduling in, respectively.

Signed-off-by: Ram Pai 
---
 arch/powerpc/include/asm/processor.h |  5 +
 arch/powerpc/kernel/process.c| 18 ++
 2 files changed, 23 insertions(+)

diff --git a/arch/powerpc/include/asm/processor.h 
b/arch/powerpc/include/asm/processor.h
index a2123f2..1f714df 100644
--- a/arch/powerpc/include/asm/processor.h
+++ b/arch/powerpc/include/asm/processor.h
@@ -310,6 +310,11 @@ struct thread_struct {
struct thread_vr_state ckvr_state; /* Checkpointed VR state */
unsigned long   ckvrsave; /* Checkpointed VRSAVE */
 #endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
+#ifdef CONFIG_PPC64_MEMORY_PROTECTION_KEYS
+   unsigned long   amr;
+   unsigned long   iamr;
+   unsigned long   uamor;
+#endif
 #ifdef CONFIG_KVM_BOOK3S_32_HANDLER
void*   kvm_shadow_vcpu; /* KVM internal data */
 #endif /* CONFIG_KVM_BOOK3S_32_HANDLER */
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index baae104..37d001a 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -1096,6 +1096,11 @@ static inline void save_sprs(struct thread_struct *t)
t->tar = mfspr(SPRN_TAR);
}
 #endif
+#ifdef CONFIG_PPC64_MEMORY_PROTECTION_KEYS
+   t->amr = mfspr(SPRN_AMR);
+   t->iamr = mfspr(SPRN_IAMR);
+   t->uamor = mfspr(SPRN_UAMOR);
+#endif
 }
 
 static inline void restore_sprs(struct thread_struct *old_thread,
@@ -1131,6 +1136,14 @@ static inline void restore_sprs(struct thread_struct 
*old_thread,
mtspr(SPRN_TAR, new_thread->tar);
}
 #endif
+#ifdef CONFIG_PPC64_MEMORY_PROTECTION_KEYS
+   if (old_thread->amr != new_thread->amr)
+   mtspr(SPRN_AMR, new_thread->amr);
+   if (old_thread->iamr != new_thread->iamr)
+   mtspr(SPRN_IAMR, new_thread->iamr);
+   if (old_thread->uamor != new_thread->uamor)
+   mtspr(SPRN_UAMOR, new_thread->uamor);
+#endif
 }
 
 struct task_struct *__switch_to(struct task_struct *prev,
@@ -1686,6 +1699,11 @@ void start_thread(struct pt_regs *regs, unsigned long 
start, unsigned long sp)
current->thread.tm_texasr = 0;
current->thread.tm_tfiar = 0;
 #endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
+#ifdef CONFIG_PPC64_MEMORY_PROTECTION_KEYS
+   current->thread.amr   = 0x0ul;
+   current->thread.iamr  = 0x0ul;
+   current->thread.uamor = 0x0ul;
+#endif /* CONFIG_PPC64_MEMORY_PROTECTION_KEYS */
 }
 EXPORT_SYMBOL(start_thread);
 
-- 
1.8.3.1

[RFC v3 11/23] x86: key creation with PKEY_DISABLE_EXECUTE is disallowed

2017-06-21 Thread Ram Pai

x86 does not support disabling execute permissions on a pkey.

Signed-off-by: Ram Pai 
---
 arch/x86/kernel/fpu/xstate.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index c24ac1e..d582631 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -900,6 +900,9 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int 
pkey,
if (!boot_cpu_has(X86_FEATURE_OSPKE))
return -EINVAL;
 
+   if (init_val & PKEY_DISABLE_EXECUTE)
+   return -EINVAL;
+
/* Set the bits we need in PKRU:  */
if (init_val & PKEY_DISABLE_ACCESS)
new_pkru_bits |= PKRU_AD_BIT;
-- 
1.8.3.1

[RFC v3 04/23] powerpc: Free up four 64K PTE bits in 64K backed HPTE pages

2017-06-21 Thread Ram Pai

Rearrange 64K PTE bits to  free  up  bits 3, 4, 5  and  6
in the 64K backed HPTE pages. This along with the earlier
patch will entirely free up the four bits from 64K PTE.
The bit numbers are big-endian as defined in the ISA3.0

This patch does the following change to 64K PTE that is
backed by 64K HPTE.

H_PAGE_F_SECOND which occupied bit 4 moves to the second part
of the pte.
H_PAGE_F_GIX which  occupied bit 5, 6 and 7 also moves to the
second part of the pte.

since bit 7 is now freed up, we move H_PAGE_BUSY from bit 9
to bit 7. Trying to minimize gaps so that contiguous bits
can be allocated if needed in the future.

The second part of the PTE will hold
(H_PAGE_F_SECOND|H_PAGE_F_GIX) at bit 60,61,62,63.

The above PTE changes is applicable to hugetlbpages aswell.

Signed-off-by: Ram Pai 
---
 arch/powerpc/include/asm/book3s/64/hash-64k.h | 28 +--
 arch/powerpc/mm/hash64_64k.c  | 17 
 arch/powerpc/mm/hugetlbpage-hash64.c  | 16 ++-
 3 files changed, 23 insertions(+), 38 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/hash-64k.h 
b/arch/powerpc/include/asm/book3s/64/hash-64k.h
index 4bac70a..7b5dbf3 100644
--- a/arch/powerpc/include/asm/book3s/64/hash-64k.h
+++ b/arch/powerpc/include/asm/book3s/64/hash-64k.h
@@ -12,11 +12,8 @@
  */
 #define H_PAGE_COMBO   _RPAGE_RPN0 /* this is a combo 4k page */
 #define H_PAGE_4K_PFN  _RPAGE_RPN1 /* PFN is for a single 4k page */
-#define H_PAGE_F_SECOND_RPAGE_RSV2 /* HPTE is in 2ndary HPTEG */
-#define H_PAGE_F_GIX   (_RPAGE_RSV3 | _RPAGE_RSV4 | _RPAGE_RPN44)
-#define H_PAGE_F_GIX_SHIFT 56
 
-#define H_PAGE_BUSY_RPAGE_RPN42 /* software: PTE & hash are busy */
+#define H_PAGE_BUSY_RPAGE_RPN44 /* software: PTE & hash are busy */
 #define H_PAGE_HASHPTE _RPAGE_RPN43/* PTE has associated HPTE */
 
 /*
@@ -26,8 +23,7 @@
 #define H_PAGE_THP_HUGE  H_PAGE_4K_PFN
 
 /* PTE flags to conserve for HPTE identification */
-#define _PAGE_HPTEFLAGS (H_PAGE_BUSY | H_PAGE_F_SECOND | \
-H_PAGE_F_GIX | H_PAGE_HASHPTE | H_PAGE_COMBO)
+#define _PAGE_HPTEFLAGS (H_PAGE_BUSY | H_PAGE_HASHPTE | H_PAGE_COMBO)
 /*
  * we support 16 fragments per PTE page of 64K size.
  */
@@ -55,24 +51,18 @@ static inline real_pte_t __real_pte(pte_t pte, pte_t *ptep)
unsigned long *hidxp;
 
rpte.pte = pte;
-   rpte.hidx = 0;
-   if (pte_val(pte) & H_PAGE_COMBO) {
-   /*
-* Make sure we order the hidx load against the H_PAGE_COMBO
-* check. The store side ordering is done in __hash_page_4K
-*/
-   smp_rmb();
-   hidxp = (unsigned long *)(ptep + PTRS_PER_PTE);
-   rpte.hidx = *hidxp;
-   }
+   /*
+* The store side ordering is done in set_hidx_slot()
+*/
+   smp_rmb();
+   hidxp = (unsigned long *)(ptep + PTRS_PER_PTE);
+   rpte.hidx = *hidxp;
return rpte;
 }
 
 static inline unsigned long __rpte_to_hidx(real_pte_t rpte, unsigned long 
index)
 {
-   if ((pte_val(rpte.pte) & H_PAGE_COMBO))
-   return (rpte.hidx >> (index<<2)) & 0xf;
-   return (pte_val(rpte.pte) >> H_PAGE_F_GIX_SHIFT) & 0xf;
+   return ((rpte.hidx >> (index<<2)) & 0xfUL);
 }
 
 static inline unsigned long set_hidx_slot(pte_t *ptep, real_pte_t rpte,
diff --git a/arch/powerpc/mm/hash64_64k.c b/arch/powerpc/mm/hash64_64k.c
index a16cd28..5cbdaa9 100644
--- a/arch/powerpc/mm/hash64_64k.c
+++ b/arch/powerpc/mm/hash64_64k.c
@@ -231,6 +231,7 @@ int __hash_page_64K(unsigned long ea, unsigned long access,
unsigned long vsid, pte_t *ptep, unsigned long trap,
unsigned long flags, int ssize)
 {
+   real_pte_t rpte;
unsigned long hpte_group;
unsigned long rflags, pa;
unsigned long old_pte, new_pte;
@@ -267,6 +268,7 @@ int __hash_page_64K(unsigned long ea, unsigned long access,
} while (!pte_xchg(ptep, __pte(old_pte), __pte(new_pte)));
 
rflags = htab_convert_pte_flags(new_pte);
+   rpte = __real_pte(__pte(old_pte), ptep);
 
if (cpu_has_feature(CPU_FTR_NOEXECUTE) &&
!cpu_has_feature(CPU_FTR_COHERENT_ICACHE))
@@ -274,16 +276,13 @@ int __hash_page_64K(unsigned long ea, unsigned long 
access,
 
vpn  = hpt_vpn(ea, vsid, ssize);
if (unlikely(old_pte & H_PAGE_HASHPTE)) {
+   unsigned long gslot;
+
/*
 * There MIGHT be an HPTE for this pte
 */
-   hash = hpt_hash(vpn, shift, ssize);
-   if (old_pte & H_PAGE_F_SECOND)
-   hash = ~hash;
-   slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
-   slot += (old_pte & H_PAGE_F_GIX) >> H_PAGE_F_GIX_SHIFT;
-
-   if (mmu_hash_ops.hpte_upd

[RFC v3 17/23] powerpc: Handle exceptions caused by violation of pkey protection

2017-06-21 Thread Ram Pai

Handle Data and Instruction exceptions caused by memory
protection-key.

Signed-off-by: Ram Pai 
---
 arch/powerpc/include/asm/mmu_context.h | 12 +
 arch/powerpc/include/asm/pkeys.h   |  9 
 arch/powerpc/include/asm/reg.h |  2 +-
 arch/powerpc/mm/fault.c| 20 
 arch/powerpc/mm/pkeys.c| 90 ++
 5 files changed, 132 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/mmu_context.h 
b/arch/powerpc/include/asm/mmu_context.h
index da7e943..71fffe0 100644
--- a/arch/powerpc/include/asm/mmu_context.h
+++ b/arch/powerpc/include/asm/mmu_context.h
@@ -175,11 +175,23 @@ static inline void arch_bprm_mm_init(struct mm_struct *mm,
 {
 }
 
+#ifdef CONFIG_PPC64_MEMORY_PROTECTION_KEYS
+bool arch_pte_access_permitted(pte_t pte, bool write);
+bool arch_vma_access_permitted(struct vm_area_struct *vma,
+   bool write, bool execute, bool foreign);
+#else /* CONFIG_PPC64_MEMORY_PROTECTION_KEYS */
+static inline bool arch_pte_access_permitted(pte_t pte, bool write)
+{
+   /* by default, allow everything */
+   return true;
+}
 static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
bool write, bool execute, bool foreign)
 {
/* by default, allow everything */
return true;
 }
+#endif /* CONFIG_PPC64_MEMORY_PROTECTION_KEYS */
+
 #endif /* __KERNEL__ */
 #endif /* __ASM_POWERPC_MMU_CONTEXT_H */
diff --git a/arch/powerpc/include/asm/pkeys.h b/arch/powerpc/include/asm/pkeys.h
index af3882f..a83722e 100644
--- a/arch/powerpc/include/asm/pkeys.h
+++ b/arch/powerpc/include/asm/pkeys.h
@@ -14,6 +14,15 @@
VM_PKEY_BIT3 | \
VM_PKEY_BIT4)
 
+static inline u16 pte_flags_to_pkey(unsigned long pte_flags)
+{
+   return ((pte_flags & H_PAGE_PKEY_BIT4) ? 0x1 : 0x0) |
+   ((pte_flags & H_PAGE_PKEY_BIT3) ? 0x2 : 0x0) |
+   ((pte_flags & H_PAGE_PKEY_BIT2) ? 0x4 : 0x0) |
+   ((pte_flags & H_PAGE_PKEY_BIT1) ? 0x8 : 0x0) |
+   ((pte_flags & H_PAGE_PKEY_BIT0) ? 0x10 : 0x0);
+}
+
 #define pkey_to_vmflag_bits(key) (((key & 0x1UL) ? VM_PKEY_BIT0 : 0x0UL) | \
((key & 0x2UL) ? VM_PKEY_BIT1 : 0x0UL) |\
((key & 0x4UL) ? VM_PKEY_BIT2 : 0x0UL) |\
diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index ba110dd..6e2a860 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -286,7 +286,7 @@
 #define   DSISR_SET_RC 0x0004  /* Failed setting of R/C bits */
 #define   DSISR_PGDIRFAULT  0x0002  /* Fault on page directory */
 #define   DSISR_PAGE_FAULT_MASK (DSISR_BIT32 | DSISR_PAGEATTR_CONFLT | \
-   DSISR_BADACCESS | DSISR_BIT43)
+   DSISR_BADACCESS | DSISR_KEYFAULT | DSISR_BIT43)
 #define SPRN_TBRL  0x10C   /* Time Base Read Lower Register (user, R/O) */
 #define SPRN_TBRU  0x10D   /* Time Base Read Upper Register (user, R/O) */
 #define SPRN_CIR   0x11B   /* Chip Information Register (hyper, R/0) */
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 3a7d580..3d71984 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -261,6 +261,13 @@ int do_page_fault(struct pt_regs *regs, unsigned long 
address,
}
 #endif
 
+#ifdef CONFIG_PPC64_MEMORY_PROTECTION_KEYS
+   if (error_code & DSISR_KEYFAULT) {
+   code = SEGV_PKUERR;
+   goto bad_area_nosemaphore;
+   }
+#endif /*  CONFIG_PPC64_MEMORY_PROTECTION_KEYS */
+
/* We restore the interrupt state now */
if (!arch_irq_disabled_regs(regs))
local_irq_enable();
@@ -441,6 +448,19 @@ int do_page_fault(struct pt_regs *regs, unsigned long 
address,
WARN_ON_ONCE(error_code & DSISR_PROTFAULT);
 #endif /* CONFIG_PPC_STD_MMU */
 
+#ifdef CONFIG_PPC64_MEMORY_PROTECTION_KEYS
+   if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE,
+   is_exec, 0)) {
+   code = SEGV_PKUERR;
+   goto bad_area;
+   }
+#endif /* CONFIG_PPC64_MEMORY_PROTECTION_KEYS */
+
+   /* handle_mm_fault() needs to know if its a instruction access
+* fault.
+*/
+   if (is_exec)
+   flags |= FAULT_FLAG_INSTRUCTION;
/*
 * If for any reason at all we couldn't handle the fault,
 * make sure we exit gracefully rather than endlessly redo
diff --git a/arch/powerpc/mm/pkeys.c b/arch/powerpc/mm/pkeys.c
index 11a32b3..439241a 100644
--- a/arch/powerpc/mm/pkeys.c
+++ b/arch/powerpc/mm/pkeys.c
@@ -27,6 +27,37 @@ static inline bool pkey_allows_readwrite(int pkey)
return !(read_amr() & ((AMR_AD_BIT|AMR_WD_BIT) << pkey_shift));
 }
 
+static inline bool pkey_allows_read(int pkey)
+{
+   in

[RFC v3 23/23] procfs: display the protection-key number associated with a vma

2017-06-21 Thread Ram Pai

Display the pkey number associated with the vma in smaps of a task.
The key will be seen as below:

VmFlags: rd wr mr mw me dw ac key=0

Signed-off-by: Ram Pai 
---
 Documentation/filesystems/proc.txt |  3 ++-
 fs/proc/task_mmu.c | 22 +++---
 2 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/Documentation/filesystems/proc.txt 
b/Documentation/filesystems/proc.txt
index 4cddbce..a8c74aa 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -423,7 +423,7 @@ SwapPss:   0 kB
 KernelPageSize:4 kB
 MMUPageSize:   4 kB
 Locked:0 kB
-VmFlags: rd ex mr mw me dw
+VmFlags: rd ex mr mw me dw key=
 
 the first of these lines shows the same information as is displayed for the
 mapping in /proc/PID/maps.  The remaining lines show the size of the mapping
@@ -491,6 +491,7 @@ manner. The codes are the following:
 hg  - huge page advise flag
 nh  - no-huge page advise flag
 mg  - mergable advise flag
+key= - the memory protection key number
 
 Note that there is no guarantee that every flag and associated mnemonic will
 be present in all further kernel releases. Things get changed, the flags may
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 2ddc298..d2eb096 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1,4 +1,6 @@
 #include 
+#include 
+#include 
 #include 
 #include 
 #include 
@@ -666,22 +668,20 @@ static void show_smap_vma_flags(struct seq_file *m, 
struct vm_area_struct *vma)
[ilog2(VM_MERGEABLE)]   = "mg",
[ilog2(VM_UFFD_MISSING)]= "um",
[ilog2(VM_UFFD_WP)] = "uw",
-#ifdef CONFIG_ARCH_HAS_PKEYS
-   /* These come out via ProtectionKey: */
-   [ilog2(VM_PKEY_BIT0)]   = "",
-   [ilog2(VM_PKEY_BIT1)]   = "",
-   [ilog2(VM_PKEY_BIT2)]   = "",
-   [ilog2(VM_PKEY_BIT3)]   = "",
-#endif /* CONFIG_ARCH_HAS_PKEYS */
-#ifdef CONFIG_PPC64_MEMORY_PROTECTION_KEYS
-   /* Additional bit in ProtectionKey: */
-   [ilog2(VM_PKEY_BIT4)]   = "",
-#endif
};
size_t i;
 
seq_puts(m, "VmFlags: ");
for (i = 0; i < BITS_PER_LONG; i++) {
+#ifdef CONFIG_ARCH_HAS_PKEYS
+   if (i == ilog2(VM_PKEY_BIT0)) {
+   int keyvalue = vma_pkey(vma);
+
+   i += ilog2(arch_max_pkey())-1;
+   seq_printf(m, "key=%d ", keyvalue);
+   continue;
+   }
+#endif /* CONFIG_ARCH_HAS_PKEYS */
if (!mnemonics[i][0])
continue;
if (vma->vm_flags & (1UL << i)) {
-- 
1.8.3.1

[RFC v3 22/23] Documentation: PowerPC specific updates to memory protection keys

2017-06-21 Thread Ram Pai

Add documentation updates that capture PowerPC specific changes.

Signed-off-by: Ram Pai 
---
 Documentation/vm/protection-keys.txt | 65 +---
 1 file changed, 45 insertions(+), 20 deletions(-)

diff --git a/Documentation/vm/protection-keys.txt 
b/Documentation/vm/protection-keys.txt
index b643045..965ad75 100644
--- a/Documentation/vm/protection-keys.txt
+++ b/Documentation/vm/protection-keys.txt
@@ -1,21 +1,46 @@
-Memory Protection Keys for Userspace (PKU aka PKEYs) is a CPU feature
-which will be found on future Intel CPUs.
+Memory Protection Keys for Userspace (PKU aka PKEYs) is a CPU feature found in
+new generation of intel CPUs and on PowerPC 7 and higher CPUs.
 
 Memory Protection Keys provides a mechanism for enforcing page-based
-protections, but without requiring modification of the page tables
-when an application changes protection domains.  It works by
-dedicating 4 previously ignored bits in each page table entry to a
-"protection key", giving 16 possible keys.
-
-There is also a new user-accessible register (PKRU) with two separate
-bits (Access Disable and Write Disable) for each key.  Being a CPU
-register, PKRU is inherently thread-local, potentially giving each
-thread a different set of protections from every other thread.
-
-There are two new instructions (RDPKRU/WRPKRU) for reading and writing
-to the new register.  The feature is only available in 64-bit mode,
-even though there is theoretically space in the PAE PTEs.  These
-permissions are enforced on data access only and have no effect on
+protections, but without requiring modification of the page tables when an
+application changes protection domains.
+
+
+On Intel:
+
+   It works by dedicating 4 previously ignored bits in each page table
+   entry to a "protection key", giving 16 possible keys.
+
+   There is also a new user-accessible register (PKRU) with two separate
+   bits (Access Disable and Write Disable) for each key.  Being a CPU
+   register, PKRU is inherently thread-local, potentially giving each
+   thread a different set of protections from every other thread.
+
+   There are two new instructions (RDPKRU/WRPKRU) for reading and writing
+   to the new register.  The feature is only available in 64-bit mode,
+   even though there is theoretically space in the PAE PTEs.  These
+   permissions are enforced on data access only and have no effect on
+   instruction fetches.
+
+
+On PowerPC:
+
+   It works by dedicating 5 page table entry bits to a "protection key",
+   giving 32 possible keys.
+
+   There  is  a  user-accessible  register (AMR)  with  two separate bits;
+   Access Disable and  Write  Disable, for  each key.  Being  a  CPU
+   register,  AMR  is inherently  thread-local,  potentially  giving  each
+   thread a different set of protections from every other thread.  NOTE:
+   Disabling read permission does not disable write and vice-versa.
+
+   The feature is available on 64-bit HPTE mode only.
+   'mtspr 0xd, mem' reads the AMR register
+   'mfspr mem, 0xd' writes into the AMR register.
+
+
+
+Permissions are enforced on data access only and have no effect on
 instruction fetches.
 
 === Syscalls ===
@@ -28,9 +53,9 @@ There are 3 system calls which directly interact with pkeys:
  unsigned long prot, int pkey);
 
 Before a pkey can be used, it must first be allocated with
-pkey_alloc().  An application calls the WRPKRU instruction
+pkey_alloc().  An application calls the WRPKRU/AMR instruction
 directly in order to change access permissions to memory covered
-with a key.  In this example WRPKRU is wrapped by a C function
+with a key.  In this example WRPKRU/AMR is wrapped by a C function
 called pkey_set().
 
int real_prot = PROT_READ|PROT_WRITE;
@@ -52,11 +77,11 @@ is no longer in use:
munmap(ptr, PAGE_SIZE);
pkey_free(pkey);
 
-(Note: pkey_set() is a wrapper for the RDPKRU and WRPKRU instructions.
+(Note: pkey_set() is a wrapper for the RDPKRU,WRPKRU or AMR instructions.
  An example implementation can be found in
  tools/testing/selftests/x86/protection_keys.c)
 
-=== Behavior ===
+=== Behavior =
 
 The kernel attempts to make protection keys consistent with the
 behavior of a plain mprotect().  For instance if you do this:
-- 
1.8.3.1

[RFC v3 19/23] selftest: Move protecton key selftest to arch neutral directory

2017-06-21 Thread Ram Pai

Signed-off-by: Ram Pai 
---
 tools/testing/selftests/vm/Makefile   |1 +
 tools/testing/selftests/vm/pkey-helpers.h |  219 
 tools/testing/selftests/vm/protection_keys.c  | 1395 +
 tools/testing/selftests/x86/Makefile  |2 +-
 tools/testing/selftests/x86/pkey-helpers.h|  219 
 tools/testing/selftests/x86/protection_keys.c | 1395 -
 6 files changed, 1616 insertions(+), 1615 deletions(-)
 create mode 100644 tools/testing/selftests/vm/pkey-helpers.h
 create mode 100644 tools/testing/selftests/vm/protection_keys.c
 delete mode 100644 tools/testing/selftests/x86/pkey-helpers.h
 delete mode 100644 tools/testing/selftests/x86/protection_keys.c

diff --git a/tools/testing/selftests/vm/Makefile 
b/tools/testing/selftests/vm/Makefile
index cbb29e4..1d32f78 100644
--- a/tools/testing/selftests/vm/Makefile
+++ b/tools/testing/selftests/vm/Makefile
@@ -17,6 +17,7 @@ TEST_GEN_FILES += transhuge-stress
 TEST_GEN_FILES += userfaultfd
 TEST_GEN_FILES += mlock-random-test
 TEST_GEN_FILES += virtual_address_range
+TEST_GEN_FILES += protection_keys
 
 TEST_PROGS := run_vmtests
 
diff --git a/tools/testing/selftests/vm/pkey-helpers.h 
b/tools/testing/selftests/vm/pkey-helpers.h
new file mode 100644
index 000..b202939
--- /dev/null
+++ b/tools/testing/selftests/vm/pkey-helpers.h
@@ -0,0 +1,219 @@
+#ifndef _PKEYS_HELPER_H
+#define _PKEYS_HELPER_H
+#define _GNU_SOURCE
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#define NR_PKEYS 16
+#define PKRU_BITS_PER_PKEY 2
+
+#ifndef DEBUG_LEVEL
+#define DEBUG_LEVEL 0
+#endif
+#define DPRINT_IN_SIGNAL_BUF_SIZE 4096
+extern int dprint_in_signal;
+extern char dprint_in_signal_buffer[DPRINT_IN_SIGNAL_BUF_SIZE];
+static inline void sigsafe_printf(const char *format, ...)
+{
+   va_list ap;
+
+   va_start(ap, format);
+   if (!dprint_in_signal) {
+   vprintf(format, ap);
+   } else {
+   int len = vsnprintf(dprint_in_signal_buffer,
+   DPRINT_IN_SIGNAL_BUF_SIZE,
+   format, ap);
+   /*
+* len is amount that would have been printed,
+* but actual write is truncated at BUF_SIZE.
+*/
+   if (len > DPRINT_IN_SIGNAL_BUF_SIZE)
+   len = DPRINT_IN_SIGNAL_BUF_SIZE;
+   write(1, dprint_in_signal_buffer, len);
+   }
+   va_end(ap);
+}
+#define dprintf_level(level, args...) do { \
+   if (level <= DEBUG_LEVEL)   \
+   sigsafe_printf(args);   \
+   fflush(NULL);   \
+} while (0)
+#define dprintf0(args...) dprintf_level(0, args)
+#define dprintf1(args...) dprintf_level(1, args)
+#define dprintf2(args...) dprintf_level(2, args)
+#define dprintf3(args...) dprintf_level(3, args)
+#define dprintf4(args...) dprintf_level(4, args)
+
+extern unsigned int shadow_pkru;
+static inline unsigned int __rdpkru(void)
+{
+   unsigned int eax, edx;
+   unsigned int ecx = 0;
+   unsigned int pkru;
+
+   asm volatile(".byte 0x0f,0x01,0xee\n\t"
+: "=a" (eax), "=d" (edx)
+: "c" (ecx));
+   pkru = eax;
+   return pkru;
+}
+
+static inline unsigned int _rdpkru(int line)
+{
+   unsigned int pkru = __rdpkru();
+
+   dprintf4("rdpkru(line=%d) pkru: %x shadow: %x\n",
+   line, pkru, shadow_pkru);
+   assert(pkru == shadow_pkru);
+
+   return pkru;
+}
+
+#define rdpkru() _rdpkru(__LINE__)
+
+static inline void __wrpkru(unsigned int pkru)
+{
+   unsigned int eax = pkru;
+   unsigned int ecx = 0;
+   unsigned int edx = 0;
+
+   dprintf4("%s() changing %08x to %08x\n", __func__, __rdpkru(), pkru);
+   asm volatile(".byte 0x0f,0x01,0xef\n\t"
+: : "a" (eax), "c" (ecx), "d" (edx));
+   assert(pkru == __rdpkru());
+}
+
+static inline void wrpkru(unsigned int pkru)
+{
+   dprintf4("%s() changing %08x to %08x\n", __func__, __rdpkru(), pkru);
+   /* will do the shadow check for us: */
+   rdpkru();
+   __wrpkru(pkru);
+   shadow_pkru = pkru;
+   dprintf4("%s(%08x) pkru: %08x\n", __func__, pkru, __rdpkru());
+}
+
+/*
+ * These are technically racy. since something could
+ * change PKRU between the read and the write.
+ */
+static inline void __pkey_access_allow(int pkey, int do_allow)
+{
+   unsigned int pkru = rdpkru();
+   int bit = pkey * 2;
+
+   if (do_allow)
+   pkru &= (1<mmap (see exit_mmap()), so make sure it is immune 
to pkeys
+ * look for pkey "leaks" where it is still set on a VMA but "freed" back 
to the kernel
+ * do a plain mprotect() to

[RFC v3 21/23] Documentation: Move protecton key documentation to arch neutral directory

2017-06-21 Thread Ram Pai

Since PowerPC and Intel both support memory protection keys, moving
the documenation to arch-neutral directory.

Signed-off-by: Ram Pai 
---
 Documentation/vm/protection-keys.txt  | 85 +++
 Documentation/x86/protection-keys.txt | 85 ---
 2 files changed, 85 insertions(+), 85 deletions(-)
 create mode 100644 Documentation/vm/protection-keys.txt
 delete mode 100644 Documentation/x86/protection-keys.txt

diff --git a/Documentation/vm/protection-keys.txt 
b/Documentation/vm/protection-keys.txt
new file mode 100644
index 000..b643045
--- /dev/null
+++ b/Documentation/vm/protection-keys.txt
@@ -0,0 +1,85 @@
+Memory Protection Keys for Userspace (PKU aka PKEYs) is a CPU feature
+which will be found on future Intel CPUs.
+
+Memory Protection Keys provides a mechanism for enforcing page-based
+protections, but without requiring modification of the page tables
+when an application changes protection domains.  It works by
+dedicating 4 previously ignored bits in each page table entry to a
+"protection key", giving 16 possible keys.
+
+There is also a new user-accessible register (PKRU) with two separate
+bits (Access Disable and Write Disable) for each key.  Being a CPU
+register, PKRU is inherently thread-local, potentially giving each
+thread a different set of protections from every other thread.
+
+There are two new instructions (RDPKRU/WRPKRU) for reading and writing
+to the new register.  The feature is only available in 64-bit mode,
+even though there is theoretically space in the PAE PTEs.  These
+permissions are enforced on data access only and have no effect on
+instruction fetches.
+
+=== Syscalls ===
+
+There are 3 system calls which directly interact with pkeys:
+
+   int pkey_alloc(unsigned long flags, unsigned long init_access_rights)
+   int pkey_free(int pkey);
+   int pkey_mprotect(unsigned long start, size_t len,
+ unsigned long prot, int pkey);
+
+Before a pkey can be used, it must first be allocated with
+pkey_alloc().  An application calls the WRPKRU instruction
+directly in order to change access permissions to memory covered
+with a key.  In this example WRPKRU is wrapped by a C function
+called pkey_set().
+
+   int real_prot = PROT_READ|PROT_WRITE;
+   pkey = pkey_alloc(0, PKEY_DENY_WRITE);
+   ptr = mmap(NULL, PAGE_SIZE, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 
0);
+   ret = pkey_mprotect(ptr, PAGE_SIZE, real_prot, pkey);
+   ... application runs here
+
+Now, if the application needs to update the data at 'ptr', it can
+gain access, do the update, then remove its write access:
+
+   pkey_set(pkey, 0); // clear PKEY_DENY_WRITE
+   *ptr = foo; // assign something
+   pkey_set(pkey, PKEY_DENY_WRITE); // set PKEY_DENY_WRITE again
+
+Now when it frees the memory, it will also free the pkey since it
+is no longer in use:
+
+   munmap(ptr, PAGE_SIZE);
+   pkey_free(pkey);
+
+(Note: pkey_set() is a wrapper for the RDPKRU and WRPKRU instructions.
+ An example implementation can be found in
+ tools/testing/selftests/x86/protection_keys.c)
+
+=== Behavior ===
+
+The kernel attempts to make protection keys consistent with the
+behavior of a plain mprotect().  For instance if you do this:
+
+   mprotect(ptr, size, PROT_NONE);
+   something(ptr);
+
+you can expect the same effects with protection keys when doing this:
+
+   pkey = pkey_alloc(0, PKEY_DISABLE_WRITE | PKEY_DISABLE_READ);
+   pkey_mprotect(ptr, size, PROT_READ|PROT_WRITE, pkey);
+   something(ptr);
+
+That should be true whether something() is a direct access to 'ptr'
+like:
+
+   *ptr = foo;
+
+or when the kernel does the access on the application's behalf like
+with a read():
+
+   read(fd, ptr, 1);
+
+The kernel will send a SIGSEGV in both cases, but si_code will be set
+to SEGV_PKERR when violating protection keys versus SEGV_ACCERR when
+the plain mprotect() permissions are violated.
diff --git a/Documentation/x86/protection-keys.txt 
b/Documentation/x86/protection-keys.txt
deleted file mode 100644
index b643045..000
--- a/Documentation/x86/protection-keys.txt
+++ /dev/null
@@ -1,85 +0,0 @@
-Memory Protection Keys for Userspace (PKU aka PKEYs) is a CPU feature
-which will be found on future Intel CPUs.
-
-Memory Protection Keys provides a mechanism for enforcing page-based
-protections, but without requiring modification of the page tables
-when an application changes protection domains.  It works by
-dedicating 4 previously ignored bits in each page table entry to a
-"protection key", giving 16 possible keys.
-
-There is also a new user-accessible register (PKRU) with two separate
-bits (Access Disable and Write Disable) for each key.  Being a CPU
-register, PKRU is inherently thread-local, potentially giving each
-thr

[RFC v3 16/23] powerpc: Macro the mask used for checking DSI exception

2017-06-21 Thread Ram Pai

Replace the magic number used to check for DSI exception
with a meaningful value.

Signed-off-by: Ram Pai 
---
 arch/powerpc/include/asm/reg.h   | 7 ++-
 arch/powerpc/kernel/exceptions-64s.S | 2 +-
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index 7e50e47..ba110dd 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -272,16 +272,21 @@
 #define SPRN_DAR   0x013   /* Data Address Register */
 #define SPRN_DBCR  0x136   /* e300 Data Breakpoint Control Reg */
 #define SPRN_DSISR 0x012   /* Data Storage Interrupt Status Register */
+#define   DSISR_BIT32  0x8000  /* not defined */
 #define   DSISR_NOHPTE 0x4000  /* no translation found */
+#define   DSISR_PAGEATTR_CONFLT0x2000  /* page attribute 
conflict */
+#define   DSISR_BIT35  0x1000  /* not defined */
 #define   DSISR_PROTFAULT  0x0800  /* protection fault */
 #define   DSISR_BADACCESS  0x0400  /* bad access to CI or G */
 #define   DSISR_ISSTORE0x0200  /* access was a store */
 #define   DSISR_DABRMATCH  0x0040  /* hit data breakpoint */
-#define   DSISR_NOSEGMENT  0x0020  /* SLB miss */
 #define   DSISR_KEYFAULT   0x0020  /* Key fault */
+#define   DSISR_BIT43  0x0010  /* not defined */
 #define   DSISR_UNSUPP_MMU 0x0008  /* Unsupported MMU config */
 #define   DSISR_SET_RC 0x0004  /* Failed setting of R/C bits */
 #define   DSISR_PGDIRFAULT  0x0002  /* Fault on page directory */
+#define   DSISR_PAGE_FAULT_MASK (DSISR_BIT32 | DSISR_PAGEATTR_CONFLT | \
+   DSISR_BADACCESS | DSISR_BIT43)
 #define SPRN_TBRL  0x10C   /* Time Base Read Lower Register (user, R/O) */
 #define SPRN_TBRU  0x10D   /* Time Base Read Upper Register (user, R/O) */
 #define SPRN_CIR   0x11B   /* Chip Information Register (hyper, R/0) */
diff --git a/arch/powerpc/kernel/exceptions-64s.S 
b/arch/powerpc/kernel/exceptions-64s.S
index ae418b8..3fd0528 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -1411,7 +1411,7 @@ USE_TEXT_SECTION()
.balign IFETCH_ALIGN_BYTES
 do_hash_page:
 #ifdef CONFIG_PPC_STD_MMU_64
-   andis.  r0,r4,0xa410/* weird error? */
+   andis.  r0,r4,DSISR_PAGE_FAULT_MASK@h
bne-handle_page_fault   /* if not, try to insert a HPTE */
andis.  r0,r4,DSISR_DABRMATCH@h
bne-handle_dabr_fault
-- 
1.8.3.1

[RFC v3 20/23] selftest: PowerPC specific test updates to memory protection keys

2017-06-21 Thread Ram Pai

Abstracted out the arch specific code into the header file, and
added powerpc specific changes.

a) added 4k-backed hpte, memory allocator, powerpc specific.
b) added three test case where the key is associated after the page is
accessed/allocated/mapped.
c) cleaned up the code to make checkpatch.pl happy

Signed-off-by: Ram Pai 
---
 tools/testing/selftests/vm/pkey-helpers.h| 230 +--
 tools/testing/selftests/vm/protection_keys.c | 562 ---
 2 files changed, 513 insertions(+), 279 deletions(-)

diff --git a/tools/testing/selftests/vm/pkey-helpers.h 
b/tools/testing/selftests/vm/pkey-helpers.h
index b202939..69bfa89 100644
--- a/tools/testing/selftests/vm/pkey-helpers.h
+++ b/tools/testing/selftests/vm/pkey-helpers.h
@@ -12,13 +12,72 @@
 #include 
 #include 
 
-#define NR_PKEYS 16
-#define PKRU_BITS_PER_PKEY 2
+/* Define some kernel-like types */
+#define  u8 uint8_t
+#define u16 uint16_t
+#define u32 uint32_t
+#define u64 uint64_t
+
+#ifdef __i386__ /* arch */
+
+#define SYS_mprotect_key 380
+#define SYS_pkey_alloc  381
+#define SYS_pkey_free   382
+#define REG_IP_IDX REG_EIP
+#define si_pkey_offset 0x14
+
+#define NR_PKEYS   16
+#define NR_RESERVED_PKEYS  1
+#define PKRU_BITS_PER_PKEY 2
+#define PKEY_DISABLE_ACCESS0x1
+#define PKEY_DISABLE_WRITE 0x2
+#define HPAGE_SIZE (1UL<<21)
+
+#define INIT_PRKU 0x0UL
+
+#elif __powerpc64__ /* arch */
+
+#define SYS_mprotect_key 386
+#define SYS_pkey_alloc  384
+#define SYS_pkey_free   385
+#define si_pkey_offset 0x20
+#define REG_IP_IDX PT_NIP
+#define REG_TRAPNO PT_TRAP
+#define REG_AMR45
+#define gregs gp_regs
+#define fpregs fp_regs
+
+#define NR_PKEYS   32
+#define NR_RESERVED_PKEYS  3
+#define PKRU_BITS_PER_PKEY 2
+#define PKEY_DISABLE_ACCESS0x3  /* disable read and write */
+#define PKEY_DISABLE_WRITE 0x2
+#define HPAGE_SIZE (1UL<<24)
+
+#define INIT_PRKU 0x3UL
+#else /* arch */
+
+   NOT SUPPORTED
+
+#endif /* arch */
+
 
 #ifndef DEBUG_LEVEL
 #define DEBUG_LEVEL 0
 #endif
 #define DPRINT_IN_SIGNAL_BUF_SIZE 4096
+
+
+static inline u32 pkey_to_shift(int pkey)
+{
+#ifdef __i386__ /* arch */
+   return pkey * PKRU_BITS_PER_PKEY;
+#elif __powerpc64__ /* arch */
+   return (NR_PKEYS - pkey - 1) * PKRU_BITS_PER_PKEY;
+#endif /* arch */
+}
+
+
 extern int dprint_in_signal;
 extern char dprint_in_signal_buffer[DPRINT_IN_SIGNAL_BUF_SIZE];
 static inline void sigsafe_printf(const char *format, ...)
@@ -53,53 +112,76 @@ static inline void sigsafe_printf(const char *format, ...)
 #define dprintf3(args...) dprintf_level(3, args)
 #define dprintf4(args...) dprintf_level(4, args)
 
-extern unsigned int shadow_pkru;
-static inline unsigned int __rdpkru(void)
+extern u64 shadow_pkey_reg;
+
+static inline u64 __rdpkey_reg(void)
 {
+#ifdef __i386__ /* arch */
unsigned int eax, edx;
unsigned int ecx = 0;
-   unsigned int pkru;
+   unsigned int pkey_reg;
 
asm volatile(".byte 0x0f,0x01,0xee\n\t"
 : "=a" (eax), "=d" (edx)
 : "c" (ecx));
-   pkru = eax;
-   return pkru;
+#elif __powerpc64__ /* arch */
+   u64 eax;
+   u64 pkey_reg;
+
+   asm volatile("mfspr %0, 0xd" : "=r" ((u64)(eax)));
+#endif /* arch */
+   pkey_reg = (u64)eax;
+   return pkey_reg;
 }
 
-static inline unsigned int _rdpkru(int line)
+static inline u64 _rdpkey_reg(int line)
 {
-   unsigned int pkru = __rdpkru();
+   u64 pkey_reg = __rdpkey_reg();
 
-   dprintf4("rdpkru(line=%d) pkru: %x shadow: %x\n",
-   line, pkru, shadow_pkru);
-   assert(pkru == shadow_pkru);
+   dprintf4("rdpkey_reg(line=%d) pkey_reg: %lx shadow: %lx\n",
+   line, pkey_reg, shadow_pkey_reg);
+   assert(pkey_reg == shadow_pkey_reg);
 
-   return pkru;
+   return pkey_reg;
 }
 
-#define rdpkru() _rdpkru(__LINE__)
+#define rdpkey_reg() _rdpkey_reg(__LINE__)
 
-static inline void __wrpkru(unsigned int pkru)
+static inline void __wrpkey_reg(u64 pkey_reg)
 {
-   unsigned int eax = pkru;
+#ifdef __i386__ /* arch */
+   unsigned int eax = pkey_reg;
unsigned int ecx = 0;
unsigned int edx = 0;
 
-   dprintf4("%s() changing %08x to %08x\n", __func__, __rdpkru(), pkru);
+   dprintf4("%s() changing %lx to %lx\n",
+__func__, __rdpkey_reg(), pkey_reg);
asm volatile(".byte 0x0f,0x01,0xef\n\t"
 : : "a" (eax), "c" (ecx), "d" (edx));
-   assert(pkru == __rdpkru());
+   dprintf4("%s() PKRUP after changing %lx to %lx\n",
+   __func__, __rdpkey_reg(), pkey_reg);
+#else /* arch */
+   u64 eax = pkey_reg;
+
+   dprintf4("%s() changing %llx to %llx\n",
+

1 2 3 4 5 6 >

1 - 100 of 567 matches

Mail list logo