[PATCH] The nested NMI modifies the place (instruction, flags and stack)

2012-10-01 Thread Salman Qazi
that the first NMI will iret to.  However, the copy of registers
modified is exactly the one that is the part of pt_regs in
the first NMI.  This can change the behaviour of the first NMI.

In particular, Google's arch_trigger_all_cpu_backtrace handler
also prints regions of memory surrounding addresses appearing in
registers.  This results in handled exceptions, after which nested NMIs
start coming in.  These nested NMIs change the value of registers
in pt_regs.  This can cause the original NMI handler to produce
incorrect output.

We solve this problem by interchanging the position of the preserved
copy of the iret registers ("saved") and the copy subject to being
trampled by nested NMI ("copied").

Signed-off-by: Salman Qazi 
---
 arch/x86/kernel/entry_64.S |   41 +++--
 1 files changed, 27 insertions(+), 14 deletions(-)

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 44531ac..b5d6e43 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1739,9 +1739,10 @@ nested_nmi:
 
 1:
/* Set up the interrupted NMIs stack to jump to repeat_nmi */
-   leaq -6*8(%rsp), %rdx
+   leaq -1*8(%rsp), %rdx
movq %rdx, %rsp
-   CFI_ADJUST_CFA_OFFSET 6*8
+   CFI_ADJUST_CFA_OFFSET 1*8
+   leaq -10*8(%rsp), %rdx
pushq_cfi $__KERNEL_DS
pushq_cfi %rdx
pushfq_cfi
@@ -1749,8 +1750,8 @@ nested_nmi:
pushq_cfi $repeat_nmi
 
/* Put stack back */
-   addq $(11*8), %rsp
-   CFI_ADJUST_CFA_OFFSET -11*8
+   addq $(6*8), %rsp
+   CFI_ADJUST_CFA_OFFSET -6*8
 
 nested_nmi_out:
popq_cfi %rdx
@@ -1776,18 +1777,18 @@ first_nmi:
 * +-+
 * | NMI executing variable  |
 * +-+
-* | Saved SS|
-* | Saved Return RSP|
-* | Saved RFLAGS|
-* | Saved CS|
-* | Saved RIP   |
-* +-+
 * | copied SS   |
 * | copied Return RSP   |
 * | copied RFLAGS   |
 * | copied CS   |
 * | copied RIP  |
 * +-+
+* | Saved SS|
+* | Saved Return RSP|
+* | Saved RFLAGS|
+* | Saved CS|
+* | Saved RIP   |
+* +-+
 * | pt_regs |
 * +-+
 *
@@ -1803,9 +1804,14 @@ first_nmi:
/* Set the NMI executing variable on the stack. */
pushq_cfi $1
 
+   /*
+* Leave room for the "copied" frame
+*/
+   subq $(5*8), %rsp
+
/* Copy the stack frame to the Saved frame */
.rept 5
-   pushq_cfi 6*8(%rsp)
+   pushq_cfi 11*8(%rsp)
.endr
CFI_DEF_CFA_OFFSET SS+8-RIP
 
@@ -1826,12 +1832,15 @@ repeat_nmi:
 * is benign for the non-repeat case, where 1 was pushed just above
 * to this very stack slot).
 */
-   movq $1, 5*8(%rsp)
+   movq $1, 10*8(%rsp)
 
/* Make another copy, this one may be modified by nested NMIs */
+   addq $(10*8), %rsp
.rept 5
-   pushq_cfi 4*8(%rsp)
+   pushq_cfi -6*8(%rsp)
.endr
+   subq $(5*8), %rsp
+
CFI_DEF_CFA_OFFSET SS+8-RIP
 end_repeat_nmi:
 
@@ -1882,8 +1891,12 @@ nmi_swapgs:
SWAPGS_UNSAFE_STACK
 nmi_restore:
RESTORE_ALL 8
+
+   /* Pop the extra iret frame */
+   addq $(5*8), %rsp
+
/* Clear the NMI executing stack variable */
-   movq $0, 10*8(%rsp)
+   movq $0, 5*8(%rsp)
jmp irq_return
CFI_ENDPROC
 END(nmi)

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] [PATCH] x86: Don't clobber top of pt_regs in nested NMI

2012-10-01 Thread Salman Qazi
The nested NMI modifies the place (instruction, flags and stack)
that the first NMI will iret to.  However, the copy of registers
modified is exactly the one that is the part of pt_regs in
the first NMI.  This can change the behaviour of the first NMI.

In particular, Google's arch_trigger_all_cpu_backtrace handler
also prints regions of memory surrounding addresses appearing in
registers.  This results in handled exceptions, after which nested NMIs
start coming in.  These nested NMIs change the value of registers
in pt_regs.  This can cause the original NMI handler to produce
incorrect output.

We solve this problem by interchanging the position of the preserved
copy of the iret registers ("saved") and the copy subject to being
trampled by nested NMI ("copied").

Signed-off-by: Salman Qazi 
---
 arch/x86/kernel/entry_64.S |   41 +++--
 1 files changed, 27 insertions(+), 14 deletions(-)

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 44531ac..b5d6e43 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1739,9 +1739,10 @@ nested_nmi:
 
 1:
/* Set up the interrupted NMIs stack to jump to repeat_nmi */
-   leaq -6*8(%rsp), %rdx
+   leaq -1*8(%rsp), %rdx
movq %rdx, %rsp
-   CFI_ADJUST_CFA_OFFSET 6*8
+   CFI_ADJUST_CFA_OFFSET 1*8
+   leaq -10*8(%rsp), %rdx
pushq_cfi $__KERNEL_DS
pushq_cfi %rdx
pushfq_cfi
@@ -1749,8 +1750,8 @@ nested_nmi:
pushq_cfi $repeat_nmi
 
/* Put stack back */
-   addq $(11*8), %rsp
-   CFI_ADJUST_CFA_OFFSET -11*8
+   addq $(6*8), %rsp
+   CFI_ADJUST_CFA_OFFSET -6*8
 
 nested_nmi_out:
popq_cfi %rdx
@@ -1776,18 +1777,18 @@ first_nmi:
 * +-+
 * | NMI executing variable  |
 * +-+
-* | Saved SS|
-* | Saved Return RSP|
-* | Saved RFLAGS|
-* | Saved CS|
-* | Saved RIP   |
-* +-+
 * | copied SS   |
 * | copied Return RSP   |
 * | copied RFLAGS   |
 * | copied CS   |
 * | copied RIP  |
 * +-+
+* | Saved SS|
+* | Saved Return RSP|
+* | Saved RFLAGS|
+* | Saved CS|
+* | Saved RIP   |
+* +-+
 * | pt_regs |
 * +-+
 *
@@ -1803,9 +1804,14 @@ first_nmi:
/* Set the NMI executing variable on the stack. */
pushq_cfi $1
 
+   /*
+* Leave room for the "copied" frame
+*/
+   subq $(5*8), %rsp
+
/* Copy the stack frame to the Saved frame */
.rept 5
-   pushq_cfi 6*8(%rsp)
+   pushq_cfi 11*8(%rsp)
.endr
CFI_DEF_CFA_OFFSET SS+8-RIP
 
@@ -1826,12 +1832,15 @@ repeat_nmi:
 * is benign for the non-repeat case, where 1 was pushed just above
 * to this very stack slot).
 */
-   movq $1, 5*8(%rsp)
+   movq $1, 10*8(%rsp)
 
/* Make another copy, this one may be modified by nested NMIs */
+   addq $(10*8), %rsp
.rept 5
-   pushq_cfi 4*8(%rsp)
+   pushq_cfi -6*8(%rsp)
.endr
+   subq $(5*8), %rsp
+
CFI_DEF_CFA_OFFSET SS+8-RIP
 end_repeat_nmi:
 
@@ -1882,8 +1891,12 @@ nmi_swapgs:
SWAPGS_UNSAFE_STACK
 nmi_restore:
RESTORE_ALL 8
+
+   /* Pop the extra iret frame */
+   addq $(5*8), %rsp
+
/* Clear the NMI executing stack variable */
-   movq $0, 10*8(%rsp)
+   movq $0, 5*8(%rsp)
jmp irq_return
CFI_ENDPROC
 END(nmi)

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] crypto: Make VMAC work when blocks aren't aligned

2012-10-05 Thread Salman Qazi
VMAC implementation, as it is, does not work with blocks that
are not multiples of 128-bytes.  Furthermore, this is a problem
when using the implementation on scatterlists, even
when the complete plain text is 128-byte multiple, as the pieces
that get passed to vmac_update can be pretty much any size.

I also added test cases for unaligned blocks.

Signed-off-by: Salman Qazi 
---
 crypto/testmgr.h  |   33 -
 crypto/vmac.c |   47 +++
 include/crypto/vmac.h |2 ++
 3 files changed, 77 insertions(+), 5 deletions(-)

diff --git a/crypto/testmgr.h b/crypto/testmgr.h
index 76d7f6c..f8365e9 100644
--- a/crypto/testmgr.h
+++ b/crypto/testmgr.h
@@ -1707,7 +1707,7 @@ static struct hash_testvec aes_xcbc128_tv_template[] = {
}
 };
 
-#define VMAC_AES_TEST_VECTORS  8
+#define VMAC_AES_TEST_VECTORS  11
 static char vmac_string1[128] = {'\x01', '\x01', '\x01', '\x01',
'\x02', '\x03', '\x02', '\x02',
'\x02', '\x04', '\x01', '\x07',
@@ -1723,6 +1723,19 @@ static char vmac_string3[128] = {'a', 'b', 'c', 'a', 
'b', 'c',
'a', 'b', 'c', 'a', 'b', 'c',
};
 
+static char vmac_string4[17] = {'b', 'c', 'e', 'f',
+   'i', 'j', 'l', 'm',
+   'o', 'p', 'r', 's',
+   't', 'u', 'w', 'x', 'z'};
+
+static char vmac_string5[127] = {'r', 'm', 'b', 't', 'c',
+'o', 'l', 'k', ']', '%',
+'9', '2', '7', '!', 'A'};
+
+static char vmac_string6[129] = {'p', 't', '*', '7', 'l',
+'i', '!', '#', 'w', '0',
+'z', '/', '4', 'A', 'n'};
+
 static struct hash_testvec aes_vmac128_tv_template[] = {
{
.key= "\x00\x01\x02\x03\x04\x05\x06\x07"
@@ -1776,6 +1789,24 @@ static struct hash_testvec aes_vmac128_tv_template[] = {
.digest = "\x8b\x32\x8f\xe1\xed\x8f\xfa\xd4",
.psize  = 128,
.ksize  = 16,
+   }, {
+   .key = "a09b5cd!f#07K\x00\x00\x00",
+   .plaintext = vmac_string4,
+   .digest = "\xab\xa5\x0f\xea\x42\x4e\xa1\x5f",
+   .psize = sizeof(vmac_string4),
+   .ksize = 16,
+   }, {
+   .key = "a09b5cd!f#07K\x00\x00\x00",
+   .plaintext = vmac_string5,
+   .digest = "\x25\x31\x98\xbc\x1d\xe8\x67\x60",
+   .psize = sizeof(vmac_string5),
+   .ksize = 16,
+   }, {
+   .key = "a09b5cd!f#07K\x00\x00\x00",
+   .plaintext = vmac_string6,
+   .digest = "\xc4\xae\x9b\x47\x95\x65\xeb\x41",
+   .psize = sizeof(vmac_string6),
+   .ksize = 16,
},
 };
 
diff --git a/crypto/vmac.c b/crypto/vmac.c
index f2338ca..2eb11a3 100644
--- a/crypto/vmac.c
+++ b/crypto/vmac.c
@@ -375,6 +375,11 @@ static void vhash_update(const unsigned char *m,
u64 pkh = ctx->polykey[0];
u64 pkl = ctx->polykey[1];
 
+   if (!mbytes)
+   return;
+
+   BUG_ON(mbytes % VMAC_NHBYTES);
+
mptr = (u64 *)m;
i = mbytes / VMAC_NHBYTES;  /* Must be non-zero */
 
@@ -454,7 +459,7 @@ do_l3:
 }
 
 static u64 vmac(unsigned char m[], unsigned int mbytes,
-   unsigned char n[16], u64 *tagl,
+   const unsigned char n[16], u64 *tagl,
struct vmac_ctx_t *ctx)
 {
u64 *in_n, *out_p;
@@ -559,8 +564,33 @@ static int vmac_update(struct shash_desc *pdesc, const u8 
*p,
 {
struct crypto_shash *parent = pdesc->tfm;
struct vmac_ctx_t *ctx = crypto_shash_ctx(parent);
+   int expand;
+   int min;
+
+   expand = VMAC_NHBYTES - ctx->partial_size > 0 ?
+   VMAC_NHBYTES - ctx->partial_size : 0;
+
+   min = len < expand ? len : expand;
+
+   memcpy(ctx->partial + ctx->partial_size, p, min);
+   ctx->partial_size += min;
+
+   if (len < expand)

Re: [PATCH] x86: Don't clobber top of pt_regs in nested NMI

2012-09-24 Thread Salman Qazi
On Mon, Sep 24, 2012 at 6:27 AM, Steven Rostedt  wrote:
> On Tue, Sep 18, 2012 at 06:29:35PM -0700, Salman Qazi wrote:
>> The nested NMI modifies the place (instruction, flags and stack)
>> that the first NMI will iret to.  However, the copy of registers
>> modified is exactly the one that is the part of pt_regs in
>> the first NMI.  This can change the behaviour of the first NMI.
>>
>> In particular, Google's arch_trigger_all_cpu_backtrace handler
>> also prints regions of memory surrounding addresses appearing in
>> registers.  This results in handled exceptions, after which nested NMIs
>> start coming in.  These nested NMIs change the value of registers
>> in pt_regs.  This can cause the original NMI handler to produce
>> incorrect output.
>
> Hmm, interesting problem.
>
>>
>> We solve this problem by introducing an extra copy of the iret
>> registers that are exclusively a part of pt_regs and are not modified
>> elsewhere.
>
> Yuck, 4 copies of the stack frame?
>
>> The downside is that the do_nmi function can no longer
>> change the control flow, as any values it writes to these five
>> registers will be discarded.
>
> I consider this a feature.
>
>>
>> Signed-off-by: Salman Qazi 
>> ---
>>  arch/x86/kernel/entry_64.S |   20 +++-
>>  1 files changed, 19 insertions(+), 1 deletions(-)
>>
>> diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
>> index 69babd8..40ddb6d 100644
>> --- a/arch/x86/kernel/entry_64.S
>> +++ b/arch/x86/kernel/entry_64.S
>> @@ -1724,6 +1724,18 @@ repeat_nmi:
>>  end_repeat_nmi:
>>
>>   /*
>> +  * We went a running NMI handling routine to have a consistent
>> +  * picture of register state.  This should hold true even if
>> +  * there is a nested NMI.  Therefore, we let the nested NMI
>> +  * play with the previous copy of these registers and leave this
>> +  * new copy unmodified for do_nmi()
>> +  */
>> + .rept 5
>> + pushq_cfi 4*8(%rsp)
>> + .endr
>> + CFI_DEF_CFA_OFFSET SS+8-RIP
>
> Hmm, another solution that can be done without an extra copy, is to swap
> the return stack frame with the copy stack frame. This way, the copy is
> seen by the pt_regs and will always be correct. The end would basically
> be the same as you have below, just skip the copy and return.
>
> Now this breaks the idea that anything below the sp pointer is not safe
> to use. But this is the NMI stack in the controlled part of the NMI
> Handler (no breakpoints allowed here). The NMI stack is special, which
> is why we have all this crap in the first place.
>
> It would be safe to save the copy stack below the stack pointer because
> the only thing that could possible interrupt us here is another NMI,
> which would just reset the stack pointer to the top again, and notice
> that this is a nested NMI and return after modifying the return stack.
> Which is OK.
>
> At least this way, we avoid copying the stack frame a third time (two
> was enough).

Yes.  This is better.  For some reason, I thought about this and
decided against it.  But, I can't remember what crossed my mind.  I'll
try this.

>
> -- Steve
>
>
>> +
>> + /*
>>* Everything below this point can be preempted by a nested
>>* NMI if the first NMI took an exception and reset our iret stack
>>* so that we repeat another NMI.
>> @@ -1771,7 +1783,13 @@ nmi_swapgs:
>>  nmi_restore:
>>   RESTORE_ALL 8
>>   /* Clear the NMI executing stack variable */
>> - movq $0, 10*8(%rsp)
>> + movq $0, 15*8(%rsp)
>> +
>> + /* Pop the extra copy of iret context that was saved above
>> +  * just for do_nmi()
>> +  */
>> + addq $5*8, %rsp
>> +
>>   jmp irq_return
>>   CFI_ENDPROC
>>  END(nmi)
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
>> the body of a message to majord...@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>> Please read the FAQ at  http://www.tux.org/lkml/
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] x86: Don't clobber top of pt_regs in nested NMI

2012-09-18 Thread Salman Qazi
The nested NMI modifies the place (instruction, flags and stack)
that the first NMI will iret to.  However, the copy of registers
modified is exactly the one that is the part of pt_regs in
the first NMI.  This can change the behaviour of the first NMI.

In particular, Google's arch_trigger_all_cpu_backtrace handler
also prints regions of memory surrounding addresses appearing in
registers.  This results in handled exceptions, after which nested NMIs
start coming in.  These nested NMIs change the value of registers
in pt_regs.  This can cause the original NMI handler to produce
incorrect output.

We solve this problem by introducing an extra copy of the iret
registers that are exclusively a part of pt_regs and are not modified
elsewhere.  The downside is that the do_nmi function can no longer
change the control flow, as any values it writes to these five
registers will be discarded.

Signed-off-by: Salman Qazi 
---
 arch/x86/kernel/entry_64.S |   20 +++-
 1 files changed, 19 insertions(+), 1 deletions(-)

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 69babd8..40ddb6d 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1724,6 +1724,18 @@ repeat_nmi:
 end_repeat_nmi:
 
/*
+* We went a running NMI handling routine to have a consistent
+* picture of register state.  This should hold true even if
+* there is a nested NMI.  Therefore, we let the nested NMI
+* play with the previous copy of these registers and leave this
+* new copy unmodified for do_nmi()
+*/
+   .rept 5
+   pushq_cfi 4*8(%rsp)
+   .endr
+   CFI_DEF_CFA_OFFSET SS+8-RIP
+
+   /*
 * Everything below this point can be preempted by a nested
 * NMI if the first NMI took an exception and reset our iret stack
 * so that we repeat another NMI.
@@ -1771,7 +1783,13 @@ nmi_swapgs:
 nmi_restore:
RESTORE_ALL 8
/* Clear the NMI executing stack variable */
-   movq $0, 10*8(%rsp)
+   movq $0, 15*8(%rsp)
+
+   /* Pop the extra copy of iret context that was saved above
+* just for do_nmi()
+*/
+   addq $5*8, %rsp
+
jmp irq_return
CFI_ENDPROC
 END(nmi)

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [BUG] mm: direct I/O (using GUP) can write to COW anonymous pages

2018-09-17 Thread Salman Qazi
On Mon, Sep 17, 2018 at 5:05 PM Hugh Dickins  wrote:
>
> Hi Jann,
>
> On Mon, 17 Sep 2018, Jann Horn wrote:
>
> > [I'm not sure who the best people to ask about this are, I hope the
> > recipient list resembles something reasonable...]
> >
> > I have noticed that the dup_mmap() logic on fork() doesn't handle
> > pages with active direct I/O properly: dup_mmap() seems to assume that
> > making the PTE referencing a page readonly will always prevent future
> > writes to the page, but if the kernel has acquired a direct reference
> > to the page before (e.g. via get_user_pages_fast()), writes can still
> > happen that way.
> >
> > The worst-case effect of this - as far as I can tell - is that when a
> > multithreaded process forks while one thread is in the middle of
> > sys_read() on a file that uses direct I/O with get_user_pages_fast(),
> > the read data can become visible in the child while the parent's
> > buffer stays uninitialized if the parent writes to a relevant page
> > post-fork before either the I/O completes or the child writes to it.
>
> Yes: you're understandably more worried by the one seeing the other's
> data; we've tended in the past to be more worried about the one getting
> corruption, and the other not seeing the data it asked for (and usually
> in the context of RDMA, rather than filesystem direct I/O).
>
> I've added some Cc's: I might be misremembering, but I think both
> Andrea and Konstantin have offered approaches to this in the past,
> and I believe Salman is taking a look at it currently.
>
> But my own interest ended when Michael added MADV_DONTFORK: beyond
> that, we've rated it a "Patient: It hurts when I do this. Doctor:
> Don't do that then" - more complexity and overhead to solve, than
> we have had appetite to get into.  But not a shiningly satisfactory
> situation, I'll agree.

The approach that I am currently investigating (at least to use
internally at Google) is to be able to detect (via a heuristic with
some error rate) when the patient might be doing it, and tell them not
to.  Of course, we would have
to tell them nicely rather than killing their process, especially if
we expect substantial number of false positives.
I am motivated by the belief that a transparent fix is not likely to
be feasible.  Others, who have taken that route earlier, can correct
me, if I am mistaken.

>
> Hugh
>
> >
> > Reproducer code:
> >
> > == START hello.c ==
> > #define FUSE_USE_VERSION 26
> >
> > #include 
> > #include 
> > #include 
> > #include 
> > #include 
> > #include 
> > #include 
> > #include 
> >
> > static const char *hello_path = "/hello";
> >
> > static int hello_getattr(const char *path, struct stat *stbuf)
> > {
> > int res = 0;
> > memset(stbuf, 0, sizeof(struct stat));
> > if (strcmp(path, "/") == 0) {
> > stbuf->st_mode = S_IFDIR | 0755;
> > stbuf->st_nlink = 2;
> > } else if (strcmp(path, hello_path) == 0) {
> > stbuf->st_mode = S_IFREG | 0666;
> > stbuf->st_nlink = 1;
> > stbuf->st_size = 0x1000;
> > stbuf->st_blocks = 0;
> > } else
> > res = -ENOENT;
> > return res;
> > }
> >
> > static int hello_readdir(const char *path, void *buf, fuse_fill_dir_t
> > filler, off_t offset, struct fuse_file_info *fi) {
> > filler(buf, ".", NULL, 0);
> > filler(buf, "..", NULL, 0);
> > filler(buf, hello_path + 1, NULL, 0);
> > return 0;
> > }
> >
> > static int hello_open(const char *path, struct fuse_file_info *fi) {
> > return 0;
> > }
> >
> > static int hello_read(const char *path, char *buf, size_t size, off_t
> > offset, struct fuse_file_info *fi) {
> > sleep(3);
> > size_t len = 0x1000;
> > if (offset < len) {
> > if (offset + size > len)
> > size = len - offset;
> > memset(buf, 0, size);
> > } else
> > size = 0;
> > return size;
> > }
> >
> > static int hello_write(const char *path, const char *buf, size_t size,
> > off_t offset, struct fuse_file_info *fi) {
> > while(1) pause();
> > }
> >
> > static struct fuse_operations hello_oper = {
> > .getattr= hello_getattr,
> > .readdir= hello_readdir,
> > .open   = hello_open,
> > .read   = hello_read,
> > .write  = hello_write,
> > };
> >
> > int main(int argc, char *argv[]) {
> > return fuse_main(argc, argv, &hello_oper, NULL);
> > }
> > == END hello.c ==
> >
> > == START simple_mmap.c ==
> > #define _GNU_SOURCE
> > #include 
> > #include 
> > #include 
> > #include 
> > #include 
> > #include 
> > #include 
> > #include 
> > #include 
> >
> > __attribute__((aligned(0x1000))) char data_buffer_[0x1];
> > #define data_buffer (data_buffer_ + 0x8000)
> >
> > void *fuse_thread(void *dummy) {
> > /* s

icache_is_aliasing and big.LITTLE

2019-05-08 Thread Salman Qazi
Hi,

What is the intention behind icache_is_aliasing on big.LITTLE systems
where some icaches are VIPT and others are PIPT? Is it meant to be
conservative in some sense or should it be made per-CPU?

Thanks for your help,

Salman


Re: icache_is_aliasing and big.LITTLE

2019-05-09 Thread Salman Qazi
Thank you.

On Thu, May 9, 2019 at 1:50 AM Catalin Marinas  wrote:
>
> Hi,
>
> On Wed, May 08, 2019 at 11:45:03AM -0700, Salman Qazi wrote:
> > What is the intention behind icache_is_aliasing on big.LITTLE systems
> > where some icaches are VIPT and others are PIPT? Is it meant to be
> > conservative in some sense or should it be made per-CPU?
>
> It needs to cover the worst case scenario across all CPUs, i.e. aliasing
> VIPT if one of the CPUs has this. We can't make it per-CPU because a
> thread performing cache maintenance might be migrated to another CPU
> with different cache policy (e.g. sync_icache_aliases()).
>
> --
> Catalin


Re: [PATCH] fs, ipc: Use an asynchronous version of kern_unmount in IPC

2019-02-13 Thread Salman Qazi
Do you have any additional concerns?

On Thu, Feb 7, 2019 at 10:43 AM Salman Qazi  wrote:
>
> On Wed, Feb 6, 2019 at 8:14 PM Al Viro  wrote:
> >
> > On Wed, Feb 06, 2019 at 11:53:54AM -0800, Salman Qazi wrote:
> >
> > > This patch solves the issue by removing synchronize_rcu from mq_put_mnt.
> > > This is done by implementing an asynchronous version of kern_unmount.
> > >
> > > Since mntput() sleeps, it needs to be deferred to a work queue.
> > >
> > > Additionally, the callers of mq_put_mnt appear to be safe having
> > > it behave asynchronously.  In particular, put_ipc_ns calls
> > > mq_clear_sbinfo which renders the inode inaccessible for the purposes of
> > > mqueue_create by making s_fs_info NULL.  This appears
> > > to be the thing that prevents access while free_ipc_ns is taking place.
> > > So, the unmount should be able to proceed lazily.
> >
> > Ugh...  I really doubt that it's correct.  The caller is
> > mq_put_mnt(ns);
> > free_ipc_ns(ns);
> > and we have
> > static void mqueue_evict_inode(struct inode *inode)
> > {
> >
> > ...
> >
> > ipc_ns = get_ns_from_inode(inode);
> >
> > with
> >
> > static struct ipc_namespace *get_ns_from_inode(struct inode *inode)
> > {
> > struct ipc_namespace *ns;
> >
> > spin_lock(&mq_lock);
> > ns = __get_ns_from_inode(inode);
> > spin_unlock(&mq_lock);
> > return ns;
> > }
> >
> > and
> >
> > static inline struct ipc_namespace *__get_ns_from_inode(struct inode *inode)
> > {
> > return get_ipc_ns(inode->i_sb->s_fs_info);
> > }
> >
> > with ->s_fs_info being the ipc_namespace we are freeing after mq_put_ns()
> >
> > Are you saying that get_ipc_ns() after free_ipc_ns() is safe?  Because
> > ->evict_inode() *IS* called on umount.  What happens to your patch if
> > there was a regular file left on that filesystem?
> >
> > Smells like a memory corruptor...
>
> Actually, the full context in the caller is
>
> if (refcount_dec_and_lock(&ns->count, &mq_lock)) {
> mq_clear_sbinfo(ns);
> spin_unlock(&mq_lock);
> mq_put_mnt(ns);
> free_ipc_ns(ns);
> }
>
> And
>
> void mq_clear_sbinfo(struct ipc_namespace *ns)
> {
> ns->mq_mnt->mnt_sb->s_fs_info = NULL;
> }
>
> Therefore, s_fs_info should be NULL before we proceed to unmount.  So,
> as far as I know, it should not be possible to find the ipc_namespace
> from the mount.


[PATCH RESENT] fs, ipc: Use an asynchronous version of kern_unmount in IPC

2019-03-04 Thread Salman Qazi
Prior to this patch, the kernel can spend a lot of time with
this stack trace:

[] __wait_rcu_gp+0x93/0xe0
[] synchronize_sched+0x48/0x60
[] kern_unmount+0x3a/0x46
[] mq_put_mnt+0x15/0x17
[] put_ipc_ns+0x36/0x8b

This patch solves the issue by removing synchronize_rcu from mq_put_mnt.
This is done by implementing an asynchronous version of kern_unmount.

Since mntput() sleeps, it needs to be deferred to a work queue.

Additionally, the callers of mq_put_mnt appear to be safe having
it behave asynchronously.  In particular, put_ipc_ns calls
mq_clear_sbinfo which renders the inode inaccessible for the purposes of
mqueue_create by making s_fs_info NULL.  This appears
to be the thing that prevents access while free_ipc_ns is taking place.
So, the unmount should be able to proceed lazily.

Tested: Ran the following program:

int main(void)
{
int pid;
int status;
int i;

for (i = 0; i < 1000; i++) {
pid = fork();
if (!pid) {
assert(!unshare(CLONE_NEWUSER|
  CLONE_NEWIPC|CLONE_NEWNS));
return 0;
}

assert(waitpid(pid, &status, 0) == pid);
}
}

Before:

$ time ./unshare2

real0m9.784s
user0m0.428s
sys 0m0.000s

After:

$ time ./unshare2

real0m0.368s
user0m0.226s
sys 0m0.122s

Signed-off-by: Salman Qazi 
Reviewed-by: Eric Dumazet 
---
 fs/namespace.c | 41 +
 include/linux/fs.h |  1 +
 ipc/mqueue.c   |  2 +-
 3 files changed, 43 insertions(+), 1 deletion(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index 678ef175d63a..e60b473c3bbc 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -3321,6 +3321,47 @@ void kern_unmount(struct vfsmount *mnt)
 }
 EXPORT_SYMBOL(kern_unmount);
 
+struct async_unmount_cb {
+   struct vfsmount *mnt;
+   struct work_struct work;
+   struct rcu_head rcu_head;
+};
+
+static void kern_unmount_work(struct work_struct *work)
+{
+   struct async_unmount_cb *cb = container_of(work,
+   struct async_unmount_cb, work);
+
+   mntput(cb->mnt);
+   kfree(cb);
+}
+
+static void kern_unmount_rcu_cb(struct rcu_head *rcu_head)
+{
+   struct async_unmount_cb *cb = container_of(rcu_head,
+   struct async_unmount_cb, rcu_head);
+
+   INIT_WORK(&cb->work, kern_unmount_work);
+   schedule_work(&cb->work);
+
+}
+
+void kern_unmount_async(struct vfsmount *mnt)
+{
+   /* release long term mount so mount point can be released */
+   if (!IS_ERR_OR_NULL(mnt)) {
+   struct async_unmount_cb *cb = kmalloc(sizeof(*cb), GFP_KERNEL);
+
+   if (cb) {
+   real_mount(mnt)->mnt_ns = NULL;
+   cb->mnt = mnt;
+   call_rcu(&cb->rcu_head, kern_unmount_rcu_cb);
+   } else {
+   kern_unmount(mnt);
+   }
+   }
+}
+
 bool our_mnt(struct vfsmount *mnt)
 {
return check_mnt(real_mount(mnt));
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 29d8e2cfed0e..8865997a8722 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2274,6 +2274,7 @@ extern int register_filesystem(struct file_system_type *);
 extern int unregister_filesystem(struct file_system_type *);
 extern struct vfsmount *kern_mount_data(struct file_system_type *, void *data);
 #define kern_mount(type) kern_mount_data(type, NULL)
+extern void kern_unmount_async(struct vfsmount *mnt);
 extern void kern_unmount(struct vfsmount *mnt);
 extern int may_umount_tree(struct vfsmount *);
 extern int may_umount(struct vfsmount *);
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index c595bed7bfcb..a8c2465ac0cb 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -1554,7 +1554,7 @@ void mq_clear_sbinfo(struct ipc_namespace *ns)
 
 void mq_put_mnt(struct ipc_namespace *ns)
 {
-   kern_unmount(ns->mq_mnt);
+   kern_unmount_async(ns->mq_mnt);
 }
 
 static int __init init_mqueue_fs(void)
-- 
2.21.0.352.gf09ad66450-goog



[tip:x86/asm] x86: Don't clobber top of pt_regs in nested NMI

2012-11-02 Thread tip-bot for Salman Qazi
Commit-ID:  28696f434fef0efa97534b59986ad33b9c4df7f8
Gitweb: http://git.kernel.org/tip/28696f434fef0efa97534b59986ad33b9c4df7f8
Author: Salman Qazi 
AuthorDate: Mon, 1 Oct 2012 17:29:25 -0700
Committer:  Steven Rostedt 
CommitDate: Fri, 2 Nov 2012 11:29:36 -0400

x86: Don't clobber top of pt_regs in nested NMI

The nested NMI modifies the place (instruction, flags and stack)
that the first NMI will iret to.  However, the copy of registers
modified is exactly the one that is the part of pt_regs in
the first NMI.  This can change the behaviour of the first NMI.

In particular, Google's arch_trigger_all_cpu_backtrace handler
also prints regions of memory surrounding addresses appearing in
registers.  This results in handled exceptions, after which nested NMIs
start coming in.  These nested NMIs change the value of registers
in pt_regs.  This can cause the original NMI handler to produce
incorrect output.

We solve this problem by interchanging the position of the preserved
copy of the iret registers ("saved") and the copy subject to being
trampled by nested NMI ("copied").

Link: 
http://lkml.kernel.org/r/20121002002919.27236.14388.st...@dungbeetle.mtv.corp.google.com

Signed-off-by: Salman Qazi 
[ Added a needed CFI_ADJUST_CFA_OFFSET ]
Signed-off-by: Steven Rostedt 
---
 arch/x86/kernel/entry_64.S |   41 +++--
 1 files changed, 27 insertions(+), 14 deletions(-)

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index b51b2c7..811795d 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1699,9 +1699,10 @@ nested_nmi:
 
 1:
/* Set up the interrupted NMIs stack to jump to repeat_nmi */
-   leaq -6*8(%rsp), %rdx
+   leaq -1*8(%rsp), %rdx
movq %rdx, %rsp
-   CFI_ADJUST_CFA_OFFSET 6*8
+   CFI_ADJUST_CFA_OFFSET 1*8
+   leaq -10*8(%rsp), %rdx
pushq_cfi $__KERNEL_DS
pushq_cfi %rdx
pushfq_cfi
@@ -1709,8 +1710,8 @@ nested_nmi:
pushq_cfi $repeat_nmi
 
/* Put stack back */
-   addq $(11*8), %rsp
-   CFI_ADJUST_CFA_OFFSET -11*8
+   addq $(6*8), %rsp
+   CFI_ADJUST_CFA_OFFSET -6*8
 
 nested_nmi_out:
popq_cfi %rdx
@@ -1736,18 +1737,18 @@ first_nmi:
 * +-+
 * | NMI executing variable  |
 * +-+
-* | Saved SS|
-* | Saved Return RSP|
-* | Saved RFLAGS|
-* | Saved CS|
-* | Saved RIP   |
-* +-+
 * | copied SS   |
 * | copied Return RSP   |
 * | copied RFLAGS   |
 * | copied CS   |
 * | copied RIP  |
 * +-+
+* | Saved SS|
+* | Saved Return RSP|
+* | Saved RFLAGS|
+* | Saved CS|
+* | Saved RIP   |
+* +-+
 * | pt_regs |
 * +-+
 *
@@ -1763,9 +1764,14 @@ first_nmi:
/* Set the NMI executing variable on the stack. */
pushq_cfi $1
 
+   /*
+* Leave room for the "copied" frame
+*/
+   subq $(5*8), %rsp
+
/* Copy the stack frame to the Saved frame */
.rept 5
-   pushq_cfi 6*8(%rsp)
+   pushq_cfi 11*8(%rsp)
.endr
CFI_DEF_CFA_OFFSET SS+8-RIP
 
@@ -1786,12 +1792,15 @@ repeat_nmi:
 * is benign for the non-repeat case, where 1 was pushed just above
 * to this very stack slot).
 */
-   movq $1, 5*8(%rsp)
+   movq $1, 10*8(%rsp)
 
/* Make another copy, this one may be modified by nested NMIs */
+   addq $(10*8), %rsp
+   CFI_ADJUST_CFA_OFFSET -10*8
.rept 5
-   pushq_cfi 4*8(%rsp)
+   pushq_cfi -6*8(%rsp)
.endr
+   subq $(5*8), %rsp
CFI_DEF_CFA_OFFSET SS+8-RIP
 end_repeat_nmi:
 
@@ -1842,8 +1851,12 @@ nmi_swapgs:
SWAPGS_UNSAFE_STACK
 nmi_restore:
RESTORE_ALL 8
+
+   /* Pop the extra iret frame */
+   addq $(5*8), %rsp
+
/* Clear the NMI executing stack variable */
-   movq $0, 10*8(%rsp)
+   movq $0, 5*8(%rsp)
jmp irq_return
CFI_ENDPROC
 END(nmi)
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/