[PATCH] The nested NMI modifies the place (instruction, flags and stack)
that the first NMI will iret to. However, the copy of registers modified is exactly the one that is the part of pt_regs in the first NMI. This can change the behaviour of the first NMI. In particular, Google's arch_trigger_all_cpu_backtrace handler also prints regions of memory surrounding addresses appearing in registers. This results in handled exceptions, after which nested NMIs start coming in. These nested NMIs change the value of registers in pt_regs. This can cause the original NMI handler to produce incorrect output. We solve this problem by interchanging the position of the preserved copy of the iret registers ("saved") and the copy subject to being trampled by nested NMI ("copied"). Signed-off-by: Salman Qazi --- arch/x86/kernel/entry_64.S | 41 +++-- 1 files changed, 27 insertions(+), 14 deletions(-) diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 44531ac..b5d6e43 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1739,9 +1739,10 @@ nested_nmi: 1: /* Set up the interrupted NMIs stack to jump to repeat_nmi */ - leaq -6*8(%rsp), %rdx + leaq -1*8(%rsp), %rdx movq %rdx, %rsp - CFI_ADJUST_CFA_OFFSET 6*8 + CFI_ADJUST_CFA_OFFSET 1*8 + leaq -10*8(%rsp), %rdx pushq_cfi $__KERNEL_DS pushq_cfi %rdx pushfq_cfi @@ -1749,8 +1750,8 @@ nested_nmi: pushq_cfi $repeat_nmi /* Put stack back */ - addq $(11*8), %rsp - CFI_ADJUST_CFA_OFFSET -11*8 + addq $(6*8), %rsp + CFI_ADJUST_CFA_OFFSET -6*8 nested_nmi_out: popq_cfi %rdx @@ -1776,18 +1777,18 @@ first_nmi: * +-+ * | NMI executing variable | * +-+ -* | Saved SS| -* | Saved Return RSP| -* | Saved RFLAGS| -* | Saved CS| -* | Saved RIP | -* +-+ * | copied SS | * | copied Return RSP | * | copied RFLAGS | * | copied CS | * | copied RIP | * +-+ +* | Saved SS| +* | Saved Return RSP| +* | Saved RFLAGS| +* | Saved CS| +* | Saved RIP | +* +-+ * | pt_regs | * +-+ * @@ -1803,9 +1804,14 @@ first_nmi: /* Set the NMI executing variable on the stack. */ pushq_cfi $1 + /* +* Leave room for the "copied" frame +*/ + subq $(5*8), %rsp + /* Copy the stack frame to the Saved frame */ .rept 5 - pushq_cfi 6*8(%rsp) + pushq_cfi 11*8(%rsp) .endr CFI_DEF_CFA_OFFSET SS+8-RIP @@ -1826,12 +1832,15 @@ repeat_nmi: * is benign for the non-repeat case, where 1 was pushed just above * to this very stack slot). */ - movq $1, 5*8(%rsp) + movq $1, 10*8(%rsp) /* Make another copy, this one may be modified by nested NMIs */ + addq $(10*8), %rsp .rept 5 - pushq_cfi 4*8(%rsp) + pushq_cfi -6*8(%rsp) .endr + subq $(5*8), %rsp + CFI_DEF_CFA_OFFSET SS+8-RIP end_repeat_nmi: @@ -1882,8 +1891,12 @@ nmi_swapgs: SWAPGS_UNSAFE_STACK nmi_restore: RESTORE_ALL 8 + + /* Pop the extra iret frame */ + addq $(5*8), %rsp + /* Clear the NMI executing stack variable */ - movq $0, 10*8(%rsp) + movq $0, 5*8(%rsp) jmp irq_return CFI_ENDPROC END(nmi) -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH] [PATCH] x86: Don't clobber top of pt_regs in nested NMI
The nested NMI modifies the place (instruction, flags and stack) that the first NMI will iret to. However, the copy of registers modified is exactly the one that is the part of pt_regs in the first NMI. This can change the behaviour of the first NMI. In particular, Google's arch_trigger_all_cpu_backtrace handler also prints regions of memory surrounding addresses appearing in registers. This results in handled exceptions, after which nested NMIs start coming in. These nested NMIs change the value of registers in pt_regs. This can cause the original NMI handler to produce incorrect output. We solve this problem by interchanging the position of the preserved copy of the iret registers ("saved") and the copy subject to being trampled by nested NMI ("copied"). Signed-off-by: Salman Qazi --- arch/x86/kernel/entry_64.S | 41 +++-- 1 files changed, 27 insertions(+), 14 deletions(-) diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 44531ac..b5d6e43 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1739,9 +1739,10 @@ nested_nmi: 1: /* Set up the interrupted NMIs stack to jump to repeat_nmi */ - leaq -6*8(%rsp), %rdx + leaq -1*8(%rsp), %rdx movq %rdx, %rsp - CFI_ADJUST_CFA_OFFSET 6*8 + CFI_ADJUST_CFA_OFFSET 1*8 + leaq -10*8(%rsp), %rdx pushq_cfi $__KERNEL_DS pushq_cfi %rdx pushfq_cfi @@ -1749,8 +1750,8 @@ nested_nmi: pushq_cfi $repeat_nmi /* Put stack back */ - addq $(11*8), %rsp - CFI_ADJUST_CFA_OFFSET -11*8 + addq $(6*8), %rsp + CFI_ADJUST_CFA_OFFSET -6*8 nested_nmi_out: popq_cfi %rdx @@ -1776,18 +1777,18 @@ first_nmi: * +-+ * | NMI executing variable | * +-+ -* | Saved SS| -* | Saved Return RSP| -* | Saved RFLAGS| -* | Saved CS| -* | Saved RIP | -* +-+ * | copied SS | * | copied Return RSP | * | copied RFLAGS | * | copied CS | * | copied RIP | * +-+ +* | Saved SS| +* | Saved Return RSP| +* | Saved RFLAGS| +* | Saved CS| +* | Saved RIP | +* +-+ * | pt_regs | * +-+ * @@ -1803,9 +1804,14 @@ first_nmi: /* Set the NMI executing variable on the stack. */ pushq_cfi $1 + /* +* Leave room for the "copied" frame +*/ + subq $(5*8), %rsp + /* Copy the stack frame to the Saved frame */ .rept 5 - pushq_cfi 6*8(%rsp) + pushq_cfi 11*8(%rsp) .endr CFI_DEF_CFA_OFFSET SS+8-RIP @@ -1826,12 +1832,15 @@ repeat_nmi: * is benign for the non-repeat case, where 1 was pushed just above * to this very stack slot). */ - movq $1, 5*8(%rsp) + movq $1, 10*8(%rsp) /* Make another copy, this one may be modified by nested NMIs */ + addq $(10*8), %rsp .rept 5 - pushq_cfi 4*8(%rsp) + pushq_cfi -6*8(%rsp) .endr + subq $(5*8), %rsp + CFI_DEF_CFA_OFFSET SS+8-RIP end_repeat_nmi: @@ -1882,8 +1891,12 @@ nmi_swapgs: SWAPGS_UNSAFE_STACK nmi_restore: RESTORE_ALL 8 + + /* Pop the extra iret frame */ + addq $(5*8), %rsp + /* Clear the NMI executing stack variable */ - movq $0, 10*8(%rsp) + movq $0, 5*8(%rsp) jmp irq_return CFI_ENDPROC END(nmi) -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH] crypto: Make VMAC work when blocks aren't aligned
VMAC implementation, as it is, does not work with blocks that are not multiples of 128-bytes. Furthermore, this is a problem when using the implementation on scatterlists, even when the complete plain text is 128-byte multiple, as the pieces that get passed to vmac_update can be pretty much any size. I also added test cases for unaligned blocks. Signed-off-by: Salman Qazi --- crypto/testmgr.h | 33 - crypto/vmac.c | 47 +++ include/crypto/vmac.h |2 ++ 3 files changed, 77 insertions(+), 5 deletions(-) diff --git a/crypto/testmgr.h b/crypto/testmgr.h index 76d7f6c..f8365e9 100644 --- a/crypto/testmgr.h +++ b/crypto/testmgr.h @@ -1707,7 +1707,7 @@ static struct hash_testvec aes_xcbc128_tv_template[] = { } }; -#define VMAC_AES_TEST_VECTORS 8 +#define VMAC_AES_TEST_VECTORS 11 static char vmac_string1[128] = {'\x01', '\x01', '\x01', '\x01', '\x02', '\x03', '\x02', '\x02', '\x02', '\x04', '\x01', '\x07', @@ -1723,6 +1723,19 @@ static char vmac_string3[128] = {'a', 'b', 'c', 'a', 'b', 'c', 'a', 'b', 'c', 'a', 'b', 'c', }; +static char vmac_string4[17] = {'b', 'c', 'e', 'f', + 'i', 'j', 'l', 'm', + 'o', 'p', 'r', 's', + 't', 'u', 'w', 'x', 'z'}; + +static char vmac_string5[127] = {'r', 'm', 'b', 't', 'c', +'o', 'l', 'k', ']', '%', +'9', '2', '7', '!', 'A'}; + +static char vmac_string6[129] = {'p', 't', '*', '7', 'l', +'i', '!', '#', 'w', '0', +'z', '/', '4', 'A', 'n'}; + static struct hash_testvec aes_vmac128_tv_template[] = { { .key= "\x00\x01\x02\x03\x04\x05\x06\x07" @@ -1776,6 +1789,24 @@ static struct hash_testvec aes_vmac128_tv_template[] = { .digest = "\x8b\x32\x8f\xe1\xed\x8f\xfa\xd4", .psize = 128, .ksize = 16, + }, { + .key = "a09b5cd!f#07K\x00\x00\x00", + .plaintext = vmac_string4, + .digest = "\xab\xa5\x0f\xea\x42\x4e\xa1\x5f", + .psize = sizeof(vmac_string4), + .ksize = 16, + }, { + .key = "a09b5cd!f#07K\x00\x00\x00", + .plaintext = vmac_string5, + .digest = "\x25\x31\x98\xbc\x1d\xe8\x67\x60", + .psize = sizeof(vmac_string5), + .ksize = 16, + }, { + .key = "a09b5cd!f#07K\x00\x00\x00", + .plaintext = vmac_string6, + .digest = "\xc4\xae\x9b\x47\x95\x65\xeb\x41", + .psize = sizeof(vmac_string6), + .ksize = 16, }, }; diff --git a/crypto/vmac.c b/crypto/vmac.c index f2338ca..2eb11a3 100644 --- a/crypto/vmac.c +++ b/crypto/vmac.c @@ -375,6 +375,11 @@ static void vhash_update(const unsigned char *m, u64 pkh = ctx->polykey[0]; u64 pkl = ctx->polykey[1]; + if (!mbytes) + return; + + BUG_ON(mbytes % VMAC_NHBYTES); + mptr = (u64 *)m; i = mbytes / VMAC_NHBYTES; /* Must be non-zero */ @@ -454,7 +459,7 @@ do_l3: } static u64 vmac(unsigned char m[], unsigned int mbytes, - unsigned char n[16], u64 *tagl, + const unsigned char n[16], u64 *tagl, struct vmac_ctx_t *ctx) { u64 *in_n, *out_p; @@ -559,8 +564,33 @@ static int vmac_update(struct shash_desc *pdesc, const u8 *p, { struct crypto_shash *parent = pdesc->tfm; struct vmac_ctx_t *ctx = crypto_shash_ctx(parent); + int expand; + int min; + + expand = VMAC_NHBYTES - ctx->partial_size > 0 ? + VMAC_NHBYTES - ctx->partial_size : 0; + + min = len < expand ? len : expand; + + memcpy(ctx->partial + ctx->partial_size, p, min); + ctx->partial_size += min; + + if (len < expand)
Re: [PATCH] x86: Don't clobber top of pt_regs in nested NMI
On Mon, Sep 24, 2012 at 6:27 AM, Steven Rostedt wrote: > On Tue, Sep 18, 2012 at 06:29:35PM -0700, Salman Qazi wrote: >> The nested NMI modifies the place (instruction, flags and stack) >> that the first NMI will iret to. However, the copy of registers >> modified is exactly the one that is the part of pt_regs in >> the first NMI. This can change the behaviour of the first NMI. >> >> In particular, Google's arch_trigger_all_cpu_backtrace handler >> also prints regions of memory surrounding addresses appearing in >> registers. This results in handled exceptions, after which nested NMIs >> start coming in. These nested NMIs change the value of registers >> in pt_regs. This can cause the original NMI handler to produce >> incorrect output. > > Hmm, interesting problem. > >> >> We solve this problem by introducing an extra copy of the iret >> registers that are exclusively a part of pt_regs and are not modified >> elsewhere. > > Yuck, 4 copies of the stack frame? > >> The downside is that the do_nmi function can no longer >> change the control flow, as any values it writes to these five >> registers will be discarded. > > I consider this a feature. > >> >> Signed-off-by: Salman Qazi >> --- >> arch/x86/kernel/entry_64.S | 20 +++- >> 1 files changed, 19 insertions(+), 1 deletions(-) >> >> diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S >> index 69babd8..40ddb6d 100644 >> --- a/arch/x86/kernel/entry_64.S >> +++ b/arch/x86/kernel/entry_64.S >> @@ -1724,6 +1724,18 @@ repeat_nmi: >> end_repeat_nmi: >> >> /* >> + * We went a running NMI handling routine to have a consistent >> + * picture of register state. This should hold true even if >> + * there is a nested NMI. Therefore, we let the nested NMI >> + * play with the previous copy of these registers and leave this >> + * new copy unmodified for do_nmi() >> + */ >> + .rept 5 >> + pushq_cfi 4*8(%rsp) >> + .endr >> + CFI_DEF_CFA_OFFSET SS+8-RIP > > Hmm, another solution that can be done without an extra copy, is to swap > the return stack frame with the copy stack frame. This way, the copy is > seen by the pt_regs and will always be correct. The end would basically > be the same as you have below, just skip the copy and return. > > Now this breaks the idea that anything below the sp pointer is not safe > to use. But this is the NMI stack in the controlled part of the NMI > Handler (no breakpoints allowed here). The NMI stack is special, which > is why we have all this crap in the first place. > > It would be safe to save the copy stack below the stack pointer because > the only thing that could possible interrupt us here is another NMI, > which would just reset the stack pointer to the top again, and notice > that this is a nested NMI and return after modifying the return stack. > Which is OK. > > At least this way, we avoid copying the stack frame a third time (two > was enough). Yes. This is better. For some reason, I thought about this and decided against it. But, I can't remember what crossed my mind. I'll try this. > > -- Steve > > >> + >> + /* >>* Everything below this point can be preempted by a nested >>* NMI if the first NMI took an exception and reset our iret stack >>* so that we repeat another NMI. >> @@ -1771,7 +1783,13 @@ nmi_swapgs: >> nmi_restore: >> RESTORE_ALL 8 >> /* Clear the NMI executing stack variable */ >> - movq $0, 10*8(%rsp) >> + movq $0, 15*8(%rsp) >> + >> + /* Pop the extra copy of iret context that was saved above >> + * just for do_nmi() >> + */ >> + addq $5*8, %rsp >> + >> jmp irq_return >> CFI_ENDPROC >> END(nmi) >> >> -- >> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in >> the body of a message to majord...@vger.kernel.org >> More majordomo info at http://vger.kernel.org/majordomo-info.html >> Please read the FAQ at http://www.tux.org/lkml/ -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH] x86: Don't clobber top of pt_regs in nested NMI
The nested NMI modifies the place (instruction, flags and stack) that the first NMI will iret to. However, the copy of registers modified is exactly the one that is the part of pt_regs in the first NMI. This can change the behaviour of the first NMI. In particular, Google's arch_trigger_all_cpu_backtrace handler also prints regions of memory surrounding addresses appearing in registers. This results in handled exceptions, after which nested NMIs start coming in. These nested NMIs change the value of registers in pt_regs. This can cause the original NMI handler to produce incorrect output. We solve this problem by introducing an extra copy of the iret registers that are exclusively a part of pt_regs and are not modified elsewhere. The downside is that the do_nmi function can no longer change the control flow, as any values it writes to these five registers will be discarded. Signed-off-by: Salman Qazi --- arch/x86/kernel/entry_64.S | 20 +++- 1 files changed, 19 insertions(+), 1 deletions(-) diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 69babd8..40ddb6d 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1724,6 +1724,18 @@ repeat_nmi: end_repeat_nmi: /* +* We went a running NMI handling routine to have a consistent +* picture of register state. This should hold true even if +* there is a nested NMI. Therefore, we let the nested NMI +* play with the previous copy of these registers and leave this +* new copy unmodified for do_nmi() +*/ + .rept 5 + pushq_cfi 4*8(%rsp) + .endr + CFI_DEF_CFA_OFFSET SS+8-RIP + + /* * Everything below this point can be preempted by a nested * NMI if the first NMI took an exception and reset our iret stack * so that we repeat another NMI. @@ -1771,7 +1783,13 @@ nmi_swapgs: nmi_restore: RESTORE_ALL 8 /* Clear the NMI executing stack variable */ - movq $0, 10*8(%rsp) + movq $0, 15*8(%rsp) + + /* Pop the extra copy of iret context that was saved above +* just for do_nmi() +*/ + addq $5*8, %rsp + jmp irq_return CFI_ENDPROC END(nmi) -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [BUG] mm: direct I/O (using GUP) can write to COW anonymous pages
On Mon, Sep 17, 2018 at 5:05 PM Hugh Dickins wrote: > > Hi Jann, > > On Mon, 17 Sep 2018, Jann Horn wrote: > > > [I'm not sure who the best people to ask about this are, I hope the > > recipient list resembles something reasonable...] > > > > I have noticed that the dup_mmap() logic on fork() doesn't handle > > pages with active direct I/O properly: dup_mmap() seems to assume that > > making the PTE referencing a page readonly will always prevent future > > writes to the page, but if the kernel has acquired a direct reference > > to the page before (e.g. via get_user_pages_fast()), writes can still > > happen that way. > > > > The worst-case effect of this - as far as I can tell - is that when a > > multithreaded process forks while one thread is in the middle of > > sys_read() on a file that uses direct I/O with get_user_pages_fast(), > > the read data can become visible in the child while the parent's > > buffer stays uninitialized if the parent writes to a relevant page > > post-fork before either the I/O completes or the child writes to it. > > Yes: you're understandably more worried by the one seeing the other's > data; we've tended in the past to be more worried about the one getting > corruption, and the other not seeing the data it asked for (and usually > in the context of RDMA, rather than filesystem direct I/O). > > I've added some Cc's: I might be misremembering, but I think both > Andrea and Konstantin have offered approaches to this in the past, > and I believe Salman is taking a look at it currently. > > But my own interest ended when Michael added MADV_DONTFORK: beyond > that, we've rated it a "Patient: It hurts when I do this. Doctor: > Don't do that then" - more complexity and overhead to solve, than > we have had appetite to get into. But not a shiningly satisfactory > situation, I'll agree. The approach that I am currently investigating (at least to use internally at Google) is to be able to detect (via a heuristic with some error rate) when the patient might be doing it, and tell them not to. Of course, we would have to tell them nicely rather than killing their process, especially if we expect substantial number of false positives. I am motivated by the belief that a transparent fix is not likely to be feasible. Others, who have taken that route earlier, can correct me, if I am mistaken. > > Hugh > > > > > Reproducer code: > > > > == START hello.c == > > #define FUSE_USE_VERSION 26 > > > > #include > > #include > > #include > > #include > > #include > > #include > > #include > > #include > > > > static const char *hello_path = "/hello"; > > > > static int hello_getattr(const char *path, struct stat *stbuf) > > { > > int res = 0; > > memset(stbuf, 0, sizeof(struct stat)); > > if (strcmp(path, "/") == 0) { > > stbuf->st_mode = S_IFDIR | 0755; > > stbuf->st_nlink = 2; > > } else if (strcmp(path, hello_path) == 0) { > > stbuf->st_mode = S_IFREG | 0666; > > stbuf->st_nlink = 1; > > stbuf->st_size = 0x1000; > > stbuf->st_blocks = 0; > > } else > > res = -ENOENT; > > return res; > > } > > > > static int hello_readdir(const char *path, void *buf, fuse_fill_dir_t > > filler, off_t offset, struct fuse_file_info *fi) { > > filler(buf, ".", NULL, 0); > > filler(buf, "..", NULL, 0); > > filler(buf, hello_path + 1, NULL, 0); > > return 0; > > } > > > > static int hello_open(const char *path, struct fuse_file_info *fi) { > > return 0; > > } > > > > static int hello_read(const char *path, char *buf, size_t size, off_t > > offset, struct fuse_file_info *fi) { > > sleep(3); > > size_t len = 0x1000; > > if (offset < len) { > > if (offset + size > len) > > size = len - offset; > > memset(buf, 0, size); > > } else > > size = 0; > > return size; > > } > > > > static int hello_write(const char *path, const char *buf, size_t size, > > off_t offset, struct fuse_file_info *fi) { > > while(1) pause(); > > } > > > > static struct fuse_operations hello_oper = { > > .getattr= hello_getattr, > > .readdir= hello_readdir, > > .open = hello_open, > > .read = hello_read, > > .write = hello_write, > > }; > > > > int main(int argc, char *argv[]) { > > return fuse_main(argc, argv, &hello_oper, NULL); > > } > > == END hello.c == > > > > == START simple_mmap.c == > > #define _GNU_SOURCE > > #include > > #include > > #include > > #include > > #include > > #include > > #include > > #include > > #include > > > > __attribute__((aligned(0x1000))) char data_buffer_[0x1]; > > #define data_buffer (data_buffer_ + 0x8000) > > > > void *fuse_thread(void *dummy) { > > /* s
icache_is_aliasing and big.LITTLE
Hi, What is the intention behind icache_is_aliasing on big.LITTLE systems where some icaches are VIPT and others are PIPT? Is it meant to be conservative in some sense or should it be made per-CPU? Thanks for your help, Salman
Re: icache_is_aliasing and big.LITTLE
Thank you. On Thu, May 9, 2019 at 1:50 AM Catalin Marinas wrote: > > Hi, > > On Wed, May 08, 2019 at 11:45:03AM -0700, Salman Qazi wrote: > > What is the intention behind icache_is_aliasing on big.LITTLE systems > > where some icaches are VIPT and others are PIPT? Is it meant to be > > conservative in some sense or should it be made per-CPU? > > It needs to cover the worst case scenario across all CPUs, i.e. aliasing > VIPT if one of the CPUs has this. We can't make it per-CPU because a > thread performing cache maintenance might be migrated to another CPU > with different cache policy (e.g. sync_icache_aliases()). > > -- > Catalin
Re: [PATCH] fs, ipc: Use an asynchronous version of kern_unmount in IPC
Do you have any additional concerns? On Thu, Feb 7, 2019 at 10:43 AM Salman Qazi wrote: > > On Wed, Feb 6, 2019 at 8:14 PM Al Viro wrote: > > > > On Wed, Feb 06, 2019 at 11:53:54AM -0800, Salman Qazi wrote: > > > > > This patch solves the issue by removing synchronize_rcu from mq_put_mnt. > > > This is done by implementing an asynchronous version of kern_unmount. > > > > > > Since mntput() sleeps, it needs to be deferred to a work queue. > > > > > > Additionally, the callers of mq_put_mnt appear to be safe having > > > it behave asynchronously. In particular, put_ipc_ns calls > > > mq_clear_sbinfo which renders the inode inaccessible for the purposes of > > > mqueue_create by making s_fs_info NULL. This appears > > > to be the thing that prevents access while free_ipc_ns is taking place. > > > So, the unmount should be able to proceed lazily. > > > > Ugh... I really doubt that it's correct. The caller is > > mq_put_mnt(ns); > > free_ipc_ns(ns); > > and we have > > static void mqueue_evict_inode(struct inode *inode) > > { > > > > ... > > > > ipc_ns = get_ns_from_inode(inode); > > > > with > > > > static struct ipc_namespace *get_ns_from_inode(struct inode *inode) > > { > > struct ipc_namespace *ns; > > > > spin_lock(&mq_lock); > > ns = __get_ns_from_inode(inode); > > spin_unlock(&mq_lock); > > return ns; > > } > > > > and > > > > static inline struct ipc_namespace *__get_ns_from_inode(struct inode *inode) > > { > > return get_ipc_ns(inode->i_sb->s_fs_info); > > } > > > > with ->s_fs_info being the ipc_namespace we are freeing after mq_put_ns() > > > > Are you saying that get_ipc_ns() after free_ipc_ns() is safe? Because > > ->evict_inode() *IS* called on umount. What happens to your patch if > > there was a regular file left on that filesystem? > > > > Smells like a memory corruptor... > > Actually, the full context in the caller is > > if (refcount_dec_and_lock(&ns->count, &mq_lock)) { > mq_clear_sbinfo(ns); > spin_unlock(&mq_lock); > mq_put_mnt(ns); > free_ipc_ns(ns); > } > > And > > void mq_clear_sbinfo(struct ipc_namespace *ns) > { > ns->mq_mnt->mnt_sb->s_fs_info = NULL; > } > > Therefore, s_fs_info should be NULL before we proceed to unmount. So, > as far as I know, it should not be possible to find the ipc_namespace > from the mount.
[PATCH RESENT] fs, ipc: Use an asynchronous version of kern_unmount in IPC
Prior to this patch, the kernel can spend a lot of time with this stack trace: [] __wait_rcu_gp+0x93/0xe0 [] synchronize_sched+0x48/0x60 [] kern_unmount+0x3a/0x46 [] mq_put_mnt+0x15/0x17 [] put_ipc_ns+0x36/0x8b This patch solves the issue by removing synchronize_rcu from mq_put_mnt. This is done by implementing an asynchronous version of kern_unmount. Since mntput() sleeps, it needs to be deferred to a work queue. Additionally, the callers of mq_put_mnt appear to be safe having it behave asynchronously. In particular, put_ipc_ns calls mq_clear_sbinfo which renders the inode inaccessible for the purposes of mqueue_create by making s_fs_info NULL. This appears to be the thing that prevents access while free_ipc_ns is taking place. So, the unmount should be able to proceed lazily. Tested: Ran the following program: int main(void) { int pid; int status; int i; for (i = 0; i < 1000; i++) { pid = fork(); if (!pid) { assert(!unshare(CLONE_NEWUSER| CLONE_NEWIPC|CLONE_NEWNS)); return 0; } assert(waitpid(pid, &status, 0) == pid); } } Before: $ time ./unshare2 real0m9.784s user0m0.428s sys 0m0.000s After: $ time ./unshare2 real0m0.368s user0m0.226s sys 0m0.122s Signed-off-by: Salman Qazi Reviewed-by: Eric Dumazet --- fs/namespace.c | 41 + include/linux/fs.h | 1 + ipc/mqueue.c | 2 +- 3 files changed, 43 insertions(+), 1 deletion(-) diff --git a/fs/namespace.c b/fs/namespace.c index 678ef175d63a..e60b473c3bbc 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -3321,6 +3321,47 @@ void kern_unmount(struct vfsmount *mnt) } EXPORT_SYMBOL(kern_unmount); +struct async_unmount_cb { + struct vfsmount *mnt; + struct work_struct work; + struct rcu_head rcu_head; +}; + +static void kern_unmount_work(struct work_struct *work) +{ + struct async_unmount_cb *cb = container_of(work, + struct async_unmount_cb, work); + + mntput(cb->mnt); + kfree(cb); +} + +static void kern_unmount_rcu_cb(struct rcu_head *rcu_head) +{ + struct async_unmount_cb *cb = container_of(rcu_head, + struct async_unmount_cb, rcu_head); + + INIT_WORK(&cb->work, kern_unmount_work); + schedule_work(&cb->work); + +} + +void kern_unmount_async(struct vfsmount *mnt) +{ + /* release long term mount so mount point can be released */ + if (!IS_ERR_OR_NULL(mnt)) { + struct async_unmount_cb *cb = kmalloc(sizeof(*cb), GFP_KERNEL); + + if (cb) { + real_mount(mnt)->mnt_ns = NULL; + cb->mnt = mnt; + call_rcu(&cb->rcu_head, kern_unmount_rcu_cb); + } else { + kern_unmount(mnt); + } + } +} + bool our_mnt(struct vfsmount *mnt) { return check_mnt(real_mount(mnt)); diff --git a/include/linux/fs.h b/include/linux/fs.h index 29d8e2cfed0e..8865997a8722 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2274,6 +2274,7 @@ extern int register_filesystem(struct file_system_type *); extern int unregister_filesystem(struct file_system_type *); extern struct vfsmount *kern_mount_data(struct file_system_type *, void *data); #define kern_mount(type) kern_mount_data(type, NULL) +extern void kern_unmount_async(struct vfsmount *mnt); extern void kern_unmount(struct vfsmount *mnt); extern int may_umount_tree(struct vfsmount *); extern int may_umount(struct vfsmount *); diff --git a/ipc/mqueue.c b/ipc/mqueue.c index c595bed7bfcb..a8c2465ac0cb 100644 --- a/ipc/mqueue.c +++ b/ipc/mqueue.c @@ -1554,7 +1554,7 @@ void mq_clear_sbinfo(struct ipc_namespace *ns) void mq_put_mnt(struct ipc_namespace *ns) { - kern_unmount(ns->mq_mnt); + kern_unmount_async(ns->mq_mnt); } static int __init init_mqueue_fs(void) -- 2.21.0.352.gf09ad66450-goog
[tip:x86/asm] x86: Don't clobber top of pt_regs in nested NMI
Commit-ID: 28696f434fef0efa97534b59986ad33b9c4df7f8 Gitweb: http://git.kernel.org/tip/28696f434fef0efa97534b59986ad33b9c4df7f8 Author: Salman Qazi AuthorDate: Mon, 1 Oct 2012 17:29:25 -0700 Committer: Steven Rostedt CommitDate: Fri, 2 Nov 2012 11:29:36 -0400 x86: Don't clobber top of pt_regs in nested NMI The nested NMI modifies the place (instruction, flags and stack) that the first NMI will iret to. However, the copy of registers modified is exactly the one that is the part of pt_regs in the first NMI. This can change the behaviour of the first NMI. In particular, Google's arch_trigger_all_cpu_backtrace handler also prints regions of memory surrounding addresses appearing in registers. This results in handled exceptions, after which nested NMIs start coming in. These nested NMIs change the value of registers in pt_regs. This can cause the original NMI handler to produce incorrect output. We solve this problem by interchanging the position of the preserved copy of the iret registers ("saved") and the copy subject to being trampled by nested NMI ("copied"). Link: http://lkml.kernel.org/r/20121002002919.27236.14388.st...@dungbeetle.mtv.corp.google.com Signed-off-by: Salman Qazi [ Added a needed CFI_ADJUST_CFA_OFFSET ] Signed-off-by: Steven Rostedt --- arch/x86/kernel/entry_64.S | 41 +++-- 1 files changed, 27 insertions(+), 14 deletions(-) diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index b51b2c7..811795d 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1699,9 +1699,10 @@ nested_nmi: 1: /* Set up the interrupted NMIs stack to jump to repeat_nmi */ - leaq -6*8(%rsp), %rdx + leaq -1*8(%rsp), %rdx movq %rdx, %rsp - CFI_ADJUST_CFA_OFFSET 6*8 + CFI_ADJUST_CFA_OFFSET 1*8 + leaq -10*8(%rsp), %rdx pushq_cfi $__KERNEL_DS pushq_cfi %rdx pushfq_cfi @@ -1709,8 +1710,8 @@ nested_nmi: pushq_cfi $repeat_nmi /* Put stack back */ - addq $(11*8), %rsp - CFI_ADJUST_CFA_OFFSET -11*8 + addq $(6*8), %rsp + CFI_ADJUST_CFA_OFFSET -6*8 nested_nmi_out: popq_cfi %rdx @@ -1736,18 +1737,18 @@ first_nmi: * +-+ * | NMI executing variable | * +-+ -* | Saved SS| -* | Saved Return RSP| -* | Saved RFLAGS| -* | Saved CS| -* | Saved RIP | -* +-+ * | copied SS | * | copied Return RSP | * | copied RFLAGS | * | copied CS | * | copied RIP | * +-+ +* | Saved SS| +* | Saved Return RSP| +* | Saved RFLAGS| +* | Saved CS| +* | Saved RIP | +* +-+ * | pt_regs | * +-+ * @@ -1763,9 +1764,14 @@ first_nmi: /* Set the NMI executing variable on the stack. */ pushq_cfi $1 + /* +* Leave room for the "copied" frame +*/ + subq $(5*8), %rsp + /* Copy the stack frame to the Saved frame */ .rept 5 - pushq_cfi 6*8(%rsp) + pushq_cfi 11*8(%rsp) .endr CFI_DEF_CFA_OFFSET SS+8-RIP @@ -1786,12 +1792,15 @@ repeat_nmi: * is benign for the non-repeat case, where 1 was pushed just above * to this very stack slot). */ - movq $1, 5*8(%rsp) + movq $1, 10*8(%rsp) /* Make another copy, this one may be modified by nested NMIs */ + addq $(10*8), %rsp + CFI_ADJUST_CFA_OFFSET -10*8 .rept 5 - pushq_cfi 4*8(%rsp) + pushq_cfi -6*8(%rsp) .endr + subq $(5*8), %rsp CFI_DEF_CFA_OFFSET SS+8-RIP end_repeat_nmi: @@ -1842,8 +1851,12 @@ nmi_swapgs: SWAPGS_UNSAFE_STACK nmi_restore: RESTORE_ALL 8 + + /* Pop the extra iret frame */ + addq $(5*8), %rsp + /* Clear the NMI executing stack variable */ - movq $0, 10*8(%rsp) + movq $0, 5*8(%rsp) jmp irq_return CFI_ENDPROC END(nmi) -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/