On Fri, Apr 28, 2017 at 12:39:12PM -0700, Dan Williams wrote:
> The pmem driver has a need to transfer data with a persistent memory
> destination and be able to rely on the fact that the destination writes
> are not cached. It is sufficient for the writes to be flushed to a
> cpu-store-buffer (non-temporal / "movnt" in x86 terms), as we expect
> userspace to call fsync() to ensure data-writes have reached a
> power-fail-safe zone in the platform. The fsync() triggers a REQ_FUA or
> REQ_FLUSH to the pmem driver which will turn around and fence previous
> writes with an "sfence".
> 
> Implement a __copy_from_user_inatomic_wt, memcpy_page_wt, and memcpy_wt,
> that guarantee that the destination buffer is not dirty in the cpu cache
> on completion. The new copy_from_iter_wt and sub-routines will be used
> to replace the "pmem api" (include/linux/pmem.h +
> arch/x86/include/asm/pmem.h). The availability of copy_from_iter_wt()
> and memcpy_wt() are gated by the CONFIG_ARCH_HAS_UACCESS_WT config
> symbol, and fallback to copy_from_iter_nocache() and plain memcpy()
> otherwise.
> 
> This is meant to satisfy the concern from Linus that if a driver wants
> to do something beyond the normal nocache semantics it should be
> something private to that driver [1], and Al's concern that anything
> uaccess related belongs with the rest of the uaccess code [2].
> 
> [1]: https://lists.01.org/pipermail/linux-nvdimm/2017-January/008364.html
> [2]: https://lists.01.org/pipermail/linux-nvdimm/2017-April/009942.html
> 
> Cc: <x...@kernel.org>
> Cc: Jan Kara <j...@suse.cz>
> Cc: Jeff Moyer <jmo...@redhat.com>
> Cc: Ingo Molnar <mi...@redhat.com>
> Cc: Christoph Hellwig <h...@lst.de>
> Cc: "H. Peter Anvin" <h...@zytor.com>
> Cc: Al Viro <v...@zeniv.linux.org.uk>
> Cc: Thomas Gleixner <t...@linutronix.de>
> Cc: Matthew Wilcox <mawil...@microsoft.com>
> Cc: Ross Zwisler <ross.zwis...@linux.intel.com>
> Signed-off-by: Dan Williams <dan.j.willi...@intel.com>
> ---
<>
> diff --git a/arch/x86/include/asm/uaccess_64.h 
> b/arch/x86/include/asm/uaccess_64.h
> index c5504b9a472e..07ded30c7e89 100644
> --- a/arch/x86/include/asm/uaccess_64.h
> +++ b/arch/x86/include/asm/uaccess_64.h
> @@ -171,6 +171,10 @@ unsigned long raw_copy_in_user(void __user *dst, const 
> void __user *src, unsigne
>  extern long __copy_user_nocache(void *dst, const void __user *src,
>                               unsigned size, int zerorest);
>  
> +extern long __copy_user_wt(void *dst, const void __user *src, unsigned size);
> +extern void memcpy_page_wt(char *to, struct page *page, size_t offset,
> +                        size_t len);
> +
>  static inline int
>  __copy_from_user_inatomic_nocache(void *dst, const void __user *src,
>                                 unsigned size)
> @@ -179,6 +183,13 @@ __copy_from_user_inatomic_nocache(void *dst, const void 
> __user *src,
>       return __copy_user_nocache(dst, src, size, 0);
>  }
>  
> +static inline int
> +__copy_from_user_inatomic_wt(void *dst, const void __user *src, unsigned 
> size)
> +{
> +     kasan_check_write(dst, size);
> +     return __copy_user_wt(dst, src, size);
> +}
> +
>  unsigned long
>  copy_user_handle_tail(char *to, char *from, unsigned len);
>  
> diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c
> index 3b7c40a2e3e1..0aeff66a022f 100644
> --- a/arch/x86/lib/usercopy_64.c
> +++ b/arch/x86/lib/usercopy_64.c
> @@ -7,6 +7,7 @@
>   */
>  #include <linux/export.h>
>  #include <linux/uaccess.h>
> +#include <linux/highmem.h>
>  
>  /*
>   * Zero Userspace
> @@ -73,3 +74,130 @@ copy_user_handle_tail(char *to, char *from, unsigned len)
>       clac();
>       return len;
>  }
> +
> +#ifdef CONFIG_ARCH_HAS_UACCESS_WT
> +/**
> + * clean_cache_range - write back a cache range with CLWB
> + * @vaddr:   virtual start address
> + * @size:    number of bytes to write back
> + *
> + * Write back a cache range using the CLWB (cache line write back)
> + * instruction. Note that @size is internally rounded up to be cache
> + * line size aligned.
> + */
> +static void clean_cache_range(void *addr, size_t size)
> +{
> +     u16 x86_clflush_size = boot_cpu_data.x86_clflush_size;
> +     unsigned long clflush_mask = x86_clflush_size - 1;
> +     void *vend = addr + size;
> +     void *p;
> +
> +     for (p = (void *)((unsigned long)addr & ~clflush_mask);
> +          p < vend; p += x86_clflush_size)
> +             clwb(p);
> +}
> +
> +long __copy_user_wt(void *dst, const void __user *src, unsigned size)
> +{
> +     unsigned long flushed, dest = (unsigned long) dst;
> +     long rc = __copy_user_nocache(dst, src, size, 0);
> +
> +     /*
> +      * __copy_user_nocache() uses non-temporal stores for the bulk
> +      * of the transfer, but we need to manually flush if the
> +      * transfer is unaligned. A cached memory copy is used when
> +      * destination or size is not naturally aligned. That is:
> +      *   - Require 8-byte alignment when size is 8 bytes or larger.
> +      *   - Require 4-byte alignment when size is 4 bytes.
> +      */
> +     if (size < 8) {
> +             if (!IS_ALIGNED(dest, 4) || size != 4)
> +                     clean_cache_range(dst, 1);
> +     } else {
> +             if (!IS_ALIGNED(dest, 8)) {
> +                     dest = ALIGN(dest, boot_cpu_data.x86_clflush_size);
> +                     clean_cache_range(dst, 1);
> +             }
> +
> +             flushed = dest - (unsigned long) dst;
> +             if (size > flushed && !IS_ALIGNED(size - flushed, 8))
> +                     clean_cache_range(dst + size - 1, 1);
> +     }
> +
> +     return rc;
> +}
> +
> +void memcpy_wt(void *_dst, const void *_src, size_t size)
> +{
> +     unsigned long dest = (unsigned long) _dst;
> +     unsigned long source = (unsigned long) _src;
> +
> +     /* cache copy and flush to align dest */
> +     if (!IS_ALIGNED(dest, 8)) {
> +             unsigned len = min_t(unsigned, size, ALIGN(dest, 8) - dest);
> +
> +             memcpy((void *) dest, (void *) source, len);
> +             clean_cache_range((void *) dest, len);
> +             dest += len;
> +             source += len;
> +             size -= len;
> +             if (!size)
> +                     return;
> +     }
> +
> +     /* 4x8 movnti loop */
> +     while (size >= 32) {
> +             asm("movq    (%0), %%r8\n"
> +                 "movq   8(%0), %%r9\n"
> +                 "movq  16(%0), %%r10\n"
> +                 "movq  24(%0), %%r11\n"
> +                 "movnti  %%r8,   (%1)\n"
> +                 "movnti  %%r9,  8(%1)\n"
> +                 "movnti %%r10, 16(%1)\n"
> +                 "movnti %%r11, 24(%1)\n"
> +                 :: "r" (source), "r" (dest)
> +                 : "memory", "r8", "r9", "r10", "r11");
> +             dest += 32;
> +             source += 32;
> +             size -= 32;
> +     }
> +
> +     /* 1x8 movnti loop */
> +     while (size >= 8) {
> +             asm("movq    (%0), %%r8\n"
> +                 "movnti  %%r8,   (%1)\n"
> +                 :: "r" (source), "r" (dest)
> +                 : "memory", "r8");
> +             dest += 8;
> +             source += 8;
> +             size -= 8;
> +     }
> +
> +     /* 1x4 movnti loop */
> +     while (size >= 4) {
> +             asm("movl    (%0), %%r8d\n"
> +                 "movnti  %%r8d,   (%1)\n"
> +                 :: "r" (source), "r" (dest)
> +                 : "memory", "r8");
> +             dest += 4;
> +             source += 4;
> +             size -= 4;
> +     }
> +
> +     /* cache copy for remaining bytes */
> +     if (size) {
> +             memcpy((void *) dest, (void *) source, size);
> +             clean_cache_range((void *) dest, size);
> +     }
> +}
> +EXPORT_SYMBOL_GPL(memcpy_wt);

I took a pretty hard look at the changes in arch/x86/lib/usercopy_64.c, and
they look correct to me.  The inline assembly for non-temporal copies mixed
with C for loop control is IMHO much easier to follow than the pure assembly
of __copy_user_nocache().

Reviewed-by: Ross Zwisler <ross.zwis...@linux.intel.com>

Reply via email to