On Fri, 2019-08-16 at 15:52 +0000, Christophe Leroy wrote:
> Resulting code (8xx with 16 bytes per cacheline and 16k pages)
> 
> 0000016c <__flush_dcache_icache_phys>:
>  16c: 54 63 00 22     rlwinm  r3,r3,0,0,17
>  170: 7d 20 00 a6     mfmsr   r9
>  174: 39 40 04 00     li      r10,1024
>  178: 55 28 07 34     rlwinm  r8,r9,0,28,26
>  17c: 7c 67 1b 78     mr      r7,r3
>  180: 7d 49 03 a6     mtctr   r10
>  184: 7d 00 01 24     mtmsr   r8
>  188: 4c 00 01 2c     isync
>  18c: 7c 00 18 6c     dcbst   0,r3
>  190: 38 63 00 10     addi    r3,r3,16
>  194: 42 00 ff f8     bdnz    18c <__flush_dcache_icache_phys+0x20>
>  198: 7c 00 04 ac     hwsync
>  19c: 7d 49 03 a6     mtctr   r10
>  1a0: 7c 00 3f ac     icbi    0,r7
>  1a4: 38 e7 00 10     addi    r7,r7,16
>  1a8: 42 00 ff f8     bdnz    1a0 <__flush_dcache_icache_phys+0x34>
>  1ac: 7c 00 04 ac     hwsync
>  1b0: 7d 20 01 24     mtmsr   r9
>  1b4: 4c 00 01 2c     isync
>  1b8: 4e 80 00 20     blr
> 
> Signed-off-by: Christophe Leroy <christophe.le...@c-s.fr>
> ---
>  This patch is on top of Alastair's series "powerpc: convert cache
> asm to C"
>  Patch 3 of that series should touch __flush_dcache_icache_phys and
> this
>  patch could come just after patch 3.
> 
>  arch/powerpc/include/asm/cacheflush.h |  8 +++++
>  arch/powerpc/mm/mem.c                 | 55
> ++++++++++++++++++++++++++++-------
>  2 files changed, 53 insertions(+), 10 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/cacheflush.h
> b/arch/powerpc/include/asm/cacheflush.h
> index 1826bf2cc137..bf4f2dc4eb76 100644
> --- a/arch/powerpc/include/asm/cacheflush.h
> +++ b/arch/powerpc/include/asm/cacheflush.h
> @@ -47,6 +47,14 @@ void flush_icache_user_range(struct vm_area_struct
> *vma,
>                                   struct page *page, unsigned long
> addr,
>                                   int len);
>  void flush_dcache_icache_page(struct page *page);
> +#if defined(CONFIG_PPC32) && !defined(CONFIG_BOOKE)
> +void __flush_dcache_icache_phys(unsigned long physaddr);
> +#else
> +static inline void __flush_dcache_icache_phys(unsigned long
> physaddr)
> +{
> +     BUG();
> +}
> +#endif
>  
>  /**
>   * flush_dcache_range(): Write any modified data cache blocks out to
> memory and invalidate them.
> diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
> index 43be99de7c9a..43009f9227c4 100644
> --- a/arch/powerpc/mm/mem.c
> +++ b/arch/powerpc/mm/mem.c
> @@ -402,6 +402,50 @@ void flush_dcache_page(struct page *page)
>  }
>  EXPORT_SYMBOL(flush_dcache_page);
>  
> +#if defined(CONFIG_PPC32) && !defined(CONFIG_BOOKE)
> +void __flush_dcache_icache_phys(unsigned long physaddr)
> +{
> +     unsigned long bytes = l1_dcache_bytes();
> +     unsigned long nb = PAGE_SIZE / bytes;
> +     unsigned long addr = physaddr & PAGE_MASK;
> +     unsigned long msr, msr0;
> +     unsigned long loop1 = addr, loop2 = addr;
> +
> +     if (cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) {
> +             /* For a snooping icache, we still need a dummy icbi to
> purge all the
> +              * prefetched instructions from the ifetch buffers. We
> also need a sync
> +              * before the icbi to order the the actual stores to
> memory that might
> +              * have modified instructions with the icbi.
> +              */
> +             mb(); /* sync */
> +             icbi((void *)addr);
> +             mb(); /* sync */
> +             isync();
> +             return;
> +     }
> +     msr0 = mfmsr();
> +     msr = msr0 & ~MSR_DR;
> +     asm volatile(
> +         "   mtctr %2;"
> +         "   mtmsr %3;"
> +         "   isync;"
> +         "0: dcbst   0, %0;"
> +         "   addi    %0, %0, %4;"
> +         "   bdnz    0b;"
> +         "   sync;"
> +         "   mtctr %2;"
> +         "1: icbi    0, %1;"
> +         "   addi    %1, %1, %4;"
> +         "   bdnz    1b;"
> +         "   sync;"
> +         "   mtmsr %5;"
> +         "   isync;"
> +         : "+r" (loop1), "+r" (loop2)
> +         : "r" (nb), "r" (msr), "i" (bytes), "r" (msr0)
> +         : "ctr", "memory");
> +}
> +#endif
> +
>  void flush_dcache_icache_page(struct page *page)
>  {
>  #ifdef CONFIG_HUGETLB_PAGE
> @@ -419,16 +463,7 @@ void flush_dcache_icache_page(struct page *page)
>               __flush_dcache_icache(start);
>               kunmap_atomic(start);
>       } else {
> -             unsigned long msr = mfmsr();
> -
> -             /* Clear the DR bit so that we operate on physical
> -              * rather than virtual addresses
> -              */
> -             mtmsr(msr & ~(MSR_DR));
> -
> -             __flush_dcache_icache((void *)physaddr);
> -
> -             mtmsr(msr);
> +             __flush_dcache_icache_phys(page_to_pfn(page) <<
> PAGE_SHIFT);
>       }
>  #endif
>  }


Thanks Christophe,

I'm trying a somewhat different approach that requires less knowledge
of assembler. Handling of CPU_FTR_COHERENT_ICACHE is outside this
function. The code below is not a patch as my tree is a bit messy,
sorry:

/**
 * flush_dcache_icache_phys() - Flush a page by it's physical address
 * @addr: the physical address of the page
 */
static void flush_dcache_icache_phys(unsigned long addr)
{
        register unsigned long msr;
        register unsigned long dlines = PAGE_SIZE >> l1_dcache_shift();
        register unsigned long dbytes = l1_dcache_bytes();
        register unsigned long ilines = PAGE_SIZE >> l1_icache_shift();
        register unsigned long ibytes = l1_icache_bytes();
        register unsigned long i;
        register unsigned long address = addr;

        /*
         * Clear the DR bit so that we operate on physical
         * rather than virtual addresses
         */
        msr = mfmsr();
        mtmsr(msr & ~(MSR_DR));

        /* Write out the data cache */
        for (i = 0; i < dlines; i++, address += dbytes)
                dcbst((void *)address);

        /* Invalidate the instruction cache */
        address = addr;
        for (i = 0; i < ilines; i++, address += ibytes)
                icbi((void *)address);

        mtmsr(msr);
}

void test_flush_phys(unsigned long addr)
{
        flush_dcache_icache_phys(addr);
}


This gives the following assembler (using pmac32_defconfig):
000003cc <test_flush_phys>:
 3cc:   94 21 ff f0     stwu    r1,-16(r1)
 3d0:   7d 00 00 a6     mfmsr   r8
 3d4:   55 09 07 34     rlwinm  r9,r8,0,28,26
 3d8:   7d 20 01 24     mtmsr   r9
 3dc:   39 20 00 80     li      r9,128
 3e0:   7d 29 03 a6     mtctr   r9
 3e4:   39 43 10 00     addi    r10,r3,4096
 3e8:   7c 69 1b 78     mr      r9,r3
 3ec:   7c 00 48 6c     dcbst   0,r9
 3f0:   39 29 00 20     addi    r9,r9,32
 3f4:   42 00 ff f8     bdnz    3ec <test_flush_phys+0x20>
 3f8:   7c 00 1f ac     icbi    0,r3
 3fc:   38 63 00 20     addi    r3,r3,32
 400:   7f 8a 18 40     cmplw   cr7,r10,r3
 404:   40 9e ff f4     bne     cr7,3f8 <test_flush_phys+0x2c>
 408:   7d 00 01 24     mtmsr   r8
 40c:   38 21 00 10     addi    r1,r1,16
 410:   4e 80 00 20     blr


-- 
Alastair D'Silva
Open Source Developer
Linux Technology Centre, IBM Australia
mob: 0423 762 819

Reply via email to