One thing that occurred to me was that since the __kernel_datapage_offset
is located immediately before the function, the offset is small and you
could get rid of the addi and just fold it into the lwz.


Aaron Sawdey, Ph.D.  saw...@us.ibm.com
050-2/C113  (507) 253-7520 TL: 553-7520 home: 507/263-0782
IBM Linux Technology Center - PPC Toolchain



From:   Denis Kirjanov <k...@linux-powerpc.org>
To:     Michael Neuling <mi...@neuling.org>
Cc:     Michael Ellerman <mich...@ellerman.id.au>, benh
            <b...@kernel.crashing.org>, Aaron Sawdey/Rochester/IBM@IBMUS,
            linuxppc-dev <linuxppc-dev@lists.ozlabs.org>, Anton Blanchard
            <an...@samba.org>, Steve Munroe/Rochester/IBM@IBMUS
Date:   09/23/2015 01:22 PM
Subject:        Re: [PATCH] powerpc/vdso: Avoid link stack corruption in
            __get_datapage()



On 9/23/15, Michael Neuling <mi...@neuling.org> wrote:
> The 32 and 64 bit variants of __get_datapage() use a "bcl; mflr" to
> determine the loaded address of the VDSO. The current version of these
> attempt to use the special bcl variant which avoids pushing to the
> link stack.
>
> Unfortunately it uses bcl+8 rather than the required bcl+4. Hence the
> current code results in link stack corruption and the resulting
> performance degradation (due to branch mis-prediction).
>
> This patch moves us to bcl+4 by moving __kernel_datapage_offset
> out of __get_datapage().
>
> With this patch, running the below benchmark we get a bump in
> performance on POWER8 for gettimeofday() (which uses
> __get_datapage()).
>
> 64bit gets ~4% improvement:
>   Without patch:
>     # ./tb
>     time = 0.180321
>   With patch:
>     # ./tb
>     time = 0.187408
>
> 32bit gets ~9% improvement:
>   Without patch:
>     # ./tb
>     time = 0.276551
>   With patch:
>     # ./tb
>     time = 0.252767

I've got the following results on POWER7 64bit
without the patch:
# ./tb
time = 0.263337
# ./tb
time = 0.251273
# ./tb
time = 0.258453
# ./tb
time = 0.260189

with the patch:
# ./tb
time = 0.241517
# ./tb
time = 0.241973
# ./tb
time = 0.239365
# ./tb
time = 0.240109



>
> Testcase tb.c (stolen from Anton)
>   /* gcc -O2 tb.c -o tb */
>   #include <sys/time.h>
>   #include <stdio.h>
>
>   int main()
>   {
>                  int i;
>
>                  struct timeval tv_start, tv_end;
>
>                  gettimeofday(&tv_start, NULL);
>
>                  for(i = 0; i < 10000000; i++) {
>                                  gettimeofday(&tv_end, NULL);
>                  }
>
>                  printf("time = %.6f\n", tv_end.tv_sec - tv_start.tv_sec +
(tv_end.tv_usec
> - tv_start.tv_usec) * 1e-6);
>
>                  return 0;
>   }
>
> Signed-off-by: Michael Neuling <mi...@neuling.org>
> Reported-by: Aaron Sawdey <saw...@us.ibm.com>
>
> diff --git a/arch/powerpc/kernel/vdso32/datapage.S
> b/arch/powerpc/kernel/vdso32/datapage.S
> index dc21e89..59cf5f4 100644
> --- a/arch/powerpc/kernel/vdso32/datapage.S
> +++ b/arch/powerpc/kernel/vdso32/datapage.S
> @@ -16,6 +16,10 @@
>  #include <asm/vdso.h>
>
>                .text
> +              .global                 __kernel_datapage_offset;
> +__kernel_datapage_offset:
> +              .long           0
> +
>  V_FUNCTION_BEGIN(__get_datapage)
>    .cfi_startproc
>                /* We don't want that exposed or overridable as we want other
objects
> @@ -27,13 +31,11 @@ V_FUNCTION_BEGIN(__get_datapage)
>                mflr            r0
>    .cfi_register lr,r0
>
> -              bcl             20,31,1f
> -              .global                 __kernel_datapage_offset;
> -__kernel_datapage_offset:
> -              .long           0
> -1:
> +              bcl             20,31,data_page_branch
> +data_page_branch:
>                mflr            r3
>                mtlr            r0
> +              addi            r3, r3, 
> __kernel_datapage_offset-data_page_branch
>                lwz             r0,0(r3)
>                add             r3,r0,r3
>                blr
> diff --git a/arch/powerpc/kernel/vdso64/datapage.S
> b/arch/powerpc/kernel/vdso64/datapage.S
> index 79796de..2f01c4a 100644
> --- a/arch/powerpc/kernel/vdso64/datapage.S
> +++ b/arch/powerpc/kernel/vdso64/datapage.S
> @@ -16,6 +16,10 @@
>  #include <asm/vdso.h>
>
>                .text
> +.global               __kernel_datapage_offset;
> +__kernel_datapage_offset:
> +              .long           0
> +
>  V_FUNCTION_BEGIN(__get_datapage)
>    .cfi_startproc
>                /* We don't want that exposed or overridable as we want other
objects
> @@ -27,13 +31,11 @@ V_FUNCTION_BEGIN(__get_datapage)
>                mflr            r0
>    .cfi_register lr,r0
>
> -              bcl             20,31,1f
> -              .global                 __kernel_datapage_offset;
> -__kernel_datapage_offset:
> -              .long           0
> -1:
> +              bcl             20,31,data_page_branch
> +data_page_branch:
>                mflr            r3
>                mtlr            r0
> +              addi            r3, r3, 
> __kernel_datapage_offset-data_page_branch
>                lwz             r0,0(r3)
>                add             r3,r0,r3
>                blr
>
>
> _______________________________________________
> Linuxppc-dev mailing list
> Linuxppc-dev@lists.ozlabs.org
> https://lists.ozlabs.org/listinfo/linuxppc-dev



_______________________________________________
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Reply via email to