On Thu, Apr 07, 2016 at 11:43:33AM +0200, Borislav Petkov wrote:
> I guess we can do something like this:
> 
>        if (likely(static_cpu_has(X86_FEATURE_POPCNT)))
>                asm volatile(POPCNT32
>                             : "="REG_OUT (res)
>                             : REG_IN (w));
>        else
>                res = __sw_hweight32(w);
> 
> and get rid of the custom calling convention.
> 
> Along with some numbers showing that the change doesn't cause any
> noticeable slowdown...

Ok, here's something which seems to build and boot in kvm.

I like how we don't need the special calling conventions anymore and we
can actually say "popcnt .." and gcc selects registers.

The include files hackery is kinda nasty but I had to do it because I
needed to be able to use static_cpu_has() in a header and including
asm/cpufeature.h pulls in all kinds of nasty dependencies. I'm certainly
open for better ideas...

---
From: Borislav Petkov <b...@suse.de>
Date: Wed, 4 May 2016 18:52:09 +0200
Subject: [PATCH] x86/hweight: Get rid of the special calling convention

People complained about ARCH_HWEIGHT_CFLAGS and how it throws a wrench
into kcov, lto, etc, experimentation.

And its not like we absolutely need it so let's get rid of it and
streamline it a bit. I had to do some carving out of facilities so
that the include hell doesn't swallow me but other than that, the new
__arch_hweight*() versions look much more palatable and gcc is more free
to select registers than us hardcoding them in the insn bytes.

Signed-off-by: Borislav Petkov <b...@suse.de>
---
 arch/x86/Kconfig                      |   5 --
 arch/x86/include/asm/arch_hweight.h   |  43 ++++---------
 arch/x86/include/asm/cpufeature.h     | 112 +-------------------------------
 arch/x86/include/asm/cpuinfo.h        |  65 +++++++++++++++++++
 arch/x86/include/asm/processor.h      |  63 +-----------------
 arch/x86/include/asm/static_cpu_has.h | 116 ++++++++++++++++++++++++++++++++++
 lib/Makefile                          |   5 --
 7 files changed, 197 insertions(+), 212 deletions(-)
 create mode 100644 arch/x86/include/asm/cpuinfo.h
 create mode 100644 arch/x86/include/asm/static_cpu_has.h

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 7bb15747fea2..79e0bcd61cb1 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -292,11 +292,6 @@ config X86_32_LAZY_GS
        def_bool y
        depends on X86_32 && !CC_STACKPROTECTOR
 
-config ARCH_HWEIGHT_CFLAGS
-       string
-       default "-fcall-saved-ecx -fcall-saved-edx" if X86_32
-       default "-fcall-saved-rdi -fcall-saved-rsi -fcall-saved-rdx 
-fcall-saved-rcx -fcall-saved-r8 -fcall-saved-r9 -fcall-saved-r10 
-fcall-saved-r11" if X86_64
-
 config ARCH_SUPPORTS_UPROBES
        def_bool y
 
diff --git a/arch/x86/include/asm/arch_hweight.h 
b/arch/x86/include/asm/arch_hweight.h
index 02e799fa43d1..6c1a2d500c4c 100644
--- a/arch/x86/include/asm/arch_hweight.h
+++ b/arch/x86/include/asm/arch_hweight.h
@@ -2,36 +2,18 @@
 #define _ASM_X86_HWEIGHT_H
 
 #include <asm/cpufeatures.h>
+#include <asm/static_cpu_has.h>
 
-#ifdef CONFIG_64BIT
-/* popcnt %edi, %eax -- redundant REX prefix for alignment */
-#define POPCNT32 ".byte 0xf3,0x40,0x0f,0xb8,0xc7"
-/* popcnt %rdi, %rax */
-#define POPCNT64 ".byte 0xf3,0x48,0x0f,0xb8,0xc7"
-#define REG_IN "D"
-#define REG_OUT "a"
-#else
-/* popcnt %eax, %eax */
-#define POPCNT32 ".byte 0xf3,0x0f,0xb8,0xc0"
-#define REG_IN "a"
-#define REG_OUT "a"
-#endif
-
-/*
- * __sw_hweightXX are called from within the alternatives below
- * and callee-clobbered registers need to be taken care of. See
- * ARCH_HWEIGHT_CFLAGS in <arch/x86/Kconfig> for the respective
- * compiler switches.
- */
 static __always_inline unsigned int __arch_hweight32(unsigned int w)
 {
-       unsigned int res = 0;
+       unsigned int res;
 
-       asm (ALTERNATIVE("call __sw_hweight32", POPCNT32, X86_FEATURE_POPCNT)
-                    : "="REG_OUT (res)
-                    : REG_IN (w));
+       if (likely(static_cpu_has(X86_FEATURE_POPCNT))) {
+               asm volatile("popcnt %[w], %[res]" : [res] "=r" (res) : [w] "r" 
(w));
 
-       return res;
+               return res;
+       }
+       return __sw_hweight32(w);
 }
 
 static inline unsigned int __arch_hweight16(unsigned int w)
@@ -53,13 +35,14 @@ static inline unsigned long __arch_hweight64(__u64 w)
 #else
 static __always_inline unsigned long __arch_hweight64(__u64 w)
 {
-       unsigned long res = 0;
+       unsigned long res;
 
-       asm (ALTERNATIVE("call __sw_hweight64", POPCNT64, X86_FEATURE_POPCNT)
-                    : "="REG_OUT (res)
-                    : REG_IN (w));
+       if (likely(static_cpu_has(X86_FEATURE_POPCNT))) {
+               asm volatile("popcnt %[w], %[res]" : [res] "=r" (res) : [w] "r" 
(w));
 
-       return res;
+               return res;
+       }
+       return __sw_hweight64(w);
 }
 #endif /* CONFIG_X86_32 */
 
diff --git a/arch/x86/include/asm/cpufeature.h 
b/arch/x86/include/asm/cpufeature.h
index 07c942d84662..9a70b12ae8df 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -6,6 +6,8 @@
 #if defined(__KERNEL__) && !defined(__ASSEMBLY__)
 
 #include <asm/asm.h>
+#include <asm/static_cpu_has.h>
+
 #include <linux/bitops.h>
 
 enum cpuid_leafs
@@ -45,51 +47,6 @@ extern const char * const x86_power_flags[32];
  */
 extern const char * const x86_bug_flags[NBUGINTS*32];
 
-#define test_cpu_cap(c, bit)                                           \
-        test_bit(bit, (unsigned long *)((c)->x86_capability))
-
-#define REQUIRED_MASK_BIT_SET(bit)                                     \
-        ( (((bit)>>5)==0  && (1UL<<((bit)&31) & REQUIRED_MASK0 )) ||   \
-          (((bit)>>5)==1  && (1UL<<((bit)&31) & REQUIRED_MASK1 )) ||   \
-          (((bit)>>5)==2  && (1UL<<((bit)&31) & REQUIRED_MASK2 )) ||   \
-          (((bit)>>5)==3  && (1UL<<((bit)&31) & REQUIRED_MASK3 )) ||   \
-          (((bit)>>5)==4  && (1UL<<((bit)&31) & REQUIRED_MASK4 )) ||   \
-          (((bit)>>5)==5  && (1UL<<((bit)&31) & REQUIRED_MASK5 )) ||   \
-          (((bit)>>5)==6  && (1UL<<((bit)&31) & REQUIRED_MASK6 )) ||   \
-          (((bit)>>5)==7  && (1UL<<((bit)&31) & REQUIRED_MASK7 )) ||   \
-          (((bit)>>5)==8  && (1UL<<((bit)&31) & REQUIRED_MASK8 )) ||   \
-          (((bit)>>5)==9  && (1UL<<((bit)&31) & REQUIRED_MASK9 )) ||   \
-          (((bit)>>5)==10 && (1UL<<((bit)&31) & REQUIRED_MASK10)) ||   \
-          (((bit)>>5)==11 && (1UL<<((bit)&31) & REQUIRED_MASK11)) ||   \
-          (((bit)>>5)==12 && (1UL<<((bit)&31) & REQUIRED_MASK12)) ||   \
-          (((bit)>>5)==13 && (1UL<<((bit)&31) & REQUIRED_MASK13)) ||   \
-          (((bit)>>5)==13 && (1UL<<((bit)&31) & REQUIRED_MASK14)) ||   \
-          (((bit)>>5)==13 && (1UL<<((bit)&31) & REQUIRED_MASK15)) ||   \
-          (((bit)>>5)==14 && (1UL<<((bit)&31) & REQUIRED_MASK16)) )
-
-#define DISABLED_MASK_BIT_SET(bit)                                     \
-        ( (((bit)>>5)==0  && (1UL<<((bit)&31) & DISABLED_MASK0 )) ||   \
-          (((bit)>>5)==1  && (1UL<<((bit)&31) & DISABLED_MASK1 )) ||   \
-          (((bit)>>5)==2  && (1UL<<((bit)&31) & DISABLED_MASK2 )) ||   \
-          (((bit)>>5)==3  && (1UL<<((bit)&31) & DISABLED_MASK3 )) ||   \
-          (((bit)>>5)==4  && (1UL<<((bit)&31) & DISABLED_MASK4 )) ||   \
-          (((bit)>>5)==5  && (1UL<<((bit)&31) & DISABLED_MASK5 )) ||   \
-          (((bit)>>5)==6  && (1UL<<((bit)&31) & DISABLED_MASK6 )) ||   \
-          (((bit)>>5)==7  && (1UL<<((bit)&31) & DISABLED_MASK7 )) ||   \
-          (((bit)>>5)==8  && (1UL<<((bit)&31) & DISABLED_MASK8 )) ||   \
-          (((bit)>>5)==9  && (1UL<<((bit)&31) & DISABLED_MASK9 )) ||   \
-          (((bit)>>5)==10 && (1UL<<((bit)&31) & DISABLED_MASK10)) ||   \
-          (((bit)>>5)==11 && (1UL<<((bit)&31) & DISABLED_MASK11)) ||   \
-          (((bit)>>5)==12 && (1UL<<((bit)&31) & DISABLED_MASK12)) ||   \
-          (((bit)>>5)==13 && (1UL<<((bit)&31) & DISABLED_MASK13)) ||   \
-          (((bit)>>5)==13 && (1UL<<((bit)&31) & DISABLED_MASK14)) ||   \
-          (((bit)>>5)==13 && (1UL<<((bit)&31) & DISABLED_MASK15)) ||   \
-          (((bit)>>5)==14 && (1UL<<((bit)&31) & DISABLED_MASK16)) )
-
-#define cpu_has(c, bit)                                                        
\
-       (__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 :  \
-        test_cpu_cap(c, bit))
-
 #define this_cpu_has(bit)                                              \
        (__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 :  \
         x86_this_cpu_test_bit(bit, (unsigned long *)&cpu_info.x86_capability))
@@ -105,8 +62,6 @@ extern const char * const x86_bug_flags[NBUGINTS*32];
 #define cpu_feature_enabled(bit)       \
        (__builtin_constant_p(bit) && DISABLED_MASK_BIT_SET(bit) ? 0 : 
static_cpu_has(bit))
 
-#define boot_cpu_has(bit)      cpu_has(&boot_cpu_data, bit)
-
 #define set_cpu_cap(c, bit)    set_bit(bit, (unsigned long 
*)((c)->x86_capability))
 #define clear_cpu_cap(c, bit)  clear_bit(bit, (unsigned long 
*)((c)->x86_capability))
 #define setup_clear_cpu_cap(bit) do { \
@@ -118,69 +73,6 @@ extern const char * const x86_bug_flags[NBUGINTS*32];
        set_bit(bit, (unsigned long *)cpu_caps_set);    \
 } while (0)
 
-#if defined(CC_HAVE_ASM_GOTO) && defined(CONFIG_X86_FAST_FEATURE_TESTS)
-/*
- * Static testing of CPU features.  Used the same as boot_cpu_has().
- * These will statically patch the target code for additional
- * performance.
- */
-static __always_inline __pure bool _static_cpu_has(u16 bit)
-{
-               asm_volatile_goto("1: jmp 6f\n"
-                        "2:\n"
-                        ".skip -(((5f-4f) - (2b-1b)) > 0) * "
-                                "((5f-4f) - (2b-1b)),0x90\n"
-                        "3:\n"
-                        ".section .altinstructions,\"a\"\n"
-                        " .long 1b - .\n"              /* src offset */
-                        " .long 4f - .\n"              /* repl offset */
-                        " .word %P1\n"                 /* always replace */
-                        " .byte 3b - 1b\n"             /* src len */
-                        " .byte 5f - 4f\n"             /* repl len */
-                        " .byte 3b - 2b\n"             /* pad len */
-                        ".previous\n"
-                        ".section .altinstr_replacement,\"ax\"\n"
-                        "4: jmp %l[t_no]\n"
-                        "5:\n"
-                        ".previous\n"
-                        ".section .altinstructions,\"a\"\n"
-                        " .long 1b - .\n"              /* src offset */
-                        " .long 0\n"                   /* no replacement */
-                        " .word %P0\n"                 /* feature bit */
-                        " .byte 3b - 1b\n"             /* src len */
-                        " .byte 0\n"                   /* repl len */
-                        " .byte 0\n"                   /* pad len */
-                        ".previous\n"
-                        ".section .altinstr_aux,\"ax\"\n"
-                        "6:\n"
-                        " testb %[bitnum],%[cap_byte]\n"
-                        " jnz %l[t_yes]\n"
-                        " jmp %l[t_no]\n"
-                        ".previous\n"
-                        : : "i" (bit), "i" (X86_FEATURE_ALWAYS),
-                            [bitnum] "i" (1 << (bit & 7)),
-                            [cap_byte] "m" (((const char 
*)boot_cpu_data.x86_capability)[bit >> 3])
-                        : : t_yes, t_no);
-       t_yes:
-               return true;
-       t_no:
-               return false;
-}
-
-#define static_cpu_has(bit)                                    \
-(                                                              \
-       __builtin_constant_p(boot_cpu_has(bit)) ?               \
-               boot_cpu_has(bit) :                             \
-               _static_cpu_has(bit)                            \
-)
-#else
-/*
- * Fall back to dynamic for gcc versions which don't support asm goto. Should 
be
- * a minority now anyway.
- */
-#define static_cpu_has(bit)            boot_cpu_has(bit)
-#endif
-
 #define cpu_has_bug(c, bit)            cpu_has(c, (bit))
 #define set_cpu_bug(c, bit)            set_cpu_cap(c, (bit))
 #define clear_cpu_bug(c, bit)          clear_cpu_cap(c, (bit))
diff --git a/arch/x86/include/asm/cpuinfo.h b/arch/x86/include/asm/cpuinfo.h
new file mode 100644
index 000000000000..a6632044f199
--- /dev/null
+++ b/arch/x86/include/asm/cpuinfo.h
@@ -0,0 +1,65 @@
+#ifndef _ASM_X86_CPUINFO_H_
+#define _ASM_X86_CPUINFO_H_
+
+/*
+ *  CPU type and hardware bug flags. Kept separately for each CPU.
+ *  Members of this structure are referenced in head.S, so think twice
+ *  before touching them. [mj]
+ */
+struct cpuinfo_x86 {
+       __u8                    x86;            /* CPU family */
+       __u8                    x86_vendor;     /* CPU vendor */
+       __u8                    x86_model;
+       __u8                    x86_mask;
+#ifdef CONFIG_X86_32
+       char                    wp_works_ok;    /* It doesn't on 386's */
+
+       /* Problems on some 486Dx4's and old 386's: */
+       char                    rfu;
+       char                    pad0;
+       char                    pad1;
+#else
+       /* Number of 4K pages in DTLB/ITLB combined(in pages): */
+       int                     x86_tlbsize;
+#endif
+       __u8                    x86_virt_bits;
+       __u8                    x86_phys_bits;
+       /* CPUID returned core id bits: */
+       __u8                    x86_coreid_bits;
+       /* Max extended CPUID function supported: */
+       __u32                   extended_cpuid_level;
+       /* Maximum supported CPUID level, -1=no CPUID: */
+       int                     cpuid_level;
+       __u32                   x86_capability[NCAPINTS + NBUGINTS];
+       char                    x86_vendor_id[16];
+       char                    x86_model_id[64];
+       /* in KB - valid for CPUS which support this call: */
+       int                     x86_cache_size;
+       int                     x86_cache_alignment;    /* In bytes */
+       /* Cache QoS architectural values: */
+       int                     x86_cache_max_rmid;     /* max index */
+       int                     x86_cache_occ_scale;    /* scale to bytes */
+       int                     x86_power;
+       unsigned long           loops_per_jiffy;
+       /* cpuid returned max cores value: */
+       u16                      x86_max_cores;
+       u16                     apicid;
+       u16                     initial_apicid;
+       u16                     x86_clflush_size;
+       /* number of cores as seen by the OS: */
+       u16                     booted_cores;
+       /* Physical processor id: */
+       u16                     phys_proc_id;
+       /* Logical processor id: */
+       u16                     logical_proc_id;
+       /* Core id: */
+       u16                     cpu_core_id;
+       /* Index into per_cpu list: */
+       u16                     cpu_index;
+       u32                     microcode;
+};
+
+extern struct cpuinfo_x86      boot_cpu_data;
+extern struct cpuinfo_x86      new_cpu_data;
+
+#endif /* _ASM_X86_CPUINFO_H_ */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 62c6cc3cc5d3..6f6555b20e3d 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -22,6 +22,7 @@ struct vm86;
 #include <asm/nops.h>
 #include <asm/special_insns.h>
 #include <asm/fpu/types.h>
+#include <asm/cpuinfo.h>
 
 #include <linux/personality.h>
 #include <linux/cache.h>
@@ -78,65 +79,6 @@ extern u16 __read_mostly tlb_lld_2m[NR_INFO];
 extern u16 __read_mostly tlb_lld_4m[NR_INFO];
 extern u16 __read_mostly tlb_lld_1g[NR_INFO];
 
-/*
- *  CPU type and hardware bug flags. Kept separately for each CPU.
- *  Members of this structure are referenced in head.S, so think twice
- *  before touching them. [mj]
- */
-
-struct cpuinfo_x86 {
-       __u8                    x86;            /* CPU family */
-       __u8                    x86_vendor;     /* CPU vendor */
-       __u8                    x86_model;
-       __u8                    x86_mask;
-#ifdef CONFIG_X86_32
-       char                    wp_works_ok;    /* It doesn't on 386's */
-
-       /* Problems on some 486Dx4's and old 386's: */
-       char                    rfu;
-       char                    pad0;
-       char                    pad1;
-#else
-       /* Number of 4K pages in DTLB/ITLB combined(in pages): */
-       int                     x86_tlbsize;
-#endif
-       __u8                    x86_virt_bits;
-       __u8                    x86_phys_bits;
-       /* CPUID returned core id bits: */
-       __u8                    x86_coreid_bits;
-       /* Max extended CPUID function supported: */
-       __u32                   extended_cpuid_level;
-       /* Maximum supported CPUID level, -1=no CPUID: */
-       int                     cpuid_level;
-       __u32                   x86_capability[NCAPINTS + NBUGINTS];
-       char                    x86_vendor_id[16];
-       char                    x86_model_id[64];
-       /* in KB - valid for CPUS which support this call: */
-       int                     x86_cache_size;
-       int                     x86_cache_alignment;    /* In bytes */
-       /* Cache QoS architectural values: */
-       int                     x86_cache_max_rmid;     /* max index */
-       int                     x86_cache_occ_scale;    /* scale to bytes */
-       int                     x86_power;
-       unsigned long           loops_per_jiffy;
-       /* cpuid returned max cores value: */
-       u16                      x86_max_cores;
-       u16                     apicid;
-       u16                     initial_apicid;
-       u16                     x86_clflush_size;
-       /* number of cores as seen by the OS: */
-       u16                     booted_cores;
-       /* Physical processor id: */
-       u16                     phys_proc_id;
-       /* Logical processor id: */
-       u16                     logical_proc_id;
-       /* Core id: */
-       u16                     cpu_core_id;
-       /* Index into per_cpu list: */
-       u16                     cpu_index;
-       u32                     microcode;
-};
-
 #define X86_VENDOR_INTEL       0
 #define X86_VENDOR_CYRIX       1
 #define X86_VENDOR_AMD         2
@@ -151,9 +93,6 @@ struct cpuinfo_x86 {
 /*
  * capabilities of CPUs
  */
-extern struct cpuinfo_x86      boot_cpu_data;
-extern struct cpuinfo_x86      new_cpu_data;
-
 extern struct tss_struct       doublefault_tss;
 extern __u32                   cpu_caps_cleared[NCAPINTS];
 extern __u32                   cpu_caps_set[NCAPINTS];
diff --git a/arch/x86/include/asm/static_cpu_has.h 
b/arch/x86/include/asm/static_cpu_has.h
new file mode 100644
index 000000000000..648ada0c7ffe
--- /dev/null
+++ b/arch/x86/include/asm/static_cpu_has.h
@@ -0,0 +1,116 @@
+#ifndef _ASM_X86_STATIC_CPU_HAS_H
+#define _ASM_X86_STATIC_CPU_HAS_H
+
+#include <asm/cpuinfo.h>
+
+#define test_cpu_cap(c, bit)                                           \
+        test_bit(bit, (unsigned long *)((c)->x86_capability))
+
+#define REQUIRED_MASK_BIT_SET(bit)                                     \
+        ( (((bit)>>5)==0  && (1UL<<((bit)&31) & REQUIRED_MASK0 )) ||   \
+          (((bit)>>5)==1  && (1UL<<((bit)&31) & REQUIRED_MASK1 )) ||   \
+          (((bit)>>5)==2  && (1UL<<((bit)&31) & REQUIRED_MASK2 )) ||   \
+          (((bit)>>5)==3  && (1UL<<((bit)&31) & REQUIRED_MASK3 )) ||   \
+          (((bit)>>5)==4  && (1UL<<((bit)&31) & REQUIRED_MASK4 )) ||   \
+          (((bit)>>5)==5  && (1UL<<((bit)&31) & REQUIRED_MASK5 )) ||   \
+          (((bit)>>5)==6  && (1UL<<((bit)&31) & REQUIRED_MASK6 )) ||   \
+          (((bit)>>5)==7  && (1UL<<((bit)&31) & REQUIRED_MASK7 )) ||   \
+          (((bit)>>5)==8  && (1UL<<((bit)&31) & REQUIRED_MASK8 )) ||   \
+          (((bit)>>5)==9  && (1UL<<((bit)&31) & REQUIRED_MASK9 )) ||   \
+          (((bit)>>5)==10 && (1UL<<((bit)&31) & REQUIRED_MASK10)) ||   \
+          (((bit)>>5)==11 && (1UL<<((bit)&31) & REQUIRED_MASK11)) ||   \
+          (((bit)>>5)==12 && (1UL<<((bit)&31) & REQUIRED_MASK12)) ||   \
+          (((bit)>>5)==13 && (1UL<<((bit)&31) & REQUIRED_MASK13)) ||   \
+          (((bit)>>5)==13 && (1UL<<((bit)&31) & REQUIRED_MASK14)) ||   \
+          (((bit)>>5)==13 && (1UL<<((bit)&31) & REQUIRED_MASK15)) ||   \
+          (((bit)>>5)==14 && (1UL<<((bit)&31) & REQUIRED_MASK16)) )
+
+#define DISABLED_MASK_BIT_SET(bit)                                     \
+        ( (((bit)>>5)==0  && (1UL<<((bit)&31) & DISABLED_MASK0 )) ||   \
+          (((bit)>>5)==1  && (1UL<<((bit)&31) & DISABLED_MASK1 )) ||   \
+          (((bit)>>5)==2  && (1UL<<((bit)&31) & DISABLED_MASK2 )) ||   \
+          (((bit)>>5)==3  && (1UL<<((bit)&31) & DISABLED_MASK3 )) ||   \
+          (((bit)>>5)==4  && (1UL<<((bit)&31) & DISABLED_MASK4 )) ||   \
+          (((bit)>>5)==5  && (1UL<<((bit)&31) & DISABLED_MASK5 )) ||   \
+          (((bit)>>5)==6  && (1UL<<((bit)&31) & DISABLED_MASK6 )) ||   \
+          (((bit)>>5)==7  && (1UL<<((bit)&31) & DISABLED_MASK7 )) ||   \
+          (((bit)>>5)==8  && (1UL<<((bit)&31) & DISABLED_MASK8 )) ||   \
+          (((bit)>>5)==9  && (1UL<<((bit)&31) & DISABLED_MASK9 )) ||   \
+          (((bit)>>5)==10 && (1UL<<((bit)&31) & DISABLED_MASK10)) ||   \
+          (((bit)>>5)==11 && (1UL<<((bit)&31) & DISABLED_MASK11)) ||   \
+          (((bit)>>5)==12 && (1UL<<((bit)&31) & DISABLED_MASK12)) ||   \
+          (((bit)>>5)==13 && (1UL<<((bit)&31) & DISABLED_MASK13)) ||   \
+          (((bit)>>5)==13 && (1UL<<((bit)&31) & DISABLED_MASK14)) ||   \
+          (((bit)>>5)==13 && (1UL<<((bit)&31) & DISABLED_MASK15)) ||   \
+          (((bit)>>5)==14 && (1UL<<((bit)&31) & DISABLED_MASK16)) )
+
+#define cpu_has(c, bit)                                                        
\
+       (__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 :  \
+        test_cpu_cap(c, bit))
+
+#define boot_cpu_has(bit)      cpu_has(&boot_cpu_data, bit)
+
+#if defined(CC_HAVE_ASM_GOTO) && defined(CONFIG_X86_FAST_FEATURE_TESTS)
+/*
+ * Static testing of CPU features.  Used the same as boot_cpu_has().
+ * These will statically patch the target code for additional
+ * performance.
+ */
+static __always_inline __pure bool _static_cpu_has(u16 bit)
+{
+               asm_volatile_goto("1: jmp 6f\n"
+                        "2:\n"
+                        ".skip -(((5f-4f) - (2b-1b)) > 0) * "
+                                "((5f-4f) - (2b-1b)),0x90\n"
+                        "3:\n"
+                        ".section .altinstructions,\"a\"\n"
+                        " .long 1b - .\n"              /* src offset */
+                        " .long 4f - .\n"              /* repl offset */
+                        " .word %P1\n"                 /* always replace */
+                        " .byte 3b - 1b\n"             /* src len */
+                        " .byte 5f - 4f\n"             /* repl len */
+                        " .byte 3b - 2b\n"             /* pad len */
+                        ".previous\n"
+                        ".section .altinstr_replacement,\"ax\"\n"
+                        "4: jmp %l[t_no]\n"
+                        "5:\n"
+                        ".previous\n"
+                        ".section .altinstructions,\"a\"\n"
+                        " .long 1b - .\n"              /* src offset */
+                        " .long 0\n"                   /* no replacement */
+                        " .word %P0\n"                 /* feature bit */
+                        " .byte 3b - 1b\n"             /* src len */
+                        " .byte 0\n"                   /* repl len */
+                        " .byte 0\n"                   /* pad len */
+                        ".previous\n"
+                        ".section .altinstr_aux,\"ax\"\n"
+                        "6:\n"
+                        " testb %[bitnum],%[cap_byte]\n"
+                        " jnz %l[t_yes]\n"
+                        " jmp %l[t_no]\n"
+                        ".previous\n"
+                        : : "i" (bit), "i" (X86_FEATURE_ALWAYS),
+                            [bitnum] "i" (1 << (bit & 7)),
+                            [cap_byte] "m" (((const char 
*)boot_cpu_data.x86_capability)[bit >> 3])
+                        : : t_yes, t_no);
+       t_yes:
+               return true;
+       t_no:
+               return false;
+}
+
+#define static_cpu_has(bit)                                    \
+(                                                              \
+       __builtin_constant_p(boot_cpu_has(bit)) ?               \
+               boot_cpu_has(bit) :                             \
+               _static_cpu_has(bit)                            \
+)
+#else
+/*
+ * Fall back to dynamic for gcc versions which don't support asm goto. Should 
be
+ * a minority now anyway.
+ */
+#define static_cpu_has(bit)            boot_cpu_has(bit)
+#endif
+
+#endif /* _ASM_X86_STATIC_CPU_HAS_H */
diff --git a/lib/Makefile b/lib/Makefile
index a65e9a861535..55ad20701dc0 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -15,9 +15,6 @@ KCOV_INSTRUMENT_rbtree.o := n
 KCOV_INSTRUMENT_list_debug.o := n
 KCOV_INSTRUMENT_debugobjects.o := n
 KCOV_INSTRUMENT_dynamic_debug.o := n
-# Kernel does not boot if we instrument this file as it uses custom calling
-# convention (see CONFIG_ARCH_HWEIGHT_CFLAGS).
-KCOV_INSTRUMENT_hweight.o := n
 
 lib-y := ctype.o string.o vsprintf.o cmdline.o \
         rbtree.o radix-tree.o dump_stack.o timerqueue.o\
@@ -72,8 +69,6 @@ obj-$(CONFIG_HAS_IOMEM) += iomap_copy.o devres.o
 obj-$(CONFIG_CHECK_SIGNATURE) += check_signature.o
 obj-$(CONFIG_DEBUG_LOCKING_API_SELFTESTS) += locking-selftest.o
 
-GCOV_PROFILE_hweight.o := n
-CFLAGS_hweight.o = $(subst $(quote),,$(CONFIG_ARCH_HWEIGHT_CFLAGS))
 obj-$(CONFIG_GENERIC_HWEIGHT) += hweight.o
 
 obj-$(CONFIG_BTREE) += btree.o

-- 
2.7.3

SUSE Linux GmbH, GF: Felix Imendörffer, Jane Smithard, Graham Norton, HRB 21284 
(AG Nürnberg)
-- 

Reply via email to