[tip:x86/urgent] x86/fsgsbase/64: Fix the base write helper functions

2018-11-21 Thread tip-bot for Chang S. Bae
Commit-ID:  8b791a31e730a652537635a53b2ac02db8e6da1d
Gitweb: https://git.kernel.org/tip/8b791a31e730a652537635a53b2ac02db8e6da1d
Author: Chang S. Bae 
AuthorDate: Fri, 16 Nov 2018 15:27:28 -0800
Committer:  Thomas Gleixner 
CommitDate: Wed, 21 Nov 2018 22:23:51 +0100

x86/fsgsbase/64: Fix the base write helper functions

The helper functions that purport to write the base should just write it
only. They shouldn't have magic optimizations to change the index.

Make the index explicitly changed in the caller, instead of including the
code in the helpers.

Subsequently, the task write helpers do not handle for the current task
anymore. The range check for a base value is also factored out, to
minimize code redundancy from the caller.

Fixes: b1378a561fd1 ("x86/fsgsbase/64: Introduce FS/GS base helper functions")
Suggested-by: Andy Lutomirski 
Signed-off-by: Chang S. Bae 
Signed-off-by: Thomas Gleixner 
Reviewed-by: Andy Lutomirski 
Cc: "H . Peter Anvin" 
Cc: Andi Kleen 
Cc: Dave Hansen 
Cc: Ravi Shankar 
Cc: H. Peter Anvin 
Link: https://lkml.kernel.org/r/20181116232728.23538-1-chang.seok@intel.com

---
 arch/x86/include/asm/fsgsbase.h | 15 +--
 arch/x86/kernel/process_64.c| 86 -
 arch/x86/kernel/ptrace.c|  9 +++--
 3 files changed, 58 insertions(+), 52 deletions(-)

diff --git a/arch/x86/include/asm/fsgsbase.h b/arch/x86/include/asm/fsgsbase.h
index eb377b6e9eed..bca4c743de77 100644
--- a/arch/x86/include/asm/fsgsbase.h
+++ b/arch/x86/include/asm/fsgsbase.h
@@ -16,8 +16,8 @@
  */
 extern unsigned long x86_fsbase_read_task(struct task_struct *task);
 extern unsigned long x86_gsbase_read_task(struct task_struct *task);
-extern int x86_fsbase_write_task(struct task_struct *task, unsigned long 
fsbase);
-extern int x86_gsbase_write_task(struct task_struct *task, unsigned long 
gsbase);
+extern void x86_fsbase_write_task(struct task_struct *task, unsigned long 
fsbase);
+extern void x86_gsbase_write_task(struct task_struct *task, unsigned long 
gsbase);
 
 /* Helper functions for reading/writing FS/GS base */
 
@@ -39,8 +39,15 @@ static inline unsigned long 
x86_gsbase_read_cpu_inactive(void)
return gsbase;
 }
 
-extern void x86_fsbase_write_cpu(unsigned long fsbase);
-extern void x86_gsbase_write_cpu_inactive(unsigned long gsbase);
+static inline void x86_fsbase_write_cpu(unsigned long fsbase)
+{
+   wrmsrl(MSR_FS_BASE, fsbase);
+}
+
+static inline void x86_gsbase_write_cpu_inactive(unsigned long gsbase)
+{
+   wrmsrl(MSR_KERNEL_GS_BASE, gsbase);
+}
 
 #endif /* CONFIG_X86_64 */
 
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 0e0b4288a4b2..74035c2a85b3 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -337,24 +337,6 @@ static unsigned long x86_fsgsbase_read_task(struct 
task_struct *task,
return base;
 }
 
-void x86_fsbase_write_cpu(unsigned long fsbase)
-{
-   /*
-* Set the selector to 0 as a notion, that the segment base is
-* overwritten, which will be checked for skipping the segment load
-* during context switch.
-*/
-   loadseg(FS, 0);
-   wrmsrl(MSR_FS_BASE, fsbase);
-}
-
-void x86_gsbase_write_cpu_inactive(unsigned long gsbase)
-{
-   /* Set the selector to 0 for the same reason as %fs above. */
-   loadseg(GS, 0);
-   wrmsrl(MSR_KERNEL_GS_BASE, gsbase);
-}
-
 unsigned long x86_fsbase_read_task(struct task_struct *task)
 {
unsigned long fsbase;
@@ -383,38 +365,18 @@ unsigned long x86_gsbase_read_task(struct task_struct 
*task)
return gsbase;
 }
 
-int x86_fsbase_write_task(struct task_struct *task, unsigned long fsbase)
+void x86_fsbase_write_task(struct task_struct *task, unsigned long fsbase)
 {
-   /*
-* Not strictly needed for %fs, but do it for symmetry
-* with %gs
-*/
-   if (unlikely(fsbase >= TASK_SIZE_MAX))
-   return -EPERM;
+   WARN_ON_ONCE(task == current);
 
-   preempt_disable();
task->thread.fsbase = fsbase;
-   if (task == current)
-   x86_fsbase_write_cpu(fsbase);
-   task->thread.fsindex = 0;
-   preempt_enable();
-
-   return 0;
 }
 
-int x86_gsbase_write_task(struct task_struct *task, unsigned long gsbase)
+void x86_gsbase_write_task(struct task_struct *task, unsigned long gsbase)
 {
-   if (unlikely(gsbase >= TASK_SIZE_MAX))
-   return -EPERM;
+   WARN_ON_ONCE(task == current);
 
-   preempt_disable();
task->thread.gsbase = gsbase;
-   if (task == current)
-   x86_gsbase_write_cpu_inactive(gsbase);
-   task->thread.gsindex = 0;
-   preempt_enable();
-
-   return 0;
 }
 
 int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
@@ -758,11 +720,47 @@ long do_arch_prctl_64(struct task_struct *task, int 
option, unsigned long arg2)
 
switch (option) {
case ARCH_SET_GS: {
-   ret = x86

[tip:x86/urgent] x86/fsgsbase/64: Fix the base write helper functions

2018-12-18 Thread tip-bot for Chang S. Bae
Commit-ID:  4771d4f5de52b4e7aa62334d7ba1e5bd22a5f900
Gitweb: https://git.kernel.org/tip/4771d4f5de52b4e7aa62334d7ba1e5bd22a5f900
Author: Chang S. Bae 
AuthorDate: Mon, 26 Nov 2018 11:55:24 -0800
Committer:  Thomas Gleixner 
CommitDate: Tue, 18 Dec 2018 12:58:21 +0100

x86/fsgsbase/64: Fix the base write helper functions

The helper functions which write fs/gs base are not just writing the base,
they are also changing the index. That's wrong and needs to be separated.

Make the index explicitly changed from the caller, instead of including
the code in the helpers.

Subsequently, the task write helpers do not handle for the current task
anymore. The range check for a base value is also factored out, to minimize
code redundancy from the caller.

Fixes: b1378a561fd1 ("x86/fsgsbase/64: Introduce FS/GS base helper functions")
Suggested-by: Andy Lutomirski 
Signed-off-by: Chang S. Bae 
Signed-off-by: Thomas Gleixner 
Reviewed-by: Andy Lutomirski 
Cc: "H . Peter Anvin" 
Cc: Andi Kleen 
Cc: Dave Hansen 
Cc: Ravi Shankar 
Cc: H. Peter Anvin 
Link: https://lkml.kernel.org/r/20181126195524.32179-1-chang.seok@intel.com
---
 arch/x86/include/asm/fsgsbase.h | 15 +--
 arch/x86/kernel/process_64.c| 99 +++--
 arch/x86/kernel/ptrace.c|  9 ++--
 3 files changed, 71 insertions(+), 52 deletions(-)

diff --git a/arch/x86/include/asm/fsgsbase.h b/arch/x86/include/asm/fsgsbase.h
index eb377b6e9eed..bca4c743de77 100644
--- a/arch/x86/include/asm/fsgsbase.h
+++ b/arch/x86/include/asm/fsgsbase.h
@@ -16,8 +16,8 @@
  */
 extern unsigned long x86_fsbase_read_task(struct task_struct *task);
 extern unsigned long x86_gsbase_read_task(struct task_struct *task);
-extern int x86_fsbase_write_task(struct task_struct *task, unsigned long 
fsbase);
-extern int x86_gsbase_write_task(struct task_struct *task, unsigned long 
gsbase);
+extern void x86_fsbase_write_task(struct task_struct *task, unsigned long 
fsbase);
+extern void x86_gsbase_write_task(struct task_struct *task, unsigned long 
gsbase);
 
 /* Helper functions for reading/writing FS/GS base */
 
@@ -39,8 +39,15 @@ static inline unsigned long 
x86_gsbase_read_cpu_inactive(void)
return gsbase;
 }
 
-extern void x86_fsbase_write_cpu(unsigned long fsbase);
-extern void x86_gsbase_write_cpu_inactive(unsigned long gsbase);
+static inline void x86_fsbase_write_cpu(unsigned long fsbase)
+{
+   wrmsrl(MSR_FS_BASE, fsbase);
+}
+
+static inline void x86_gsbase_write_cpu_inactive(unsigned long gsbase)
+{
+   wrmsrl(MSR_KERNEL_GS_BASE, gsbase);
+}
 
 #endif /* CONFIG_X86_64 */
 
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index bbfbf017065c..ddd4fa718c43 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -339,24 +339,6 @@ static unsigned long x86_fsgsbase_read_task(struct 
task_struct *task,
return base;
 }
 
-void x86_fsbase_write_cpu(unsigned long fsbase)
-{
-   /*
-* Set the selector to 0 as a notion, that the segment base is
-* overwritten, which will be checked for skipping the segment load
-* during context switch.
-*/
-   loadseg(FS, 0);
-   wrmsrl(MSR_FS_BASE, fsbase);
-}
-
-void x86_gsbase_write_cpu_inactive(unsigned long gsbase)
-{
-   /* Set the selector to 0 for the same reason as %fs above. */
-   loadseg(GS, 0);
-   wrmsrl(MSR_KERNEL_GS_BASE, gsbase);
-}
-
 unsigned long x86_fsbase_read_task(struct task_struct *task)
 {
unsigned long fsbase;
@@ -385,38 +367,18 @@ unsigned long x86_gsbase_read_task(struct task_struct 
*task)
return gsbase;
 }
 
-int x86_fsbase_write_task(struct task_struct *task, unsigned long fsbase)
+void x86_fsbase_write_task(struct task_struct *task, unsigned long fsbase)
 {
-   /*
-* Not strictly needed for %fs, but do it for symmetry
-* with %gs
-*/
-   if (unlikely(fsbase >= TASK_SIZE_MAX))
-   return -EPERM;
+   WARN_ON_ONCE(task == current);
 
-   preempt_disable();
task->thread.fsbase = fsbase;
-   if (task == current)
-   x86_fsbase_write_cpu(fsbase);
-   task->thread.fsindex = 0;
-   preempt_enable();
-
-   return 0;
 }
 
-int x86_gsbase_write_task(struct task_struct *task, unsigned long gsbase)
+void x86_gsbase_write_task(struct task_struct *task, unsigned long gsbase)
 {
-   if (unlikely(gsbase >= TASK_SIZE_MAX))
-   return -EPERM;
+   WARN_ON_ONCE(task == current);
 
-   preempt_disable();
task->thread.gsbase = gsbase;
-   if (task == current)
-   x86_gsbase_write_cpu_inactive(gsbase);
-   task->thread.gsindex = 0;
-   preempt_enable();
-
-   return 0;
 }
 
 int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
@@ -754,11 +716,60 @@ long do_arch_prctl_64(struct task_struct *task, int 
option, unsigned long arg2)
 
switch (option) {
case ARCH_SET_GS: {
-   

[tip:x86/urgent] x86/fsgsbase/64: Fix the base write helper functions

2018-12-18 Thread tip-bot for Chang S. Bae
Commit-ID:  87ab4689ca6526079ab6f5150219ee88b42000ae
Gitweb: https://git.kernel.org/tip/87ab4689ca6526079ab6f5150219ee88b42000ae
Author: Chang S. Bae 
AuthorDate: Mon, 26 Nov 2018 11:55:24 -0800
Committer:  Thomas Gleixner 
CommitDate: Tue, 18 Dec 2018 14:26:09 +0100

x86/fsgsbase/64: Fix the base write helper functions

Andy spotted a regression in the fs/gs base helpers after the patch series
was committed. The helper functions which write fs/gs base are not just
writing the base, they are also changing the index. That's wrong and needs
to be separated because writing the base has not to modify the index.

While the regression is not causing any harm right now because the only
caller depends on that behaviour, it's a guarantee for subtle breakage down
the road.

Make the index explicitly changed from the caller, instead of including
the code in the helpers.

Subsequently, the task write helpers do not handle for the current task
anymore. The range check for a base value is also factored out, to minimize
code redundancy from the caller.

Fixes: b1378a561fd1 ("x86/fsgsbase/64: Introduce FS/GS base helper functions")
Suggested-by: Andy Lutomirski 
Signed-off-by: Chang S. Bae 
Signed-off-by: Thomas Gleixner 
Reviewed-by: Andy Lutomirski 
Cc: "H . Peter Anvin" 
Cc: Andi Kleen 
Cc: Dave Hansen 
Cc: Ravi Shankar 
Cc: H. Peter Anvin 
Link: https://lkml.kernel.org/r/20181126195524.32179-1-chang.seok@intel.com
---
 arch/x86/include/asm/fsgsbase.h | 15 +--
 arch/x86/kernel/process_64.c| 99 +++--
 arch/x86/kernel/ptrace.c|  9 ++--
 3 files changed, 71 insertions(+), 52 deletions(-)

diff --git a/arch/x86/include/asm/fsgsbase.h b/arch/x86/include/asm/fsgsbase.h
index eb377b6e9eed..bca4c743de77 100644
--- a/arch/x86/include/asm/fsgsbase.h
+++ b/arch/x86/include/asm/fsgsbase.h
@@ -16,8 +16,8 @@
  */
 extern unsigned long x86_fsbase_read_task(struct task_struct *task);
 extern unsigned long x86_gsbase_read_task(struct task_struct *task);
-extern int x86_fsbase_write_task(struct task_struct *task, unsigned long 
fsbase);
-extern int x86_gsbase_write_task(struct task_struct *task, unsigned long 
gsbase);
+extern void x86_fsbase_write_task(struct task_struct *task, unsigned long 
fsbase);
+extern void x86_gsbase_write_task(struct task_struct *task, unsigned long 
gsbase);
 
 /* Helper functions for reading/writing FS/GS base */
 
@@ -39,8 +39,15 @@ static inline unsigned long 
x86_gsbase_read_cpu_inactive(void)
return gsbase;
 }
 
-extern void x86_fsbase_write_cpu(unsigned long fsbase);
-extern void x86_gsbase_write_cpu_inactive(unsigned long gsbase);
+static inline void x86_fsbase_write_cpu(unsigned long fsbase)
+{
+   wrmsrl(MSR_FS_BASE, fsbase);
+}
+
+static inline void x86_gsbase_write_cpu_inactive(unsigned long gsbase)
+{
+   wrmsrl(MSR_KERNEL_GS_BASE, gsbase);
+}
 
 #endif /* CONFIG_X86_64 */
 
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index bbfbf017065c..ddd4fa718c43 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -339,24 +339,6 @@ static unsigned long x86_fsgsbase_read_task(struct 
task_struct *task,
return base;
 }
 
-void x86_fsbase_write_cpu(unsigned long fsbase)
-{
-   /*
-* Set the selector to 0 as a notion, that the segment base is
-* overwritten, which will be checked for skipping the segment load
-* during context switch.
-*/
-   loadseg(FS, 0);
-   wrmsrl(MSR_FS_BASE, fsbase);
-}
-
-void x86_gsbase_write_cpu_inactive(unsigned long gsbase)
-{
-   /* Set the selector to 0 for the same reason as %fs above. */
-   loadseg(GS, 0);
-   wrmsrl(MSR_KERNEL_GS_BASE, gsbase);
-}
-
 unsigned long x86_fsbase_read_task(struct task_struct *task)
 {
unsigned long fsbase;
@@ -385,38 +367,18 @@ unsigned long x86_gsbase_read_task(struct task_struct 
*task)
return gsbase;
 }
 
-int x86_fsbase_write_task(struct task_struct *task, unsigned long fsbase)
+void x86_fsbase_write_task(struct task_struct *task, unsigned long fsbase)
 {
-   /*
-* Not strictly needed for %fs, but do it for symmetry
-* with %gs
-*/
-   if (unlikely(fsbase >= TASK_SIZE_MAX))
-   return -EPERM;
+   WARN_ON_ONCE(task == current);
 
-   preempt_disable();
task->thread.fsbase = fsbase;
-   if (task == current)
-   x86_fsbase_write_cpu(fsbase);
-   task->thread.fsindex = 0;
-   preempt_enable();
-
-   return 0;
 }
 
-int x86_gsbase_write_task(struct task_struct *task, unsigned long gsbase)
+void x86_gsbase_write_task(struct task_struct *task, unsigned long gsbase)
 {
-   if (unlikely(gsbase >= TASK_SIZE_MAX))
-   return -EPERM;
+   WARN_ON_ONCE(task == current);
 
-   preempt_disable();
task->thread.gsbase = gsbase;
-   if (task == current)
-   x86_gsbase_write_cpu_inactive(gsbase);
-   task->thread.gsi

[tip:x86/asm] x86/fsgsbase/64: Introduce FS/GS base helper functions

2018-10-08 Thread tip-bot for Chang S. Bae
Commit-ID:  b1378a561fd16afdd96ef0bc912b1bcd2b85a68e
Gitweb: https://git.kernel.org/tip/b1378a561fd16afdd96ef0bc912b1bcd2b85a68e
Author: Chang S. Bae 
AuthorDate: Tue, 18 Sep 2018 16:08:53 -0700
Committer:  Ingo Molnar 
CommitDate: Mon, 8 Oct 2018 10:41:08 +0200

x86/fsgsbase/64: Introduce FS/GS base helper functions

Introduce FS/GS base access functionality via ,
not yet used by anything directly.

Factor out task_seg_base() from x86/ptrace.c and rename it to
x86_fsgsbase_read_task() to make it part of the new helpers.

This will allow us to enhance FSGSBASE support and eventually enable
the FSBASE/GSBASE instructions.

An "inactive" GS base refers to a base saved at kernel entry
and being part of an inactive, non-running/stopped user-task.
(The typical ptrace model.)

Here are the new functions:

  x86_fsbase_read_task()
  x86_gsbase_read_task()
  x86_fsbase_write_task()
  x86_gsbase_write_task()
  x86_fsbase_read_cpu()
  x86_fsbase_write_cpu()
  x86_gsbase_read_cpu_inactive()
  x86_gsbase_write_cpu_inactive()

As an advantage of the unified namespace we can now see all FS/GSBASE
API use in the kernel via the following 'git grep' pattern:

  $ git grep x86_.*sbase

[ mingo: Wrote new changelog. ]

Based-on-code-from: Andy Lutomirski 
Suggested-by: Ingo Molnar 
Signed-off-by: Chang S. Bae 
Cc: Andy Lutomirski 
Cc: Andy Lutomirski 
Cc: Borislav Petkov 
Cc: Brian Gerst 
Cc: Dave Hansen 
Cc: Denys Vlasenko 
Cc: H. Peter Anvin 
Cc: Linus Torvalds 
Cc: Markus T Metzger 
Cc: Peter Zijlstra 
Cc: Ravi Shankar 
Cc: Rik van Riel 
Cc: Thomas Gleixner 
Link: 
http://lkml.kernel.org/r/1537312139-5580-3-git-send-email-chang.seok@intel.com
Signed-off-by: Ingo Molnar 
---
 arch/x86/include/asm/fsgsbase.h |  50 
 arch/x86/kernel/process_64.c| 124 
 arch/x86/kernel/ptrace.c|  51 ++---
 3 files changed, 179 insertions(+), 46 deletions(-)

diff --git a/arch/x86/include/asm/fsgsbase.h b/arch/x86/include/asm/fsgsbase.h
new file mode 100644
index ..1ab465ee23fe
--- /dev/null
+++ b/arch/x86/include/asm/fsgsbase.h
@@ -0,0 +1,50 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_FSGSBASE_H
+#define _ASM_FSGSBASE_H 1
+
+#ifndef __ASSEMBLY__
+
+#ifdef CONFIG_X86_64
+
+#include 
+
+unsigned long x86_fsgsbase_read_task(struct task_struct *task,
+unsigned short selector);
+
+/*
+ * Read/write a task's fsbase or gsbase. This returns the value that
+ * the FS/GS base would have (if the task were to be resumed). These
+ * work on current or on a different non-running task.
+ */
+unsigned long x86_fsbase_read_task(struct task_struct *task);
+unsigned long x86_gsbase_read_task(struct task_struct *task);
+int x86_fsbase_write_task(struct task_struct *task, unsigned long fsbase);
+int x86_gsbase_write_task(struct task_struct *task, unsigned long gsbase);
+
+/* Helper functions for reading/writing FS/GS base */
+
+static inline unsigned long x86_fsbase_read_cpu(void)
+{
+   unsigned long fsbase;
+
+   rdmsrl(MSR_FS_BASE, fsbase);
+   return fsbase;
+}
+
+void x86_fsbase_write_cpu(unsigned long fsbase);
+
+static inline unsigned long x86_gsbase_read_cpu_inactive(void)
+{
+   unsigned long gsbase;
+
+   rdmsrl(MSR_KERNEL_GS_BASE, gsbase);
+   return gsbase;
+}
+
+void x86_gsbase_write_cpu_inactive(unsigned long gsbase);
+
+#endif /* CONFIG_X86_64 */
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* _ASM_FSGSBASE_H */
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index ea5ea850348d..2a53ff8d1baf 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -54,6 +54,7 @@
 #include 
 #include 
 #include 
+#include 
 #ifdef CONFIG_IA32_EMULATION
 /* Not included via unistd.h */
 #include 
@@ -286,6 +287,129 @@ static __always_inline void load_seg_legacy(unsigned 
short prev_index,
}
 }
 
+unsigned long x86_fsgsbase_read_task(struct task_struct *task,
+unsigned short selector)
+{
+   unsigned short idx = selector >> 3;
+   unsigned long base;
+
+   if (likely((selector & SEGMENT_TI_MASK) == 0)) {
+   if (unlikely(idx >= GDT_ENTRIES))
+   return 0;
+
+   /*
+* There are no user segments in the GDT with nonzero bases
+* other than the TLS segments.
+*/
+   if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
+   return 0;
+
+   idx -= GDT_ENTRY_TLS_MIN;
+   base = get_desc_base(&task->thread.tls_array[idx]);
+   } else {
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
+   struct ldt_struct *ldt;
+
+   /*
+* If performance here mattered, we could protect the LDT
+* with RCU.  This is a slow path, though, so we can just
+* take the mutex.
+*/
+   mutex_lock(

[tip:x86/asm] x86/fsgsbase/64: Make ptrace use the new FS/GS base helpers

2018-10-08 Thread tip-bot for Chang S. Bae
Commit-ID:  e696c231bebf5f17fe0c5e465c01511320668054
Gitweb: https://git.kernel.org/tip/e696c231bebf5f17fe0c5e465c01511320668054
Author: Chang S. Bae 
AuthorDate: Tue, 18 Sep 2018 16:08:54 -0700
Committer:  Ingo Molnar 
CommitDate: Mon, 8 Oct 2018 10:41:08 +0200

x86/fsgsbase/64: Make ptrace use the new FS/GS base helpers

Use the new FS/GS base helper functions in  in the platform
specific ptrace implementation of the following APIs:

  PTRACE_ARCH_PRCTL,
  PTRACE_SETREG,
  PTRACE_GETREG,
  etc.

The fsgsbase code is more abstracted out this way and the FS/GS-update
mechanism will be easier to change this way.

[ mingo: Wrote new changelog. ]

Based-on-code-from: Andy Lutomirski 
Signed-off-by: Chang S. Bae 
Cc: Andy Lutomirski 
Cc: Andy Lutomirski 
Cc: Borislav Petkov 
Cc: Brian Gerst 
Cc: Dave Hansen 
Cc: Denys Vlasenko 
Cc: H. Peter Anvin 
Cc: Linus Torvalds 
Cc: Markus T Metzger 
Cc: Peter Zijlstra 
Cc: Ravi Shankar 
Cc: Rik van Riel 
Cc: Thomas Gleixner 
Link: 
http://lkml.kernel.org/r/1537312139-5580-4-git-send-email-chang.seok@intel.com
Signed-off-by: Ingo Molnar 
---
 arch/x86/include/asm/fsgsbase.h |  3 ---
 arch/x86/kernel/process_64.c| 49 +
 arch/x86/kernel/ptrace.c| 27 +++
 3 files changed, 18 insertions(+), 61 deletions(-)

diff --git a/arch/x86/include/asm/fsgsbase.h b/arch/x86/include/asm/fsgsbase.h
index 1ab465ee23fe..5e9cbcce318a 100644
--- a/arch/x86/include/asm/fsgsbase.h
+++ b/arch/x86/include/asm/fsgsbase.h
@@ -8,9 +8,6 @@
 
 #include 
 
-unsigned long x86_fsgsbase_read_task(struct task_struct *task,
-unsigned short selector);
-
 /*
  * Read/write a task's fsbase or gsbase. This returns the value that
  * the FS/GS base would have (if the task were to be resumed). These
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 2a53ff8d1baf..e5fb0c3dee4d 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -287,8 +287,8 @@ static __always_inline void load_seg_legacy(unsigned short 
prev_index,
}
 }
 
-unsigned long x86_fsgsbase_read_task(struct task_struct *task,
-unsigned short selector)
+static unsigned long x86_fsgsbase_read_task(struct task_struct *task,
+   unsigned short selector)
 {
unsigned short idx = selector >> 3;
unsigned long base;
@@ -751,54 +751,25 @@ static long prctl_map_vdso(const struct vdso_image 
*image, unsigned long addr)
 long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2)
 {
int ret = 0;
-   int doit = task == current;
-   int cpu;
 
switch (option) {
-   case ARCH_SET_GS:
-   if (arg2 >= TASK_SIZE_MAX)
-   return -EPERM;
-   cpu = get_cpu();
-   task->thread.gsindex = 0;
-   task->thread.gsbase = arg2;
-   if (doit) {
-   load_gs_index(0);
-   ret = wrmsrl_safe(MSR_KERNEL_GS_BASE, arg2);
-   }
-   put_cpu();
+   case ARCH_SET_GS: {
+   ret = x86_gsbase_write_task(task, arg2);
break;
-   case ARCH_SET_FS:
-   /* Not strictly needed for fs, but do it for symmetry
-  with gs */
-   if (arg2 >= TASK_SIZE_MAX)
-   return -EPERM;
-   cpu = get_cpu();
-   task->thread.fsindex = 0;
-   task->thread.fsbase = arg2;
-   if (doit) {
-   /* set the selector to 0 to not confuse __switch_to */
-   loadsegment(fs, 0);
-   ret = wrmsrl_safe(MSR_FS_BASE, arg2);
-   }
-   put_cpu();
+   }
+   case ARCH_SET_FS: {
+   ret = x86_fsbase_write_task(task, arg2);
break;
+   }
case ARCH_GET_FS: {
-   unsigned long base;
+   unsigned long base = x86_fsbase_read_task(task);
 
-   if (doit)
-   rdmsrl(MSR_FS_BASE, base);
-   else
-   base = task->thread.fsbase;
ret = put_user(base, (unsigned long __user *)arg2);
break;
}
case ARCH_GET_GS: {
-   unsigned long base;
+   unsigned long base = x86_gsbase_read_task(task);
 
-   if (doit)
-   rdmsrl(MSR_KERNEL_GS_BASE, base);
-   else
-   base = task->thread.gsbase;
ret = put_user(base, (unsigned long __user *)arg2);
break;
}
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index fbde2a7ce377..d8f49c7384a3 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -397,12 +397,11 @@ static int putreg(struct task_struct *child,
  

[tip:x86/asm] x86/fsgsbase/64: Convert the ELF core dump code to the new FSGSBASE helpers

2018-10-08 Thread tip-bot for Chang S. Bae
Commit-ID:  824eea38d239fb2a6027e65e18a5daef23019b00
Gitweb: https://git.kernel.org/tip/824eea38d239fb2a6027e65e18a5daef23019b00
Author: Chang S. Bae 
AuthorDate: Tue, 18 Sep 2018 16:08:55 -0700
Committer:  Ingo Molnar 
CommitDate: Mon, 8 Oct 2018 10:41:09 +0200

x86/fsgsbase/64: Convert the ELF core dump code to the new FSGSBASE helpers

Replace open-coded rdmsr()'s with their  API
counterparts.

No change in functionality intended.

[ mingo: Wrote new changelog. ]

Based-on-code-from: Andy Lutomirski 
Signed-off-by: Chang S. Bae 
Reviewed-by: Andi Kleen 
Reviewed-by: Andy Lutomirski 
Reviewed-by: Thomas Gleixner 
Cc: Andy Lutomirski 
Cc: Borislav Petkov 
Cc: Brian Gerst 
Cc: Dave Hansen 
Cc: Denys Vlasenko 
Cc: H. Peter Anvin 
Cc: Linus Torvalds 
Cc: Markus T Metzger 
Cc: Peter Zijlstra 
Cc: Ravi Shankar 
Cc: Rik van Riel 
Link: 
http://lkml.kernel.org/r/1537312139-5580-5-git-send-email-chang.seok@intel.com
Signed-off-by: Ingo Molnar 
---
 arch/x86/include/asm/elf.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index 0d157d2a1e2a..1527ec351036 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -10,6 +10,7 @@
 #include 
 #include 
 #include 
+#include 
 
 typedef unsigned long elf_greg_t;
 
@@ -205,7 +206,6 @@ void set_personality_ia32(bool);
 
 #define ELF_CORE_COPY_REGS(pr_reg, regs)   \
 do {   \
-   unsigned long base; \
unsigned v; \
(pr_reg)[0] = (regs)->r15;  \
(pr_reg)[1] = (regs)->r14;  \
@@ -228,8 +228,8 @@ do {
\
(pr_reg)[18] = (regs)->flags;   \
(pr_reg)[19] = (regs)->sp;  \
(pr_reg)[20] = (regs)->ss;  \
-   rdmsrl(MSR_FS_BASE, base); (pr_reg)[21] = base; \
-   rdmsrl(MSR_KERNEL_GS_BASE, base); (pr_reg)[22] = base;  \
+   (pr_reg)[21] = x86_fsbase_read_cpu();   \
+   (pr_reg)[22] = x86_gsbase_read_cpu_inactive();  \
asm("movl %%ds,%0" : "=r" (v)); (pr_reg)[23] = v;   \
asm("movl %%es,%0" : "=r" (v)); (pr_reg)[24] = v;   \
asm("movl %%fs,%0" : "=r" (v)); (pr_reg)[25] = v;   \


[tip:x86/asm] x86/fsgsbase/64: Factor out FS/GS segment loading from __switch_to()

2018-10-08 Thread tip-bot for Chang S. Bae
Commit-ID:  f4550b52e495e1b634d1f2c1004bcea5dc3321ea
Gitweb: https://git.kernel.org/tip/f4550b52e495e1b634d1f2c1004bcea5dc3321ea
Author: Chang S. Bae 
AuthorDate: Tue, 18 Sep 2018 16:08:56 -0700
Committer:  Ingo Molnar 
CommitDate: Mon, 8 Oct 2018 10:41:09 +0200

x86/fsgsbase/64: Factor out FS/GS segment loading from __switch_to()

Instead of open coding the calls to load_seg_legacy(), introduce
x86_fsgsbase_load() to load FS/GS segments.

This makes it more explicit that this is part of FSGSBASE functionality,
and the new helper can be updated when FSGSBASE instructions are enabled.

[ mingo: Wrote new changelog. ]

Signed-off-by: Chang S. Bae 
Reviewed-by: Andi Kleen 
Reviewed-by: Andy Lutomirski 
Reviewed-by: Thomas Gleixner 
Cc: Andy Lutomirski 
Cc: Borislav Petkov 
Cc: Brian Gerst 
Cc: Dave Hansen 
Cc: Denys Vlasenko 
Cc: H. Peter Anvin 
Cc: Linus Torvalds 
Cc: Markus T Metzger 
Cc: Peter Zijlstra 
Cc: Ravi Shankar 
Cc: Rik van Riel 
Link: 
http://lkml.kernel.org/r/1537312139-5580-6-git-send-email-chang.seok@intel.com
Signed-off-by: Ingo Molnar 
---
 arch/x86/kernel/process_64.c | 14 ++
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index e5fb0c3dee4d..d6674a425714 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -287,6 +287,15 @@ static __always_inline void load_seg_legacy(unsigned short 
prev_index,
}
 }
 
+static __always_inline void x86_fsgsbase_load(struct thread_struct *prev,
+ struct thread_struct *next)
+{
+   load_seg_legacy(prev->fsindex, prev->fsbase,
+   next->fsindex, next->fsbase, FS);
+   load_seg_legacy(prev->gsindex, prev->gsbase,
+   next->gsindex, next->gsbase, GS);
+}
+
 static unsigned long x86_fsgsbase_read_task(struct task_struct *task,
unsigned short selector)
 {
@@ -597,10 +606,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct 
*next_p)
if (unlikely(next->ds | prev->ds))
loadsegment(ds, next->ds);
 
-   load_seg_legacy(prev->fsindex, prev->fsbase,
-   next->fsindex, next->fsbase, FS);
-   load_seg_legacy(prev->gsindex, prev->gsbase,
-   next->gsindex, next->gsbase, GS);
+   x86_fsgsbase_load(prev, next);
 
switch_fpu_finish(next_fpu, cpu);
 


[tip:x86/asm] x86/segments/64: Rename the GDT PER_CPU entry to CPU_NUMBER

2018-10-08 Thread tip-bot for Chang S. Bae
Commit-ID:  c4755613a1339ea77dbb15de75c9f74217209265
Gitweb: https://git.kernel.org/tip/c4755613a1339ea77dbb15de75c9f74217209265
Author: Chang S. Bae 
AuthorDate: Tue, 18 Sep 2018 16:08:57 -0700
Committer:  Ingo Molnar 
CommitDate: Mon, 8 Oct 2018 10:41:10 +0200

x86/segments/64: Rename the GDT PER_CPU entry to CPU_NUMBER

The old 'per CPU' naming was misleading: 64-bit kernels don't use this
GDT entry for per CPU data, but to store the CPU (and node) ID.

[ mingo: Wrote new changelog. ]

Suggested-by: H. Peter Anvin 
Signed-off-by: Chang S. Bae 
Reviewed-by: Thomas Gleixner 
Acked-by: Andy Lutomirski 
Cc: Andy Lutomirski 
Cc: Borislav Petkov 
Cc: Brian Gerst 
Cc: Dave Hansen 
Cc: Denys Vlasenko 
Cc: Linus Torvalds 
Cc: Markus T Metzger 
Cc: Peter Zijlstra 
Cc: Ravi Shankar 
Cc: Rik van Riel 
Link: 
http://lkml.kernel.org/r/1537312139-5580-7-git-send-email-chang.seok@intel.com
Signed-off-by: Ingo Molnar 
---
 arch/x86/entry/vdso/vma.c  | 2 +-
 arch/x86/include/asm/segment.h | 5 ++---
 arch/x86/include/asm/vgtod.h   | 8 
 3 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
index 5b8b556dbb12..0b114aafcedc 100644
--- a/arch/x86/entry/vdso/vma.c
+++ b/arch/x86/entry/vdso/vma.c
@@ -359,7 +359,7 @@ static void vgetcpu_cpu_init(void *arg)
d.p = 1;/* Present */
d.d = 1;/* 32-bit */
 
-   write_gdt_entry(get_cpu_gdt_rw(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S);
+   write_gdt_entry(get_cpu_gdt_rw(cpu), GDT_ENTRY_CPU_NUMBER, &d, 
DESCTYPE_S);
 }
 
 static int vgetcpu_online(unsigned int cpu)
diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h
index e293c122d0d5..e3e788ea52e5 100644
--- a/arch/x86/include/asm/segment.h
+++ b/arch/x86/include/asm/segment.h
@@ -186,8 +186,7 @@
 #define GDT_ENTRY_TLS_MIN  12
 #define GDT_ENTRY_TLS_MAX  14
 
-/* Abused to load per CPU data from limit */
-#define GDT_ENTRY_PER_CPU  15
+#define GDT_ENTRY_CPU_NUMBER   15
 
 /*
  * Number of entries in the GDT table:
@@ -207,7 +206,7 @@
 #define __USER_DS  (GDT_ENTRY_DEFAULT_USER_DS*8 + 3)
 #define __USER32_DS__USER_DS
 #define __USER_CS  (GDT_ENTRY_DEFAULT_USER_CS*8 + 3)
-#define __PER_CPU_SEG  (GDT_ENTRY_PER_CPU*8 + 3)
+#define __CPU_NUMBER_SEG   (GDT_ENTRY_CPU_NUMBER*8 + 3)
 
 #endif
 
diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h
index 53748541c487..4e81ea920722 100644
--- a/arch/x86/include/asm/vgtod.h
+++ b/arch/x86/include/asm/vgtod.h
@@ -86,9 +86,9 @@ static inline unsigned int __getcpu(void)
unsigned int p;
 
/*
-* Load per CPU data from GDT.  LSL is faster than RDTSCP and
-* works on all CPUs.  This is volatile so that it orders
-* correctly wrt barrier() and to keep gcc from cleverly
+* Load CPU (and node) number from GDT.  LSL is faster than RDTSCP
+* and works on all CPUs.  This is volatile so that it orders
+* correctly with respect to barrier() and to keep GCC from cleverly
 * hoisting it out of the calling function.
 *
 * If RDPID is available, use it.
@@ -96,7 +96,7 @@ static inline unsigned int __getcpu(void)
alternative_io ("lsl %[seg],%[p]",
".byte 0xf3,0x0f,0xc7,0xf8", /* RDPID %eax/rax */
X86_FEATURE_RDPID,
-   [p] "=a" (p), [seg] "r" (__PER_CPU_SEG));
+   [p] "=a" (p), [seg] "r" (__CPU_NUMBER_SEG));
 
return p;
 }


[tip:x86/asm] x86/vdso: Introduce helper functions for CPU and node number

2018-10-08 Thread tip-bot for Chang S. Bae
Commit-ID:  ffebbaedc8616cffe648202e364dce6a045d65a2
Gitweb: https://git.kernel.org/tip/ffebbaedc8616cffe648202e364dce6a045d65a2
Author: Chang S. Bae 
AuthorDate: Tue, 18 Sep 2018 16:08:58 -0700
Committer:  Ingo Molnar 
CommitDate: Mon, 8 Oct 2018 10:41:10 +0200

x86/vdso: Introduce helper functions for CPU and node number

Clean up the CPU/node number related code a bit, to make it more apparent
how we are encoding/extracting the CPU and node fields from the
segment limit.

No change in functionality intended.

[ mingo: Wrote new changelog. ]

Suggested-by: Andy Lutomirski 
Suggested-by: Thomas Gleixner 
Signed-off-by: Chang S. Bae 
Cc: Andy Lutomirski 
Cc: Borislav Petkov 
Cc: Brian Gerst 
Cc: Dave Hansen 
Cc: Denys Vlasenko 
Cc: H. Peter Anvin 
Cc: Linus Torvalds 
Cc: Markus T Metzger 
Cc: Peter Zijlstra 
Cc: Ravi Shankar 
Cc: Rik van Riel 
Link: 
http://lkml.kernel.org/r/1537312139-5580-8-git-send-email-chang.seok@intel.com
Signed-off-by: Ingo Molnar 
---
 arch/x86/entry/vdso/vgetcpu.c  |  9 +
 arch/x86/entry/vdso/vma.c  | 19 +++
 arch/x86/include/asm/segment.h | 41 +
 arch/x86/include/asm/vgtod.h   | 26 --
 4 files changed, 49 insertions(+), 46 deletions(-)

diff --git a/arch/x86/entry/vdso/vgetcpu.c b/arch/x86/entry/vdso/vgetcpu.c
index 8ec3d1f4ce9a..de78fc9cd963 100644
--- a/arch/x86/entry/vdso/vgetcpu.c
+++ b/arch/x86/entry/vdso/vgetcpu.c
@@ -13,14 +13,7 @@
 notrace long
 __vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused)
 {
-   unsigned int p;
-
-   p = __getcpu();
-
-   if (cpu)
-   *cpu = p & VGETCPU_CPU_MASK;
-   if (node)
-   *node = p >> 12;
+   vdso_read_cpu_node(cpu, node);
return 0;
 }
 
diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
index 0b114aafcedc..39b5584c5808 100644
--- a/arch/x86/entry/vdso/vma.c
+++ b/arch/x86/entry/vdso/vma.c
@@ -339,20 +339,15 @@ static void vgetcpu_cpu_init(void *arg)
 {
int cpu = smp_processor_id();
struct desc_struct d = { };
-   unsigned long node = 0;
-#ifdef CONFIG_NUMA
-   node = cpu_to_node(cpu);
-#endif
+   unsigned long cpudata = vdso_encode_cpu_node(cpu, cpu_to_node(cpu));
+
if (static_cpu_has(X86_FEATURE_RDTSCP))
-   write_rdtscp_aux((node << 12) | cpu);
+   write_rdtscp_aux(cpudata);
+
+   /* Store CPU and node number in limit */
+   d.limit0 = cpudata;
+   d.limit1 = cpudata >> 16;
 
-   /*
-* Store cpu number in limit so that it can be loaded
-* quickly in user space in vgetcpu. (12 bits for the CPU
-* and 8 bits for the node)
-*/
-   d.limit0 = cpu | ((node & 0xf) << 12);
-   d.limit1 = node >> 4;
d.type = 5; /* RO data, expand down, accessed */
d.dpl = 3;  /* Visible to user code */
d.s = 1;/* Not a system segment */
diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h
index e3e788ea52e5..4d1f6cc62e13 100644
--- a/arch/x86/include/asm/segment.h
+++ b/arch/x86/include/asm/segment.h
@@ -224,6 +224,47 @@
 #define GDT_ENTRY_TLS_ENTRIES  3
 #define TLS_SIZE   (GDT_ENTRY_TLS_ENTRIES* 8)
 
+#ifdef CONFIG_X86_64
+
+/* Bit size and mask of CPU number stored in the per CPU data (and TSC_AUX) */
+#define VDSO_CPU_SIZE  12
+#define VDSO_CPU_MASK  0xfff
+
+#ifndef __ASSEMBLY__
+
+/* Helper functions to store/load CPU and node numbers */
+
+static inline unsigned long vdso_encode_cpu_node(int cpu, unsigned long node)
+{
+   return ((node << VDSO_CPU_SIZE) | cpu);
+}
+
+static inline void vdso_read_cpu_node(unsigned *cpu, unsigned *node)
+{
+   unsigned int p;
+
+   /*
+* Load CPU and node number from GDT.  LSL is faster than RDTSCP
+* and works on all CPUs.  This is volatile so that it orders
+* correctly with respect to barrier() and to keep GCC from cleverly
+* hoisting it out of the calling function.
+*
+* If RDPID is available, use it.
+*/
+   alternative_io ("lsl %[seg],%[p]",
+   ".byte 0xf3,0x0f,0xc7,0xf8", /* RDPID %eax/rax */
+   X86_FEATURE_RDPID,
+   [p] "=a" (p), [seg] "r" (__CPU_NUMBER_SEG));
+
+   if (cpu)
+   *cpu = (p & VDSO_CPU_MASK);
+   if (node)
+   *node = (p >> VDSO_CPU_SIZE);
+}
+
+#endif /* !__ASSEMBLY__ */
+#endif /* CONFIG_X86_64 */
+
 #ifdef __KERNEL__
 
 /*
diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h
index 4e81ea920722..056a61c8c5c7 100644
--- a/arch/x86/include/asm/vgtod.h
+++ b/arch/x86/include/asm/vgtod.h
@@ -77,30 +77,4 @@ static inline void gtod_write_end(struct vsyscall_gtod_data 
*s)
++s->seq;
 }
 
-#ifdef CONFIG_X86_64
-
-#define VGETCPU_CPU_MASK 0xfff
-
-static inline uns

[tip:x86/asm] x86/vdso: Initialize the CPU/node NR segment descriptor earlier

2018-10-08 Thread tip-bot for Chang S. Bae
Commit-ID:  b2e2ba578e016a091eb31565849990fe68c7c599
Gitweb: https://git.kernel.org/tip/b2e2ba578e016a091eb31565849990fe68c7c599
Author: Chang S. Bae 
AuthorDate: Tue, 18 Sep 2018 16:08:59 -0700
Committer:  Ingo Molnar 
CommitDate: Mon, 8 Oct 2018 10:41:10 +0200

x86/vdso: Initialize the CPU/node NR segment descriptor earlier

Currently the CPU/node NR segment descriptor (GDT_ENTRY_CPU_NUMBER) is
initialized relatively late during CPU init, from the vCPU code, which
has a number of disadvantages, such as hotplug CPU notifiers and SMP
cross-calls.

Instead just initialize it much earlier, directly in cpu_init().

This reduces complexity and increases robustness.

[ mingo: Wrote new changelog. ]

Suggested-by: H. Peter Anvin 
Suggested-by: Andy Lutomirski 
Signed-off-by: Chang S. Bae 
Cc: Andy Lutomirski 
Cc: Borislav Petkov 
Cc: Brian Gerst 
Cc: Dave Hansen 
Cc: Denys Vlasenko 
Cc: Linus Torvalds 
Cc: Markus T Metzger 
Cc: Peter Zijlstra 
Cc: Ravi Shankar 
Cc: Rik van Riel 
Cc: Thomas Gleixner 
Link: 
http://lkml.kernel.org/r/1537312139-5580-9-git-send-email-chang.seok@intel.com
Signed-off-by: Ingo Molnar 
---
 arch/x86/entry/vdso/vma.c| 33 +
 arch/x86/kernel/cpu/common.c | 24 
 2 files changed, 25 insertions(+), 32 deletions(-)

diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
index 39b5584c5808..3f9d43f26f63 100644
--- a/arch/x86/entry/vdso/vma.c
+++ b/arch/x86/entry/vdso/vma.c
@@ -332,35 +332,6 @@ static __init int vdso_setup(char *s)
return 0;
 }
 __setup("vdso=", vdso_setup);
-#endif
-
-#ifdef CONFIG_X86_64
-static void vgetcpu_cpu_init(void *arg)
-{
-   int cpu = smp_processor_id();
-   struct desc_struct d = { };
-   unsigned long cpudata = vdso_encode_cpu_node(cpu, cpu_to_node(cpu));
-
-   if (static_cpu_has(X86_FEATURE_RDTSCP))
-   write_rdtscp_aux(cpudata);
-
-   /* Store CPU and node number in limit */
-   d.limit0 = cpudata;
-   d.limit1 = cpudata >> 16;
-
-   d.type = 5; /* RO data, expand down, accessed */
-   d.dpl = 3;  /* Visible to user code */
-   d.s = 1;/* Not a system segment */
-   d.p = 1;/* Present */
-   d.d = 1;/* 32-bit */
-
-   write_gdt_entry(get_cpu_gdt_rw(cpu), GDT_ENTRY_CPU_NUMBER, &d, 
DESCTYPE_S);
-}
-
-static int vgetcpu_online(unsigned int cpu)
-{
-   return smp_call_function_single(cpu, vgetcpu_cpu_init, NULL, 1);
-}
 
 static int __init init_vdso(void)
 {
@@ -370,9 +341,7 @@ static int __init init_vdso(void)
init_vdso_image(&vdso_image_x32);
 #endif
 
-   /* notifier priority > KVM */
-   return cpuhp_setup_state(CPUHP_AP_X86_VDSO_VMA_ONLINE,
-"x86/vdso/vma:online", vgetcpu_online, NULL);
+   return 0;
 }
 subsys_initcall(init_vdso);
 #endif /* CONFIG_X86_64 */
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 44c4ef3d989b..a148d18a1ef0 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1669,6 +1669,29 @@ static void wait_for_master_cpu(int cpu)
 #endif
 }
 
+#ifdef CONFIG_X86_64
+static void setup_getcpu(int cpu)
+{
+   unsigned long cpudata = vdso_encode_cpu_node(cpu, 
early_cpu_to_node(cpu));
+   struct desc_struct d = { };
+
+   if (static_cpu_has(X86_FEATURE_RDTSCP))
+   write_rdtscp_aux(cpudata);
+
+   /* Store CPU and node number in limit. */
+   d.limit0 = cpudata;
+   d.limit1 = cpudata >> 16;
+
+   d.type = 5; /* RO data, expand down, accessed */
+   d.dpl = 3;  /* Visible to user code */
+   d.s = 1;/* Not a system segment */
+   d.p = 1;/* Present */
+   d.d = 1;/* 32-bit */
+
+   write_gdt_entry(get_cpu_gdt_rw(cpu), GDT_ENTRY_CPU_NUMBER, &d, 
DESCTYPE_S);
+}
+#endif
+
 /*
  * cpu_init() initializes state that is per-CPU. Some data is already
  * initialized (naturally) in the bootstrap process, such as the GDT
@@ -1706,6 +1729,7 @@ void cpu_init(void)
early_cpu_to_node(cpu) != NUMA_NO_NODE)
set_numa_node(early_cpu_to_node(cpu));
 #endif
+   setup_getcpu(cpu);
 
me = current;
 


[tip:x86/cpu] x86/ptrace: Prevent ptrace from clearing the FS/GS selector

2019-06-22 Thread tip-bot for Chang S. Bae
Commit-ID:  48f5e52e916b55fb73754833efbacc7f8081a159
Gitweb: https://git.kernel.org/tip/48f5e52e916b55fb73754833efbacc7f8081a159
Author: Chang S. Bae 
AuthorDate: Sun, 16 Jun 2019 15:44:11 +
Committer:  Thomas Gleixner 
CommitDate: Sat, 22 Jun 2019 11:38:50 +0200

x86/ptrace: Prevent ptrace from clearing the FS/GS selector

When a ptracer writes a ptracee's FS/GSBASE with a different value, the
selector is also cleared. This behavior is not correct as the selector
should be preserved.

Update only the base value and leave the selector intact. To simplify the
code further remove the conditional checking for the same value as this
code is not performance critical.

The only recognizable downside of this change is when the selector is
already nonzero on write. The base will be reloaded according to the
selector. But the case is highly unexpected in real usages.

[ tglx: Massage changelog ]

Suggested-by: Andy Lutomirski 
Signed-off-by: Chang S. Bae 
Signed-off-by: Thomas Gleixner 
Cc: "H . Peter Anvin" 
Cc: Andi Kleen 
Cc: H. Peter Anvin 
Link: https://lkml.kernel.org/r/9040cfcd-74bd-4c17-9a01-b9b713cf6...@intel.com

---
 arch/x86/kernel/ptrace.c | 14 ++
 1 file changed, 2 insertions(+), 12 deletions(-)

diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index a166c960bc9e..3108cdc00b29 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -397,22 +397,12 @@ static int putreg(struct task_struct *child,
case offsetof(struct user_regs_struct,fs_base):
if (value >= TASK_SIZE_MAX)
return -EIO;
-   /*
-* When changing the FS base, use do_arch_prctl_64()
-* to set the index to zero and to set the base
-* as requested.
-*/
-   if (child->thread.fsbase != value)
-   return do_arch_prctl_64(child, ARCH_SET_FS, value);
+   x86_fsbase_write_task(child, value);
return 0;
case offsetof(struct user_regs_struct,gs_base):
-   /*
-* Exactly the same here as the %fs handling above.
-*/
if (value >= TASK_SIZE_MAX)
return -EIO;
-   if (child->thread.gsbase != value)
-   return do_arch_prctl_64(child, ARCH_SET_GS, value);
+   x86_gsbase_write_task(child, value);
return 0;
 #endif
}


[tip:x86/cpu] selftests/x86/fsgsbase: Test ptracer-induced GSBASE write

2019-06-22 Thread tip-bot for Chang S. Bae
Commit-ID:  1b6858d5a2eb2485761f06bd48055ed5bed08464
Gitweb: https://git.kernel.org/tip/1b6858d5a2eb2485761f06bd48055ed5bed08464
Author: Chang S. Bae 
AuthorDate: Wed, 8 May 2019 03:02:17 -0700
Committer:  Thomas Gleixner 
CommitDate: Sat, 22 Jun 2019 11:38:51 +0200

selftests/x86/fsgsbase: Test ptracer-induced GSBASE write

The test validates that the selector is not changed when a ptracer writes
the ptracee's GSBASE.

Suggested-by: Andy Lutomirski 
Signed-off-by: Chang S. Bae 
Signed-off-by: Thomas Gleixner 
Cc: "H . Peter Anvin" 
Cc: Andi Kleen 
Cc: Ravi Shankar 
Cc: H. Peter Anvin 
Link: 
https://lkml.kernel.org/r/1557309753-24073-3-git-send-email-chang.seok@intel.com

---
 tools/testing/selftests/x86/fsgsbase.c | 70 ++
 1 file changed, 70 insertions(+)

diff --git a/tools/testing/selftests/x86/fsgsbase.c 
b/tools/testing/selftests/x86/fsgsbase.c
index af85bd4752a5..b02ddce49bbb 100644
--- a/tools/testing/selftests/x86/fsgsbase.c
+++ b/tools/testing/selftests/x86/fsgsbase.c
@@ -23,6 +23,9 @@
 #include 
 #include 
 #include 
+#include 
+#include 
+#include 
 
 #ifndef __x86_64__
 # error This test is 64-bit only
@@ -367,6 +370,71 @@ static void test_unexpected_base(void)
}
 }
 
+#define USER_REGS_OFFSET(r) offsetof(struct user_regs_struct, r)
+
+static void test_ptrace_write_gsbase(void)
+{
+   int status;
+   pid_t child = fork();
+
+   if (child < 0)
+   err(1, "fork");
+
+   if (child == 0) {
+   printf("[RUN]\tPTRACE_POKE(), write GSBASE from ptracer\n");
+
+   /*
+* Use the LDT setup and fetch the GSBASE from the LDT
+* by switching to the (nonzero) selector (again)
+*/
+   do_unexpected_base();
+   asm volatile ("mov %0, %%gs" : : "rm" ((unsigned short)0x7));
+
+   if (ptrace(PTRACE_TRACEME, 0, NULL, NULL) != 0)
+   err(1, "PTRACE_TRACEME");
+
+   raise(SIGTRAP);
+   _exit(0);
+   }
+
+   wait(&status);
+
+   if (WSTOPSIG(status) == SIGTRAP) {
+   unsigned long gs;
+   unsigned long gs_offset = USER_REGS_OFFSET(gs);
+   unsigned long base_offset = USER_REGS_OFFSET(gs_base);
+
+   gs = ptrace(PTRACE_PEEKUSER, child, gs_offset, NULL);
+
+   if (gs != 0x7) {
+   nerrs++;
+   printf("[FAIL]\tGS is not prepared with nonzero\n");
+   goto END;
+   }
+
+   if (ptrace(PTRACE_POKEUSER, child, base_offset, 0xFF) != 0)
+   err(1, "PTRACE_POKEUSER");
+
+   gs = ptrace(PTRACE_PEEKUSER, child, gs_offset, NULL);
+
+   /*
+* In a non-FSGSBASE system, the nonzero selector will load
+* GSBASE (again). But what is tested here is whether the
+* selector value is changed or not by the GSBASE write in
+* a ptracer.
+*/
+   if (gs != 0x7) {
+   nerrs++;
+   printf("[FAIL]\tGS changed to %lx\n", gs);
+   } else {
+   printf("[OK]\tGS remained 0x7\n");
+   }
+   }
+
+END:
+   ptrace(PTRACE_CONT, child, NULL, NULL);
+}
+
 int main()
 {
pthread_t thread;
@@ -423,5 +491,7 @@ int main()
if (pthread_join(thread, NULL) != 0)
err(1, "pthread_join");
 
+   test_ptrace_write_gsbase();
+
return nerrs == 0 ? 0 : 1;
 }


[tip:x86/cpu] kbuild: Raise the minimum required binutils version to 2.21

2019-06-22 Thread tip-bot for Chang S. Bae
Commit-ID:  1fb12b35e5ffe379d109b22cb3069830d0136d9a
Gitweb: https://git.kernel.org/tip/1fb12b35e5ffe379d109b22cb3069830d0136d9a
Author: Chang S. Bae 
AuthorDate: Wed, 8 May 2019 03:02:19 -0700
Committer:  Thomas Gleixner 
CommitDate: Sat, 22 Jun 2019 11:38:51 +0200

kbuild: Raise the minimum required binutils version to 2.21

It helps to use some new instructions directly in assembly code.

Suggested-by: Andi Kleen 
Signed-off-by: Chang S. Bae 
Signed-off-by: Thomas Gleixner 
Reviewed-by: Andi Kleen 
Acked-by: Andrew Morton 
Cc: Andy Lutomirski 
Cc: Ravi Shankar 
Cc: Linux Torvalds 
Cc: H. Peter Anvin 
Link: 
https://lkml.kernel.org/r/1557309753-24073-5-git-send-email-chang.seok@intel.com

---
 Documentation/process/changes.rst | 6 ++
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/Documentation/process/changes.rst 
b/Documentation/process/changes.rst
index 18735dc460a0..0a18075c485e 100644
--- a/Documentation/process/changes.rst
+++ b/Documentation/process/changes.rst
@@ -31,7 +31,7 @@ you probably needn't concern yourself with isdn4k-utils.
 == ===  

 GNU C  4.6  gcc --version
 GNU make   3.81 make --version
-binutils   2.20 ld -v
+binutils   2.21 ld -v
 flex   2.5.35   flex --version
 bison  2.0  bison --version
 util-linux 2.10ofdformat --version
@@ -77,9 +77,7 @@ You will need GNU make 3.81 or later to build the kernel.
 Binutils
 
 
-The build system has, as of 4.13, switched to using thin archives (`ar T`)
-rather than incremental linking (`ld -r`) for built-in.a intermediate steps.
-This requires binutils 2.20 or newer.
+Binutils 2.21 or newer is needed to build the kernel.
 
 pkg-config
 --


[tip:x86/cpu] x86/fsgsbase/64: Enable FSGSBASE instructions in helper functions

2019-06-22 Thread tip-bot for Chang S. Bae
Commit-ID:  a86b4625138d39e97b4cc254fc9c4bb9e1dc4542
Gitweb: https://git.kernel.org/tip/a86b4625138d39e97b4cc254fc9c4bb9e1dc4542
Author: Chang S. Bae 
AuthorDate: Wed, 8 May 2019 03:02:21 -0700
Committer:  Thomas Gleixner 
CommitDate: Sat, 22 Jun 2019 11:38:52 +0200

x86/fsgsbase/64: Enable FSGSBASE instructions in helper functions

Add cpu feature conditional FSGSBASE access to the relevant helper
functions. That allows to accelerate certain FS/GS base operations in
subsequent changes.

Note, that while possible, the user space entry/exit GSBASE operations are
not going to use the new FSGSBASE instructions. The reason is that it would
require additional storage for the user space value which adds more
complexity to the low level code and experiments have shown marginal
benefit. This may be revisited later but for now the SWAPGS based handling
in the entry code is preserved except for the paranoid entry/exit code.

To preserve the SWAPGS entry mechanism introduce __[rd|wr]gsbase_inactive()
helpers. Note, for Xen PV, paravirt hooks can be added later as they might
allow a very efficient but different implementation.

[ tglx: Massaged changelog ]

Signed-off-by: Chang S. Bae 
Signed-off-by: Thomas Gleixner 
Cc: Andy Lutomirski 
Cc: Andi Kleen 
Cc: Ravi Shankar 
Cc: Andrew Cooper 
Cc: H. Peter Anvin 
Link: 
https://lkml.kernel.org/r/1557309753-24073-7-git-send-email-chang.seok@intel.com

---
 arch/x86/include/asm/fsgsbase.h | 27 -
 arch/x86/kernel/process_64.c| 66 +
 2 files changed, 78 insertions(+), 15 deletions(-)

diff --git a/arch/x86/include/asm/fsgsbase.h b/arch/x86/include/asm/fsgsbase.h
index fdd1177499b4..aefd53767a5d 100644
--- a/arch/x86/include/asm/fsgsbase.h
+++ b/arch/x86/include/asm/fsgsbase.h
@@ -49,35 +49,32 @@ static __always_inline void wrgsbase(unsigned long gsbase)
asm volatile("wrgsbase %0" :: "r" (gsbase) : "memory");
 }
 
+#include 
+
 /* Helper functions for reading/writing FS/GS base */
 
 static inline unsigned long x86_fsbase_read_cpu(void)
 {
unsigned long fsbase;
 
-   rdmsrl(MSR_FS_BASE, fsbase);
+   if (static_cpu_has(X86_FEATURE_FSGSBASE))
+   fsbase = rdfsbase();
+   else
+   rdmsrl(MSR_FS_BASE, fsbase);
 
return fsbase;
 }
 
-static inline unsigned long x86_gsbase_read_cpu_inactive(void)
-{
-   unsigned long gsbase;
-
-   rdmsrl(MSR_KERNEL_GS_BASE, gsbase);
-
-   return gsbase;
-}
-
 static inline void x86_fsbase_write_cpu(unsigned long fsbase)
 {
-   wrmsrl(MSR_FS_BASE, fsbase);
+   if (static_cpu_has(X86_FEATURE_FSGSBASE))
+   wrfsbase(fsbase);
+   else
+   wrmsrl(MSR_FS_BASE, fsbase);
 }
 
-static inline void x86_gsbase_write_cpu_inactive(unsigned long gsbase)
-{
-   wrmsrl(MSR_KERNEL_GS_BASE, gsbase);
-}
+extern unsigned long x86_gsbase_read_cpu_inactive(void);
+extern void x86_gsbase_write_cpu_inactive(unsigned long gsbase);
 
 #endif /* CONFIG_X86_64 */
 
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 250e4c4ac6d9..c34ee0f72378 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -161,6 +161,40 @@ enum which_selector {
GS
 };
 
+/*
+ * Out of line to be protected from kprobes. It is not used on Xen
+ * paravirt. When paravirt support is needed, it needs to be renamed
+ * with native_ prefix.
+ */
+static noinline unsigned long __rdgsbase_inactive(void)
+{
+   unsigned long gsbase;
+
+   lockdep_assert_irqs_disabled();
+
+   native_swapgs();
+   gsbase = rdgsbase();
+   native_swapgs();
+
+   return gsbase;
+}
+NOKPROBE_SYMBOL(__rdgsbase_inactive);
+
+/*
+ * Out of line to be protected from kprobes. It is not used on Xen
+ * paravirt. When paravirt support is needed, it needs to be renamed
+ * with native_ prefix.
+ */
+static noinline void __wrgsbase_inactive(unsigned long gsbase)
+{
+   lockdep_assert_irqs_disabled();
+
+   native_swapgs();
+   wrgsbase(gsbase);
+   native_swapgs();
+}
+NOKPROBE_SYMBOL(__wrgsbase_inactive);
+
 /*
  * Saves the FS or GS base for an outgoing thread if FSGSBASE extensions are
  * not available.  The goal is to be reasonably fast on non-FSGSBASE systems.
@@ -339,6 +373,38 @@ static unsigned long x86_fsgsbase_read_task(struct 
task_struct *task,
return base;
 }
 
+unsigned long x86_gsbase_read_cpu_inactive(void)
+{
+   unsigned long gsbase;
+
+   if (static_cpu_has(X86_FEATURE_FSGSBASE)) {
+   unsigned long flags;
+
+   /* Interrupts are disabled here. */
+   local_irq_save(flags);
+   gsbase = __rdgsbase_inactive();
+   local_irq_restore(flags);
+   } else {
+   rdmsrl(MSR_KERNEL_GS_BASE, gsbase);
+   }
+
+   return gsbase;
+}
+
+void x86_gsbase_write_cpu_inactive(unsigned long gsbase)
+{
+   if (static_cpu_has(X86_FEATURE_FSGSBASE)) {
+ 

[tip:x86/cpu] x86/entry/64: Switch CR3 before SWAPGS in paranoid entry

2019-06-22 Thread tip-bot for Chang S. Bae
Commit-ID:  1d07316b1363a004ed548c3759584f8e8b1e24e3
Gitweb: https://git.kernel.org/tip/1d07316b1363a004ed548c3759584f8e8b1e24e3
Author: Chang S. Bae 
AuthorDate: Wed, 8 May 2019 03:02:25 -0700
Committer:  Thomas Gleixner 
CommitDate: Sat, 22 Jun 2019 11:38:53 +0200

x86/entry/64: Switch CR3 before SWAPGS in paranoid entry

When FSGSBASE is enabled, the GSBASE handling in paranoid entry will need
to retrieve the kernel GSBASE which requires that the kernel page table is
active.

As the CR3 switch to the kernel page tables (PTI is active) does not depend
on kernel GSBASE, move the CR3 switch in front of the GSBASE handling.

Comment the EBX content while at it.

No functional change.

[ tglx: Rewrote changelog and comments ]

Signed-off-by: Chang S. Bae 
Signed-off-by: Thomas Gleixner 
Cc: Andy Lutomirski 
Cc: "H . Peter Anvin" 
Cc: Andi Kleen 
Cc: Ravi Shankar 
Cc: Dave Hansen 
Cc: H. Peter Anvin 
Link: 
https://lkml.kernel.org/r/1557309753-24073-11-git-send-email-chang.seok@intel.com

---
 arch/x86/entry/entry_64.S | 31 +++
 1 file changed, 23 insertions(+), 8 deletions(-)

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 11aa3b2afa4d..aaa846f8850a 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -1173,13 +1173,6 @@ ENTRY(paranoid_entry)
cld
PUSH_AND_CLEAR_REGS save_ret=1
ENCODE_FRAME_POINTER 8
-   movl$1, %ebx
-   movl$MSR_GS_BASE, %ecx
-   rdmsr
-   testl   %edx, %edx
-   js  1f  /* negative -> in kernel */
-   SWAPGS
-   xorl%ebx, %ebx
 
 1:
/*
@@ -1191,9 +1184,30 @@ ENTRY(paranoid_entry)
 * This is also why CS (stashed in the "iret frame" by the
 * hardware at entry) can not be used: this may be a return
 * to kernel code, but with a user CR3 value.
+*
+* Switching CR3 does not depend on kernel GSBASE so it can
+* be done before switching to the kernel GSBASE. This is
+* required for FSGSBASE because the kernel GSBASE has to
+* be retrieved from a kernel internal table.
 */
SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg=%rax save_reg=%r14
 
+   /* EBX = 1 -> kernel GSBASE active, no restore required */
+   movl$1, %ebx
+   /*
+* The kernel-enforced convention is a negative GSBASE indicates
+* a kernel value. No SWAPGS needed on entry and exit.
+*/
+   movl$MSR_GS_BASE, %ecx
+   rdmsr
+   testl   %edx, %edx
+   jns .Lparanoid_entry_swapgs
+   ret
+
+.Lparanoid_entry_swapgs:
+   SWAPGS
+   /* EBX = 0 -> SWAPGS required on exit */
+   xorl%ebx, %ebx
ret
 END(paranoid_entry)
 
@@ -1213,7 +1227,8 @@ ENTRY(paranoid_exit)
UNWIND_HINT_REGS
DISABLE_INTERRUPTS(CLBR_ANY)
TRACE_IRQS_OFF_DEBUG
-   testl   %ebx, %ebx  /* swapgs needed? */
+   /* If EBX is 0, SWAPGS is required */
+   testl   %ebx, %ebx
jnz .Lparanoid_exit_no_swapgs
TRACE_IRQS_IRETQ
/* Always restore stashed CR3 value (see paranoid_entry) */


[tip:x86/cpu] x86/entry/64: Introduce the FIND_PERCPU_BASE macro

2019-06-22 Thread tip-bot for Chang S. Bae
Commit-ID:  79e1932fa3cedd731ddbd6af111fe4db8ca109ae
Gitweb: https://git.kernel.org/tip/79e1932fa3cedd731ddbd6af111fe4db8ca109ae
Author: Chang S. Bae 
AuthorDate: Wed, 8 May 2019 03:02:26 -0700
Committer:  Thomas Gleixner 
CommitDate: Sat, 22 Jun 2019 11:38:54 +0200

x86/entry/64: Introduce the FIND_PERCPU_BASE macro

GSBASE is used to find per-CPU data in the kernel. But when GSBASE is
unknown, the per-CPU base can be found from the per_cpu_offset table with a
CPU NR.  The CPU NR is extracted from the limit field of the CPUNODE entry
in GDT, or by the RDPID instruction. This is a prerequisite for using
FSGSBASE in the low level entry code.

Also, add the GAS-compatible RDPID macro as binutils 2.21 do not support
it. Support is added in version 2.27.

[ tglx: Massaged changelog ]

Suggested-by: H. Peter Anvin 
Signed-off-by: Chang S. Bae 
Signed-off-by: Thomas Gleixner 
Cc: Andy Lutomirski 
Cc: Andi Kleen 
Cc: Ravi Shankar 
Cc: Dave Hansen 
Link: 
https://lkml.kernel.org/r/1557309753-24073-12-git-send-email-chang.seok@intel.com

---
 arch/x86/entry/calling.h| 34 ++
 arch/x86/include/asm/inst.h | 15 +++
 2 files changed, 49 insertions(+)

diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index efb0d1b1f15f..9a524360ae2e 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -6,6 +6,7 @@
 #include 
 #include 
 #include 
+#include 
 
 /*
 
@@ -345,6 +346,39 @@ For 32-bit we have the following conventions - kernel is 
built with
 #endif
 .endm
 
+#ifdef CONFIG_SMP
+
+/*
+ * CPU/node NR is loaded from the limit (size) field of a special segment
+ * descriptor entry in GDT.
+ */
+.macro LOAD_CPU_AND_NODE_SEG_LIMIT reg:req
+   movq$__CPUNODE_SEG, \reg
+   lsl \reg, \reg
+.endm
+
+/*
+ * Fetch the per-CPU GSBASE value for this processor and put it in @reg.
+ * We normally use %gs for accessing per-CPU data, but we are setting up
+ * %gs here and obviously can not use %gs itself to access per-CPU data.
+ */
+.macro GET_PERCPU_BASE reg:req
+   ALTERNATIVE \
+   "LOAD_CPU_AND_NODE_SEG_LIMIT \reg", \
+   "RDPID  \reg", \
+   X86_FEATURE_RDPID
+   andq$VDSO_CPUNODE_MASK, \reg
+   movq__per_cpu_offset(, \reg, 8), \reg
+.endm
+
+#else
+
+.macro GET_PERCPU_BASE reg:req
+   movqpcpu_unit_offsets(%rip), \reg
+.endm
+
+#endif /* CONFIG_SMP */
+
 /*
  * This does 'call enter_from_user_mode' unless we can avoid it based on
  * kernel config or using the static jump infrastructure.
diff --git a/arch/x86/include/asm/inst.h b/arch/x86/include/asm/inst.h
index f5a796da07f8..d063841a17e3 100644
--- a/arch/x86/include/asm/inst.h
+++ b/arch/x86/include/asm/inst.h
@@ -306,6 +306,21 @@
.endif
MODRM 0xc0 movq_r64_xmm_opd1 movq_r64_xmm_opd2
.endm
+
+.macro RDPID opd
+   REG_TYPE rdpid_opd_type \opd
+   .if rdpid_opd_type == REG_TYPE_R64
+   R64_NUM rdpid_opd \opd
+   .else
+   R32_NUM rdpid_opd \opd
+   .endif
+   .byte 0xf3
+   .if rdpid_opd > 7
+   PFX_REX rdpid_opd 0
+   .endif
+   .byte 0x0f, 0xc7
+   MODRM 0xc0 rdpid_opd 0x7
+.endm
 #endif
 
 #endif


[tip:x86/cpu] x86/process/64: Use FSGSBASE instructions on thread copy and ptrace

2019-06-22 Thread tip-bot for Chang S. Bae
Commit-ID:  f60a83df4593c5e03e746ded66d8b436c4ad6e41
Gitweb: https://git.kernel.org/tip/f60a83df4593c5e03e746ded66d8b436c4ad6e41
Author: Chang S. Bae 
AuthorDate: Wed, 8 May 2019 03:02:23 -0700
Committer:  Thomas Gleixner 
CommitDate: Sat, 22 Jun 2019 11:38:53 +0200

x86/process/64: Use FSGSBASE instructions on thread copy and ptrace

When FSGSBASE is enabled, copying threads and reading fsbase and gsbase
using ptrace must read the actual values.

When copying a thread, use save_fsgs() and copy the saved values.  For
ptrace, the bases must be read from memory regardless of the selector if
FSGSBASE is enabled.

[ tglx: Invoke __rdgsbase_inactive() with interrupts disabled ]
[ luto: Massage changelog ]

Suggested-by: Andy Lutomirski 
Signed-off-by: Chang S. Bae 
Signed-off-by: Thomas Gleixner 
Cc: "H . Peter Anvin" 
Cc: Andi Kleen 
Cc: Ravi Shankar 
Cc: H. Peter Anvin 
Link: 
https://lkml.kernel.org/r/1557309753-24073-9-git-send-email-chang.seok@intel.com

---
 arch/x86/kernel/process_64.c | 19 +--
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 59013f480b86..8f239091c15d 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -245,13 +245,17 @@ static __always_inline void save_fsgs(struct task_struct 
*task)
savesegment(fs, task->thread.fsindex);
savesegment(gs, task->thread.gsindex);
if (static_cpu_has(X86_FEATURE_FSGSBASE)) {
+   unsigned long flags;
+
/*
 * If FSGSBASE is enabled, we can't make any useful guesses
 * about the base, and user code expects us to save the current
 * value.  Fortunately, reading the base directly is efficient.
 */
task->thread.fsbase = rdfsbase();
+   local_irq_save(flags);
task->thread.gsbase = __rdgsbase_inactive();
+   local_irq_restore(flags);
} else {
save_base_legacy(task, task->thread.fsindex, FS);
save_base_legacy(task, task->thread.gsindex, GS);
@@ -433,7 +437,8 @@ unsigned long x86_fsbase_read_task(struct task_struct *task)
 
if (task == current)
fsbase = x86_fsbase_read_cpu();
-   else if (task->thread.fsindex == 0)
+   else if (static_cpu_has(X86_FEATURE_FSGSBASE) ||
+(task->thread.fsindex == 0))
fsbase = task->thread.fsbase;
else
fsbase = x86_fsgsbase_read_task(task, task->thread.fsindex);
@@ -447,7 +452,8 @@ unsigned long x86_gsbase_read_task(struct task_struct *task)
 
if (task == current)
gsbase = x86_gsbase_read_cpu_inactive();
-   else if (task->thread.gsindex == 0)
+   else if (static_cpu_has(X86_FEATURE_FSGSBASE) ||
+(task->thread.gsindex == 0))
gsbase = task->thread.gsbase;
else
gsbase = x86_fsgsbase_read_task(task, task->thread.gsindex);
@@ -487,10 +493,11 @@ int copy_thread_tls(unsigned long clone_flags, unsigned 
long sp,
p->thread.sp = (unsigned long) fork_frame;
p->thread.io_bitmap_ptr = NULL;
 
-   savesegment(gs, p->thread.gsindex);
-   p->thread.gsbase = p->thread.gsindex ? 0 : me->thread.gsbase;
-   savesegment(fs, p->thread.fsindex);
-   p->thread.fsbase = p->thread.fsindex ? 0 : me->thread.fsbase;
+   save_fsgs(me);
+   p->thread.fsindex = me->thread.fsindex;
+   p->thread.fsbase = me->thread.fsbase;
+   p->thread.gsindex = me->thread.gsindex;
+   p->thread.gsbase = me->thread.gsbase;
savesegment(es, p->thread.es);
savesegment(ds, p->thread.ds);
memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));


[tip:x86/cpu] x86/entry/64: Handle FSGSBASE enabled paranoid entry/exit

2019-06-22 Thread tip-bot for Chang S. Bae
Commit-ID:  708078f65721b46d82d9934a3f0b36a2b8ad0656
Gitweb: https://git.kernel.org/tip/708078f65721b46d82d9934a3f0b36a2b8ad0656
Author: Chang S. Bae 
AuthorDate: Wed, 8 May 2019 03:02:27 -0700
Committer:  Thomas Gleixner 
CommitDate: Sat, 22 Jun 2019 11:38:54 +0200

x86/entry/64: Handle FSGSBASE enabled paranoid entry/exit

Without FSGSBASE, user space cannot change GSBASE other than through a
PRCTL. The kernel enforces that the user space GSBASE value is postive as
negative values are used for detecting the kernel space GSBASE value in the
paranoid entry code.

If FSGSBASE is enabled, user space can set arbitrary GSBASE values without
kernel intervention, including negative ones, which breaks the paranoid
entry assumptions.

To avoid this, paranoid entry needs to unconditionally save the current
GSBASE value independent of the interrupted context, retrieve and write the
kernel GSBASE and unconditionally restore the saved value on exit. The
restore happens either in paranoid_exit or in the special exit path of the
NMI low level code.

All other entry code pathes which use unconditional SWAPGS are not affected
as they do not depend on the actual content.

[ tglx: Massaged changelogs and comments ]

Suggested-by: H. Peter Anvin 
Suggested-by: Andy Lutomirski 
Suggested-by: Thomas Gleixner 
Signed-off-by: Chang S. Bae 
Signed-off-by: Thomas Gleixner 
Cc: Andi Kleen 
Cc: Ravi Shankar 
Cc: Dave Hansen 
Link: 
https://lkml.kernel.org/r/1557309753-24073-13-git-send-email-chang.seok@intel.com

---
 arch/x86/entry/calling.h  |  6 
 arch/x86/entry/entry_64.S | 80 ---
 2 files changed, 75 insertions(+), 11 deletions(-)

diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index 9a524360ae2e..d3fbe2dc03ea 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -338,6 +338,12 @@ For 32-bit we have the following conventions - kernel is 
built with
 #endif
 .endm
 
+.macro SAVE_AND_SET_GSBASE scratch_reg:req save_reg:req
+   rdgsbase \save_reg
+   GET_PERCPU_BASE \scratch_reg
+   wrgsbase \scratch_reg
+.endm
+
 #endif /* CONFIG_X86_64 */
 
 .macro STACKLEAK_ERASE
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index aaa846f8850a..7f9f5119d6b1 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -38,6 +38,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 
 #include "calling.h"
@@ -947,7 +948,6 @@ ENTRY(\sym)
addq$\ist_offset, CPU_TSS_IST(\shift_ist)
.endif
 
-   /* these procedures expect "no swapgs" flag in ebx */
.if \paranoid
jmp paranoid_exit
.else
@@ -1164,9 +1164,14 @@ idtentry machine_check   do_mce  
has_error_code=0paranoid=1
 #endif
 
 /*
- * Save all registers in pt_regs, and switch gs if needed.
- * Use slow, but surefire "are we in kernel?" check.
- * Return: ebx=0: need swapgs on exit, ebx=1: otherwise
+ * Save all registers in pt_regs. Return GSBASE related information
+ * in EBX depending on the availability of the FSGSBASE instructions:
+ *
+ * FSGSBASER/EBX
+ * N0 -> SWAPGS on exit
+ *  1 -> no SWAPGS on exit
+ *
+ * YGSBASE value at entry, must be restored in paranoid_exit
  */
 ENTRY(paranoid_entry)
UNWIND_HINT_FUNC
@@ -1174,7 +1179,6 @@ ENTRY(paranoid_entry)
PUSH_AND_CLEAR_REGS save_ret=1
ENCODE_FRAME_POINTER 8
 
-1:
/*
 * Always stash CR3 in %r14.  This value will be restored,
 * verbatim, at exit.  Needed if paranoid_entry interrupted
@@ -1192,6 +1196,25 @@ ENTRY(paranoid_entry)
 */
SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg=%rax save_reg=%r14
 
+/*
+* Handling GSBASE depends on the availability of FSGSBASE.
+*
+* Without FSGSBASE the kernel enforces that negative GSBASE
+* values indicate kernel GSBASE. With FSGSBASE no assumptions
+* can be made about the GSBASE value when entering from user
+* space.
+   */
+   ALTERNATIVE "jmp .Lparanoid_entry_checkgs", "", X86_FEATURE_FSGSBASE
+
+   /*
+* Read the current GSBASE and store it in in %rbx unconditionally,
+* retrieve and set the current CPUs kernel GSBASE. The stored value
+* has to be restored in paranoid_exit unconditionally.
+*/
+   SAVE_AND_SET_GSBASE scratch_reg=%rax save_reg=%rbx
+   ret
+
+.Lparanoid_entry_checkgs:
/* EBX = 1 -> kernel GSBASE active, no restore required */
movl$1, %ebx
/*
@@ -1218,16 +1241,32 @@ END(paranoid_entry)
  *
  * We may be returning to very strange contexts (e.g. very early
  * in syscall entry), so checking for preemption here would
- * be complicated.  Fortunately, we there's no good reason
- * to try to handle preemption here.
+ * be complicated.  Fortunately, there's no good reason to try
+ * to handle preemption here.
  *
- * On entry, ebx is 

[tip:x86/cpu] x86/entry/64: Document GSBASE handling in the paranoid path

2019-06-22 Thread tip-bot for Chang S. Bae
Commit-ID:  5bf0cab60ee2c730ec91ae0aabc3146bcfed138b
Gitweb: https://git.kernel.org/tip/5bf0cab60ee2c730ec91ae0aabc3146bcfed138b
Author: Chang S. Bae 
AuthorDate: Wed, 8 May 2019 03:02:28 -0700
Committer:  Thomas Gleixner 
CommitDate: Sat, 22 Jun 2019 11:38:55 +0200

x86/entry/64: Document GSBASE handling in the paranoid path

On a FSGSBASE system, the way to handle GSBASE in the paranoid path is
different from the existing SWAPGS-based entry/exit path handling. Document
the reason and what has to be done for FSGSBASE enabled systems.

[ tglx: Massaged doc and changelog ]

Signed-off-by: Chang S. Bae 
Signed-off-by: Thomas Gleixner 
Cc: Andy Lutomirski 
Cc: Andi Kleen 
Cc: Ravi Shankar 
Cc: H. Peter Anvin 
Link: 
https://lkml.kernel.org/r/1557309753-24073-14-git-send-email-chang.seok@intel.com

---
 Documentation/x86/entry_64.rst | 9 +
 1 file changed, 9 insertions(+)

diff --git a/Documentation/x86/entry_64.rst b/Documentation/x86/entry_64.rst
index a48b3f6ebbe8..b87c1d816aea 100644
--- a/Documentation/x86/entry_64.rst
+++ b/Documentation/x86/entry_64.rst
@@ -108,3 +108,12 @@ We try to only use IST entries and the paranoid entry code 
for vectors
 that absolutely need the more expensive check for the GS base - and we
 generate all 'normal' entry points with the regular (faster) paranoid=0
 variant.
+
+On a FSGSBASE system, however, user space can set GS without kernel
+interaction. It means the value of GS base itself does not imply anything,
+whether a kernel value or a user space value. So, there is no longer a safe
+way to check whether the exception is entering from user mode or kernel
+mode in the paranoid entry code path. So the GSBASE value needs to be read
+out, saved and the kernel GSBASE value written. On exit the saved GSBASE
+value needs to be restored unconditionally. The non paranoid entry/exit
+code still uses SWAPGS unconditionally as the state is known.


[tip:x86/cpu] selftests/x86/fsgsbase: Test ptracer-induced GSBASE write with FSGSBASE

2019-06-22 Thread tip-bot for Chang S. Bae
Commit-ID:  a87730cc3acc475eff12ddde3f7d5687371b5c76
Gitweb: https://git.kernel.org/tip/a87730cc3acc475eff12ddde3f7d5687371b5c76
Author: Chang S. Bae 
AuthorDate: Wed, 8 May 2019 03:02:30 -0700
Committer:  Thomas Gleixner 
CommitDate: Sat, 22 Jun 2019 11:38:56 +0200

selftests/x86/fsgsbase: Test ptracer-induced GSBASE write with FSGSBASE

This validates that GS and GSBASE are independently preserved in
ptracer commands.

Suggested-by: Andy Lutomirski 
Signed-off-by: Chang S. Bae 
Signed-off-by: Thomas Gleixner 
Cc: Andi Kleen 
Cc: Ravi Shankar 
Cc: H. Peter Anvin 
Link: 
https://lkml.kernel.org/r/1557309753-24073-16-git-send-email-chang.seok@intel.com

---
 tools/testing/selftests/x86/fsgsbase.c | 11 +--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/x86/fsgsbase.c 
b/tools/testing/selftests/x86/fsgsbase.c
index afd029897c79..21fd4f94b5b0 100644
--- a/tools/testing/selftests/x86/fsgsbase.c
+++ b/tools/testing/selftests/x86/fsgsbase.c
@@ -470,7 +470,7 @@ static void test_ptrace_write_gsbase(void)
wait(&status);
 
if (WSTOPSIG(status) == SIGTRAP) {
-   unsigned long gs;
+   unsigned long gs, base;
unsigned long gs_offset = USER_REGS_OFFSET(gs);
unsigned long base_offset = USER_REGS_OFFSET(gs_base);
 
@@ -486,6 +486,7 @@ static void test_ptrace_write_gsbase(void)
err(1, "PTRACE_POKEUSER");
 
gs = ptrace(PTRACE_PEEKUSER, child, gs_offset, NULL);
+   base = ptrace(PTRACE_PEEKUSER, child, base_offset, NULL);
 
/*
 * In a non-FSGSBASE system, the nonzero selector will load
@@ -496,8 +497,14 @@ static void test_ptrace_write_gsbase(void)
if (gs != 0x7) {
nerrs++;
printf("[FAIL]\tGS changed to %lx\n", gs);
+   } else if (have_fsgsbase && (base != 0xFF)) {
+   nerrs++;
+   printf("[FAIL]\tGSBASE changed to %lx\n", base);
} else {
-   printf("[OK]\tGS remained 0x7\n");
+   printf("[OK]\tGS remained 0x7 %s");
+   if (have_fsgsbase)
+   printf("and GSBASE changed to 0xFF");
+   printf("\n");
}
}