[PATCH] powerpc/platforms/85xx: fix t1042rdb_diu.c build errors & warning

2018-07-15 Thread Randy Dunlap
From: Randy Dunlap 

Fix build errors and warnings in t1042rdb_diu.c by adding header files
and MODULE_LICENSE().

../arch/powerpc/platforms/85xx/t1042rdb_diu.c:152:1: warning: data definition 
has no type or storage class
 early_initcall(t1042rdb_diu_init);
../arch/powerpc/platforms/85xx/t1042rdb_diu.c:152:1: error: type defaults to 
'int' in declaration of 'early_initcall' [-Werror=implicit-int]
../arch/powerpc/platforms/85xx/t1042rdb_diu.c:152:1: warning: parameter names 
(without types) in function declaration

and
WARNING: modpost: missing MODULE_LICENSE() in 
arch/powerpc/platforms/85xx/t1042rdb_diu.o

Signed-off-by: Randy Dunlap 
Cc: Benjamin Herrenschmidt 
Cc: Paul Mackerras 
Cc: Michael Ellerman 
Cc: Scott Wood 
Cc: Kumar Gala 
Cc: linuxppc-dev@lists.ozlabs.org
---
Found when using Michael's patch for ppc64_book3e_allmodconfig.

 arch/powerpc/platforms/85xx/t1042rdb_diu.c |4 
 1 file changed, 4 insertions(+)

--- lnx-418-rc4.orig/arch/powerpc/platforms/85xx/t1042rdb_diu.c
+++ lnx-418-rc4/arch/powerpc/platforms/85xx/t1042rdb_diu.c
@@ -9,8 +9,10 @@
  * option) any later version.
  */
 
+#include 
 #include 
 #include 
+#include 
 #include 
 #include 
 
@@ -150,3 +152,5 @@ static int __init t1042rdb_diu_init(void
 }
 
 early_initcall(t1042rdb_diu_init);
+
+MODULE_LICENSE("GPL");




[PATCH] usb/phy: fix PPC64 build errors in phy-fsl-usb.c

2018-07-15 Thread Randy Dunlap
From: Randy Dunlap 

Fix build errors when built for PPC64:
These variables are only used on PPC32 so they don't need to be
initialized for PPC64.

../drivers/usb/phy/phy-fsl-usb.c: In function 'usb_otg_start':
../drivers/usb/phy/phy-fsl-usb.c:865:3: error: '_fsl_readl' undeclared (first 
use in this function); did you mean 'fsl_readl'?
   _fsl_readl = _fsl_readl_be;
../drivers/usb/phy/phy-fsl-usb.c:865:16: error: '_fsl_readl_be' undeclared 
(first use in this function); did you mean 'fsl_readl'?
   _fsl_readl = _fsl_readl_be;
../drivers/usb/phy/phy-fsl-usb.c:866:3: error: '_fsl_writel' undeclared (first 
use in this function); did you mean 'fsl_writel'?
   _fsl_writel = _fsl_writel_be;
../drivers/usb/phy/phy-fsl-usb.c:866:17: error: '_fsl_writel_be' undeclared 
(first use in this function); did you mean 'fsl_writel'?
   _fsl_writel = _fsl_writel_be;
../drivers/usb/phy/phy-fsl-usb.c:868:16: error: '_fsl_readl_le' undeclared 
(first use in this function); did you mean 'fsl_readl'?
   _fsl_readl = _fsl_readl_le;
../drivers/usb/phy/phy-fsl-usb.c:869:17: error: '_fsl_writel_le' undeclared 
(first use in this function); did you mean 'fsl_writel'?
   _fsl_writel = _fsl_writel_le;

and the sysfs "show" function return type should be ssize_t, not int:

../drivers/usb/phy/phy-fsl-usb.c:1042:49: error: initialization of 'ssize_t 
(*)(struct device *, struct device_attribute *, char *)' {aka 'long int 
(*)(struct device *, struct device_attribute *, char *)'} from incompatible 
pointer type 'int (*)(struct device *, struct device_attribute *, char *)' 
[-Werror=incompatible-pointer-types]
 static DEVICE_ATTR(fsl_usb2_otg_state, S_IRUGO, show_fsl_usb2_otg_state, NULL);

Signed-off-by: Randy Dunlap 
Cc: Felipe Balbi 
Cc: linux-...@vger.kernel.org
Cc: Michael Ellerman 
Cc: linuxppc-dev@lists.ozlabs.org
---
Found when using Michael's patch for ppc64_book3e_allmodconfig.

 drivers/usb/phy/phy-fsl-usb.c |4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

--- lnx-418-rc4.orig/drivers/usb/phy/phy-fsl-usb.c
+++ lnx-418-rc4/drivers/usb/phy/phy-fsl-usb.c
@@ -861,6 +861,7 @@ int usb_otg_start(struct platform_device
if (pdata->init && pdata->init(pdev) != 0)
return -EINVAL;
 
+#ifdef CONFIG_PPC32
if (pdata->big_endian_mmio) {
_fsl_readl = _fsl_readl_be;
_fsl_writel = _fsl_writel_be;
@@ -868,6 +869,7 @@ int usb_otg_start(struct platform_device
_fsl_readl = _fsl_readl_le;
_fsl_writel = _fsl_writel_le;
}
+#endif
 
/* request irq */
p_otg->irq = platform_get_irq(pdev, 0);
@@ -958,7 +960,7 @@ int usb_otg_start(struct platform_device
 /*
  * state file in sysfs
  */
-static int show_fsl_usb2_otg_state(struct device *dev,
+static ssize_t show_fsl_usb2_otg_state(struct device *dev,
   struct device_attribute *attr, char *buf)
 {
struct otg_fsm *fsm = &fsl_otg_dev->fsm;




Re: [PATCH 2/2] powerpc: Add ppc64le and ppc64_book3e allmodconfig targets

2018-07-15 Thread Randy Dunlap
On 07/09/18 07:24, Michael Ellerman wrote:
> Similarly as we just did for 32-bit, add phony targets for generating
> a little endian and Book3E allmodconfig. These aren't covered by the
> regular allmodconfig, which is big endian and Book3S due to the way
> the Kconfig symbols are structured.

[adding Felipe Balbi]


Is book3e allmodconfig not seen/used very much?

Besides the patches that I have already sent, I am seeing a build problem
with ppc64_book3e_allmodconfig, where we have:

CONFIG_USB_PHY=y
CONFIG_FSL_USB2_OTG=y
but
CONFIG_USB_OTG_FSM=m

In drivers/usb/phy/Kconfig, FSL_USB2_OTG depends on USB_OTG_FSM (among
other things), but!  FSL_USB2_OTG is a bool symbol, depending on a
tristate symbol.  This often causes problems.  In this case it causes errors
with a builtin driver trying to use symbols that are built in a loadable module:

drivers/usb/phy/phy-fsl-usb.o: In function `.fsl_otg_ioctl':
phy-fsl-usb.c:(.text.fsl_otg_ioctl+0xb4): undefined reference to 
`.otg_statemachine'
drivers/usb/phy/phy-fsl-usb.o: In function `.fsl_otg_start_srp':
phy-fsl-usb.c:(.text.fsl_otg_start_srp+0x4c): undefined reference to 
`.otg_statemachine'
drivers/usb/phy/phy-fsl-usb.o: In function `.fsl_otg_set_host':
phy-fsl-usb.c:(.text.fsl_otg_set_host+0xd0): undefined reference to 
`.otg_statemachine'
drivers/usb/phy/phy-fsl-usb.o: In function `.fsl_otg_start_hnp':
phy-fsl-usb.c:(.text.fsl_otg_start_hnp+0x68): undefined reference to 
`.otg_statemachine'
drivers/usb/phy/phy-fsl-usb.o: In function `.show_fsl_usb2_otg_state':
phy-fsl-usb.c:(.text.show_fsl_usb2_otg_state+0x154): undefined reference to 
`.usb_otg_state_string'
drivers/usb/phy/phy-fsl-usb.o: In function `.a_wait_enum':
(.text.a_wait_enum+0x4c): undefined reference to `.otg_statemachine'
drivers/usb/phy/phy-fsl-usb.o: In function `.fsl_otg_set_peripheral':
phy-fsl-usb.c:(.text.fsl_otg_set_peripheral+0x84): undefined reference to 
`.usb_gadget_vbus_disconnect'
phy-fsl-usb.c:(.text.fsl_otg_set_peripheral+0x9c): undefined reference to 
`.otg_statemachine'



> Signed-off-by: Michael Ellerman 
> ---
>  arch/powerpc/Makefile | 10 ++
>  1 file changed, 10 insertions(+)
> 
> diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile
> index 2556c2182789..48e887f03a6c 100644
> --- a/arch/powerpc/Makefile
> +++ b/arch/powerpc/Makefile
> @@ -359,6 +359,16 @@ ppc32_allmodconfig:
>   $(Q)$(MAKE) 
> KCONFIG_ALLCONFIG=$(srctree)/arch/powerpc/configs/book3s_32.config \
>   -f $(srctree)/Makefile allmodconfig
>  
> +PHONY += ppc64le_allmodconfig
> +ppc64le_allmodconfig:
> + $(Q)$(MAKE) KCONFIG_ALLCONFIG=$(srctree)/arch/powerpc/configs/le.config 
> \
> + -f $(srctree)/Makefile allmodconfig
> +
> +PHONY += ppc64_book3e_allmodconfig
> +ppc64_book3e_allmodconfig:
> + $(Q)$(MAKE) 
> KCONFIG_ALLCONFIG=$(srctree)/arch/powerpc/configs/85xx-64bit.config \
> + -f $(srctree)/Makefile allmodconfig
> +
>  define archhelp
>@echo '* zImage  - Build default images selected by kernel config'
>@echo '  zImage.*- Compressed kernel image 
> (arch/$(ARCH)/boot/zImage.*)'
> 

thanks,
-- 
~Randy


Re: [PATCH kernel v3 3/6] KVM: PPC: Make iommu_table::it_userspace big endian

2018-07-15 Thread Paul Mackerras
On Wed, Jul 04, 2018 at 04:13:46PM +1000, Alexey Kardashevskiy wrote:
> We are going to reuse multilevel TCE code for the userspace copy of
> the TCE table and since it is big endian, let's make the copy big endian
> too.
> 
> Reviewed-by: David Gibson 
> Signed-off-by: Alexey Kardashevskiy 

Acked-by: Paul Mackerras 


Re: [PATCH kernel v6 2/2] KVM: PPC: Check if IOMMU page is contained in the pinned physical page

2018-07-15 Thread David Gibson
On Wed, Jul 11, 2018 at 09:00:44PM +1000, Alexey Kardashevskiy wrote:
> A VM which has:
>  - a DMA capable device passed through to it (eg. network card);
>  - running a malicious kernel that ignores H_PUT_TCE failure;
>  - capability of using IOMMU pages bigger that physical pages
> can create an IOMMU mapping that exposes (for example) 16MB of
> the host physical memory to the device when only 64K was allocated to the VM.
> 
> The remaining 16MB - 64K will be some other content of host memory, possibly
> including pages of the VM, but also pages of host kernel memory, host
> programs or other VMs.
> 
> The attacking VM does not control the location of the page it can map,
> and is only allowed to map as many pages as it has pages of RAM.
> 
> We already have a check in drivers/vfio/vfio_iommu_spapr_tce.c that
> an IOMMU page is contained in the physical page so the PCI hardware won't
> get access to unassigned host memory; however this check is missing in
> the KVM fastpath (H_PUT_TCE accelerated code). We were lucky so far and
> did not hit this yet as the very first time when the mapping happens
> we do not have tbl::it_userspace allocated yet and fall back to
> the userspace which in turn calls VFIO IOMMU driver, this fails and
> the guest does not retry,
> 
> This stores the smallest preregistered page size in the preregistered
> region descriptor and changes the mm_iommu_xxx API to check this against
> the IOMMU page size.
> 
> This calculates maximum page size as a minimum of the natural region
> alignment and compound page size. For the page shift this uses the shift
> returned by find_linux_pte() which indicates how the page is mapped to
> the current userspace - if the page is huge and this is not a zero, then
> it is a leaf pte and the page is mapped within the range.
> 
> Signed-off-by: Alexey Kardashevskiy 

Reviewed-by: David Gibson 

> ---
> Changes:
> v6:
> * replaced hugetlbfs with pageshift from find_linux_pte()
> 
> v5:
> * only consider compound pages from hugetlbfs
> 
> v4:
> * reimplemented max pageshift calculation
> 
> v3:
> * fixed upper limit for the page size
> * added checks that we don't register parts of a huge page
> 
> v2:
> * explicitely check for compound pages before calling compound_order()
> 
> ---
> The bug is: run QEMU _without_ hugepages (no -mempath) and tell it to
> advertise 16MB pages to the guest; a typical pseries guest will use 16MB
> for IOMMU pages without checking the mmu pagesize and this will fail
> at 
> https://git.qemu.org/?p=qemu.git;a=blob;f=hw/vfio/common.c;h=fb396cf00ac40eb35967a04c9cc798ca896eed57;hb=refs/heads/master#l256
> 
> With the change, mapping will fail in KVM and the guest will print:
> 
> mlx5_core :00:00.0: ibm,create-pe-dma-window(2027) 0 800 2000 18 
> 1f returned 0 (liobn = 0x8001 starting addr = 800 0)
> mlx5_core :00:00.0: created tce table LIOBN 0x8001 for 
> /pci@8002000/ethernet@0
> mlx5_core :00:00.0: failed to map direct window for 
> /pci@8002000/ethernet@0: -1
> ---
>  arch/powerpc/include/asm/mmu_context.h |  4 ++--
>  arch/powerpc/kvm/book3s_64_vio.c   |  2 +-
>  arch/powerpc/kvm/book3s_64_vio_hv.c|  6 --
>  arch/powerpc/mm/mmu_context_iommu.c| 39 
> --
>  drivers/vfio/vfio_iommu_spapr_tce.c|  2 +-
>  5 files changed, 45 insertions(+), 8 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/mmu_context.h 
> b/arch/powerpc/include/asm/mmu_context.h
> index 896efa5..79d570c 100644
> --- a/arch/powerpc/include/asm/mmu_context.h
> +++ b/arch/powerpc/include/asm/mmu_context.h
> @@ -35,9 +35,9 @@ extern struct mm_iommu_table_group_mem_t 
> *mm_iommu_lookup_rm(
>  extern struct mm_iommu_table_group_mem_t *mm_iommu_find(struct mm_struct *mm,
>   unsigned long ua, unsigned long entries);
>  extern long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem,
> - unsigned long ua, unsigned long *hpa);
> + unsigned long ua, unsigned int pageshift, unsigned long *hpa);
>  extern long mm_iommu_ua_to_hpa_rm(struct mm_iommu_table_group_mem_t *mem,
> - unsigned long ua, unsigned long *hpa);
> + unsigned long ua, unsigned int pageshift, unsigned long *hpa);
>  extern long mm_iommu_mapped_inc(struct mm_iommu_table_group_mem_t *mem);
>  extern void mm_iommu_mapped_dec(struct mm_iommu_table_group_mem_t *mem);
>  #endif
> diff --git a/arch/powerpc/kvm/book3s_64_vio.c 
> b/arch/powerpc/kvm/book3s_64_vio.c
> index d066e37..8c456fa 100644
> --- a/arch/powerpc/kvm/book3s_64_vio.c
> +++ b/arch/powerpc/kvm/book3s_64_vio.c
> @@ -449,7 +449,7 @@ long kvmppc_tce_iommu_do_map(struct kvm *kvm, struct 
> iommu_table *tbl,
>   /* This only handles v2 IOMMU type, v1 is handled via ioctl() */
>   return H_TOO_HARD;
>  
> - if (WARN_ON_ONCE(mm_iommu_ua_to_hpa(mem, ua, &hpa)))
> + if (WARN_ON_ONCE(mm_iommu_ua_to_hpa(mem, ua, tbl->it_page_shift, &hpa)))
>

Re: [PATCH v3 2/2] powerpc: Enable CPU_FTR_ASYM_SMT for interleaved big-cores

2018-07-15 Thread Gautham R Shenoy


On Wed, Jul 11, 2018 at 04:32:30PM +0800, kbuild test robot wrote:
> Hi Gautham,
> 
> Thank you for the patch! Yet something to improve:
> 
> [auto build test ERROR on powerpc/next]
> [also build test ERROR on v4.18-rc4 next-20180710]
> [if your patch is applied to the wrong git tree, please drop us a note to 
> help improve the system]
> 
> url:
> https://github.com/0day-ci/linux/commits/Gautham-R-Shenoy/powerpc-Detect-the-presence-of-big-cores-via-ibm-thread-groups/20180706-174756
> base:   https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git next
> config: powerpc-g5_defconfig (attached as .config)
> compiler: powerpc64-linux-gnu-gcc (Debian 7.2.0-11) 7.2.0
> reproduce:
> wget 
> https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O 
> ~/bin/make.cross
> chmod +x ~/bin/make.cross
> # save the attached .config to linux build tree
> GCC_VERSION=7.2.0 make.cross ARCH=powerpc 
> 
> All errors (new ones prefixed by >>):
> 
>In file included from include/linux/static_key.h:1:0,
> from include/linux/context_tracking_state.h:6,
> from include/linux/vtime.h:5,
> from include/linux/hardirq.h:8,
> from include/linux/interrupt.h:11,
> from include/linux/serial_core.h:25,
> from include/linux/serial_8250.h:14,
> from arch/powerpc/kernel/setup-common.c:33:
>arch/powerpc/kernel/setup-common.c: In function 'smp_setup_cpu_maps':
> >> arch/powerpc/kernel/setup-common.c:777:25: error: 'cpu_feature_keys' 
> >> undeclared (first use in this function); did you mean 'setup_feature_keys'?
>   static_branch_enable(&cpu_feature_keys[key]);

Ok, so this needs to be enabled only on CONFIG_PPC_PSERIES and
CONFIG_PPC_POWERNV. Will fix this.


--
Thanks and Regards
gautham.



[RFC PATCH v6 0/4] powerpc/fadump: Improvements and fixes for firmware-assisted dump.

2018-07-15 Thread Mahesh J Salgaonkar
One of the primary issues with Firmware Assisted Dump (fadump) on Power
is that it needs a large amount of memory to be reserved. This reserved
memory is used for saving the contents of old crashed kernel's memory before
fadump capture kernel uses old kernel's memory area to boot. However, This
reserved memory area stays unused until system crash and isn't available
for production kernel to use.

Instead of setting aside a significant chunk of memory that nobody can use,
take advantage ZONE_MOVABLE to mark a significant chunk of reserved memory
as ZONE_MOVABLE, so that the kernel is prevented from using, but
applications are free to use it.

Patch 1 introduces an interface to mark reserved memory as ZONE_MOVABLE.
Patch 2 uses the above interface to mark reserved memory movable so that
it can be used for applications usage, making fadump reservationless.
Patch 3 and 4 fixes minor issues.

Changes in V6:
- Introduce an interface to mark reserved memory as ZONE_MOVABLE. Hence
  sending this series as RFC again.
- Mark reserved area as ZONE_MOVABLE instead of CMA.
- Add fadump=nonmovable parameter for user who don't want to use ZONE_MOVABLE.

Changes in V5:
- Drop the patch that does metadata movement.
- Move the kexec fix patch to top (patch 1)
- Fold CMA documenation patch into patch 2
- Fix the compilation issues when CONFIG_CMA is not set reported by Hari.
- Use the approach of using boot memory size for CMA as suggested by Hari
  except the movement of sections. Thanks to Hari.

Changes in V4:
- patch 1: Make fadump compatible irrespective of kernel versions.
- patch 4: moved out of the series and been posted seperatly at
  http://patchwork.ozlabs.org/patch/896716/
- Documentation update about CMA reservation.

Changes in V3:
- patch 1 & 2: move metadata region and documentation update.
- patch 7: Un-register the faudmp on kexec path


---

Mahesh Salgaonkar (4):
  mm/page_alloc: Introduce an interface to mark reserved memory as 
ZONE_MOVABLE
  powerpc/fadump: Reservationless firmware assisted dump
  powerpc/fadump: throw proper error message on fadump registration failure.
  powerpc/fadump: Do not allow hot-remove memory from fadump reserved area.


 Documentation/powerpc/firmware-assisted-dump.txt |   18 +++
 arch/powerpc/include/asm/fadump.h|7 +
 arch/powerpc/kernel/fadump.c |  123 +--
 arch/powerpc/platforms/pseries/hotplug-memory.c  |7 +
 include/linux/mmzone.h   |2 
 mm/page_alloc.c  |  146 ++
 6 files changed, 290 insertions(+), 13 deletions(-)

--
Signature



[RFC PATCH v6 1/4] mm/page_alloc: Introduce an interface to mark reserved memory as ZONE_MOVABLE

2018-07-15 Thread Mahesh J Salgaonkar
From: Mahesh Salgaonkar 

Add an interface to allow a custom reserved memory to be marked as
ZONE_MOVABLE. This will help some subsystem's to convert their reserved
memory region into ZONE_MOVABLE so that the memory can still be available
to user applications.

The approach is based on Joonsoo Kim's commit bad8c6c0
(https://github.com/torvalds/linux/commit/bad8c6c0) that
uses ZONE_MOVABLE to manage CMA area. Majority of the code has been taken
from the Joonsoo Kim's commit mentioned above. But I see above commit
has been reverted due to some issues reported on i386. I believe this
patch is being reworked and re-posted soon.

Like CMA, the other user of ZONE_MOVABLE can be fadump on powerpc, which
reserves significant chunk of memory that is used only after system
is crashed. Until then the reserved memory is unused. By marking that
memory to ZONE_MOVABLE, it can be at least utilized by user applications.

This patch proposes a RFC implementation of an interface to mark
specified reserved area as ZONE_MOVABLE. Comments are welcome.

Signed-off-by: Mahesh Salgaonkar 
---
 include/linux/mmzone.h |2 +
 mm/page_alloc.c|  146 
 2 files changed, 148 insertions(+)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 32699b2dc52a..2519dd690572 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1288,6 +1288,8 @@ struct mminit_pfnnid_cache {
 #endif
 
 void memory_present(int nid, unsigned long start, unsigned long end);
+extern int __init zone_movable_init_reserved_mem(phys_addr_t base,
+   phys_addr_t size);
 
 /*
  * If it is possible to have holes within a MAX_ORDER_NR_PAGES, then we
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 1521100f1e63..0817ed8843cb 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -7687,6 +7687,152 @@ bool has_unmovable_pages(struct zone *zone, struct page 
*page, int count,
return true;
 }
 
+static __init void mark_zone_movable(struct page *page)
+{
+   unsigned i = pageblock_nr_pages;
+   struct page *p = page;
+   struct zone *zone;
+   unsigned long pfn = page_to_pfn(page);
+   int nid = page_to_nid(page);
+
+   zone = page_zone(page);
+   zone->present_pages -= pageblock_nr_pages;
+
+   do {
+   __ClearPageReserved(p);
+   set_page_count(p, 0);
+
+   /* Steal pages from other zones */
+   set_page_links(p, ZONE_MOVABLE, nid, pfn);
+   } while (++p, ++pfn, --i);
+
+   zone = page_zone(page);
+   zone->present_pages += pageblock_nr_pages;
+
+   set_pageblock_migratetype(page, MIGRATE_MOVABLE);
+
+   if (pageblock_order >= MAX_ORDER) {
+   i = pageblock_nr_pages;
+   p = page;
+   do {
+   set_page_refcounted(p);
+   __free_pages(p, MAX_ORDER - 1);
+   p += MAX_ORDER_NR_PAGES;
+   } while (i -= MAX_ORDER_NR_PAGES);
+   } else {
+   set_page_refcounted(page);
+   __free_pages(page, pageblock_order);
+   }
+
+   adjust_managed_page_count(page, pageblock_nr_pages);
+}
+
+static int __init zone_movable_activate_area(unsigned long start_pfn,
+   unsigned long end_pfn)
+{
+   unsigned long base_pfn = start_pfn, pfn = start_pfn;
+   struct zone *zone;
+   unsigned i = (end_pfn - start_pfn) >> pageblock_order;
+
+   zone = page_zone(pfn_to_page(base_pfn));
+   while (pfn < end_pfn) {
+   if (!pfn_valid(pfn))
+   goto err;
+
+   if (page_zone(pfn_to_page(pfn)) != zone)
+   goto err;
+   pfn++;
+   }
+
+   do {
+   mark_zone_movable(pfn_to_page(base_pfn));
+   base_pfn += pageblock_nr_pages;
+   } while (--i);
+
+   return 0;
+err:
+   pr_err("Zone movable could not be activated\n");
+   return -EINVAL;
+}
+
+/**
+ * zone_movable_init_reserved_mem() - create custom zone movable area from
+ *   reserved memory
+ * @base: Base address of the reserved area
+ * @size: Size of the reserved area (in bytes),
+ *
+ * This function creates custom zone movable area from already reserved memory.
+ */
+int __init zone_movable_init_reserved_mem(phys_addr_t base, phys_addr_t size)
+{
+   struct zone *zone;
+   pg_data_t *pgdat;
+   unsigned long start_pfn = PHYS_PFN(base);
+   unsigned long end_pfn = PHYS_PFN(base + size);
+   phys_addr_t alignment;
+   int ret;
+
+   if (!size || !memblock_is_region_reserved(base, size))
+   return -EINVAL;
+
+   /* ensure minimal alignment required by mm core */
+   alignment = PAGE_SIZE <<
+   max_t(unsigned long, MAX_ORDER - 1, pageblock_order);
+
+   if (ALIGN(base, alignment) != base || ALIGN

[RFC PATCH v6 2/4] powerpc/fadump: Reservationless firmware assisted dump

2018-07-15 Thread Mahesh J Salgaonkar
From: Mahesh Salgaonkar 

One of the primary issues with Firmware Assisted Dump (fadump) on Power
is that it needs a large amount of memory to be reserved. On large
systems with TeraBytes of memory, this reservation can be quite
significant.

In some cases, fadump fails if the memory reserved is insufficient, or
if the reserved memory was DLPAR hot-removed.

In the normal case, post reboot, the preserved memory is filtered to
extract only relevant areas of interest using the makedumpfile tool.
While the tool provides flexibility to determine what needs to be part
of the dump and what memory to filter out, all supported distributions
default this to "Capture only kernel data and nothing else".

We take advantage of this default and the Linux kernel's zone movable
feature to fundamentally change the memory reservation model for fadump.

Instead of setting aside a significant chunk of memory nobody can use,
this patch marks a significant chunk of reserved memory as ZONE_MOVABLE
that the kernel is prevented from using (due to MIGRATE_MOVABLE),
but applications are free to use it. With this fadump will still be able
to capture all of the kernel memory and most of the user space memory
except the user pages that were present in ZONE_MOVABLE zone. But if
someone wants to capture all of user space memory and ok with reserved
memory not available to production system, then 'fadump=nonmovable' kernel
parameter can be used to fallback to old behaviour.

Essentially, on a P9 LPAR with 2 cores, 8GB RAM and current upstream:
[root@zzxx-yy10 ~]# free -m
  totalusedfree  shared  buff/cache   available
Mem:   7557 1936822  12 5416725
Swap:  4095   04095

With this patch:
[root@zzxx-yy10 ~]# free -m
  totalusedfree  shared  buff/cache   available
Mem:   8133 1947464  12 4757338
Swap:  4095   04095

Changes made here are completely transparent to how fadump has
traditionally worked.

Signed-off-by: Ananth N Mavinakayanahalli 
Signed-off-by: Mahesh Salgaonkar 
Signed-off-by: Hari Bathini 
---
 Documentation/powerpc/firmware-assisted-dump.txt |   18 +
 arch/powerpc/include/asm/fadump.h|5 +
 arch/powerpc/kernel/fadump.c |   80 --
 3 files changed, 95 insertions(+), 8 deletions(-)

diff --git a/Documentation/powerpc/firmware-assisted-dump.txt 
b/Documentation/powerpc/firmware-assisted-dump.txt
index bdd344aa18d9..f8a6343a1dcf 100644
--- a/Documentation/powerpc/firmware-assisted-dump.txt
+++ b/Documentation/powerpc/firmware-assisted-dump.txt
@@ -113,7 +113,16 @@ header, is usually reserved at an offset greater than boot 
memory
 size (see Fig. 1). This area is *not* released: this region will
 be kept permanently reserved, so that it can act as a receptacle
 for a copy of the boot memory content in addition to CPU state
-and HPTE region, in the case a crash does occur.
+and HPTE region, in the case a crash does occur. Since this reserved
+memory area is used only after the system crash, there is no point in
+blocking this significant chunk of memory from production kernel.
+Hence, the implementation marks the memory reserved for fadump as
+ZONE_MOVABLE. With ZONE_MOVABLE this memory will be available for
+applications to use it, while kernel is prevented from using it. With
+this fadump will still be able to capture all of the kernel memory and
+most of the user space memory except the user pages that were present
+in ZONE_MOVABLE region.
+
 
   o Memory Reservation during first kernel
 
@@ -162,6 +171,9 @@ How to enable firmware-assisted dump (fadump):
 
 1. Set config option CONFIG_FA_DUMP=y and build kernel.
 2. Boot into linux kernel with 'fadump=on' kernel cmdline option.
+   By default, the reserved memory will be marked as zone movable.
+   Alternatively, user can boot linux kernel with 'fadump=nonmovable' to
+   prevent fadump to mark reserved memory as zone movable.
 3. Optionally, user can also set 'crashkernel=' kernel cmdline
to specify size of the memory to reserve for boot memory dump
preservation.
@@ -172,6 +184,10 @@ NOTE: 1. 'fadump_reserve_mem=' parameter has been 
deprecated. Instead
   2. If firmware-assisted dump fails to reserve memory then it
  will fallback to existing kdump mechanism if 'crashkernel='
  option is set at kernel cmdline.
+  3. if user wants to capture all of user space memory and ok with
+ reserved memory not available to production system, then
+ 'fadump=nonmovable' kernel parameter can be used to fallback to
+ old behaviour.
 
 Sysfs/debugfs files:
 
diff --git a/arch/powerpc/include/asm/fadump.h 
b/arch/powerpc/include/asm/fadump.h
index 5a23010af600..5c0de4508aab 100644
--- a/arch/powerpc/include/asm/fadump.h
+++ b/arch/powerpc/include/asm/fadump.h
@@ -

[RFC PATCH v6 3/4] powerpc/fadump: throw proper error message on fadump registration failure.

2018-07-15 Thread Mahesh J Salgaonkar
From: Mahesh Salgaonkar 

fadump fails to register when there are holes in reserved memory area.
This can happen if user has hot-removed a memory that falls in the fadump
reserved memory area. Throw a meaningful error message to the user in
such case.

Signed-off-by: Mahesh Salgaonkar 
---
 arch/powerpc/kernel/fadump.c |   33 +
 1 file changed, 33 insertions(+)

diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
index ce333c1d4cb8..d1375f3f48c3 100644
--- a/arch/powerpc/kernel/fadump.c
+++ b/arch/powerpc/kernel/fadump.c
@@ -170,6 +170,36 @@ static int is_boot_memory_area_contiguous(void)
return ret;
 }
 
+/*
+ * Returns 1, if there are no holes in reserved memory area,
+ * 0 otherwise.
+ */
+static int is_reserved_memory_area_contiguous(void)
+{
+   struct memblock_region *reg;
+   unsigned long start, end;
+   unsigned long d_start = fw_dump.reserve_dump_area_start;
+   unsigned long d_end = d_start + fw_dump.reserve_dump_area_size;
+   int ret = 0;
+
+   for_each_memblock(memory, reg) {
+   start = max(d_start, (unsigned long)reg->base);
+   end = min(d_end, (unsigned long)(reg->base + reg->size));
+   if (d_start < end) {
+   /* Memory hole from d_start to start */
+   if (start > d_start)
+   break;
+
+   if (end == d_end) {
+   ret = 1;
+   break;
+   }
+   d_start = end + 1;
+   }
+   }
+   return ret;
+}
+
 /* Print firmware assisted dump configurations for debugging purpose. */
 static void fadump_show_config(void)
 {
@@ -531,6 +561,9 @@ static int register_fw_dump(struct fadump_mem_struct *fdm)
if (!is_boot_memory_area_contiguous())
pr_err("Can't have holes in boot memory area while "
   "registering fadump\n");
+   else if (!is_reserved_memory_area_contiguous())
+   pr_err("Can't have holes in reserved memory area while"
+  " registering fadump\n");
 
printk(KERN_ERR "Failed to register firmware-assisted kernel"
" dump. Parameter Error(%d).\n", rc);



[RFC PATCH v6 4/4] powerpc/fadump: Do not allow hot-remove memory from fadump reserved area.

2018-07-15 Thread Mahesh J Salgaonkar
From: Mahesh Salgaonkar 

For fadump to work successfully there should not be any holes in reserved
memory ranges where kernel has asked firmware to move the content of old
kernel memory in event of crash. Now that fadump reserved memory is marked
as movable zone, this memory area is now not protected from hot-remove
operations. Hence, fadump service can fail to re-register after the
hot-remove operation, if hot-removed memory belongs to fadump reserved
region. To avoid this make sure that memory from fadump reserved area is
not hot-removable if fadump is registered.

However, if user still wants to remove that memory, he can do so by
manually stopping fadump service before hot-remove operation.

Signed-off-by: Mahesh Salgaonkar 
---
 arch/powerpc/include/asm/fadump.h   |2 +-
 arch/powerpc/kernel/fadump.c|   10 --
 arch/powerpc/platforms/pseries/hotplug-memory.c |7 +--
 3 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/include/asm/fadump.h 
b/arch/powerpc/include/asm/fadump.h
index 5c0de4508aab..cd28e9b59057 100644
--- a/arch/powerpc/include/asm/fadump.h
+++ b/arch/powerpc/include/asm/fadump.h
@@ -208,7 +208,7 @@ struct fad_crash_memory_ranges {
unsigned long long  size;
 };
 
-extern int is_fadump_boot_memory_area(u64 addr, ulong size);
+extern int is_fadump_memory_area(u64 addr, ulong size);
 extern int early_init_dt_scan_fw_dump(unsigned long node,
const char *uname, int depth, void *data);
 extern int fadump_reserve_mem(void);
diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
index d1375f3f48c3..18a35f12ffb5 100644
--- a/arch/powerpc/kernel/fadump.c
+++ b/arch/powerpc/kernel/fadump.c
@@ -116,13 +116,19 @@ int __init early_init_dt_scan_fw_dump(unsigned long node,
 
 /*
  * If fadump is registered, check if the memory provided
- * falls within boot memory area.
+ * falls within boot memory area and reserved memory area.
  */
-int is_fadump_boot_memory_area(u64 addr, ulong size)
+int is_fadump_memory_area(u64 addr, ulong size)
 {
+   u64 d_start = fw_dump.reserve_dump_area_start;
+   u64 d_end = d_start + fw_dump.reserve_dump_area_size;
+
if (!fw_dump.dump_registered)
return 0;
 
+   if (((addr + size) > d_start) && (addr <= d_end))
+   return 1;
+
return (addr + size) > RMA_START && addr <= fw_dump.boot_memory_size;
 }
 
diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c 
b/arch/powerpc/platforms/pseries/hotplug-memory.c
index c1578f54c626..e4c658cda3a7 100644
--- a/arch/powerpc/platforms/pseries/hotplug-memory.c
+++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
@@ -389,8 +389,11 @@ static bool lmb_is_removable(struct drmem_lmb *lmb)
phys_addr = lmb->base_addr;
 
 #ifdef CONFIG_FA_DUMP
-   /* Don't hot-remove memory that falls in fadump boot memory area */
-   if (is_fadump_boot_memory_area(phys_addr, block_sz))
+   /*
+* Don't hot-remove memory that falls in fadump boot memory area
+* and memory that is reserved for capturing old kernel memory.
+*/
+   if (is_fadump_memory_area(phys_addr, block_sz))
return false;
 #endif