Hi Anshuman,

Thanks for the feedback!

On 22/02/16 21:13, Anshuman Khandual wrote:
On 02/22/2016 11:32 AM, Rashmica Gupta wrote:
Useful to be able to dump the kernel page tables to check permissions and
memory types - derived from arm64's implementation.

Add a debugfs file to check the page tables. To use this the PPC_PTDUMP
config option must be selected.

Tested on 64BE and 64LE with both 4K and 64K page sizes.
---
This statement above must be after the ---- line else it will be part of
the commit message or you wanted the test note as part of commit message
itself ?

The patch seems to contain some white space problems. Please clean them up.
Will do!
  arch/powerpc/Kconfig.debug |  12 ++
  arch/powerpc/mm/Makefile   |   1 +
  arch/powerpc/mm/dump.c     | 364 +++++++++++++++++++++++++++++++++++++++++++++
  3 files changed, 377 insertions(+)
  create mode 100644 arch/powerpc/mm/dump.c

diff --git a/arch/powerpc/Kconfig.debug b/arch/powerpc/Kconfig.debug
index 638f9ce740f5..e4883880abe3 100644
--- a/arch/powerpc/Kconfig.debug
+++ b/arch/powerpc/Kconfig.debug
@@ -344,4 +344,16 @@ config FAIL_IOMMU
If you are unsure, say N. +config PPC_PTDUMP
+        bool "Export kernel pagetable layout to userspace via debugfs"
+        depends on DEBUG_KERNEL
+        select DEBUG_FS
+        help
+          This options dumps the state of the kernel pagetables in a debugfs
+          file. This is only useful for kernel developers who are working in
+          architecture specific areas of the kernel - probably not a good idea 
to
+          enable this feature in a production kernel.
Just clean the paragraph alignment here 
......................................^^^^^^^^

+
+          If you are unsure, say N.
+
  endmenu
diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
index 1ffeda85c086..16f84bdd7597 100644
--- a/arch/powerpc/mm/Makefile
+++ b/arch/powerpc/mm/Makefile
@@ -40,3 +40,4 @@ obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o
  obj-$(CONFIG_HIGHMEM)         += highmem.o
  obj-$(CONFIG_PPC_COPRO_BASE)  += copro_fault.o
  obj-$(CONFIG_SPAPR_TCE_IOMMU) += mmu_context_iommu.o
+obj-$(CONFIG_PPC_PTDUMP)       += dump.o
File name like "[kernel_]pgtable_dump.c" will sound better ? Or
just use like the X86 one "dump_pagetables.c". "dump.c" sounds
very generic.
Yup, good point.
diff --git a/arch/powerpc/mm/dump.c b/arch/powerpc/mm/dump.c
new file mode 100644
index 000000000000..937b10fc40cc
--- /dev/null
+++ b/arch/powerpc/mm/dump.c
@@ -0,0 +1,364 @@
+/*
+ * Copyright 2016, Rashmica Gupta, IBM Corp.
+ *
+ * Debug helper to dump the current kernel pagetables of the system
+ * so that we can see what the various memory ranges are set to.
+ *
+ * Derived from the arm64 implementation:
+ * Copyright (c) 2014, The Linux Foundation, Laura Abbott.
+ * (C) Copyright 2008 Intel Corporation, Arjan van de Ven.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+#include <linux/debugfs.h>
+#include <linux/fs.h>
+#include <linux/io.h>
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/seq_file.h>
+#include <asm/fixmap.h>
+#include <asm/pgtable.h>
+#include <linux/const.h>
+#include <asm/page.h>
+
+#define PUD_TYPE_MASK           (_AT(u64, 3) << 0)
+#define PUD_TYPE_SECT           (_AT(u64, 1) << 0)
+#define PMD_TYPE_MASK           (_AT(u64, 3) << 0)
+#define PMD_TYPE_SECT           (_AT(u64, 1) << 0)
+
+
+#if CONFIG_PGTABLE_LEVELS == 2
+#include <asm-generic/pgtable-nopmd.h>
+#elif CONFIG_PGTABLE_LEVELS == 3
+#include <asm-generic/pgtable-nopud.h>
+#endif
Really ? Do we have any platform with only 2 level of page table ?
I'm not sure - was trying to cover all the bases. If you're
confident that we don't, I can remove it.
+
+#define pmd_sect(pmd)  ((pmd_val(pmd) & PMD_TYPE_MASK) == PMD_TYPE_SECT)
+#ifdef CONFIG_PPC_64K_PAGES
+#define pud_sect(pud)           (0)
+#else
+#define pud_sect(pud)           ((pud_val(pud) & PUD_TYPE_MASK) == \
+                                               PUD_TYPE_SECT)
+#endif
Can you please explain the use of pmd_sect() and pud_sect() defines ?

+       
+
+struct addr_marker {
+       unsigned long start_address;
+       const char *name;
+};
All the architectures are using the same structure addr_marker.
Cannot we just move it to a generic header file ? There are
other such common structures like these in the file which are
used across architectures and can be moved to some where common ?
Could do that. Where do you think would be the appropriate place
for such a header file?
+
+enum address_markers_idx {
+       VMALLOC_START_NR = 0,
+       VMALLOC_END_NR,
+       ISA_IO_START_NR,
+       ISA_IO_END_NR,
+       PHB_IO_START_NR,
+       PHB_IO_END_NR,
+       IOREMAP_START_NR,
+       IOREMP_END_NR,
+};
Where these are used ? ^^^^^^^^^ I dont see any where.
Whoops, yes those are not used anymore.

Also as it dumps only the kernel virtual mapping, should not we
mention about it some where.
See my question below...
+
+static struct addr_marker address_markers[] = {
+       { VMALLOC_START,        "vmalloc() Area" },
+       { VMALLOC_END,          "vmalloc() End" },
+       { ISA_IO_BASE,          "isa I/O start" },
+       { ISA_IO_END,           "isa I/O end" },
+       { PHB_IO_BASE,          "phb I/O start" },
+       { PHB_IO_END,           "phb I/O end" },
+       { IOREMAP_BASE,         "I/O remap start" },
+       { IOREMAP_END,          "I/O remap end" },
+       { -1,                   NULL },
+};

I understand that VMEMMAP range is not covered under the kernel virtual
mapping page table. But we can hook it up looking into the linked list
we track for the VA-PA mappings in there. Just a suggestion.
I'm not sure that I understand, can you please explain why we
would want to do that?

+
+/*
+ * The page dumper groups page table entries of the same type into a single
+ * description. It uses pg_state to track the range information while
+ * iterating over the pte entries. When the continuity is broken it then
+ * dumps out a description of the range.
+ */
This needs to be more detailed. Some where at the starting point of the
file we need to write detailed description about what all information it
is dumping and how.
Will do!

+struct pg_state {
+       struct seq_file *seq;
+       const struct addr_marker *marker;
+       unsigned long start_address;
+       unsigned level;
+       u64 current_prot;
+};
+
+struct prot_bits {
+       u64             mask;
+       u64             val;
+       const char      *set;
+       const char      *clear;
+};
This structure encapsulates various PTE flags bit information.
Then why it is named as "prot_bits" ?
Yup, a lazy oversight on my part.
+
+static const struct prot_bits pte_bits[] = {
+       {
+               .mask   = _PAGE_USER,
+               .val    = _PAGE_USER,
+               .set    = "user",
+               .clear  = "    ",
+       }, {
+               .mask   = _PAGE_RW,
+               .val    = _PAGE_RW,
+               .set    = "rw",
+               .clear  = "ro",
+       }, {
+               .mask   = _PAGE_EXEC,
+               .val    = _PAGE_EXEC,
+               .set    = " X ",
+               .clear  = "   ",
+       }, {
+               .mask   = _PAGE_PTE,
+               .val    = _PAGE_PTE,
+               .set    = "pte",
+               .clear  = "   ",
+       }, {
+               .mask   = _PAGE_PRESENT,
+               .val    = _PAGE_PRESENT,
+               .set    = "present",
+               .clear  = "       ",
+       }, {
+               .mask   = _PAGE_HASHPTE,
+               .val    = _PAGE_HASHPTE,
+               .set    = "htpe",
+               .clear  = "    ",
+       }, {
+               .mask   = _PAGE_GUARDED,
+               .val    = _PAGE_GUARDED,
+               .set    = "guarded",
+               .clear  = "       ",
+       }, {
+               .mask   = _PAGE_DIRTY,
+               .val    = _PAGE_DIRTY,
+               .set    = "dirty",
+               .clear  = "     ",
+       }, {
+               .mask   = _PAGE_ACCESSED,
+               .val    = _PAGE_ACCESSED,
+               .set    = "accessed",
+               .clear  = "        ",
+       }, {
+               .mask   = _PAGE_WRITETHRU,
+               .val    = _PAGE_WRITETHRU,
+               .set    = "write through",
+               .clear  = "             ",
+       }, {
+               .mask   = _PAGE_NO_CACHE,
+               .val    = _PAGE_NO_CACHE,
+               .set    = "no cache",
+               .clear  = "        ",
+       }, {
+               .mask   = _PAGE_BUSY,
+               .val    = _PAGE_BUSY,
+               .set    = "busy",
+       }, {
+               .mask   = _PAGE_F_GIX,
+               .val    = _PAGE_F_GIX,
+               .set    = "gix",
+       }, {
+               .mask   = _PAGE_F_SECOND,
+               .val    = _PAGE_F_SECOND,
+               .set    = "second",
+       }, {
+               .mask   = _PAGE_SPECIAL,
+               .val    = _PAGE_SPECIAL,
+               .set    = "special",
+       }
+};
+
+struct pg_level {
+       const struct prot_bits *bits;
+       size_t num;
+       u64 mask;
+};
+
It describes each level of the page table and what all kind of
PTE flags can be there at each of these levels and their combined
mask. The structure and it's elements must be named accordingly.
Its confusing right now.

+static struct pg_level pg_level[] = {
+       {
+       }, { /* pgd */
+               .bits   = pte_bits,
+               .num    = ARRAY_SIZE(pte_bits),
+       }, { /* pud */
+               .bits   = pte_bits,
+               .num    = ARRAY_SIZE(pte_bits),
+       }, { /* pmd */
+               .bits   = pte_bits,
+               .num    = ARRAY_SIZE(pte_bits),
+       }, { /* pte */
+               .bits   = pte_bits,
+               .num    = ARRAY_SIZE(pte_bits),
+       },
+};
+
+static void dump_prot(struct pg_state *st, const struct prot_bits *bits,
+                       size_t num)
+{
+       unsigned i;
+
+       for (i = 0; i < num; i++, bits++) {
+               const char *s;
+
+               if ((st->current_prot & bits->mask) == bits->val)
+                       s = bits->set;
+               else
+                       s = bits->clear;
+
+               if (s)
+                       seq_printf(st->seq, " %s", s);
+       }
+}
+
+static void note_page(struct pg_state *st, unsigned long addr, unsigned level,
+                               u64 val)
+{
+       static const char units[] = "KMGTPE";
+       u64 prot = val & pg_level[level].mask;
+
+       /* At first no level is set */
+       if (!st->level) {
+               st->level = level;
+               st->current_prot = prot;
+               st->start_address = addr;
+               seq_printf(st->seq, "---[ %s ]---\n", st->marker->name);
+       /* We are only interested in dumping when something (protection,
+        *  level of PTE or the section of vmalloc) has changed */
Again, these are PTE flags not protection bits. Please change these
comments accordingly. Also this function does a lot of things, can
we split it up ?
Do you think that it would make it easier to understand by
splitting it up?
+       } else if (prot != st->current_prot || level != st->level ||
+                  addr >= st->marker[1].start_address) {
+               const char *unit = units;
+               unsigned long delta;
+
+               /* Check protection on PTE */
+               if (st->current_prot) {
+                       seq_printf(st->seq, "0x%016lx-0x%016lx   ",
+                                  st->start_address, addr-1);
+
+                       delta = (addr - st->start_address) >> 10;
+                       /* Work out what appropriate unit to use */
+                       while (!(delta & 1023) && unit[1]) {
+                               delta >>= 10;
+                               unit++;
+                       }
+                       seq_printf(st->seq, "%9lu%c", delta, *unit);
+                       /* Dump all the protection flags */
+                       if (pg_level[st->level].bits)
+                               dump_prot(st, pg_level[st->level].bits,
+                                         pg_level[st->level].num);
+                       seq_puts(st->seq, "\n");
+               }
+               /* Address indicates we have passed the end of the
+                * current section of vmalloc */
+               while (addr >= st->marker[1].start_address) {
+                       st->marker++;
+                       seq_printf(st->seq, "---[ %s ]---\n", st->marker->name);
+               }
Right now with this patch, it does not print anything after [ vmalloc() End ].
But I would expect it to go over all other sections one by one. The above
while loop just goes over the list and prints the remaining address range
names. Why ?

0xd00007fffff00000-0xd00007fffff0ffff          64K      rw     pte present htpe 
        dirty accessed
0xd00007fffff10000-0xd00007fffff3ffff         192K      rw     pte present      
        dirty accessed
0xd00007fffff40000-0xd00007fffff4ffff          64K      rw     pte present htpe 
        dirty accessed
0xd00007fffff50000-0xd00007fffff7ffff         192K      rw     pte present      
        dirty accessed
0xd00007fffff80000-0xd00007fffff8ffff          64K      rw     pte present htpe 
        dirty accessed
0xd00007fffff90000-0xd00007fffffbffff         192K      rw     pte present      
        dirty accessed
0xd00007fffffc0000-0xd00007fffffcffff          64K      rw     pte present htpe 
        dirty accessed
0xd00007fffffd0000-0xd00007ffffffffff         192K      rw     pte present      
        dirty accessed
---[ vmalloc() End ]---
---[ isa I/O start ]---
---[ isa I/O end ]---
---[ phb I/O start ]---
---[ phb I/O end ]---
---[ I/O remap start ]---
---[ I/O remap end ]---

What setup are you using? My output looked something similar to this for all BE and LE
with both 4K and 64K pages:
0xd00007fffff60000-0xd00007fffff7ffff 128K rw pte present dirty
accessed
0xd00007fffff80000-0xd00007fffff9ffff 128K rw pte present htpe dirty
accessed
0xd00007fffffa0000-0xd00007fffffbffff 128K rw pte present dirty
accessed
0xd00007fffffc0000-0xd00007fffffdffff 128K rw pte present htpe dirty
accessed
0xd00007fffffe0000-0xd00007ffffffffff 128K rw pte present dirty
accessed
---[ vmalloc() End ]---
---[ isa I/O start ]---
---[ isa I/O end ]---
---[ phb I/O start ]---
0xd000080000010000-0xd00008000001ffff 64K rw pte present htpe guarded
dirty accessed               no cache
---[ phb I/O end ]---
---[ I/O remap start ]---
0xd000080080020000-0xd00008008002ffff 64K rw pte present htpe guarded
dirty accessed               no cache
0xd000080080040000-0xd00008008004ffff 64K rw pte present htpe guarded
 dirty accessed               no cache
0xd000080080060000-0xd00008008006ffff 64K rw pte present htpe guarded
dirty accessed               no cache
---[ I/O remap end ]---

+
+               st->start_address = addr;
+               st->current_prot = prot;
+               st->level = level;
+       }
+
+}
+
+static void walk_pte(struct pg_state *st, pmd_t *pmd, unsigned long start)
+{
+       pte_t *pte = pte_offset_kernel(pmd, 0);
+       unsigned long addr;
+       unsigned i;
+
+       for (i = 0; i < PTRS_PER_PTE; i++, pte++) {
+               addr = start + i * PAGE_SIZE;
+               note_page(st, addr, 4, pte_val(*pte));
+       }
+}
+
+static void walk_pmd(struct pg_state *st, pud_t *pud, unsigned long start)
+{
+       pmd_t *pmd = pmd_offset(pud, 0);
+       unsigned long addr;
+       unsigned i;
+
+       for (i = 0; i < PTRS_PER_PMD; i++, pmd++) {
+               addr = start + i * PMD_SIZE;
+               if (!pmd_none(*pmd) && !pmd_sect(*pmd))
+                       /* pmd exists */
+                       walk_pte(st, pmd, addr);
+               else
+                       note_page(st, addr, 3, pmd_val(*pmd));
+       }
+}
+
+static void walk_pud(struct pg_state *st, pgd_t *pgd, unsigned long start)
+{
+       pud_t *pud = pud_offset(pgd, 0);
+       unsigned long addr = 0UL;
+       unsigned i;
+
+       for (i = 0; i < PTRS_PER_PUD; i++, pud++) {
+               addr = start + i * PUD_SIZE;
+               if (!pud_none(*pud) && !pud_sect(*pud))
+                       /* pud exists */
+                       walk_pmd(st, pud, addr);
+               else
+                       note_page(st, addr, 2, pud_val(*pud));
+       }
+}
+
+static void walk_pgd(struct pg_state *st, unsigned long start)
+{
+       pgd_t *pgd = pgd_offset_k( 0UL);
+       unsigned i;
+       unsigned long addr;
+
+       for (i = 0; i < PTRS_PER_PGD; i++, pgd++) {
+               addr = start + i * PGDIR_SIZE;
+               if(!pgd_none(*pgd))
+                       /* pgd exists */
+                       walk_pud(st, pgd, addr);
+               else
+                       note_page(st, addr, 1, pgd_val(*pgd));
+
+       }
+}
+
+static int ptdump_show(struct seq_file *m, void *v)
+{
+       struct pg_state st = {
+               .seq = m,
+               .start_address = KERN_VIRT_START,
+               .marker = address_markers,
+       };
+       /* Traverse kernel page tables */
+       walk_pgd(&st, KERN_VIRT_START);
+       note_page(&st, 0, 0, 0);
+       return 0;
+}
+
+static int ptdump_open(struct inode *inode, struct file *file)
+{
+       return single_open(file, ptdump_show, NULL);
+}
+
+static const struct file_operations ptdump_fops = {
+       .open           = ptdump_open,
+       .read           = seq_read,
+       .llseek         = seq_lseek,
+       .release        = single_release,
+};
+
+static int ptdump_init(void)
+{
+       struct dentry *pe;
'pe' as a pointer ? Have a better name like 'pgtable' or simple
pointer like name 'ptr' will be better.

Agreed.
+       unsigned i, j;
+
+       for (i = 0; i < ARRAY_SIZE(pg_level); i++)
+               if (pg_level[i].bits)
+                       for (j = 0; j < pg_level[i].num; j++)
+                               pg_level[i].mask |= pg_level[i].bits[j].mask;
This is iterating over two data structures at a time. Cant we put
this into a separate function like build_pagetable_complete_mask() ?


_______________________________________________
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Reply via email to