Hi Marc,
I attached the test code that I used to benchmark ReadAllocate hints
performance on Qualcomm hardware.
Shanker
On 11/08/2016 01:18 AM, Shanker Donthineni wrote:
Read-allocation hints are not enabled for both the GIC-ITS and GICR
tables. This forces the hardware to always read the table contents
from an external memory (DDR) which is slow compared to cache memory.
Most of the tables are often read by hardware. So, it's better to
enable Read-allocate hints in addition to Write-allocate hints in
order to improve the GICR_PEND, GICR_PROP, Collection, Device, and
vCPU tables lookup time.
Signed-off-by: Shanker Donthineni <shank...@codeaurora.org>
---
Implemented a test case to prove that enabling Read Allocation hints
improves ITS lookup time ~15% while delivering a LPI event. Used the
ITS command INV to analyze time spent in device, collection, prop and
pending table lookups.
Pseudo code:
Create a fake ITS device.
Record PMU cycle counter before sending INV command.
Build and send ITS INT command.
ITS hardware triggers device table lookup.
ITTE table & collection table lookup.
ITS property table lookup.
ITS pending table lookup.
Deliver interrupt to CPU interface.
do_IRQ() called.
Measure the total CPU cycle spent to reach this point.
Without ReadAllocation hints:
/sys/kernel/debug # echo 100 > lpitest
[ 94.693968] CPU[1] niter=100 cycles=0x8dfc0 avg=0x16b7 min=0x1652
With ReadAllocation hints:
/sys/kernel/debug # echo 100 > lpitest
[ 98.617873] CPU[1] niter=100 cycles=0x7df49 avg=0x1427 min=0x1388
drivers/irqchip/irq-gic-v3-its.c | 10 +++++-----
1 file changed, 5 insertions(+), 5 deletions(-)
diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
index c5dee30..227a1eb 100644
--- a/drivers/irqchip/irq-gic-v3-its.c
+++ b/drivers/irqchip/irq-gic-v3-its.c
@@ -961,7 +961,7 @@ static bool its_parse_baser_device(struct its_node *its,
struct its_baser *baser
u32 psz, u32 *order)
{
u64 esz = GITS_BASER_ENTRY_SIZE(its_read_baser(its, baser));
- u64 val = GITS_BASER_InnerShareable | GITS_BASER_WaWb;
+ u64 val = GITS_BASER_InnerShareable | GITS_BASER_RaWaWb;
u32 ids = its->device_ids;
u32 new_order = *order;
bool indirect = false;
@@ -1026,7 +1026,7 @@ static int its_alloc_tables(struct its_node *its)
u64 typer = gic_read_typer(its->base + GITS_TYPER);
u32 ids = GITS_TYPER_DEVBITS(typer);
u64 shr = GITS_BASER_InnerShareable;
- u64 cache = GITS_BASER_WaWb;
+ u64 cache = GITS_BASER_RaWaWb;
u32 psz = SZ_64K;
int err, i;
@@ -1123,7 +1123,7 @@ static void its_cpu_init_lpis(void)
/* set PROPBASE */
val = (page_to_phys(gic_rdists->prop_page) |
GICR_PROPBASER_InnerShareable |
- GICR_PROPBASER_WaWb |
+ GICR_PROPBASER_RaWaWb |
((LPI_NRBITS - 1) & GICR_PROPBASER_IDBITS_MASK));
writeq_relaxed(val, rbase + GICR_PROPBASER);
@@ -1148,7 +1148,7 @@ static void its_cpu_init_lpis(void)
/* set PENDBASE */
val = (page_to_phys(pend_page) |
GICR_PENDBASER_InnerShareable |
- GICR_PENDBASER_WaWb);
+ GICR_PENDBASER_RaWaWb);
writeq_relaxed(val, rbase + GICR_PENDBASER);
tmp = readq_relaxed(rbase + GICR_PENDBASER);
@@ -1712,7 +1712,7 @@ static int __init its_probe_one(struct resource *res,
goto out_free_tables;
baser = (virt_to_phys(its->cmd_base) |
- GITS_CBASER_WaWb |
+ GITS_CBASER_RaWaWb |
GITS_CBASER_InnerShareable |
(ITS_CMD_QUEUE_SZ / SZ_4K - 1) |
GITS_CBASER_VALID);
--
Shanker Donthineni
Qualcomm Datacenter Technologies, Inc. as an affiliate of Qualcomm
Technologies, Inc.
Qualcomm Technologies, Inc. is a member of the Code Aurora Forum, a Linux
Foundation Collaborative Project.
>From 8a60136b1b805746451b5ff4d82eaa251bfd8c6c Mon Sep 17 00:00:00 2001
From: Shanker Donthineni <shank...@codeaurora.org>
Date: Mon, 7 Nov 2016 23:08:28 -0600
Subject: [PATCH 2/2] irqchip/gicv3-its: Test code for measuring Read-allocate
hints performance
Apply this patch on tip of the v4.9-rc4 kernel and do two steps mentioned
below for measuring performance improvement with Read-allocate hints.
mount -t debugfs none /sys/kernel/debug
echo 10 > /sys/kernel/debug/lpitest
Test displays the CPU cycles that are spent to deliver LPI event.
Example:
[ 93.139710] CPU[1] iter=10 cycles=0xdd8c avg=0x1627 min=0x13a3
Signed-off-by: Shanker Donthineni <shank...@codeaurora.org>
---
arch/arm64/mm/cache.S | 55 +++++++++++++
drivers/irqchip/irq-gic-v3-its.c | 160 +++++++++++++++++++++++++++++++++++++
drivers/irqchip/irq-gic-v3.c | 7 ++
include/linux/irqchip/arm-gic-v3.h | 23 ++++++
4 files changed, 245 insertions(+)
diff --git a/arch/arm64/mm/cache.S b/arch/arm64/mm/cache.S
index 58b5a90..0a03420 100644
--- a/arch/arm64/mm/cache.S
+++ b/arch/arm64/mm/cache.S
@@ -198,3 +198,58 @@ ENTRY(__dma_unmap_area)
b.ne __dma_inv_area
ret
ENDPIPROC(__dma_unmap_area)
+
+/*
+ * flush_dcache_all(), Flush the whole D-cache.
+ */
+ENTRY(flush_dcache_all)
+ mov x12, lr
+ dmb sy // ensure ordering with previous memory accesses
+ mrs x0, clidr_el1 // read clidr
+ and x3, x0, #0x7000000 // extract loc from clidr
+ lsr x3, x3, #23 // left align loc bit field
+ cbz x3, finished // if loc is 0, then no need to clean
+ mov x10, #0 // start clean at cache level 0
+loop1:
+ add x2, x10, x10, lsr #1 // work out 3x current cache level
+ lsr x1, x0, x2 // extract cache type bits from clidr
+ and x1, x1, #7 // mask of the bits for current cache only
+ cmp x1, #2 // see what cache we have at this level
+ b.lt skip // skip if no cache, or just i-cache
+ mrs x9, daif
+ disable_irq
+ msr csselr_el1, x10 // select current cache level in csselr
+ isb // isb to sych the new cssr&csidr
+ mrs x1, ccsidr_el1 // read the new ccsidr
+ msr daif, x9
+ and x2, x1, #7 // extract the length of the cache lines
+ add x2, x2, #4 // add 4 (line length offset)
+ mov x4, #0x3ff
+ and x4, x4, x1, lsr #3 // find maximum number on the way size
+ clz w5, w4 // find bit position of way size increment
+ mov x7, #0x7fff
+ and x7, x7, x1, lsr #13 // extract max number of the index size
+loop2:
+ mov x9, x4 // create working copy of max way size
+loop3:
+ lsl x6, x9, x5
+ orr x11, x10, x6 // factor way and cache number into x11
+ lsl x6, x7, x2
+ orr x11, x11, x6 // factor index number into x11
+ dc cisw, x11 // clean & invalidate by set/way
+ subs x9, x9, #1 // decrement the way
+ b.ge loop3
+ subs x7, x7, #1 // decrement the index
+ b.ge loop2
+skip:
+ add x10, x10, #2 // increment cache number
+ cmp x3, x10
+ b.gt loop1
+finished:
+ mov x10, #0 // swith back to cache level 0
+ msr csselr_el1, x10 // select current cache level in csselr
+ dsb sy
+ isb
+ mov x0, #0
+ ret x12
+ENDPROC(flush_dcache_all)
diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
index 227a1eb..7aec44f 100644
--- a/drivers/irqchip/irq-gic-v3-its.c
+++ b/drivers/irqchip/irq-gic-v3-its.c
@@ -1882,3 +1882,163 @@ int __init its_init(struct fwnode_handle *handle, struct rdists *rdists,
return 0;
}
+#include <linux/debugfs.h>
+
+struct lpitest_cntx lpitest1 = {
+ .wq = __WAIT_QUEUE_HEAD_INITIALIZER(lpitest1.wq),
+};
+struct lpitest_cntx *lpitest = &lpitest1;
+
+static struct its_device *lpi_its_dev;
+
+static struct its_collection *its_build_int_cmd(struct its_cmd_block *cmd,
+ struct its_cmd_desc *desc)
+{
+ struct its_collection *col;
+
+ col = dev_event_to_col(desc->its_inv_cmd.dev,
+ desc->its_inv_cmd.event_id);
+
+ its_encode_cmd(cmd, 0x03);
+ its_encode_devid(cmd, desc->its_inv_cmd.dev->device_id);
+ its_encode_event_id(cmd, desc->its_inv_cmd.event_id);
+
+ its_fixup_cmd(cmd);
+
+ return col;
+}
+
+static void its_send_int(struct its_device *dev, u32 event_id)
+{
+ struct its_cmd_desc desc;
+
+ desc.its_inv_cmd.dev = dev;
+ desc.its_inv_cmd.event_id = event_id;
+
+ its_send_single_command(dev->its, its_build_int_cmd, &desc);
+}
+
+static ssize_t lpitest_write(struct file *file, const char __user *buffer,
+ size_t count, loff_t *pos)
+{
+ unsigned long val, lcnt;
+ u64 cycles, dcycles, mcycles = ~0;
+ int cpu, ret;
+
+ ret = kstrtoul_from_user(buffer, count, 10, &val);
+ if (ret || val <= 0)
+ return ret;
+
+ preempt_disable();
+
+ flush_dcache_all();
+
+ cpu = smp_processor_id();
+ *pos += count;
+ lpitest->irqnr = lpi_its_dev->event_map.lpi_base + cpu;
+
+ lpitest->total_cycles = 0;
+ lcnt = val;
+ while (val) {
+ cycles = pmu_read_cycles();
+ lpitest->done = 1;
+ its_send_int(lpi_its_dev, cpu);
+ wait_event_interruptible(lpitest->wq, !lpitest->done);
+ dcycles = lpitest->end_cycles - cycles;
+ lpitest->total_cycles += dcycles;
+ if (mcycles > dcycles)
+ mcycles = dcycles;
+ val--;
+ }
+ preempt_enable();
+
+ pr_info("CPU[%d] niter=%ld cycles=0x%lx avg=0x%lx min=0x%lx\n", cpu,
+ (unsigned long)lcnt,
+ (unsigned long)lpitest->total_cycles,
+ (unsigned long)lpitest->total_cycles/lcnt,
+ (unsigned long)mcycles);
+
+ return ret ? ret : count;
+}
+
+static int lpitest_proc_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, NULL, NULL);
+}
+
+static const struct file_operations lpitest_fops = {
+ .owner = THIS_MODULE,
+ .open = lpitest_proc_open,
+ .read = seq_read,
+ .write = lpitest_write,
+};
+
+static void pmu_enable_cycle_counter(void *discard)
+{
+ u64 tmp;
+
+ asm volatile("mrs %0, pmcr_el0\n"
+ "orr %0, %0, #(1 << 0)\n"
+ "orr %0, %0, #(1 << 2)\n"
+ "bic %0, %0, #(1 << 3)\n"
+ "orr %0, %0, #(1 << 6)\n"
+ "msr pmcr_el0, %0\n"
+ "mov %0, #0b11111\n"
+ "msr pmselr_el0, %0\n"
+ "isb \n"
+ "mrs %0, pmxevtyper_el0\n"
+ "orr %0, %0, #(1 << 27)\n"
+ "bic %0, %0, #(3 << 30)\n"
+ "bic %0, %0, #(3 << 28)\n"
+ "msr pmxevtyper_el0, %0\n"
+ "mrs %0, pmcntenset_el0\n"
+ "orr %0, %0, #(1 << 31)\n"
+ "msr pmcntenset_el0, %0\n"
+ : "=r" (tmp));
+}
+
+static int __init its_lpitest_init(void)
+{
+ struct its_device *its_dev;
+ struct its_node *its;
+ struct dentry *dentry;
+ irq_hw_number_t hwirq;
+ int i, nvec = 64;
+ u8 *cfg;
+
+ if (list_empty(&its_nodes))
+ return 0;
+ its = list_first_entry(&its_nodes, struct its_node, entry);
+
+ dentry = debugfs_create_file("lpitest", 0666, NULL, NULL, &lpitest_fops);
+ if (!dentry) {
+ pr_err("failed to create debugfs for its-lpitest");
+ return -ENOMEM;
+ }
+
+ its_dev = its_create_device(its, 0xFFFF, nvec);
+ if (!its_dev) {
+ pr_err("failed to create its device for lpitest");
+ return -ENOMEM;
+ }
+
+ lpi_its_dev = its_dev;
+ hwirq = its_dev->event_map.lpi_base;
+ cfg = page_address(gic_rdists->prop_page) + hwirq - 8192;
+
+ for (i = 0; i < nvec; i++) {
+ lpi_its_dev->event_map.col_map[i] = i;
+ its_send_mapvi(its_dev, hwirq + i, i);
+ *cfg |= LPI_PROP_ENABLED;
+ dsb(ishst);
+ its_send_inv(its_dev, i);
+ cfg++;
+ }
+
+ on_each_cpu(pmu_enable_cycle_counter, NULL, 1);
+
+ pr_info("lpitest successfully initialized lpi_base=%d\n", (u32)hwirq);
+
+ return 0;
+}
+late_initcall(its_lpitest_init);
diff --git a/drivers/irqchip/irq-gic-v3.c b/drivers/irqchip/irq-gic-v3.c
index 19d642e..ab44a0f 100644
--- a/drivers/irqchip/irq-gic-v3.c
+++ b/drivers/irqchip/irq-gic-v3.c
@@ -355,6 +355,13 @@ static asmlinkage void __exception_irq_entry gic_handle_irq(struct pt_regs *regs
if (static_key_true(&supports_deactivate))
gic_write_eoir(irqnr);
+ if (irqnr == lpitest->irqnr) {
+ lpitest->end_cycles = pmu_read_cycles();
+ lpitest->done = 0;
+ wake_up_interruptible(&lpitest->wq);
+ continue;
+ }
+
err = handle_domain_irq(gic_data.domain, irqnr, regs);
if (err) {
WARN_ONCE(true, "Unexpected interrupt received!\n");
diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h
index b7e3431..986b7f4 100644
--- a/include/linux/irqchip/arm-gic-v3.h
+++ b/include/linux/irqchip/arm-gic-v3.h
@@ -450,6 +450,29 @@ static inline bool gic_enable_sre(void)
return !!(val & ICC_SRE_EL1_SRE);
}
+#include <linux/wait.h>
+
+extern void flush_dcache_all(void);
+
+static __always_inline volatile u64 pmu_read_cycles(void)
+{
+ u64 cycles;
+
+ asm volatile("mrs %0, pmccntr_el0\n"
+ "isb \n\t": [reg] "=r" (cycles));
+ return cycles;
+}
+
+struct lpitest_cntx {
+ u64 total_cycles;
+ u64 end_cycles;
+ u32 irqnr;
+ u32 done;
+ wait_queue_head_t wq;
+};
+
+extern struct lpitest_cntx *lpitest;
+
#endif
#endif
--
Qualcomm Datacenter Technologies, Inc. on behalf of the Qualcomm Technologies, Inc.
Qualcomm Technologies, Inc. is a member of the Code Aurora Forum, a Linux Foundation Collaborative Project.