Ping............
> On 05/23/2013 04:47 PM, Wanlong Gao wrote: >> Use mbind to pin guest numa node memory to host nodes manually. >> >> If we are not able to pin memory to host node, we may meet the >> cross node memory access performance regression. >> >> With this patch, we can add manual pinning host node like this: >> -m 1024 -numa node,cpus=0,nodeid=0,mem=512,pin=0 -numa >> node,nodeid=1,cpus=1,mem=512,pin=1 >> >> And, if PCI-passthrough is used, direct-attached-device uses DMA transfer >> between device and qemu process. All pages of the guest will be pinned by >> get_user_pages(). >> >> KVM_ASSIGN_PCI_DEVICE ioctl >> kvm_vm_ioctl_assign_device() >> =>kvm_assign_device() >> => kvm_iommu_map_memslots() >> => kvm_iommu_map_pages() >> => kvm_pin_pages() >> >> So, with direct-attached-device, all guest page's page count will be +1 and >> any page migration will not work. AutoNUMA won't too. And direction by >> libvirt is *ignored*. >> >> Above all, we need manual pinning memory to host node to avoid >> such cross nodes memmory access performance regression. > > Any comments ? > > Thanks, > Wanlong Gao > >> >> Signed-off-by: Wanlong Gao <gaowanl...@cn.fujitsu.com> >> --- >> exec.c | 21 +++++++++++++++++++++ >> include/sysemu/sysemu.h | 1 + >> vl.c | 13 +++++++++++++ >> 3 files changed, 35 insertions(+) >> >> diff --git a/exec.c b/exec.c >> index aec65c5..fe929ef 100644 >> --- a/exec.c >> +++ b/exec.c >> @@ -36,6 +36,8 @@ >> #include "qemu/config-file.h" >> #include "exec/memory.h" >> #include "sysemu/dma.h" >> +#include "sysemu/sysemu.h" >> +#include "qemu/bitops.h" >> #include "exec/address-spaces.h" >> #if defined(CONFIG_USER_ONLY) >> #include <qemu.h> >> @@ -1081,6 +1083,25 @@ ram_addr_t qemu_ram_alloc_from_ptr(ram_addr_t size, >> void *host, >> memory_try_enable_merging(new_block->host, size); >> } >> } >> + >> + if (nb_numa_nodes > 0 && !strcmp(mr->name, "pc.ram")) { >> + int i; >> + uint64_t nodes_mem = 0; >> + unsigned long *maskp = g_malloc0(sizeof(*maskp)); >> + for (i = 0; i < nb_numa_nodes; i++) { >> + *maskp = 0; >> + if (node_pin[i] != -1) { >> + set_bit(node_pin[i], maskp); >> + if (qemu_mbind(new_block->host + nodes_mem, node_mem[i], >> + QEMU_MPOL_BIND, maskp, MAX_NODES, 0)) { >> + perror("qemu_mbind"); >> + exit(1); >> + } >> + } >> + nodes_mem += node_mem[i]; >> + } >> + } >> + >> new_block->length = size; >> >> /* Keep the list sorted from biggest to smallest block. */ >> diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h >> index 2fb71af..ebf6580 100644 >> --- a/include/sysemu/sysemu.h >> +++ b/include/sysemu/sysemu.h >> @@ -131,6 +131,7 @@ extern QEMUClock *rtc_clock; >> #define MAX_CPUMASK_BITS 255 >> extern int nb_numa_nodes; >> extern uint64_t node_mem[MAX_NODES]; >> +extern int node_pin[MAX_NODES]; >> extern unsigned long *node_cpumask[MAX_NODES]; >> >> #define MAX_OPTION_ROMS 16 >> diff --git a/vl.c b/vl.c >> index 5555b1d..3768002 100644 >> --- a/vl.c >> +++ b/vl.c >> @@ -253,6 +253,7 @@ static QTAILQ_HEAD(, FWBootEntry) fw_boot_order = >> >> int nb_numa_nodes; >> uint64_t node_mem[MAX_NODES]; >> +int node_pin[MAX_NODES]; >> unsigned long *node_cpumask[MAX_NODES]; >> >> uint8_t qemu_uuid[16]; >> @@ -1390,6 +1391,17 @@ static void numa_add(const char *optarg) >> } >> node_mem[nodenr] = sval; >> } >> + >> + if (get_param_value(option, 128, "pin", optarg) != 0) { >> + int unsigned long long pin_node; >> + if (parse_uint_full(option, &pin_node, 10) < 0) { >> + fprintf(stderr, "qemu: Invalid pinning nodeid: %s\n", >> optarg); >> + exit(1); >> + } else { >> + node_pin[nodenr] = pin_node; >> + } >> + } >> + >> if (get_param_value(option, 128, "cpus", optarg) != 0) { >> numa_node_parse_cpus(nodenr, option); >> } >> @@ -2921,6 +2933,7 @@ int main(int argc, char **argv, char **envp) >> >> for (i = 0; i < MAX_NODES; i++) { >> node_mem[i] = 0; >> + node_pin[i] = -1; >> node_cpumask[i] = bitmap_new(MAX_CPUMASK_BITS); >> } >> >> > >