On Tue, May 06, 2014 at 05:27:46PM +0800, Hu Tao wrote: [...] > @@ -203,6 +296,20 @@ host_memory_backend_memory_init(UserCreatable *uc, Error > **errp) > if (backend->prealloc) { > os_mem_prealloc(memory_region_get_fd(&backend->mr), ptr, sz); > } > + > +#ifdef CONFIG_NUMA > + unsigned long maxnode = find_last_bit(backend->host_nodes, MAX_NODES); > + > + /* This is a workaround for a long standing bug in Linux' > + * mbind implementation, which cuts off the last specified > + * node. > + */
What if the bug is fixed? mbind() documentation says "nodemask points to a bit mask of nodes containing up to maxnode bits", so we must ensure backend->host_nodes has the one extra bit. Also, if no bit is set, we can pass nodemask=NULL or maxnode=0 as argument. We could address both issues, and do this: struct HostMemoryBackend { [...] DECLARE_BITMAP(host_nodes, MAX_NODES + 1); [...] lastbit = find_last_bit(backend->host_nodes, MAX_NODES); /* lastbit == MAX_NODES means maxnode=0 */ maxnode = (lastbit + 1) % (MAX_NODES + 1); /* We can have up to MAX_NODES nodes, but we need to pass maxnode+1 * as argument to mbind() due to an old Linux bug (feature?) which * cuts off the last specified node. This means backend->host_nodes * must have MAX_NODES+1 bits available. */ assert(sizeof(backend->host_nodes) >= BITS_TO_LONGS(MAX_NODES + 1) * sizeof(unsigned long)); assert(maxnode <= MAX_NODES); mbind(ptr, sz, policy, maxnode ? backend->host_nodes : NULL, maxnode + 1, flags); (I am starting to wonder if it was worth dropping the libnuma requirement and implementing our own mbind()-calling code.) > + if (mbind(ptr, sz, backend->policy, backend->host_nodes, maxnode + 2, > 0)) { > + error_setg_errno(errp, errno, > + "cannot bind memory to host NUMA nodes"); Don't we want to set flags to MPOL_MF_STRICT here? I believe we shouldn't have any pages preallocated at this point, but in case we do, I would expect them to be moved instead of ignoring the policy set by the user. > + return; > + } > +#endif > } > > MemoryRegion * > diff --git a/include/sysemu/hostmem.h b/include/sysemu/hostmem.h > index 819b72d..4e96298 100644 > --- a/include/sysemu/hostmem.h > +++ b/include/sysemu/hostmem.h > @@ -12,8 +12,10 @@ > #ifndef QEMU_HOSTMEM_H > #define QEMU_HOSTMEM_H > > +#include "sysemu/sysemu.h" /* for MAX_NODES */ > #include "qom/object.h" > #include "exec/memory.h" > +#include "qemu/bitmap.h" > > #define TYPE_MEMORY_BACKEND "memory" > #define MEMORY_BACKEND(obj) \ > @@ -52,6 +54,8 @@ struct HostMemoryBackend { > uint64_t size; > bool merge, dump; > bool prealloc, force_prealloc; > + DECLARE_BITMAP(host_nodes, MAX_NODES); > + HostMemPolicy policy; > > MemoryRegion mr; > }; > diff --git a/qapi-schema.json b/qapi-schema.json > index 5dd30eb..bea3476 100644 > --- a/qapi-schema.json > +++ b/qapi-schema.json > @@ -4732,3 +4732,23 @@ > '*cpus': ['uint16'], > '*mem': 'size', > '*memdev': 'str' }} > + > +## > +# @HostMemPolicy > +# > +# Host memory policy types > +# > +# @default: restore default policy, remove any nondefault policy > +# > +# @preferred: set the preferred host nodes for allocation > +# > +# @bind: a strict policy that restricts memory allocation to the > +# host nodes specified > +# > +# @interleave: memory allocations are interleaved across the set > +# of host nodes specified > +# > +# Since 2.1 > +## > +{ 'enum': 'HostMemPolicy', > + 'data': [ 'default', 'preferred', 'bind', 'interleave' ] } > -- > 1.8.5.2.229.g4448466 > > -- Eduardo