[ it looks my email client had some glitch and failed to add mst in cc list ]
copy mst On 11/27/17 12:35 +0800, Haozhong Zhang wrote: > When mmap(2) the backend files, QEMU uses the host page size > (getpagesize(2)) by default as the alignment of mapping address. > However, some backends may require alignments different than the page > size. For example, mmap a device DAX (e.g., /dev/dax0.0) on Linux > kernel 4.13 to an address, which is 4K-aligned but not 2M-aligned, > fails with a kernel message like > > [617494.969768] dax dax0.0: qemu-system-x86: dax_mmap: fail, unaligned vma > (0x7fa37c579000 - 0x7fa43c579000, 0x1fffff) > > Because there is no common approach to get such alignment requirement, > we add the 'align' option to 'memory-backend-file', so that users or > management utils, which have enough knowledge about the backend, can > specify a proper alignment via this option. > > Signed-off-by: Haozhong Zhang <haozhong.zh...@intel.com> > --- > backends/hostmem-file.c | 41 ++++++++++++++++++++++++++++++++++++++++- > docs/nvdimm.txt | 16 ++++++++++++++++ > exec.c | 8 +++++++- > include/exec/memory.h | 3 +++ > memory.c | 2 ++ > numa.c | 2 +- > 6 files changed, 69 insertions(+), 3 deletions(-) > > diff --git a/backends/hostmem-file.c b/backends/hostmem-file.c > index e44c319915..e319ec1ad8 100644 > --- a/backends/hostmem-file.c > +++ b/backends/hostmem-file.c > @@ -34,6 +34,7 @@ struct HostMemoryBackendFile { > bool share; > bool discard_data; > char *mem_path; > + uint64_t align; > }; > > static void > @@ -58,7 +59,7 @@ file_backend_memory_alloc(HostMemoryBackend *backend, Error > **errp) > path = object_get_canonical_path(OBJECT(backend)); > memory_region_init_ram_from_file(&backend->mr, OBJECT(backend), > path, > - backend->size, fb->share, > + backend->size, fb->align, fb->share, > fb->mem_path, errp); > g_free(path); > } > @@ -115,6 +116,40 @@ static void file_memory_backend_set_discard_data(Object > *o, bool value, > MEMORY_BACKEND_FILE(o)->discard_data = value; > } > > +static void file_memory_backend_get_align(Object *o, Visitor *v, > + const char *name, void *opaque, > + Error **errp) > +{ > + HostMemoryBackendFile *fb = MEMORY_BACKEND_FILE(o); > + uint64_t val = fb->align; > + > + visit_type_size(v, name, &val, errp); > +} > + > +static void file_memory_backend_set_align(Object *o, Visitor *v, > + const char *name, void *opaque, > + Error **errp) > +{ > + HostMemoryBackend *backend = MEMORY_BACKEND(o); > + HostMemoryBackendFile *fb = MEMORY_BACKEND_FILE(o); > + Error *local_err = NULL; > + uint64_t val; > + > + if (host_memory_backend_mr_inited(backend)) { > + error_setg(&local_err, "cannot change property value"); > + goto out; > + } > + > + visit_type_size(v, name, &val, &local_err); > + if (local_err) { > + goto out; > + } > + fb->align = val; > + > + out: > + error_propagate(errp, local_err); > +} > + > static void file_backend_unparent(Object *obj) > { > HostMemoryBackend *backend = MEMORY_BACKEND(obj); > @@ -145,6 +180,10 @@ file_backend_class_init(ObjectClass *oc, void *data) > object_class_property_add_str(oc, "mem-path", > get_mem_path, set_mem_path, > &error_abort); > + object_class_property_add(oc, "align", "int", > + file_memory_backend_get_align, > + file_memory_backend_set_align, > + NULL, NULL, &error_abort); > } > > static void file_backend_instance_finalize(Object *o) > diff --git a/docs/nvdimm.txt b/docs/nvdimm.txt > index 2d9f8c0e8c..21249dd062 100644 > --- a/docs/nvdimm.txt > +++ b/docs/nvdimm.txt > @@ -122,3 +122,19 @@ Note: > M >= size of RAM devices + > size of statically plugged vNVDIMM devices + > size of hotplugged vNVDIMM devices > + > +Alignment > +--------- > + > +QEMU uses mmap(2) to maps vNVDIMM backends and aligns the mapping > +address to the page size (getpagesize(2)) by default. However, some > +types of backends may require an alignment different than the page > +size. In that case, QEMU v2.12.0 and later provide 'align' option to > +memory-backend-file to allow users to specify the proper alignment. > + > +For example, device dax require the 2 MB alignment, so we can use > +following QEMU command line options to use it (/dev/dax0.0) as the > +backend of vNVDIMM: > + > + -object > memory-backend-file,id=mem1,share=on,mem-path=/dev/dax0.0,size=4G,align=2M > + -device nvdimm,id=nvdimm1,memdev=mem1 > diff --git a/exec.c b/exec.c > index 03238a3449..90440efecd 100644 > --- a/exec.c > +++ b/exec.c > @@ -1600,7 +1600,13 @@ static void *file_ram_alloc(RAMBlock *block, > void *area; > > block->page_size = qemu_fd_getpagesize(fd); > - block->mr->align = block->page_size; > + if (block->mr->align % block->page_size) { > + error_setg(errp, "aligment 0x%" PRIx64 > + " must be multiples of page size 0x%" PRIx64, > + block->mr->align, block->page_size); > + return NULL; > + } > + block->mr->align = MAX(block->page_size, block->mr->align); > #if defined(__s390x__) > if (kvm_enabled()) { > block->mr->align = MAX(block->mr->align, QEMU_VMALLOC_ALIGN); > diff --git a/include/exec/memory.h b/include/exec/memory.h > index 5ed4042f87..a1be8b06d3 100644 > --- a/include/exec/memory.h > +++ b/include/exec/memory.h > @@ -465,6 +465,8 @@ void memory_region_init_resizeable_ram(MemoryRegion *mr, > * @name: Region name, becomes part of RAMBlock name used in migration stream > * must be unique within any device > * @size: size of the region. > + * @align: alignment of the region base address; if 0, the default alignment > + * (getpagesize()) will be used. > * @share: %true if memory must be mmaped with the MAP_SHARED flag > * @path: the path in which to allocate the RAM. > * @errp: pointer to Error*, to store an error if it happens. > @@ -476,6 +478,7 @@ void memory_region_init_ram_from_file(MemoryRegion *mr, > struct Object *owner, > const char *name, > uint64_t size, > + uint64_t align, > bool share, > const char *path, > Error **errp); > diff --git a/memory.c b/memory.c > index e26e5a3b1d..2c80656e39 100644 > --- a/memory.c > +++ b/memory.c > @@ -1570,6 +1570,7 @@ void memory_region_init_ram_from_file(MemoryRegion *mr, > struct Object *owner, > const char *name, > uint64_t size, > + uint64_t align, > bool share, > const char *path, > Error **errp) > @@ -1578,6 +1579,7 @@ void memory_region_init_ram_from_file(MemoryRegion *mr, > mr->ram = true; > mr->terminates = true; > mr->destructor = memory_region_destructor_ram; > + mr->align = align; > mr->ram_block = qemu_ram_alloc_from_file(size, mr, share, path, errp); > mr->dirty_log_mask = tcg_enabled() ? (1 << DIRTY_MEMORY_CODE) : 0; > } > diff --git a/numa.c b/numa.c > index 7151b24d1c..b0fe22a60c 100644 > --- a/numa.c > +++ b/numa.c > @@ -551,7 +551,7 @@ static void allocate_system_memory_nonnuma(MemoryRegion > *mr, Object *owner, > if (mem_path) { > #ifdef __linux__ > Error *err = NULL; > - memory_region_init_ram_from_file(mr, owner, name, ram_size, false, > + memory_region_init_ram_from_file(mr, owner, name, ram_size, 0, false, > mem_path, &err); > if (err) { > error_report_err(err); > -- > 2.14.1 >