On Mon, Sep 16, 2019 at 09:23:46AM -0400, Igor Mammedov wrote: > Max memslot size supported by kvm on s390 is 8Tb, > move logic of splitting RAM in chunks upto 8T to KVM code. > > This way it will hide KVM specific restrictions in KVM code > and won't affect board level design decisions. Which would allow > us to avoid misusing memory_region_allocate_system_memory() API > and eventually use a single hostmem backend for guest RAM. > > Signed-off-by: Igor Mammedov <imamm...@redhat.com> > --- > Since there is quite a bit of migration related code churn in the patch > CCing migration maintainer > > CC: dgilb...@redhat.com > > v6: > * KVM's migration code was assuming 1:1 relation between > memslot and MemorySection, which becomes not true fo s390x > with this patch. As result migration was broken and > dirty logging wasn't even started with when a MemorySection > was split on several memslots. Amend related KVM dirty > log tracking code to account for split MemorySection. > v5: > * move computation 'size -= slot_size' inside of loop body > (David Hildenbrand <da...@redhat.com>) > v4: > * fix compilation issue > (Christian Borntraeger <borntrae...@de.ibm.com>) > * advance HVA along with GPA in kvm_set_phys_mem() > (Christian Borntraeger <borntrae...@de.ibm.com>) > > patch prepares only KVM side for switching to single RAM memory region > another patch will take care of dropping manual RAM partitioning in > s390 code. > > > include/sysemu/kvm_int.h | 1 + > accel/kvm/kvm-all.c | 239 ++++++++++++++++++++++--------------- > hw/s390x/s390-virtio-ccw.c | 9 -- > target/s390x/kvm.c | 12 ++ > 4 files changed, 159 insertions(+), 102 deletions(-) > > diff --git a/include/sysemu/kvm_int.h b/include/sysemu/kvm_int.h > index 72b2d1b3ae..ac2d1f8b56 100644 > --- a/include/sysemu/kvm_int.h > +++ b/include/sysemu/kvm_int.h > @@ -41,4 +41,5 @@ typedef struct KVMMemoryListener { > void kvm_memory_listener_register(KVMState *s, KVMMemoryListener *kml, > AddressSpace *as, int as_id); > > +void kvm_set_max_memslot_size(hwaddr max_slot_size); > #endif > diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c > index b09bad0804..0635903408 100644 > --- a/accel/kvm/kvm-all.c > +++ b/accel/kvm/kvm-all.c > @@ -140,6 +140,7 @@ bool kvm_direct_msi_allowed; > bool kvm_ioeventfd_any_length_allowed; > bool kvm_msi_use_devid; > static bool kvm_immediate_exit; > +static hwaddr kvm_max_slot_size = ~0; > > static const KVMCapabilityInfo kvm_required_capabilites[] = { > KVM_CAP_INFO(USER_MEMORY), > @@ -437,7 +438,7 @@ static int kvm_slot_update_flags(KVMMemoryListener *kml, > KVMSlot *mem, > static int kvm_section_update_flags(KVMMemoryListener *kml, > MemoryRegionSection *section) > { > - hwaddr start_addr, size; > + hwaddr start_addr, size, slot_size; > KVMSlot *mem; > int ret = 0; > > @@ -448,13 +449,18 @@ static int kvm_section_update_flags(KVMMemoryListener > *kml, > > kvm_slots_lock(kml); > > - mem = kvm_lookup_matching_slot(kml, start_addr, size); > - if (!mem) { > - /* We don't have a slot if we want to trap every access. */ > - goto out; > - } > + while (size && !ret) { > + slot_size = kvm_max_slot_size < size ? kvm_max_slot_size : size;
Nit: use MIN()? > + mem = kvm_lookup_matching_slot(kml, start_addr, slot_size); > + if (!mem) { > + /* We don't have a slot if we want to trap every access. */ > + goto out; I don't really understand the rational behind e377e87ca6a3d by simply reading the commit message... Just want to make sure: this won't be triggered when memslot split happened for this memory region section, right? > + } > > - ret = kvm_slot_update_flags(kml, mem, section->mr); > + ret = kvm_slot_update_flags(kml, mem, section->mr); > + start_addr += slot_size; > + size -= slot_size; > + } > > out: > kvm_slots_unlock(kml); > @@ -497,11 +503,14 @@ static void kvm_log_stop(MemoryListener *listener, > > /* get kvm's dirty pages bitmap and update qemu's */ > static int kvm_get_dirty_pages_log_range(MemoryRegionSection *section, > + hwaddr offset_in_section, > + hwaddr range_size, > unsigned long *bitmap) > { > ram_addr_t start = section->offset_within_region + > + offset_in_section + > memory_region_get_ram_addr(section->mr); > - ram_addr_t pages = int128_get64(section->size) / getpagesize(); > + ram_addr_t pages = range_size / getpagesize(); I feel like we can avoid touching kvm_get_dirty_pages_log_range() by passing in a correct MemoryRegionSection* to it (please see [1] below). > > cpu_physical_memory_set_dirty_lebitmap(bitmap, start, pages); > return 0; > @@ -527,11 +536,13 @@ static int > kvm_physical_sync_dirty_bitmap(KVMMemoryListener *kml, > struct kvm_dirty_log d = {}; > KVMSlot *mem; > hwaddr start_addr, size; > + hwaddr slot_size, slot_offset = 0; > int ret = 0; > > size = kvm_align_section(section, &start_addr); > - if (size) { > - mem = kvm_lookup_matching_slot(kml, start_addr, size); > + while (size) { > + slot_size = kvm_max_slot_size < size ? kvm_max_slot_size : size; (use MIN?) > + mem = kvm_lookup_matching_slot(kml, start_addr, slot_size); > if (!mem) { > /* We don't have a slot if we want to trap every access. */ > goto out; > @@ -549,11 +560,11 @@ static int > kvm_physical_sync_dirty_bitmap(KVMMemoryListener *kml, > * So for now, let's align to 64 instead of HOST_LONG_BITS here, in > * a hope that sizeof(long) won't become >8 any time soon. > */ > - size = ALIGN(((mem->memory_size) >> TARGET_PAGE_BITS), > - /*HOST_LONG_BITS*/ 64) / 8; > if (!mem->dirty_bmap) { > + hwaddr bitmap_size = ALIGN(((mem->memory_size) >> > TARGET_PAGE_BITS), > + /*HOST_LONG_BITS*/ 64) / 8; > /* Allocate on the first log_sync, once and for all */ > - mem->dirty_bmap = g_malloc0(size); > + mem->dirty_bmap = g_malloc0(bitmap_size); > } > > d.dirty_bitmap = mem->dirty_bmap; > @@ -564,7 +575,12 @@ static int > kvm_physical_sync_dirty_bitmap(KVMMemoryListener *kml, > goto out; > } > > - kvm_get_dirty_pages_log_range(section, d.dirty_bitmap); > + kvm_get_dirty_pages_log_range(section, slot_offset, slot_size, > + d.dirty_bitmap); [1] Here can we hand-made a MemoryRegionSection which covers the slot_offset/slot_size region and pass it to kvm_get_dirty_pages_log_range()? > + > + slot_offset += slot_size; > + start_addr += slot_size; > + size -= slot_size; > } > out: > return ret; > @@ -575,55 +591,14 @@ out: > #define KVM_CLEAR_LOG_ALIGN (qemu_real_host_page_size << > KVM_CLEAR_LOG_SHIFT) > #define KVM_CLEAR_LOG_MASK (-KVM_CLEAR_LOG_ALIGN) > > -/** > - * kvm_physical_log_clear - Clear the kernel's dirty bitmap for range > - * > - * NOTE: this will be a no-op if we haven't enabled manual dirty log > - * protection in the host kernel because in that case this operation > - * will be done within log_sync(). > - * > - * @kml: the kvm memory listener > - * @section: the memory range to clear dirty bitmap > - */ > -static int kvm_physical_log_clear(KVMMemoryListener *kml, > - MemoryRegionSection *section) > +static int kvm_physical_log_slot_clear(uint64_t start, uint64_t size, > + KVMSlot *mem, int as_id) > { > KVMState *s = kvm_state; > - struct kvm_clear_dirty_log d; > - uint64_t start, end, bmap_start, start_delta, bmap_npages, size; > unsigned long *bmap_clear = NULL, psize = qemu_real_host_page_size; > - KVMSlot *mem = NULL; > - int ret, i; > - > - if (!s->manual_dirty_log_protect) { > - /* No need to do explicit clear */ > - return 0; > - } > - > - start = section->offset_within_address_space; > - size = int128_get64(section->size); > - > - if (!size) { > - /* Nothing more we can do... */ > - return 0; > - } > - > - kvm_slots_lock(kml); > - > - /* Find any possible slot that covers the section */ > - for (i = 0; i < s->nr_slots; i++) { > - mem = &kml->slots[i]; > - if (mem->start_addr <= start && > - start + size <= mem->start_addr + mem->memory_size) { > - break; > - } > - } > - > - /* > - * We should always find one memslot until this point, otherwise > - * there could be something wrong from the upper layer > - */ > - assert(mem && i != s->nr_slots); > + uint64_t end, bmap_start, start_delta, bmap_npages; > + struct kvm_clear_dirty_log d; > + int ret; > > /* > * We need to extend either the start or the size or both to > @@ -694,7 +669,7 @@ static int kvm_physical_log_clear(KVMMemoryListener *kml, > /* It should never overflow. If it happens, say something */ > assert(bmap_npages <= UINT32_MAX); > d.num_pages = bmap_npages; > - d.slot = mem->slot | (kml->as_id << 16); > + d.slot = mem->slot | (as_id << 16); > > if (kvm_vm_ioctl(s, KVM_CLEAR_DIRTY_LOG, &d) == -1) { > ret = -errno; > @@ -718,8 +693,66 @@ static int kvm_physical_log_clear(KVMMemoryListener *kml, > /* This handles the NULL case well */ > g_free(bmap_clear); > > - kvm_slots_unlock(kml); > + return ret; > +} > + > +/** > + * kvm_physical_log_clear - Clear the kernel's dirty bitmap for range > + * > + * NOTE: this will be a no-op if we haven't enabled manual dirty log > + * protection in the host kernel because in that case this operation > + * will be done within log_sync(). > + * > + * @kml: the kvm memory listener > + * @section: the memory range to clear dirty bitmap > + */ > +static int kvm_physical_log_clear(KVMMemoryListener *kml, > + MemoryRegionSection *section) > +{ > + KVMState *s = kvm_state; > + uint64_t start, size, slot_size; > + KVMSlot *mem = NULL; > + int i, ret = 0; > + > + if (!s->manual_dirty_log_protect) { > + /* No need to do explicit clear */ > + return 0; > + } > + > + start = section->offset_within_address_space; > + size = int128_get64(section->size); > + > + if (!size) { > + /* Nothing more we can do... */ > + return 0; > + } > + > + kvm_slots_lock(kml); > + > + while (size) { > + slot_size = kvm_max_slot_size < size ? kvm_max_slot_size : size; MIN? Also, the naming of slot_size might be a bit misleading here because log_clear() is different in that it could happen with very small memory region section passed in, hence slot_size could very probably not be the size of slot. Maybe range_size or anything else? > + /* Iterate over slots that cover the section */ > + for (i = 0; i < s->nr_slots; i++) { > + mem = &kml->slots[i]; > + if (mem->start_addr <= start && > + start + slot_size <= mem->start_addr + mem->memory_size) { > + break; > + } > + } > > + /* > + * We should always find one memslot until this point, otherwise > + * there could be something wrong from the upper layer > + */ > + assert(mem && i != s->nr_slots); > + > + kvm_physical_log_slot_clear(start, slot_size, mem, kml->as_id); Use ret to capture errors? > + > + size -= slot_size; > + start += slot_size; > + } > + > + kvm_slots_unlock(kml); > return ret; > } > > @@ -953,6 +986,14 @@ kvm_check_extension_list(KVMState *s, const > KVMCapabilityInfo *list) > return NULL; > } > > +void kvm_set_max_memslot_size(hwaddr max_slot_size) > +{ > + g_assert( > + ROUND_UP(max_slot_size, qemu_real_host_page_size) == max_slot_size > + ); > + kvm_max_slot_size = max_slot_size; > +} > + > static void kvm_set_phys_mem(KVMMemoryListener *kml, > MemoryRegionSection *section, bool add) > { > @@ -960,7 +1001,7 @@ static void kvm_set_phys_mem(KVMMemoryListener *kml, > int err; > MemoryRegion *mr = section->mr; > bool writeable = !mr->readonly && !mr->rom_device; > - hwaddr start_addr, size; > + hwaddr start_addr, size, slot_size; > void *ram; > > if (!memory_region_is_ram(mr)) { > @@ -985,41 +1026,52 @@ static void kvm_set_phys_mem(KVMMemoryListener *kml, > kvm_slots_lock(kml); > > if (!add) { > - mem = kvm_lookup_matching_slot(kml, start_addr, size); > - if (!mem) { > - goto out; > - } > - if (mem->flags & KVM_MEM_LOG_DIRTY_PAGES) { > - kvm_physical_sync_dirty_bitmap(kml, section); > - } > + do { > + slot_size = kvm_max_slot_size < size ? kvm_max_slot_size : size; Same > + mem = kvm_lookup_matching_slot(kml, start_addr, slot_size); > + if (!mem) { > + goto out; > + } > + if (mem->flags & KVM_MEM_LOG_DIRTY_PAGES) { > + kvm_physical_sync_dirty_bitmap(kml, section); > + } > > - /* unregister the slot */ > - g_free(mem->dirty_bmap); > - mem->dirty_bmap = NULL; > - mem->memory_size = 0; > - mem->flags = 0; > - err = kvm_set_user_memory_region(kml, mem, false); > - if (err) { > - fprintf(stderr, "%s: error unregistering slot: %s\n", > - __func__, strerror(-err)); > - abort(); > - } > + /* unregister the slot */ > + g_free(mem->dirty_bmap); > + mem->dirty_bmap = NULL; > + mem->memory_size = 0; > + mem->flags = 0; > + err = kvm_set_user_memory_region(kml, mem, false); > + if (err) { > + fprintf(stderr, "%s: error unregistering slot: %s\n", > + __func__, strerror(-err)); > + abort(); > + } > + start_addr += slot_size; > + size -= slot_size; > + } while (size); > goto out; > } > > /* register the new slot */ > - mem = kvm_alloc_slot(kml); > - mem->memory_size = size; > - mem->start_addr = start_addr; > - mem->ram = ram; > - mem->flags = kvm_mem_flags(mr); > - > - err = kvm_set_user_memory_region(kml, mem, true); > - if (err) { > - fprintf(stderr, "%s: error registering slot: %s\n", __func__, > - strerror(-err)); > - abort(); > - } > + do { > + slot_size = kvm_max_slot_size < size ? kvm_max_slot_size : size; Same. > + mem = kvm_alloc_slot(kml); > + mem->memory_size = slot_size; > + mem->start_addr = start_addr; > + mem->ram = ram; > + mem->flags = kvm_mem_flags(mr); > + > + err = kvm_set_user_memory_region(kml, mem, true); > + if (err) { > + fprintf(stderr, "%s: error registering slot: %s\n", __func__, > + strerror(-err)); > + abort(); > + } > + start_addr += slot_size; > + ram += slot_size; > + size -= slot_size; > + } while (size); > > out: > kvm_slots_unlock(kml); > @@ -2859,6 +2911,7 @@ static bool kvm_accel_has_memory(MachineState *ms, > AddressSpace *as, > > for (i = 0; i < kvm->nr_as; ++i) { > if (kvm->as[i].as == as && kvm->as[i].ml) { > + size = kvm_max_slot_size < size ? kvm_max_slot_size : size; > return NULL != kvm_lookup_matching_slot(kvm->as[i].ml, > start_addr, size); > } > diff --git a/hw/s390x/s390-virtio-ccw.c b/hw/s390x/s390-virtio-ccw.c > index 8bfb6684cb..baa95aad38 100644 > --- a/hw/s390x/s390-virtio-ccw.c > +++ b/hw/s390x/s390-virtio-ccw.c > @@ -154,15 +154,6 @@ static void virtio_ccw_register_hcalls(void) > virtio_ccw_hcall_early_printk); > } > > -/* > - * KVM does only support memory slots up to KVM_MEM_MAX_NR_PAGES pages > - * as the dirty bitmap must be managed by bitops that take an int as > - * position indicator. If we have a guest beyond that we will split off > - * new subregions. The split must happen on a segment boundary (1MB). > - */ > -#define KVM_MEM_MAX_NR_PAGES ((1ULL << 31) - 1) > -#define SEG_MSK (~0xfffffULL) > -#define KVM_SLOT_MAX_BYTES ((KVM_MEM_MAX_NR_PAGES * TARGET_PAGE_SIZE) & > SEG_MSK) (Nit: IMHO it'll be cleaner to keep these and in the next patch to simply revert bb223055b9b327e) > static void s390_memory_init(ram_addr_t mem_size) > { > MemoryRegion *sysmem = get_system_memory(); > diff --git a/target/s390x/kvm.c b/target/s390x/kvm.c > index cea71ac7c3..838d23cb17 100644 > --- a/target/s390x/kvm.c > +++ b/target/s390x/kvm.c > @@ -28,6 +28,7 @@ > #include "cpu.h" > #include "internal.h" > #include "kvm_s390x.h" > +#include "sysemu/kvm_int.h" > #include "qapi/error.h" > #include "qemu/error-report.h" > #include "qemu/timer.h" > @@ -123,6 +124,16 @@ > #define VCPU_IRQ_BUF_SIZE(max_cpus) (sizeof(struct kvm_s390_irq) * \ > (max_cpus + NR_LOCAL_IRQS)) > > +/* > + * KVM does only support memory slots up to KVM_MEM_MAX_NR_PAGES pages > + * as the dirty bitmap must be managed by bitops that take an int as > + * position indicator. If we have a guest beyond that we will split off > + * new subregions. The split must happen on a segment boundary (1MB). > + */ > +#define KVM_MEM_MAX_NR_PAGES ((1ULL << 31) - 1) Not related to this series, but... should kvm export this to uapi headers? > +#define SEG_MSK (~0xfffffULL) > +#define KVM_SLOT_MAX_BYTES ((KVM_MEM_MAX_NR_PAGES * TARGET_PAGE_SIZE) & > SEG_MSK) > + > static CPUWatchpoint hw_watchpoint; > /* > * We don't use a list because this structure is also used to transmit the > @@ -348,6 +359,7 @@ int kvm_arch_init(MachineState *ms, KVMState *s) > */ > /* kvm_vm_enable_cap(s, KVM_CAP_S390_AIS, 0); */ > > + kvm_set_max_memslot_size(KVM_SLOT_MAX_BYTES); > return 0; > } > > -- > 2.18.1 > > Regards, -- Peter Xu