On 19/2/25 12:20, Chenyi Qiang wrote:


On 2/18/2025 5:19 PM, Alexey Kardashevskiy wrote:



[..]

diff --git a/include/system/memory-attribute-manager.h b/include/
system/memory-attribute-manager.h
new file mode 100644
index 0000000000..72adc0028e
--- /dev/null
+++ b/include/system/memory-attribute-manager.h
@@ -0,0 +1,42 @@
+/*
+ * QEMU memory attribute manager
+ *
+ * Copyright Intel
+ *
+ * Author:
+ *      Chenyi Qiang <chenyi.qi...@intel.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
later.
+ * See the COPYING file in the top-level directory
+ *
+ */
+
+#ifndef SYSTEM_MEMORY_ATTRIBUTE_MANAGER_H
+#define SYSTEM_MEMORY_ATTRIBUTE_MANAGER_H
+
+#include "system/hostmem.h"
+
+#define TYPE_MEMORY_ATTRIBUTE_MANAGER "memory-attribute-manager"
+
+OBJECT_DECLARE_TYPE(MemoryAttributeManager,
MemoryAttributeManagerClass, MEMORY_ATTRIBUTE_MANAGER)
+
+struct MemoryAttributeManager {
+    Object parent;
+
+    MemoryRegion *mr;
+
+    /* 1-setting of the bit represents the memory is populated
(shared) */
+    int32_t bitmap_size;

unsigned.

Also, do either s/bitmap_size/shared_bitmap_size/ or
s/shared_bitmap/bitmap/

Will change it. Thanks.




+    unsigned long *shared_bitmap;
+
+    QLIST_HEAD(, RamDiscardListener) rdl_list;
+};
+
+struct MemoryAttributeManagerClass {
+    ObjectClass parent_class;
+};
+
+int memory_attribute_manager_realize(MemoryAttributeManager *mgr,
MemoryRegion *mr);
+void memory_attribute_manager_unrealize(MemoryAttributeManager *mgr);
+
+#endif
diff --git a/system/memory-attribute-manager.c b/system/memory-
attribute-manager.c
new file mode 100644
index 0000000000..ed97e43dd0
--- /dev/null
+++ b/system/memory-attribute-manager.c
@@ -0,0 +1,292 @@
+/*
+ * QEMU memory attribute manager
+ *
+ * Copyright Intel
+ *
+ * Author:
+ *      Chenyi Qiang <chenyi.qi...@intel.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
later.
+ * See the COPYING file in the top-level directory
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/error-report.h"
+#include "system/memory-attribute-manager.h"
+
+OBJECT_DEFINE_TYPE_WITH_INTERFACES(MemoryAttributeManager,
+                                   memory_attribute_manager,
+                                   MEMORY_ATTRIBUTE_MANAGER,
+                                   OBJECT,
+                                   { TYPE_RAM_DISCARD_MANAGER },
+                                   { })
+
+static int memory_attribute_manager_get_block_size(const
MemoryAttributeManager *mgr)
+{
+    /*
+     * Because page conversion could be manipulated in the size of at
least 4K or 4K aligned,
+     * Use the host page size as the granularity to track the memory
attribute.
+     * TODO: if necessary, switch to get the page_size from RAMBlock.
+     * i.e. mgr->mr->ram_block->page_size.

I'd assume it is rather necessary already.

OK, Will return the page_size of RAMBlock directly.


+     */
+    return qemu_real_host_page_size();
+}
+
+
+static bool memory_attribute_rdm_is_populated(const RamDiscardManager
*rdm,
+                                              const
MemoryRegionSection *section)
+{
+    const MemoryAttributeManager *mgr = MEMORY_ATTRIBUTE_MANAGER(rdm);
+    int block_size = memory_attribute_manager_get_block_size(mgr);
+    uint64_t first_bit = section->offset_within_region / block_size;
+    uint64_t last_bit = first_bit + int128_get64(section->size) /
block_size - 1;
+    unsigned long first_discard_bit;
+
+    first_discard_bit = find_next_zero_bit(mgr->shared_bitmap,
last_bit + 1, first_bit);
+    return first_discard_bit > last_bit;
+}
+
+typedef int (*memory_attribute_section_cb)(MemoryRegionSection *s,
void *arg);
+
+static int memory_attribute_notify_populate_cb(MemoryRegionSection
*section, void *arg)
+{
+    RamDiscardListener *rdl = arg;
+
+    return rdl->notify_populate(rdl, section);
+}
+
+static int memory_attribute_notify_discard_cb(MemoryRegionSection
*section, void *arg)
+{
+    RamDiscardListener *rdl = arg;
+
+    rdl->notify_discard(rdl, section);
+
+    return 0;
+}
+
+static int memory_attribute_for_each_populated_section(const
MemoryAttributeManager *mgr,
+
MemoryRegionSection *section,
+                                                       void *arg,
+
memory_attribute_section_cb cb)
+{
+    unsigned long first_one_bit, last_one_bit;
+    uint64_t offset, size;
+    int block_size = memory_attribute_manager_get_block_size(mgr);
+    int ret = 0;
+
+    first_one_bit = section->offset_within_region / block_size;
+    first_one_bit = find_next_bit(mgr->shared_bitmap, mgr-
bitmap_size, first_one_bit);
+
+    while (first_one_bit < mgr->bitmap_size) {
+        MemoryRegionSection tmp = *section;
+
+        offset = first_one_bit * block_size;
+        last_one_bit = find_next_zero_bit(mgr->shared_bitmap, mgr-
bitmap_size,
+                                          first_one_bit + 1) - 1;
+        size = (last_one_bit - first_one_bit + 1) * block_size;


What all this math is for if we stuck with VFIO doing 1 page at the
time? (I think I commented on this)

Sorry, I missed your previous comment. IMHO, as we track the status in
bitmap and we want to call the cb() on the shared part within
MemoryRegionSection. Here we do the calculation to find the expected
sub-range.


You find a largest intersection here and call cb() on it which will call VFIO with 1 page at the time. So you could just call cb() for every page from here which will make the code simpler.



+
+        if (!memory_region_section_intersect_range(&tmp, offset,
size)) {
+            break;
+        }
+
+        ret = cb(&tmp, arg);
+        if (ret) {
+            error_report("%s: Failed to notify RAM discard listener:
%s", __func__,
+                         strerror(-ret));
+            break;
+        }
+
+        first_one_bit = find_next_bit(mgr->shared_bitmap, mgr-
bitmap_size,
+                                      last_one_bit + 2);
+    }
+
+    return ret;
+}
+

[..]

+
+static void
memory_attribute_rdm_unregister_listener(RamDiscardManager *rdm,
+
RamDiscardListener *rdl)
+{
+    MemoryAttributeManager *mgr = MEMORY_ATTRIBUTE_MANAGER(rdm);
+    int ret;
+
+    g_assert(rdl->section);
+    g_assert(rdl->section->mr == mgr->mr);
+
+    ret = memory_attribute_for_each_populated_section(mgr, rdl-
section, rdl,
+
memory_attribute_notify_discard_cb);
+    if (ret) {
+        error_report("%s: Failed to unregister RAM discard listener:
%s", __func__,
+                     strerror(-ret));
+    }
+
+    memory_region_section_free_copy(rdl->section);
+    rdl->section = NULL;
+    QLIST_REMOVE(rdl, next);
+
+}
+
+typedef struct MemoryAttributeReplayData {
+    void *fn;

ReplayRamDiscard *fn, not void*.

We could cast the void *fn either to ReplayRamPopulate or
ReplayRamDiscard (see below).


Hard to read, hard to maintain, and they take same parameters, only the return value is different (int/void) - if this is really important, have 2 fn pointers in MemoryAttributeReplayData. It is already hard to follow this train on callbacks.


+    void *opaque;
+} MemoryAttributeReplayData;
+
+static int
memory_attribute_rdm_replay_populated_cb(MemoryRegionSection *section,
void *arg)
+{
+    MemoryAttributeReplayData *data = arg;
+
+    return ((ReplayRamPopulate)data->fn)(section, data->opaque);
+}
+
+static int memory_attribute_rdm_replay_populated(const
RamDiscardManager *rdm,
+                                                 MemoryRegionSection
*section,
+                                                 ReplayRamPopulate
replay_fn,
+                                                 void *opaque)
+{
+    MemoryAttributeManager *mgr = MEMORY_ATTRIBUTE_MANAGER(rdm);
+    MemoryAttributeReplayData data = { .fn = replay_fn, .opaque =
opaque };
+
+    g_assert(section->mr == mgr->mr);
+    return memory_attribute_for_each_populated_section(mgr, section,
&data,
+
memory_attribute_rdm_replay_populated_cb);
+}
+
+static int
memory_attribute_rdm_replay_discarded_cb(MemoryRegionSection *section,
void *arg)
+{
+    MemoryAttributeReplayData *data = arg;
+
+    ((ReplayRamDiscard)data->fn)(section, data->opaque);
+    return 0;
+}
+
+static void memory_attribute_rdm_replay_discarded(const
RamDiscardManager *rdm,
+                                                  MemoryRegionSection
*section,
+                                                  ReplayRamDiscard
replay_fn,
+                                                  void *opaque)
+{
+    MemoryAttributeManager *mgr = MEMORY_ATTRIBUTE_MANAGER(rdm);
+    MemoryAttributeReplayData data = { .fn = replay_fn, .opaque =
opaque };
+
+    g_assert(section->mr == mgr->mr);
+    memory_attribute_for_each_discarded_section(mgr, section, &data,
+
memory_attribute_rdm_replay_discarded_cb);
+}
+
+int memory_attribute_manager_realize(MemoryAttributeManager *mgr,
MemoryRegion *mr)
+{
+    uint64_t bitmap_size;
+    int block_size = memory_attribute_manager_get_block_size(mgr);
+    int ret;
+
+    bitmap_size = ROUND_UP(mr->size, block_size) / block_size;
+
+    mgr->mr = mr;
+    mgr->bitmap_size = bitmap_size;
+    mgr->shared_bitmap = bitmap_new(bitmap_size);
+
+    ret = memory_region_set_ram_discard_manager(mgr->mr,
RAM_DISCARD_MANAGER(mgr));

Move it 3 lines up and avoid stale data in mgr->mr/bitmap_size/
shared_bitmap and avoid g_free below?

Make sense. I will move it up the same as patch 02 before bitmap_new().


+    if (ret) {
+        g_free(mgr->shared_bitmap);
+    }
+
+    return ret;
+}
+
+void memory_attribute_manager_unrealize(MemoryAttributeManager *mgr)
+{
+    memory_region_set_ram_discard_manager(mgr->mr, NULL);
+
+    g_free(mgr->shared_bitmap);
+}
+
+static void memory_attribute_manager_init(Object *obj)

Not used.

+{
+    MemoryAttributeManager *mgr = MEMORY_ATTRIBUTE_MANAGER(obj);
+
+    QLIST_INIT(&mgr->rdl_list);
+} > +
+static void memory_attribute_manager_finalize(Object *obj)

Not used either. Thanks,

I think it is OK to define it as a placeholder? Just some preference.

At very least gcc should warn on these (I am surprised it did not) and nobody likes this. Thanks,



+{
+}
+
+static void memory_attribute_manager_class_init(ObjectClass *oc, void
*data)
+{
+    RamDiscardManagerClass *rdmc = RAM_DISCARD_MANAGER_CLASS(oc);
+
+    rdmc->get_min_granularity =
memory_attribute_rdm_get_min_granularity;
+    rdmc->register_listener = memory_attribute_rdm_register_listener;
+    rdmc->unregister_listener =
memory_attribute_rdm_unregister_listener;
+    rdmc->is_populated = memory_attribute_rdm_is_populated;
+    rdmc->replay_populated = memory_attribute_rdm_replay_populated;
+    rdmc->replay_discarded = memory_attribute_rdm_replay_discarded;
+}
diff --git a/system/meson.build b/system/meson.build
index 4952f4b2c7..ab07ff1442 100644
--- a/system/meson.build
+++ b/system/meson.build
@@ -15,6 +15,7 @@ system_ss.add(files(
     'dirtylimit.c',
     'dma-helpers.c',
     'globals.c',
+  'memory-attribute-manager.c',
     'memory_mapping.c',
     'qdev-monitor.c',
     'qtest.c',



--
Alexey


Reply via email to