On 2/19/2025 11:49 AM, Alexey Kardashevskiy wrote:
> 
> 
> On 19/2/25 12:20, Chenyi Qiang wrote:
>>
>>
>> On 2/18/2025 5:19 PM, Alexey Kardashevskiy wrote:
>>>
>>>
>>
>> [..]
>>
>>>> diff --git a/include/system/memory-attribute-manager.h b/include/
>>>> system/memory-attribute-manager.h
>>>> new file mode 100644
>>>> index 0000000000..72adc0028e
>>>> --- /dev/null
>>>> +++ b/include/system/memory-attribute-manager.h
>>>> @@ -0,0 +1,42 @@
>>>> +/*
>>>> + * QEMU memory attribute manager
>>>> + *
>>>> + * Copyright Intel
>>>> + *
>>>> + * Author:
>>>> + *      Chenyi Qiang <chenyi.qi...@intel.com>
>>>> + *
>>>> + * This work is licensed under the terms of the GNU GPL, version 2 or
>>>> later.
>>>> + * See the COPYING file in the top-level directory
>>>> + *
>>>> + */
>>>> +
>>>> +#ifndef SYSTEM_MEMORY_ATTRIBUTE_MANAGER_H
>>>> +#define SYSTEM_MEMORY_ATTRIBUTE_MANAGER_H
>>>> +
>>>> +#include "system/hostmem.h"
>>>> +
>>>> +#define TYPE_MEMORY_ATTRIBUTE_MANAGER "memory-attribute-manager"
>>>> +
>>>> +OBJECT_DECLARE_TYPE(MemoryAttributeManager,
>>>> MemoryAttributeManagerClass, MEMORY_ATTRIBUTE_MANAGER)
>>>> +
>>>> +struct MemoryAttributeManager {
>>>> +    Object parent;
>>>> +
>>>> +    MemoryRegion *mr;
>>>> +
>>>> +    /* 1-setting of the bit represents the memory is populated
>>>> (shared) */
>>>> +    int32_t bitmap_size;
>>>
>>> unsigned.
>>>
>>> Also, do either s/bitmap_size/shared_bitmap_size/ or
>>> s/shared_bitmap/bitmap/
>>
>> Will change it. Thanks.
>>
>>>
>>>
>>>
>>>> +    unsigned long *shared_bitmap;
>>>> +
>>>> +    QLIST_HEAD(, RamDiscardListener) rdl_list;
>>>> +};
>>>> +
>>>> +struct MemoryAttributeManagerClass {
>>>> +    ObjectClass parent_class;
>>>> +};
>>>> +
>>>> +int memory_attribute_manager_realize(MemoryAttributeManager *mgr,
>>>> MemoryRegion *mr);
>>>> +void memory_attribute_manager_unrealize(MemoryAttributeManager *mgr);
>>>> +
>>>> +#endif
>>>> diff --git a/system/memory-attribute-manager.c b/system/memory-
>>>> attribute-manager.c
>>>> new file mode 100644
>>>> index 0000000000..ed97e43dd0
>>>> --- /dev/null
>>>> +++ b/system/memory-attribute-manager.c
>>>> @@ -0,0 +1,292 @@
>>>> +/*
>>>> + * QEMU memory attribute manager
>>>> + *
>>>> + * Copyright Intel
>>>> + *
>>>> + * Author:
>>>> + *      Chenyi Qiang <chenyi.qi...@intel.com>
>>>> + *
>>>> + * This work is licensed under the terms of the GNU GPL, version 2 or
>>>> later.
>>>> + * See the COPYING file in the top-level directory
>>>> + *
>>>> + */
>>>> +
>>>> +#include "qemu/osdep.h"
>>>> +#include "qemu/error-report.h"
>>>> +#include "system/memory-attribute-manager.h"
>>>> +
>>>> +OBJECT_DEFINE_TYPE_WITH_INTERFACES(MemoryAttributeManager,
>>>> +                                   memory_attribute_manager,
>>>> +                                   MEMORY_ATTRIBUTE_MANAGER,
>>>> +                                   OBJECT,
>>>> +                                   { TYPE_RAM_DISCARD_MANAGER },
>>>> +                                   { })
>>>> +
>>>> +static int memory_attribute_manager_get_block_size(const
>>>> MemoryAttributeManager *mgr)
>>>> +{
>>>> +    /*
>>>> +     * Because page conversion could be manipulated in the size of at
>>>> least 4K or 4K aligned,
>>>> +     * Use the host page size as the granularity to track the memory
>>>> attribute.
>>>> +     * TODO: if necessary, switch to get the page_size from RAMBlock.
>>>> +     * i.e. mgr->mr->ram_block->page_size.
>>>
>>> I'd assume it is rather necessary already.
>>
>> OK, Will return the page_size of RAMBlock directly.
>>
>>>
>>>> +     */
>>>> +    return qemu_real_host_page_size();
>>>> +}
>>>> +
>>>> +
>>>> +static bool memory_attribute_rdm_is_populated(const RamDiscardManager
>>>> *rdm,
>>>> +                                              const
>>>> MemoryRegionSection *section)
>>>> +{
>>>> +    const MemoryAttributeManager *mgr = MEMORY_ATTRIBUTE_MANAGER(rdm);
>>>> +    int block_size = memory_attribute_manager_get_block_size(mgr);
>>>> +    uint64_t first_bit = section->offset_within_region / block_size;
>>>> +    uint64_t last_bit = first_bit + int128_get64(section->size) /
>>>> block_size - 1;
>>>> +    unsigned long first_discard_bit;
>>>> +
>>>> +    first_discard_bit = find_next_zero_bit(mgr->shared_bitmap,
>>>> last_bit + 1, first_bit);
>>>> +    return first_discard_bit > last_bit;
>>>> +}
>>>> +
>>>> +typedef int (*memory_attribute_section_cb)(MemoryRegionSection *s,
>>>> void *arg);
>>>> +
>>>> +static int memory_attribute_notify_populate_cb(MemoryRegionSection
>>>> *section, void *arg)
>>>> +{
>>>> +    RamDiscardListener *rdl = arg;
>>>> +
>>>> +    return rdl->notify_populate(rdl, section);
>>>> +}
>>>> +
>>>> +static int memory_attribute_notify_discard_cb(MemoryRegionSection
>>>> *section, void *arg)
>>>> +{
>>>> +    RamDiscardListener *rdl = arg;
>>>> +
>>>> +    rdl->notify_discard(rdl, section);
>>>> +
>>>> +    return 0;
>>>> +}
>>>> +
>>>> +static int memory_attribute_for_each_populated_section(const
>>>> MemoryAttributeManager *mgr,
>>>> +
>>>> MemoryRegionSection *section,
>>>> +                                                       void *arg,
>>>> +
>>>> memory_attribute_section_cb cb)
>>>> +{
>>>> +    unsigned long first_one_bit, last_one_bit;
>>>> +    uint64_t offset, size;
>>>> +    int block_size = memory_attribute_manager_get_block_size(mgr);
>>>> +    int ret = 0;
>>>> +
>>>> +    first_one_bit = section->offset_within_region / block_size;
>>>> +    first_one_bit = find_next_bit(mgr->shared_bitmap, mgr-
>>>>> bitmap_size, first_one_bit);
>>>> +
>>>> +    while (first_one_bit < mgr->bitmap_size) {
>>>> +        MemoryRegionSection tmp = *section;
>>>> +
>>>> +        offset = first_one_bit * block_size;
>>>> +        last_one_bit = find_next_zero_bit(mgr->shared_bitmap, mgr-
>>>>> bitmap_size,
>>>> +                                          first_one_bit + 1) - 1;
>>>> +        size = (last_one_bit - first_one_bit + 1) * block_size;
>>>
>>>
>>> What all this math is for if we stuck with VFIO doing 1 page at the
>>> time? (I think I commented on this)
>>
>> Sorry, I missed your previous comment. IMHO, as we track the status in
>> bitmap and we want to call the cb() on the shared part within
>> MemoryRegionSection. Here we do the calculation to find the expected
>> sub-range.
> 
> 
> You find a largest intersection here and call cb() on it which will call
> VFIO with 1 page at the time. So you could just call cb() for every page
> from here which will make the code simpler.

I prefer to keep calling cb() on a large intersection . I think in
future after cut_mapping is supported, we don't need to make VFIO call 1
page at a time. VFIO can call on the large range directly.

In addition, calling cb() for every page seems specific to VFIO usage.
It is more generic to call on a large intersection. If more RDM listener
added in future(although VFIO is the only user currently), do the split
in caller is inefficient.

> 
> 
>>>
>>>> +
>>>> +        if (!memory_region_section_intersect_range(&tmp, offset,
>>>> size)) {
>>>> +            break;
>>>> +        }
>>>> +
>>>> +        ret = cb(&tmp, arg);
>>>> +        if (ret) {
>>>> +            error_report("%s: Failed to notify RAM discard listener:
>>>> %s", __func__,
>>>> +                         strerror(-ret));
>>>> +            break;
>>>> +        }
>>>> +
>>>> +        first_one_bit = find_next_bit(mgr->shared_bitmap, mgr-
>>>>> bitmap_size,
>>>> +                                      last_one_bit + 2);
>>>> +    }
>>>> +
>>>> +    return ret;
>>>> +}
>>>> +
>>
>> [..]
>>
>>>> +
>>>> +static void
>>>> memory_attribute_rdm_unregister_listener(RamDiscardManager *rdm,
>>>> +
>>>> RamDiscardListener *rdl)
>>>> +{
>>>> +    MemoryAttributeManager *mgr = MEMORY_ATTRIBUTE_MANAGER(rdm);
>>>> +    int ret;
>>>> +
>>>> +    g_assert(rdl->section);
>>>> +    g_assert(rdl->section->mr == mgr->mr);
>>>> +
>>>> +    ret = memory_attribute_for_each_populated_section(mgr, rdl-
>>>>> section, rdl,
>>>> +
>>>> memory_attribute_notify_discard_cb);
>>>> +    if (ret) {
>>>> +        error_report("%s: Failed to unregister RAM discard listener:
>>>> %s", __func__,
>>>> +                     strerror(-ret));
>>>> +    }
>>>> +
>>>> +    memory_region_section_free_copy(rdl->section);
>>>> +    rdl->section = NULL;
>>>> +    QLIST_REMOVE(rdl, next);
>>>> +
>>>> +}
>>>> +
>>>> +typedef struct MemoryAttributeReplayData {
>>>> +    void *fn;
>>>
>>> ReplayRamDiscard *fn, not void*.
>>
>> We could cast the void *fn either to ReplayRamPopulate or
>> ReplayRamDiscard (see below).
> 
> 
> Hard to read, hard to maintain, and they take same parameters, only the
> return value is different (int/void) - if this is really important, have
> 2 fn pointers in MemoryAttributeReplayData. It is already hard to follow
> this train on callbacks.

Actually, I prefer to make ReplayRamDiscard and ReplayRamPopulate
unified. Make ReplayRamDiscard() also return int. Then we only need to
define one function like:

typedef int (*ReplayMemoryAttributeChange)(MemoryRegionSection *section,
void *opaque);

Maybe David can share his opinions.

> 
> 
>>>> +    void *opaque;
>>>> +} MemoryAttributeReplayData;
>>>> +
>>>> +static int
>>>> memory_attribute_rdm_replay_populated_cb(MemoryRegionSection *section,
>>>> void *arg)
>>>> +{
>>>> +    MemoryAttributeReplayData *data = arg;
>>>> +
>>>> +    return ((ReplayRamPopulate)data->fn)(section, data->opaque);
>>>> +}
>>>> +
>>>> +static int memory_attribute_rdm_replay_populated(const
>>>> RamDiscardManager *rdm,
>>>> +                                                 MemoryRegionSection
>>>> *section,
>>>> +                                                 ReplayRamPopulate
>>>> replay_fn,
>>>> +                                                 void *opaque)
>>>> +{
>>>> +    MemoryAttributeManager *mgr = MEMORY_ATTRIBUTE_MANAGER(rdm);
>>>> +    MemoryAttributeReplayData data = { .fn = replay_fn, .opaque =
>>>> opaque };
>>>> +
>>>> +    g_assert(section->mr == mgr->mr);
>>>> +    return memory_attribute_for_each_populated_section(mgr, section,
>>>> &data,
>>>> +
>>>> memory_attribute_rdm_replay_populated_cb);
>>>> +}
>>>> +
>>>> +static int
>>>> memory_attribute_rdm_replay_discarded_cb(MemoryRegionSection *section,
>>>> void *arg)
>>>> +{
>>>> +    MemoryAttributeReplayData *data = arg;
>>>> +
>>>> +    ((ReplayRamDiscard)data->fn)(section, data->opaque);
>>>> +    return 0;
>>>> +}
>>>> +
>>>> +static void memory_attribute_rdm_replay_discarded(const
>>>> RamDiscardManager *rdm,
>>>> +                                                  MemoryRegionSection
>>>> *section,
>>>> +                                                  ReplayRamDiscard
>>>> replay_fn,
>>>> +                                                  void *opaque)
>>>> +{
>>>> +    MemoryAttributeManager *mgr = MEMORY_ATTRIBUTE_MANAGER(rdm);
>>>> +    MemoryAttributeReplayData data = { .fn = replay_fn, .opaque =
>>>> opaque };
>>>> +
>>>> +    g_assert(section->mr == mgr->mr);
>>>> +    memory_attribute_for_each_discarded_section(mgr, section, &data,
>>>> +
>>>> memory_attribute_rdm_replay_discarded_cb);
>>>> +}
>>>> +
>>>> +int memory_attribute_manager_realize(MemoryAttributeManager *mgr,
>>>> MemoryRegion *mr)
>>>> +{
>>>> +    uint64_t bitmap_size;
>>>> +    int block_size = memory_attribute_manager_get_block_size(mgr);
>>>> +    int ret;
>>>> +
>>>> +    bitmap_size = ROUND_UP(mr->size, block_size) / block_size;
>>>> +
>>>> +    mgr->mr = mr;
>>>> +    mgr->bitmap_size = bitmap_size;
>>>> +    mgr->shared_bitmap = bitmap_new(bitmap_size);
>>>> +
>>>> +    ret = memory_region_set_ram_discard_manager(mgr->mr,
>>>> RAM_DISCARD_MANAGER(mgr));
>>>
>>> Move it 3 lines up and avoid stale data in mgr->mr/bitmap_size/
>>> shared_bitmap and avoid g_free below?
>>
>> Make sense. I will move it up the same as patch 02 before bitmap_new().
>>
>>>
>>>> +    if (ret) {
>>>> +        g_free(mgr->shared_bitmap);
>>>> +    }
>>>> +
>>>> +    return ret;
>>>> +}
>>>> +
>>>> +void memory_attribute_manager_unrealize(MemoryAttributeManager *mgr)
>>>> +{
>>>> +    memory_region_set_ram_discard_manager(mgr->mr, NULL);
>>>> +
>>>> +    g_free(mgr->shared_bitmap);
>>>> +}
>>>> +
>>>> +static void memory_attribute_manager_init(Object *obj)
>>>
>>> Not used.
>>>
>>>> +{
>>>> +    MemoryAttributeManager *mgr = MEMORY_ATTRIBUTE_MANAGER(obj);
>>>> +
>>>> +    QLIST_INIT(&mgr->rdl_list);
>>>> +} > +
>>>> +static void memory_attribute_manager_finalize(Object *obj)
>>>
>>> Not used either. Thanks,
>>
>> I think it is OK to define it as a placeholder? Just some preference.
> 
> At very least gcc should warn on these (I am surprised it did not) and
> nobody likes this. Thanks,

I tried a little. They must be defined. The init() and finalize() calls
are used in the OBJECT_DEFINE_TYPE_WITH_INTERFACES() macro. I think it
is a common template to define in this way.

> 
> 
>>>
>>>> +{
>>>> +}
>>>> +
>>>> +static void memory_attribute_manager_class_init(ObjectClass *oc, void
>>>> *data)
>>>> +{
>>>> +    RamDiscardManagerClass *rdmc = RAM_DISCARD_MANAGER_CLASS(oc);
>>>> +
>>>> +    rdmc->get_min_granularity =
>>>> memory_attribute_rdm_get_min_granularity;
>>>> +    rdmc->register_listener = memory_attribute_rdm_register_listener;
>>>> +    rdmc->unregister_listener =
>>>> memory_attribute_rdm_unregister_listener;
>>>> +    rdmc->is_populated = memory_attribute_rdm_is_populated;
>>>> +    rdmc->replay_populated = memory_attribute_rdm_replay_populated;
>>>> +    rdmc->replay_discarded = memory_attribute_rdm_replay_discarded;
>>>> +}
>>>> diff --git a/system/meson.build b/system/meson.build
>>>> index 4952f4b2c7..ab07ff1442 100644
>>>> --- a/system/meson.build
>>>> +++ b/system/meson.build
>>>> @@ -15,6 +15,7 @@ system_ss.add(files(
>>>>      'dirtylimit.c',
>>>>      'dma-helpers.c',
>>>>      'globals.c',
>>>> +  'memory-attribute-manager.c',
>>>>      'memory_mapping.c',
>>>>      'qdev-monitor.c',
>>>>      'qtest.c',
>>>
>>
> 


Reply via email to