On 2014/12/8 21:49, Peter Maydell wrote: > On 2 December 2014 at 12:56, Shannon Zhao <zhaoshengl...@huawei.com> wrote: >> Add support for NUMA on ARM64. Tested successfully running a guest >> Linux kernel with the following patch applied: > > I'm still hoping for review from somebody who better understands > how QEMU and NUMA should interact, but in the meantime some comments > at a code level: > >> hw/arm/boot.c | 25 ------------ >> hw/arm/virt.c | 120 >> +++++++++++++++++++++++++++++++++++++++++++++++++++++--- >> 2 files changed, 113 insertions(+), 32 deletions(-) >> >> diff --git a/hw/arm/boot.c b/hw/arm/boot.c >> index 0014c34..c20fee4 100644 >> --- a/hw/arm/boot.c >> +++ b/hw/arm/boot.c >> @@ -335,7 +335,6 @@ static int load_dtb(hwaddr addr, const struct >> arm_boot_info *binfo, >> { >> void *fdt = NULL; >> int size, rc; >> - uint32_t acells, scells; >> >> if (binfo->dtb_filename) { >> char *filename; >> @@ -369,30 +368,6 @@ static int load_dtb(hwaddr addr, const struct >> arm_boot_info *binfo, >> return 0; >> } >> >> - acells = qemu_fdt_getprop_cell(fdt, "/", "#address-cells"); >> - scells = qemu_fdt_getprop_cell(fdt, "/", "#size-cells"); >> - if (acells == 0 || scells == 0) { >> - fprintf(stderr, "dtb file invalid (#address-cells or #size-cells >> 0)\n"); >> - goto fail; >> - } >> - >> - if (scells < 2 && binfo->ram_size >= (1ULL << 32)) { >> - /* This is user error so deserves a friendlier error message >> - * than the failure of setprop_sized_cells would provide >> - */ >> - fprintf(stderr, "qemu: dtb file not compatible with " >> - "RAM size > 4GB\n"); >> - goto fail; >> - } >> - >> - rc = qemu_fdt_setprop_sized_cells(fdt, "/memory", "reg", >> - acells, binfo->loader_start, >> - scells, binfo->ram_size); >> - if (rc < 0) { >> - fprintf(stderr, "couldn't set /memory/reg\n"); >> - goto fail; >> - } >> - > > This patchset seems to be moving the initialization of a lot of > the dtb from this generic code into the virt board. That doesn't > seem right to me -- why would NUMA support be specific to the > virt board? I would expect support for this to be in the generic > code (possibly controlled with a board option for "I support NUMA"). > As it is your patch will break support for every other > board that uses device trees, because they rely on this code > which you've deleted here. >
Good suggestion. Will fix this :-) >> if (binfo->kernel_cmdline && *binfo->kernel_cmdline) { >> rc = qemu_fdt_setprop_string(fdt, "/chosen", "bootargs", >> binfo->kernel_cmdline); >> diff --git a/hw/arm/virt.c b/hw/arm/virt.c >> index 78f618d..9d18a91 100644 >> --- a/hw/arm/virt.c >> +++ b/hw/arm/virt.c >> @@ -170,8 +170,6 @@ static void create_fdt(VirtBoardInfo *vbi) >> * to fill in necessary properties later >> */ >> qemu_fdt_add_subnode(fdt, "/chosen"); >> - qemu_fdt_add_subnode(fdt, "/memory"); >> - qemu_fdt_setprop_string(fdt, "/memory", "device_type", "memory"); >> >> /* Clock node, for the benefit of the UART. The kernel device tree >> * binding documentation claims the PL011 node clock properties are >> @@ -235,6 +233,116 @@ static void fdt_add_psci_node(const VirtBoardInfo *vbi) >> qemu_fdt_setprop_cell(fdt, "/psci", "migrate", migrate_fn); >> } >> >> +static int virt_memory_init(MachineState *machine, >> + MemoryRegion *system_memory, >> + const VirtBoardInfo *vbi) >> +{ >> + MemoryRegion *ram = g_new(MemoryRegion, 1); >> + CPUState *cpu; >> + int min_cpu = 0, max_cpu = 0; >> + int i, j, count, len; >> + uint32_t acells, scells; >> + >> + acells = qemu_fdt_getprop_cell(vbi->fdt, "/", "#address-cells"); >> + scells = qemu_fdt_getprop_cell(vbi->fdt, "/", "#size-cells"); >> + if (acells == 0 || scells == 0) { >> + fprintf(stderr, "dtb file invalid (#address-cells or #size-cells >> 0)\n"); >> + goto fail; >> + } >> + >> + if (scells < 2 && machine->ram_size >= (1ULL << 32)) { >> + /* This is user error so deserves a friendlier error message >> + * than the failure of setprop_sized_cells would provide >> + */ >> + fprintf(stderr, "qemu: dtb file not compatible with " >> + "RAM size > 4GB\n"); >> + goto fail; >> + } >> + >> + memory_region_allocate_system_memory(ram, NULL, "mach-virt.ram", >> + machine->ram_size); >> + memory_region_add_subregion(system_memory, vbi->memmap[VIRT_MEM].base, >> ram); >> + >> + hwaddr mem_base = vbi->memmap[VIRT_MEM].base; >> + >> + if (!nb_numa_nodes) { >> + qemu_fdt_add_subnode(vbi->fdt, "/memory"); >> + qemu_fdt_setprop_string(vbi->fdt, "/memory", "device_type", >> "memory"); >> + qemu_fdt_setprop_sized_cells(vbi->fdt, "/memory", "reg", >> + acells, mem_base, >> + scells, machine->ram_size); >> + return 0; >> + } >> + >> + qemu_fdt_add_subnode(vbi->fdt, "/numa-map"); >> + qemu_fdt_setprop_cell(vbi->fdt, "/numa-map", "#address-cells", 0x2); >> + qemu_fdt_setprop_cell(vbi->fdt, "/numa-map", "#size-cells", 0x1); >> + qemu_fdt_setprop_cell(vbi->fdt, "/numa-map", "#node-count", 0x2); >> + >> + uint64_t *mem_map = g_malloc0(nb_numa_nodes * sizeof(uint64_t) * 6); >> + uint64_t *cpu_map = g_malloc0(nb_numa_nodes * sizeof(uint64_t) * 6); >> + uint64_t *node_matrix = g_malloc0(nb_numa_nodes * nb_numa_nodes >> + * sizeof(uint64_t) * 6); > > g_new0() is usually better than g_malloc0(count * sizeof(thing)); > Ok >> + >> + for (i = 0; i < nb_numa_nodes; i++) { >> + uint64_t buffer[6] = {1, 0x00000000, 1, mem_base, 1, i}; >> + char *nodename; >> + nodename = g_strdup_printf("/memory@%" PRIx64, mem_base); >> + qemu_fdt_add_subnode(vbi->fdt, nodename); >> + qemu_fdt_setprop_string(vbi->fdt, nodename, "device_type", >> "memory"); >> + qemu_fdt_setprop_sized_cells(vbi->fdt, nodename, "reg", >> + acells, mem_base, >> + scells, numa_info[i].node_mem-1); >> + memcpy(mem_map + 6 * i, buffer, 6 * sizeof(*buffer)); > > Why do we create a local buffer array and then immediately do nothing > with it but memcpy() it somewhere else? I suspect this code would > be clearer if you defined a type (possibly a struct) where you're > currently using raw uint64_t array[6]. Then you could make your > mem_map, cpu_map and node_matrix variables have more sensible types > than raw uint64_t*, and you could just set the right element in them > using C array syntax rather than these memcpy calls. > OK, thanks for your suggestion. >> + mem_base += numa_info[i].node_mem; >> + g_free(nodename); >> + } >> + qemu_fdt_setprop_sized_cells_from_array(vbi->fdt, "/numa-map", >> "mem-map", >> + (nb_numa_nodes * 6) / 2, mem_map); > > Lots of unnecessary hardcoded 6s here (and above and below). > Will replace them. >> + >> + for (i = 0; i < nb_numa_nodes; i++) { >> + CPU_FOREACH(cpu) { >> + if (test_bit(cpu->cpu_index, numa_info[i].node_cpu)) { >> + if (cpu->cpu_index < min_cpu) { >> + min_cpu = cpu->cpu_index; >> + } >> + if (cpu->cpu_index > max_cpu) { >> + max_cpu = cpu->cpu_index; >> + } >> + } >> + } >> + >> + uint64_t buffer[6] = {1, min_cpu, 1, max_cpu, 1, i}; >> + memcpy(cpu_map + 6 * i, buffer, 6 * sizeof(*buffer)); >> + min_cpu = max_cpu + 1; >> + } >> + >> + qemu_fdt_setprop_sized_cells_from_array(vbi->fdt, "/numa-map", >> "cpu-map", >> + (nb_numa_nodes * 6) / 2, cpu_map); >> + count = 0; >> + for (i = 0; i < nb_numa_nodes; i++) { >> + for (j = 0; j < nb_numa_nodes; j++) { >> + len = 20; >> + if (i == j) { >> + len = 10; >> + } >> + uint64_t buffer[6] = {1, i, 1, j, 1, len}; >> + memcpy(node_matrix + 6 * count, buffer, 6 * sizeof(*buffer)); >> + count++; >> + } >> + } >> + qemu_fdt_setprop_sized_cells_from_array(vbi->fdt, "/numa-map", >> + "node-matrix", (nb_numa_nodes * nb_numa_nodes * 6) / 2, >> node_matrix); >> + >> + g_free(mem_map); >> + g_free(cpu_map); >> + g_free(node_matrix); >> + >> + return 0; >> +fail: >> + return -1; >> +} >> + >> static void fdt_add_timer_nodes(const VirtBoardInfo *vbi) >> { >> /* Note that on A15 h/w these interrupts are level-triggered, >> @@ -532,7 +640,6 @@ static void machvirt_init(MachineState *machine) >> qemu_irq pic[NUM_IRQS]; >> MemoryRegion *sysmem = get_system_memory(); >> int n; >> - MemoryRegion *ram = g_new(MemoryRegion, 1); >> const char *cpu_model = machine->cpu_model; >> VirtBoardInfo *vbi; >> >> @@ -585,10 +692,9 @@ static void machvirt_init(MachineState *machine) >> fdt_add_cpu_nodes(vbi); >> fdt_add_psci_node(vbi); >> >> - memory_region_init_ram(ram, NULL, "mach-virt.ram", machine->ram_size, >> - &error_abort); >> - vmstate_register_ram_global(ram); >> - memory_region_add_subregion(sysmem, vbi->memmap[VIRT_MEM].base, ram); >> + if (virt_memory_init(machine, sysmem, vbi) < 0) { >> + exit(1); >> + } >> >> create_flash(vbi); >> >> -- >> 1.7.1 > > thanks > -- PMM > > . > Thanks for your comments. Anyone else has more comments about this? Thanks, Shannon