David Gibson [da...@gibson.dropbear.id.au] wrote: | On Wed, Nov 04, 2015 at 03:06:05PM -0800, Sukadev Bhattiprolu wrote: | > Implement RTAS_SYSPARM_PROCESSOR_MODULE_INFO parameter to rtas_get_sysparm() | > call in qemu. This call returns the processor module (socket), chip and core | > information as specified in section 7.3.16.18 of PAPR v2.7. | | PAPR v2.7 isn't available publically. For upstream patches, please | reference LoPAPR instead (where it's section 7.3.16.17 AFAICT).
Ok. | | > We walk the /proc/device-tree to determine the number of chips, cores and | > modules in the _host_ system and return this info to the guest application | > that makes the rtas_get_sysparm() call. | > | > We currently hard code the number of module_types to 1, but we should determine | > that dynamically somehow later. | > | > Thanks to input from Nishanth Aravamudan and Alexey Kardashevsk. | > | > Signed-off-by: Sukadev Bhattiprolu <suka...@linux.vnet.ibm.com> | | This isn't ready to go yet - you need to put some more consideration | into the uncommon cases: PR KVM, TCG and non-Power hosts. Ok. Is there a we can make this code applicable only a Powerpc host? (would moving this code to target-ppc/kvm.c do that?) | | > --- | > Changelog[v2]: | > [Alexey Kardashevsk] Use existing interfaces like ldl_be_p(), | > stw_be_phys(), g_hash_table_new_full(), error_report() rather | > than re-inventing; fix indentation, function prottypes etc; | > Drop the fts() interface (which doesn't seem to be available | > on mingw32/mingw64) and use opendir() to walk specific | > directories in the directory tree. | > --- | > hw/ppc/Makefile.objs | 1 + | > hw/ppc/spapr_rtas.c | 35 +++++++ | > hw/ppc/spapr_rtas_modinfo.c | 230 ++++++++++++++++++++++++++++++++++++++++++++ | > hw/ppc/spapr_rtas_modinfo.h | 12 +++ | > include/hw/ppc/spapr.h | 1 + | > 5 files changed, 279 insertions(+) | > create mode 100644 hw/ppc/spapr_rtas_modinfo.c | > create mode 100644 hw/ppc/spapr_rtas_modinfo.h | > | > diff --git a/hw/ppc/Makefile.objs b/hw/ppc/Makefile.objs | > index c1ffc77..57c6b02 100644 | > --- a/hw/ppc/Makefile.objs | > +++ b/hw/ppc/Makefile.objs | > @@ -4,6 +4,7 @@ obj-y += ppc.o ppc_booke.o | > obj-$(CONFIG_PSERIES) += spapr.o spapr_vio.o spapr_events.o | > obj-$(CONFIG_PSERIES) += spapr_hcall.o spapr_iommu.o spapr_rtas.o | > obj-$(CONFIG_PSERIES) += spapr_pci.o spapr_rtc.o spapr_drc.o spapr_rng.o | > +obj-$(CONFIG_PSERIES) += spapr_rtas_modinfo.o | > ifeq ($(CONFIG_PCI)$(CONFIG_PSERIES)$(CONFIG_LINUX), yyy) | > obj-y += spapr_pci_vfio.o | > endif | > diff --git a/hw/ppc/spapr_rtas.c b/hw/ppc/spapr_rtas.c | > index 34b12a3..41fd8a6 100644 | > --- a/hw/ppc/spapr_rtas.c | > +++ b/hw/ppc/spapr_rtas.c | > @@ -34,6 +34,8 @@ | > #include "hw/ppc/spapr.h" | > #include "hw/ppc/spapr_vio.h" | > #include "qapi-event.h" | > + | > +#include "spapr_rtas_modinfo.h" | > #include "hw/boards.h" | > | > #include <libfdt.h> | > @@ -240,6 +242,39 @@ static void rtas_ibm_get_system_parameter(PowerPCCPU *cpu, | > target_ulong ret = RTAS_OUT_SUCCESS; | > | > switch (parameter) { | > + case RTAS_SYSPARM_PROCESSOR_MODULE_INFO: { | > + int i; | > + int offset = 0; | > + int size; | > + struct rtas_module_info modinfo; | > + | > + if (rtas_get_module_info(&modinfo)) { | > + break; | > + } | | So, you handle the variable size of this structure before sending it | to the guest, but you don't handle it in allocation of the structure | right here. You'll get away with that because for now you only ever | have one entry in the sockets array, but it's a bit icky. Can we assume that the size is static for now... | | > + | > + size = sizeof(modinfo); | > + size += (modinfo.module_types - 1) * sizeof(struct rtas_socket_info); | | More seriously, this calculation will break horribly if you change the | size of the array in struct rtas_module_info. and just set 'size' to sizeof(modinfo)?. | | > + stw_be_phys(&address_space_memory, buffer+offset, size); | > + offset += 2; | > + | > + stw_be_phys(&address_space_memory, buffer+offset, modinfo.module_types); | > + offset += 2; | > + | > + for (i = 0; i < modinfo.module_types; i++) { | > + stw_be_phys(&address_space_memory, buffer+offset, | > + modinfo.si[i].sockets); | > + offset += 2; | > + stw_be_phys(&address_space_memory, buffer+offset, | > + modinfo.si[i].chips); | > + offset += 2; | > + stw_be_phys(&address_space_memory, buffer+offset, | > + modinfo.si[i].cores_per_chip); | > + offset += 2; | > + } | > + break; | > + } | > + | > case RTAS_SYSPARM_SPLPAR_CHARACTERISTICS: { | > char *param_val = g_strdup_printf("MaxEntCap=%d," | > "DesMem=%llu," | > diff --git a/hw/ppc/spapr_rtas_modinfo.c b/hw/ppc/spapr_rtas_modinfo.c | > new file mode 100644 | > index 0000000..068fc2c | > --- /dev/null | > +++ b/hw/ppc/spapr_rtas_modinfo.c | > @@ -0,0 +1,230 @@ | > +/* | > + * QEMU PowerPC pSeries Logical Partition (aka sPAPR) hardware System Emulator | > + * | > + * Hypercall based emulated RTAS | > + * | > + * Copyright (c) 2015 Sukadev Bhattiprolu, IBM Corporation. | > + * | > + * Permission is hereby granted, free of charge, to any person obtaining a copy | > + * of this software and associated documentation files (the "Software"), to deal | > + * in the Software without restriction, including without limitation the rights | > + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | > + * copies of the Software, and to permit persons to whom the Software is | > + * furnished to do so, subject to the following conditions: | > + * | > + * The above copyright notice and this permission notice shall be included in | > + * all copies or substantial portions of the Software. | > + * | > + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | > + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | > + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL | > + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | > + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | > + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | > + * THE SOFTWARE. | > + * | > + */ | > +#include <stdio.h> | > +#include <dirent.h> | > +#include <sys/types.h> | > +#include <sys/stat.h> | > +#include <string.h> | > +#include <errno.h> | > + | > +#include <glib.h> | > +#include "spapr_rtas_modinfo.h" | > +#include "qemu/error-report.h" | > +#include "qemu/bswap.h" | | This entirely file assumes that (a) you have a ppc64 Linux host, which | may not be true, and (b) that it's ok to expose host details to the guest. Is there a way we can skip/ignore this code if not on a ppc64 Linux host? Are there steps I should take on other hosts to not expose host details to the guests? | | > +static int file_read_buf(char *file_name, char *buf, int len) | > +{ | > + int rc; | > + FILE *fp; | > + | > + fp = fopen(file_name, "r"); | > + if (!fp) { | > + error_report("%s: Error opening %s\n", __func__, file_name); | > + return -1; | > + } | > + | > + rc = fread(buf, 1, len, fp); | > + fclose(fp); | > + | > + if (rc != len) { | > + return -1; | > + } | > + | > + return 0; | > +} | > + | > +/* | > + * Read an identifier from the file @path and add the identifier | > + * to the hash table @gt unless its already in the table. | > + */ | > +static int hash_table_add_contents(GHashTable *gt, char *path) | > +{ | > + int idx; | > + char buf[16]; | > + int *key; | > + | > + if (file_read_buf(path, buf, sizeof(int))) { | | If you're just reading sizeof(int), why is the buf 16 bytes? You | should allocate buf to be the right size and use sizeof(buf). | | Also since this is a value you're getting directly from externally, | you should use a fixed width type, rather than 'int'. Maybe uint32_t? | | > + return -1; | > + } | > + | > + idx = ldl_be_p(buf); | | Easier to use the 'beNN_to_cpu' functions than ldl_be here. Ok. | | > + if (g_hash_table_contains(gt, &idx)) { | > + return 0; | > + } | > + | > + key = g_malloc(sizeof(int)); | > + | > + *key = idx; | > + if (!g_hash_table_insert(gt, key, NULL)) { | > + error_report("Unable to insert key %d into chips hash table\n", idx); | > + g_free(key); | > + return -1; | > + } | > + | > + return 0; | > +} | > + | > +/* | > + * Each core in the system is represented by a directory with the | > + * prefix 'PowerPC,POWER' in the directory /proc/device-tree/cpus/. | > + * Process that directory and count the number of cores in the system. | | True on IBM POWER systems, but not necessarily everywhere - e.g. PR | KVM on an embedded PowerPC host. What is PR KVM? | | > + */ | > +static int count_cores(int *num_cores) | > +{ | > + int rc; | > + DIR *dir; | > + struct dirent *ent; | > + const char *cpus_dir = "/proc/device-tree/cpus"; | > + const char *ppc_prefix = "PowerPC,POWER"; | > + | > + dir = opendir(cpus_dir); | > + if (!dir) { | > + error_report("Unable to open %s, error %d\n", cpus_dir, errno); | > + return -1; | > + } | | You could probably do this more simply with glob(3). Agree, it simplifiies code a lot. Thanks. | | > + *num_cores = 0; | > + while (1) { | > + errno = 0; | > + ent = readdir(dir); | > + if (!ent) { | > + break; | > + } | > + | > + if (!strncmp(ppc_prefix, ent->d_name, strlen(ppc_prefix))) { | > + (*num_cores)++; | > + } | > + } | > + | > + rc = 0; | > + if (errno) { | > + rc = -1; | > + } | > + | > + closedir(dir); | > + return rc; | > +} | > + | > +/* | > + * Each module's (aka socket's) id is contained in the 'ibm,hw-module-id' | > + * file in the "xscom" directory (/proc/device-tree/xscom*). Similarly each | > + * chip's id is contained in the 'ibm,chip-id' file in the xscom directory. | > + * | > + * A module can contain more than one chip and a chip can contain more | > + * than one core. So there are likely to be duplicates in the module | > + * and chip identifiers (i.e more than one xscom directory can contain | > + * the same module/chip id). | > + * | > + * Search the xscom directories and count the number of _UNIQUE_ module | > + * and chip identifiers in the system. | | There's no direct way to go from a core | (i.e. /proc/device-tree/cpus/cpu@NNN) to the corresponding chip and/or | module? Yes, it would logical to find the chip and module from the core :-) While 'ibm,chip-id' is in the core dir (/proc/device-tree/cpus/PowerPC,*/), the 'ibm,hw-module-id' is not there (on my Tuleta system). Maybe the 'ibm,hw-module-id' will be added in the future? I am using the xscom node to be consistent in counting chips and modules. | | > + */ | > +static int count_modules_chips(int *num_modules, int *num_chips) | > +{ | > + int rc; | > + DIR *dir; | > + struct dirent *ent; | > + char path[PATH_MAX]; | > + const char *xscom_dir = "/proc/device-tree"; | > + const char *xscom_prefix = "xscom@"; | > + GHashTable *chips_tbl, *modules_tbl; | > + | > + dir = opendir(xscom_dir); | > + if (!dir) { | > + error_report("Unable to open %s, error %d\n", xscom_dir, errno); | > + return -1; | > + } | > + | > + modules_tbl = g_hash_table_new_full(g_int_hash, g_int_equal, g_free, NULL); | > + chips_tbl = g_hash_table_new_full(g_int_hash, g_int_equal, g_free, NULL); | > + | > + rc = -1; | > + while (1) { | > + errno = 0; | > + ent = readdir(dir); | > + if (!ent) { | > + break; | > + } | | Again glob(3) could simplify this. Agree. | | > + | > + if (strncmp(xscom_prefix, ent->d_name, strlen(xscom_prefix))) { | > + continue; | > + } | > + | > + sprintf(path, "%s/%s/ibm,chip-id", xscom_dir, ent->d_name); | > + if (hash_table_add_contents(chips_tbl, path)) { | > + goto cleanup; | > + } | > + | > + sprintf(path, "%s/%s/ibm,hw-module-id", xscom_dir, ent->d_name); | > + if (hash_table_add_contents(modules_tbl, path)) { | > + goto cleanup; | > + } | > + } | > + | > + if (errno) { | > + goto cleanup; | > + } | > + | > + *num_modules = g_hash_table_size(modules_tbl); | > + *num_chips = g_hash_table_size(chips_tbl); | > + rc = 0; | > + | > +cleanup: | > + g_hash_table_remove_all(modules_tbl); | > + g_hash_table_destroy(modules_tbl); | > + | > + g_hash_table_remove_all(chips_tbl); | > + g_hash_table_destroy(chips_tbl); | > + | > + closedir(dir); | > + | > + return rc; | > +} | > + | > +int rtas_get_module_info(struct rtas_module_info *modinfo) | > +{ | > + int cores; | > + int chips; | > + int modules; | > + | > + memset(modinfo, 0, sizeof(struct rtas_module_info)); | > + | > + if (count_modules_chips(&modules, &chips) || count_cores(&cores)) { | > + return -1; | > + } | > + | > + /* | > + * TODO: Hard code number of module_types for now till | > + * we figure out how to determine it dynamically. | > + */ | > + modinfo->module_types = 1; | > + modinfo->si[0].sockets = modules; | > + modinfo->si[0].chips = chips; | > + modinfo->si[0].cores_per_chip = cores / chips; | > + | > + return 0; | > +} | > diff --git a/hw/ppc/spapr_rtas_modinfo.h b/hw/ppc/spapr_rtas_modinfo.h | > new file mode 100644 | > index 0000000..356a2b5 | > --- /dev/null | > +++ b/hw/ppc/spapr_rtas_modinfo.h | > @@ -0,0 +1,12 @@ | > + | > +struct rtas_socket_info { | > + unsigned short sockets; | > + unsigned short chips; | > + unsigned short cores_per_chip; | > +}; | > +struct rtas_module_info { | > + unsigned short module_types; | > + struct rtas_socket_info si[1]; | | You probably want a "MAX_MODULE_TYPES" #define or similar instead of | harcoding this to 1. Agree. | | > +}; | > + | > +extern int rtas_get_module_info(struct rtas_module_info *topo); | > diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h | > index 5baa906..cfe7fa2 100644 | > --- a/include/hw/ppc/spapr.h | > +++ b/include/hw/ppc/spapr.h | > @@ -463,6 +463,7 @@ int spapr_allocate_irq_block(int num, bool lsi, bool msi); | > /* RTAS ibm,get-system-parameter token values */ | > #define RTAS_SYSPARM_SPLPAR_CHARACTERISTICS 20 | > #define RTAS_SYSPARM_DIAGNOSTICS_RUN_MODE 42 | > +#define RTAS_SYSPARM_PROCESSOR_MODULE_INFO 43 | > #define RTAS_SYSPARM_UUID 48 | > | > /* RTAS indicator/sensor types | | -- | David Gibson | I'll have my music baroque, and my code | david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | | _way_ _around_! | http://www.ozlabs.org/~dgibson