Re: [Qemu-devel] [PATCH qemu v3 6/6] spapr: Support NVIDIA V100 GPU with NVLink2

Alexey Kardashevskiy Wed, 06 Mar 2019 20:34:16 -0800

On 07/03/2019 14:57, David Gibson wrote:
> On Thu, Mar 07, 2019 at 01:40:33PM +1100, Alexey Kardashevskiy wrote:
>>
>>
>> On 05/03/2019 12:47, David Gibson wrote:
>>> On Thu, Feb 28, 2019 at 05:11:32PM +1100, Alexey Kardashevskiy wrote:
>>>> On 28/02/2019 14:31, David Gibson wrote:
>>>>> On Wed, Feb 27, 2019 at 07:51:49PM +1100, Alexey Kardashevskiy wrote:
>>>>>> NVIDIA V100 GPUs have on-board RAM which is mapped into the host memory
>>>>>> space and accessible as normal RAM via an NVLink bus. The VFIO-PCI driver
>>>>>> implements special regions for such GPUs and emulates an NVLink bridge.
>>>>>> NVLink2-enabled POWER9 CPUs also provide address translation services
>>>>>> which includes an ATS shootdown (ATSD) register exported via the NVLink
>>>>>> bridge device.
>>>>>>
>>>>>> This adds a quirk to VFIO to map the GPU memory and create an MR;
>>>>>> the new MR is stored in a PCI device as a QOM link. The sPAPR PCI uses
>>>>>> this to get the MR and map it to the system address space.
>>>>>> Another quirk does the same for ATSD.
>>>>>>
>>>>>> This adds additional steps to sPAPR PHB setup:
>>>>>>
>>>>>> 1. Search for specific GPUs and NPUs, collect findings in
>>>>>> sPAPRPHBState::nvgpus, manage system address space mappings;
>>>>>>
>>>>>> 2. Add device-specific properties such as "ibm,npu", "ibm,gpu",
>>>>>> "memory-block", "link-speed" to advertise the NVLink2 function to
>>>>>> the guest;
>>>>>>
>>>>>> 3. Add "mmio-atsd" to vPHB to advertise the ATSD capability;
>>>>>>
>>>>>> 4. Add new memory blocks (with extra "linux,memory-usable" to prevent
>>>>>> the guest OS from accessing the new memory until it is onlined) and
>>>>>> npuphb# nodes representing an NPU unit for every vPHB as the GPU driver
>>>>>> uses it for link discovery.
>>>>>>
>>>>>> This allocates space for GPU RAM and ATSD like we do for MMIOs by
>>>>>> adding 2 new parameters to the phb_placement() hook. Older machine types
>>>>>> set these to zero.
>>>>>>
>>>>>> This puts new memory nodes in a separate NUMA node to replicate the host
>>>>>> system setup as the GPU driver relies on this.
>>>>>>
>>>>>> This adds requirement similar to EEH - one IOMMU group per vPHB.
>>>>>> The reason for this is that ATSD registers belong to a physical NPU
>>>>>> so they cannot invalidate translations on GPUs attached to another NPU.
>>>>>> It is guaranteed by the host platform as it does not mix NVLink bridges
>>>>>> or GPUs from different NPU in the same IOMMU group. If more than one
>>>>>> IOMMU group is detected on a vPHB, this disables ATSD support for that
>>>>>> vPHB and prints a warning.
>>>>>>
>>>>>> Signed-off-by: Alexey Kardashevskiy <a...@ozlabs.ru>
>>>>>> ---
>>>>>> Changes:
>>>>>> v3:
>>>>>> * moved GPU RAM above PCI MMIO limit
>>>>>> * renamed QOM property to nvlink2-tgt
>>>>>> * moved nvlink2 code to its own file
>>>>>>
>>>>>> ---
>>>>>>
>>>>>> The example command line for redbud system:
>>>>>>
>>>>>> pbuild/qemu-aiku1804le-ppc64/ppc64-softmmu/qemu-system-ppc64 \
>>>>>> -nodefaults \
>>>>>> -chardev stdio,id=STDIO0,signal=off,mux=on \
>>>>>> -device spapr-vty,id=svty0,reg=0x71000110,chardev=STDIO0 \
>>>>>> -mon id=MON0,chardev=STDIO0,mode=readline -nographic -vga none \
>>>>>> -enable-kvm -m 384G \
>>>>>> -chardev socket,id=SOCKET0,server,nowait,host=localhost,port=40000 \
>>>>>> -mon chardev=SOCKET0,mode=control \
>>>>>> -smp 80,sockets=1,threads=4 \
>>>>>> -netdev "tap,id=TAP0,helper=/home/aik/qemu-bridge-helper --br=br0" \
>>>>>> -device "virtio-net-pci,id=vnet0,mac=52:54:00:12:34:56,netdev=TAP0" \
>>>>>> img/vdisk0.img \
>>>>>> -device "vfio-pci,id=vfio0004_04_00_0,host=0004:04:00.0" \
>>>>>> -device "vfio-pci,id=vfio0006_00_00_0,host=0006:00:00.0" \
>>>>>> -device "vfio-pci,id=vfio0006_00_00_1,host=0006:00:00.1" \
>>>>>> -device "vfio-pci,id=vfio0006_00_00_2,host=0006:00:00.2" \
>>>>>> -device "vfio-pci,id=vfio0004_05_00_0,host=0004:05:00.0" \
>>>>>> -device "vfio-pci,id=vfio0006_00_01_0,host=0006:00:01.0" \
>>>>>> -device "vfio-pci,id=vfio0006_00_01_1,host=0006:00:01.1" \
>>>>>> -device "vfio-pci,id=vfio0006_00_01_2,host=0006:00:01.2" \
>>>>>> -device spapr-pci-host-bridge,id=phb1,index=1 \
>>>>>> -device "vfio-pci,id=vfio0035_03_00_0,host=0035:03:00.0" \
>>>>>> -device "vfio-pci,id=vfio0007_00_00_0,host=0007:00:00.0" \
>>>>>> -device "vfio-pci,id=vfio0007_00_00_1,host=0007:00:00.1" \
>>>>>> -device "vfio-pci,id=vfio0007_00_00_2,host=0007:00:00.2" \
>>>>>> -device "vfio-pci,id=vfio0035_04_00_0,host=0035:04:00.0" \
>>>>>> -device "vfio-pci,id=vfio0007_00_01_0,host=0007:00:01.0" \
>>>>>> -device "vfio-pci,id=vfio0007_00_01_1,host=0007:00:01.1" \
>>>>>> -device "vfio-pci,id=vfio0007_00_01_2,host=0007:00:01.2" -snapshot \
>>>>>> -machine pseries \
>>>>>> -L /home/aik/t/qemu-ppc64-bios/ -d guest_errors
>>>>>>
>>>>>> Note that QEMU attaches PCI devices to the last added vPHB so first
>>>>>> 8 devices - 4:04:00.0 till 6:00:01.2 - go to the default vPHB, and
>>>>>> 35:03:00.0..7:00:01.2 to the vPHB with id=phb1.
>>>>>> ---
>>>>>>  hw/ppc/Makefile.objs        |   2 +-
>>>>>>  hw/vfio/pci.h               |   2 +
>>>>>>  include/hw/pci-host/spapr.h |  41 ++++
>>>>>>  include/hw/ppc/spapr.h      |   3 +-
>>>>>>  hw/ppc/spapr.c              |  29 ++-
>>>>>>  hw/ppc/spapr_pci.c          |   8 +
>>>>>>  hw/ppc/spapr_pci_nvlink2.c  | 419 ++++++++++++++++++++++++++++++++++++
>>>>>>  hw/vfio/pci-quirks.c        | 120 +++++++++++
>>>>>>  hw/vfio/pci.c               |  14 ++
>>>>>>  hw/vfio/trace-events        |   4 +
>>>>>>  10 files changed, 637 insertions(+), 5 deletions(-)
>>>>>>  create mode 100644 hw/ppc/spapr_pci_nvlink2.c
>>>>>>
>>>>>> diff --git a/hw/ppc/Makefile.objs b/hw/ppc/Makefile.objs
>>>>>> index 1111b21..636e717 100644
>>>>>> --- a/hw/ppc/Makefile.objs
>>>>>> +++ b/hw/ppc/Makefile.objs
>>>>>> @@ -9,7 +9,7 @@ obj-$(CONFIG_SPAPR_RNG) +=  spapr_rng.o
>>>>>>  # IBM PowerNV
>>>>>>  obj-$(CONFIG_POWERNV) += pnv.o pnv_xscom.o pnv_core.o pnv_lpc.o 
>>>>>> pnv_psi.o pnv_occ.o pnv_bmc.o
>>>>>>  ifeq ($(CONFIG_PCI)$(CONFIG_PSERIES)$(CONFIG_LINUX), yyy)
>>>>>> -obj-y += spapr_pci_vfio.o
>>>>>> +obj-y += spapr_pci_vfio.o spapr_pci_nvlink2.o
>>>>>>  endif
>>>>>>  obj-$(CONFIG_PSERIES) += spapr_rtas_ddw.o
>>>>>>  # PowerPC 4xx boards
>>>>>> diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
>>>>>> index b1ae4c0..706c304 100644
>>>>>> --- a/hw/vfio/pci.h
>>>>>> +++ b/hw/vfio/pci.h
>>>>>> @@ -194,6 +194,8 @@ int vfio_populate_vga(VFIOPCIDevice *vdev, Error 
>>>>>> **errp);
>>>>>>  int vfio_pci_igd_opregion_init(VFIOPCIDevice *vdev,
>>>>>>                                 struct vfio_region_info *info,
>>>>>>                                 Error **errp);
>>>>>> +int vfio_pci_nvidia_v100_ram_init(VFIOPCIDevice *vdev, Error **errp);
>>>>>> +int vfio_pci_nvlink2_init(VFIOPCIDevice *vdev, Error **errp);
>>>>>>  
>>>>>>  void vfio_display_reset(VFIOPCIDevice *vdev);
>>>>>>  int vfio_display_probe(VFIOPCIDevice *vdev, Error **errp);
>>>>>> diff --git a/include/hw/pci-host/spapr.h b/include/hw/pci-host/spapr.h
>>>>>> index ab0e3a0..e791dd4 100644
>>>>>> --- a/include/hw/pci-host/spapr.h
>>>>>> +++ b/include/hw/pci-host/spapr.h
>>>>>> @@ -87,6 +87,9 @@ struct sPAPRPHBState {
>>>>>>      uint32_t mig_liobn;
>>>>>>      hwaddr mig_mem_win_addr, mig_mem_win_size;
>>>>>>      hwaddr mig_io_win_addr, mig_io_win_size;
>>>>>> +    hwaddr nv2_gpa_win_addr;
>>>>>> +    hwaddr nv2_atsd_win_addr;
>>>>>> +    struct spapr_phb_pci_nvgpu_config *nvgpus;
>>>>>>  };
>>>>>>  
>>>>>>  #define SPAPR_PCI_MEM_WIN_BUS_OFFSET 0x80000000ULL
>>>>>> @@ -105,6 +108,23 @@ struct sPAPRPHBState {
>>>>>>  
>>>>>>  #define SPAPR_PCI_MSI_WINDOW         0x40000000000ULL
>>>>>>  
>>>>>> +#define SPAPR_PCI_NV2RAM64_WIN_BASE  SPAPR_PCI_LIMIT
>>>>>> +#define SPAPR_PCI_NV2RAM64_WIN_SIZE  0x10000000000ULL /* 1 TiB for all 
>>>>>> 6xGPUs */
>>>>>
>>>>> The comments and values below suggest that it is 1TiB for each GPU,
>>>>> rather than 1TiB shared by all 6.  Which is it?
>>>>
>>>>
>>>> 1TiB for all of them within 1 vPHB. Not sure where it suggests 1TiB for
>>>> each GPU.
>>>
>>> The fact that NV2ATSD_WIN_BASE is set at 6TiB above NV2RAM64_WIN_BASE
>>> is what suggested to me that there was one 1TiB window for each of the
>>> 6 possible GPUs.
>>>
>>>>>> +
>>>>>> +/* Max number of these GPUs per a physical box */
>>>>>> +#define NVGPU_MAX_NUM                6
>>>>>
>>>>> Is there any possibility later hardware revisions could increase this?
>>>>> If so we should probably leave some extra room in the address space.
>>>>
>>>> A GPU RAM window is 256GiB (and only 32GiB is used), and 3 is the
>>>> maximum in one group so far. So 1TiB should be enough for quite some
>>>> time. Having more GPUs in a box is probably possible but for now 6xGPU
>>>> require water cooling while 4xGPU does not so unless there is a new
>>>> generation of these GPUs comes out, the numbers won't change much.
>>>
>>> Hm, ok.
>>>
>>>> I'll double SPAPR_PCI_NV2RAM64_WIN_SIZE.
>>>
>>> Um.. I'm not sure how that follows from the above.
>>
>> 1TiB is enough now but 2TiB is more future proof. That was it.
> 
> Ok.
> 
>>>>>> +/*
>>>>>> + * One NVLink bridge provides one ATSD register so it should be 18.
>>>>>> + * In practice though since we allow only one group per vPHB which 
>>>>>> equals
>>>>>> + * to an NPU2 which has maximum 6 NVLink bridges.
>>>>>> + */
>>>>>> +#define NVGPU_MAX_ATSD               6
>>>>>> +
>>>>>> +#define SPAPR_PCI_NV2ATSD_WIN_BASE   (SPAPR_PCI_NV2RAM64_WIN_BASE + \
>>>>>> +                                      SPAPR_PCI_NV2RAM64_WIN_SIZE * \
>>>>>> +                                      NVGPU_MAX_NUM)
>>>>>> +#define SPAPR_PCI_NV2ATSD_WIN_SIZE   (NVGPU_MAX_ATSD * 0x10000)
>>>>>
>>>>> What's the significance of the 64 kiB constant here?  Should it be a
>>>>> symbolic name, or speleed "64 * kiB".
>>>>
>>>> Ok.
>>>
>>>
>>> Hmm.  Am I right in thinking that both each 64-bit RAM and each ATSD
>>> RAM slot is per-vPHB? 
>>
>> These are windows from which I allocated RAM base and ATSD per GPU/NPU.
> 
> Ok, I guess that per-vPHB set of windows is what I'm meaning by "slot"
> then.
> 
>>> Would it make more sense to directly index into
>>> the array of slots with the phb index, rather than having a separate
>>> GPU index?
>>
>> There can be 1 or many "slots" per PHBs ("many" is not really encouraged
>> as they will miss ATSD but nevertheless), and "slots" are not in a
>> global list of any kind.
> 
> Ok, I think we're using different meanings of "slot" here.  By "slot"
> I was meaning one 64-bit and one ATS window with a common index
> (i.e. a slot in the array indices, rather than a physical slot on the
> system).  IIUC all the GPUs and NPUs on a vPHB will sit in a single
> "slot" by that sense.


This is not what I meant though. A slot is a physical SXM2 (or whatever
that acronym is) slot with all these links, and one vPHB has 2 or 3 of
these. I could use vPHB index + slot index for ATSD and RAM but I chose
to do more or less same thing as BARs which are allocated within
selected windows.


> 
>>>>>> +
>>>>>>  static inline qemu_irq spapr_phb_lsi_qirq(struct sPAPRPHBState *phb, 
>>>>>> int pin)
>>>>>>  {
>>>>>>      sPAPRMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
>>>>>> @@ -135,6 +155,11 @@ int spapr_phb_vfio_eeh_get_state(sPAPRPHBState 
>>>>>> *sphb, int *state);
>>>>>>  int spapr_phb_vfio_eeh_reset(sPAPRPHBState *sphb, int option);
>>>>>>  int spapr_phb_vfio_eeh_configure(sPAPRPHBState *sphb);
>>>>>>  void spapr_phb_vfio_reset(DeviceState *qdev);
>>>>>> +void spapr_phb_nvgpu_setup(sPAPRPHBState *sphb);
>>>>>> +void spapr_phb_nvgpu_populate_dt(sPAPRPHBState *sphb, void *fdt, int 
>>>>>> bus_off);
>>>>>> +void spapr_phb_nvgpu_ram_populate_dt(sPAPRPHBState *sphb, void *fdt);
>>>>>> +void spapr_phb_nvgpu_populate_pcidev_dt(PCIDevice *dev, void *fdt, int 
>>>>>> offset,
>>>>>> +                                        sPAPRPHBState *sphb);
>>>>>>  #else
>>>>>>  static inline bool spapr_phb_eeh_available(sPAPRPHBState *sphb)
>>>>>>  {
>>>>>> @@ -161,6 +186,22 @@ static inline int 
>>>>>> spapr_phb_vfio_eeh_configure(sPAPRPHBState *sphb)
>>>>>>  static inline void spapr_phb_vfio_reset(DeviceState *qdev)
>>>>>>  {
>>>>>>  }
>>>>>> +static inline void spapr_phb_nvgpu_setup(sPAPRPHBState *sphb)
>>>>>> +{
>>>>>> +}
>>>>>> +static inline void spapr_phb_nvgpu_populate_dt(sPAPRPHBState *sphb, 
>>>>>> void *fdt,
>>>>>> +                                               int bus_off)
>>>>>> +{
>>>>>> +}
>>>>>> +static inline void spapr_phb_nvgpu_ram_populate_dt(sPAPRPHBState *sphb,
>>>>>> +                                                   void *fdt)
>>>>>> +{
>>>>>> +}
>>>>>> +static inline void spapr_phb_nvgpu_populate_pcidev_dt(PCIDevice *dev, 
>>>>>> void *fdt,
>>>>>> +                                                      int offset,
>>>>>> +                                                      sPAPRPHBState 
>>>>>> *sphb)
>>>>>> +{
>>>>>> +}
>>>>>
>>>>> I'm guessing some of these should never get called on systems without
>>>>> NVLink2, in which case they should probably have a
>>>>> g_assert_not_reached() in there.
>>>>
>>>> I guess if you compile QEMU for --target-list=ppc64-softmmu in Windows
>>>> (i.e. tcg + pseries + pci but no vfio), these will be called and crash
>>>> then, no?
>>>
>>> Well, if they can be called in that situation then, yes, they need to
>>> be no-ops like they are now.  But is that true for all of them?
>>> Hmm.. yes it might be, never mind.
>>>
>>>>>
>>>>>>  #endif
>>>>>>  
>>>>>>  void spapr_phb_dma_reset(sPAPRPHBState *sphb);
>>>>>> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
>>>>>> index 358bb38..9acf867 100644
>>>>>> --- a/include/hw/ppc/spapr.h
>>>>>> +++ b/include/hw/ppc/spapr.h
>>>>>> @@ -113,7 +113,8 @@ struct sPAPRMachineClass {
>>>>>>      void (*phb_placement)(sPAPRMachineState *spapr, uint32_t index,
>>>>>>                            uint64_t *buid, hwaddr *pio, 
>>>>>>                            hwaddr *mmio32, hwaddr *mmio64,
>>>>>> -                          unsigned n_dma, uint32_t *liobns, Error 
>>>>>> **errp);
>>>>>> +                          unsigned n_dma, uint32_t *liobns, hwaddr 
>>>>>> *nv2gpa,
>>>>>> +                          hwaddr *nv2atsd, Error **errp);
>>>>>>      sPAPRResizeHPT resize_hpt_default;
>>>>>>      sPAPRCapabilities default_caps;
>>>>>>      sPAPRIrq *irq;
>>>>>> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
>>>>>> index 74c9b07..fda6e7e 100644
>>>>>> --- a/hw/ppc/spapr.c
>>>>>> +++ b/hw/ppc/spapr.c
>>>>>> @@ -3929,7 +3929,9 @@ static void spapr_phb_pre_plug(HotplugHandler 
>>>>>> *hotplug_dev, DeviceState *dev,
>>>>>>      smc->phb_placement(spapr, sphb->index,
>>>>>>                         &sphb->buid, &sphb->io_win_addr,
>>>>>>                         &sphb->mem_win_addr, &sphb->mem64_win_addr,
>>>>>> -                       windows_supported, sphb->dma_liobn, errp);
>>>>>> +                       windows_supported, sphb->dma_liobn,
>>>>>> +                       &sphb->nv2_gpa_win_addr, 
>>>>>> &sphb->nv2_atsd_win_addr,
>>>>>> +                       errp);
>>>>>>  }
>>>>>>  
>>>>>>  static void spapr_phb_plug(HotplugHandler *hotplug_dev, DeviceState 
>>>>>> *dev,
>>>>>> @@ -4129,7 +4131,8 @@ static const CPUArchIdList 
>>>>>> *spapr_possible_cpu_arch_ids(MachineState *machine)
>>>>>>  static void spapr_phb_placement(sPAPRMachineState *spapr, uint32_t 
>>>>>> index,
>>>>>>                                  uint64_t *buid, hwaddr *pio,
>>>>>>                                  hwaddr *mmio32, hwaddr *mmio64,
>>>>>> -                                unsigned n_dma, uint32_t *liobns, Error 
>>>>>> **errp)
>>>>>> +                                unsigned n_dma, uint32_t *liobns,
>>>>>> +                                hwaddr *nv2gpa, hwaddr *nv2atsd, Error 
>>>>>> **errp)
>>>>>>  {
>>>>>>      /*
>>>>>>       * New-style PHB window placement.
>>>>>> @@ -4174,6 +4177,9 @@ static void spapr_phb_placement(sPAPRMachineState 
>>>>>> *spapr, uint32_t index,
>>>>>>      *pio = SPAPR_PCI_BASE + index * SPAPR_PCI_IO_WIN_SIZE;
>>>>>>      *mmio32 = SPAPR_PCI_BASE + (index + 1) * SPAPR_PCI_MEM32_WIN_SIZE;
>>>>>>      *mmio64 = SPAPR_PCI_BASE + (index + 1) * SPAPR_PCI_MEM64_WIN_SIZE;
>>>>>> +
>>>>>
>>>>> This doesn't look right.  SPAPR_PCI_NV2ATSD_WIN_BASE appears to be
>>>>> defined such that there slots for NVGPU_MAX_NUM gpa "slots" of size
>>>>> SPAPR_PCI_NV2RAM64_WIN_SIZE before we get to the ATSD base.
>>>>>
>>>>>> +    *nv2gpa = SPAPR_PCI_NV2RAM64_WIN_BASE + index * 
>>>>>> SPAPR_PCI_NV2RAM64_WIN_SIZE;
>>>>>
>>>>> But this implies you need a "slot" for every possible PHB index, which
>>>>> is rather more than NVGPU_MAX_NUM.
>>>>>
>>>>>> +    *nv2atsd = SPAPR_PCI_NV2ATSD_WIN_BASE + index * 
>>>>>> SPAPR_PCI_NV2ATSD_WIN_SIZE;
>>>>
>>>>
>>>> Ah right :( These should go then above 128TiB I guess as I do not really
>>>> want them to appear inside a huge dma window.
>>>
>>> Right.  So actually looks like you are already indexing the window
>>> slots by phb index, in which case you need to allow for 32 slots even
>>> though only 6 can be populated at the moment.
>>
>>
>> Why precisely 32? Round up of 18?
> 
> Because 32 is the allowed number of vPHBs.
> 
>>>>>>  }
>>>>>>  
>>>>>>  static ICSState *spapr_ics_get(XICSFabric *dev, int irq)
>>>>>> @@ -4376,6 +4382,18 @@ DEFINE_SPAPR_MACHINE(4_0, "4.0", true);
>>>>>>  /*
>>>>>>   * pseries-3.1
>>>>>>   */
>>>>>> +static void phb_placement_3_1(sPAPRMachineState *spapr, uint32_t index,
>>>>>> +                              uint64_t *buid, hwaddr *pio,
>>>>>> +                              hwaddr *mmio32, hwaddr *mmio64,
>>>>>> +                              unsigned n_dma, uint32_t *liobns,
>>>>>> +                              hwaddr *nv2gpa, hwaddr *nv2atsd, Error 
>>>>>> **errp)
>>>>>> +{
>>>>>> +    spapr_phb_placement(spapr, index, buid, pio, mmio32, mmio64, n_dma, 
>>>>>> liobns,
>>>>>> +                        nv2gpa, nv2atsd, errp);
>>>>>> +    *nv2gpa = 0;
>>>>>> +    *nv2atsd = 0;
>>>>>> +}
>>>>>> +
>>>>>>  static void spapr_machine_3_1_class_options(MachineClass *mc)
>>>>>>  {
>>>>>>      sPAPRMachineClass *smc = SPAPR_MACHINE_CLASS(mc);
>>>>>> @@ -4391,6 +4409,7 @@ static void 
>>>>>> spapr_machine_3_1_class_options(MachineClass *mc)
>>>>>>      mc->default_cpu_type = POWERPC_CPU_TYPE_NAME("power8_v2.0");
>>>>>>      smc->update_dt_enabled = false;
>>>>>>      smc->dr_phb_enabled = false;
>>>>>> +    smc->phb_placement = phb_placement_3_1;
>>>>>>  }
>>>>>>  
>>>>>>  DEFINE_SPAPR_MACHINE(3_1, "3.1", false);
>>>>>> @@ -4522,7 +4541,8 @@ DEFINE_SPAPR_MACHINE(2_8, "2.8", false);
>>>>>>  static void phb_placement_2_7(sPAPRMachineState *spapr, uint32_t index,
>>>>>>                                uint64_t *buid, hwaddr *pio,
>>>>>>                                hwaddr *mmio32, hwaddr *mmio64,
>>>>>> -                              unsigned n_dma, uint32_t *liobns, Error 
>>>>>> **errp)
>>>>>> +                              unsigned n_dma, uint32_t *liobns,
>>>>>> +                              hwaddr *nv2gpa, hwaddr *nv2atsd, Error 
>>>>>> **errp)
>>>>>>  {
>>>>>>      /* Legacy PHB placement for pseries-2.7 and earlier machine types */
>>>>>>      const uint64_t base_buid = 0x800000020000000ULL;
>>>>>> @@ -4566,6 +4586,9 @@ static void phb_placement_2_7(sPAPRMachineState 
>>>>>> *spapr, uint32_t index,
>>>>>>       * fallback behaviour of automatically splitting a large "32-bit"
>>>>>>       * window into contiguous 32-bit and 64-bit windows
>>>>>>       */
>>>>>> +
>>>>>> +    *nv2gpa = 0;
>>>>>> +    *nv2atsd = 0;
>>>>>>  }
>>>>>>  
>>>>>>  static void spapr_machine_2_7_class_options(MachineClass *mc)
>>>>>> diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c
>>>>>> index 06a5ffd..f076462 100644
>>>>>> --- a/hw/ppc/spapr_pci.c
>>>>>> +++ b/hw/ppc/spapr_pci.c
>>>>>> @@ -1355,6 +1355,8 @@ static void spapr_populate_pci_child_dt(PCIDevice 
>>>>>> *dev, void *fdt, int offset,
>>>>>>      if (sphb->pcie_ecs && pci_is_express(dev)) {
>>>>>>          _FDT(fdt_setprop_cell(fdt, offset, "ibm,pci-config-space-type", 
>>>>>> 0x1));
>>>>>>      }
>>>>>> +
>>>>>> +    spapr_phb_nvgpu_populate_pcidev_dt(dev, fdt, offset, sphb);
>>>>>>  }
>>>>>>  
>>>>>>  /* create OF node for pci device and required OF DT properties */
>>>>>> @@ -1878,6 +1880,7 @@ static void spapr_phb_reset(DeviceState *qdev)
>>>>>>      sPAPRPHBState *sphb = SPAPR_PCI_HOST_BRIDGE(qdev);
>>>>>>  
>>>>>>      spapr_phb_dma_reset(sphb);
>>>>>> +    spapr_phb_nvgpu_setup(sphb);
>>>>>>  
>>>>>>      /* Reset the IOMMU state */
>>>>>>      object_child_foreach(OBJECT(qdev), spapr_phb_children_reset, NULL);
>>>>>> @@ -1910,6 +1913,8 @@ static Property spapr_phb_properties[] = {
>>>>>>                       pre_2_8_migration, false),
>>>>>>      DEFINE_PROP_BOOL("pcie-extended-configuration-space", sPAPRPHBState,
>>>>>>                       pcie_ecs, true),
>>>>>> +    DEFINE_PROP_UINT64("gpa", sPAPRPHBState, nv2_gpa_win_addr, 0),
>>>>>> +    DEFINE_PROP_UINT64("atsd", sPAPRPHBState, nv2_atsd_win_addr, 0),
>>>>>>      DEFINE_PROP_END_OF_LIST(),
>>>>>>  };
>>>>>>  
>>>>>> @@ -2282,6 +2287,9 @@ int spapr_populate_pci_dt(sPAPRPHBState *phb, 
>>>>>> uint32_t intc_phandle, void *fdt,
>>>>>>          return ret;
>>>>>>      }
>>>>>>  
>>>>>> +    spapr_phb_nvgpu_populate_dt(phb, fdt, bus_off);
>>>>>> +    spapr_phb_nvgpu_ram_populate_dt(phb, fdt);
>>>>>> +
>>>>>>      return 0;
>>>>>>  }
>>>>>>  
>>>>>> diff --git a/hw/ppc/spapr_pci_nvlink2.c b/hw/ppc/spapr_pci_nvlink2.c
>>>>>> new file mode 100644
>>>>>> index 0000000..965a6be
>>>>>> --- /dev/null
>>>>>> +++ b/hw/ppc/spapr_pci_nvlink2.c
>>>>>> @@ -0,0 +1,419 @@
>>>>>> +/*
>>>>>> + * QEMU sPAPR PCI for NVLink2 pass through
>>>>>> + *
>>>>>> + * Copyright (c) 2019 Alexey Kardashevskiy, IBM Corporation.
>>>>>> + *
>>>>>> + * Permission is hereby granted, free of charge, to any person 
>>>>>> obtaining a copy
>>>>>> + * of this software and associated documentation files (the 
>>>>>> "Software"), to deal
>>>>>> + * in the Software without restriction, including without limitation 
>>>>>> the rights
>>>>>> + * to use, copy, modify, merge, publish, distribute, sublicense, and/or 
>>>>>> sell
>>>>>> + * copies of the Software, and to permit persons to whom the Software is
>>>>>> + * furnished to do so, subject to the following conditions:
>>>>>> + *
>>>>>> + * The above copyright notice and this permission notice shall be 
>>>>>> included in
>>>>>> + * all copies or substantial portions of the Software.
>>>>>> + *
>>>>>> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
>>>>>> EXPRESS OR
>>>>>> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
>>>>>> MERCHANTABILITY,
>>>>>> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT 
>>>>>> SHALL
>>>>>> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR 
>>>>>> OTHER
>>>>>> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 
>>>>>> ARISING FROM,
>>>>>> + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
>>>>>> DEALINGS IN
>>>>>> + * THE SOFTWARE.
>>>>>> + */
>>>>>> +#include "qemu/osdep.h"
>>>>>> +#include "qapi/error.h"
>>>>>> +#include "qemu-common.h"
>>>>>> +#include "hw/pci/pci.h"
>>>>>> +#include "hw/pci-host/spapr.h"
>>>>>> +#include "qemu/error-report.h"
>>>>>> +#include "hw/ppc/fdt.h"
>>>>>> +#include "hw/pci/pci_bridge.h"
>>>>>> +
>>>>>> +#define PHANDLE_PCIDEV(phb, pdev)    (0x12000000 | \
>>>>>> +                                     (((phb)->index) << 16) | 
>>>>>> ((pdev)->devfn))
>>>>>> +#define PHANDLE_GPURAM(phb, n)       (0x110000FF | ((n) << 8) | \
>>>>>> +                                     (((phb)->index) << 16))
>>>>>> +/* NVLink2 wants a separate NUMA node for its RAM */
>>>>>> +#define GPURAM_ASSOCIATIVITY(phb, n) (255 - ((phb)->index * 3 + (n)))
>>>>>> +#define PHANDLE_NVLINK(phb, gn, nn)  (0x00130000 | (((phb)->index) << 
>>>>>> 8) | \
>>>>>> +                                     ((gn) << 4) | (nn))
>>>>>> +
>>>>>> +/* Max number of NVLinks per GPU in any physical box */
>>>>>> +#define NVGPU_MAX_LINKS              3
>>>>>> +
>>>>>> +struct spapr_phb_pci_nvgpu_config {
>>>>>> +    uint64_t nv2_ram_current;
>>>>>> +    uint64_t nv2_atsd_current;
>>>>>> +    int num; /* number of non empty (i.e. tgt!=0) entries in slots[] */
>>>>>> +    struct spapr_phb_pci_nvgpu_slot {
>>>>>> +        uint64_t tgt;
>>>>>> +        uint64_t gpa;
>>>>>> +        PCIDevice *gpdev;
>>>>>> +        int linknum;
>>>>>> +        struct {
>>>>>> +            uint64_t atsd_gpa;
>>>>>> +            PCIDevice *npdev;
>>>>>> +            uint32_t link_speed;
>>>>>> +        } links[NVGPU_MAX_LINKS];
>>>>>> +    } slots[NVGPU_MAX_NUM];
>>>>>> +};
>>>>>> +
>>>>>> +static int spapr_pci_nvgpu_get_slot(struct spapr_phb_pci_nvgpu_config 
>>>>>> *nvgpus,
>>>>>> +                                    uint64_t tgt)
>>>>>> +{
>>>>>> +    int i;
>>>>>> +
>>>>>> +    /* Search for partially collected "slot" */
>>>>>> +    for (i = 0; i < nvgpus->num; ++i) {
>>>>>> +        if (nvgpus->slots[i].tgt == tgt) {
>>>>>> +            return i;
>>>>>> +        }
>>>>>> +    }
>>>>>> +
>>>>>> +    if (nvgpus->num == ARRAY_SIZE(nvgpus->slots)) {
>>>>>> +        warn_report("Found too many NVLink bridges per GPU");
>>>>>> +        return -1;
>>>>>
>>>>> This is within qemu so it would be better to use the qemu error API
>>>>> than returning an error code.
>>>>
>>>> You mean returning Error**? Oh. Ok.
>>>
>>> Well, not returning, technically, but taking an Error ** parameter
>>> which is checked by the caller to detect errors.
>>
>>
>> None of these is actually propagated to the upper level as neither of
>> these is fatal (well, except one which I am turning into assert).
> 
> Oh, ok.  In that case you don't need an Error **.

Well, I added them already in my local tree and print the faults in
spapr_pci.c.


>>>>>> +    }
>>>>>> +
>>>>>> +    i = nvgpus->num;
>>>>>> +    nvgpus->slots[i].tgt = tgt;
>>>>>> +    ++nvgpus->num;
>>>>>> +
>>>>>> +    return i;
>>>>>
>>>>> Might be nicer to return a pointer to the slot structure.
>>>>
>>>>
>>>> This can work.
>>>>
>>>>
>>>>>
>>>>>> +}
>>>>>> +
>>>>>> +static void spapr_pci_collect_nvgpu(struct spapr_phb_pci_nvgpu_config 
>>>>>> *nvgpus,
>>>>>> +                                    PCIDevice *pdev, uint64_t tgt,
>>>>>> +                                    MemoryRegion *mr)
>>>>>> +{
>>>>>> +    int i = spapr_pci_nvgpu_get_slot(nvgpus, tgt);
>>>>>> +
>>>>>> +    if (i < 0) {
>>>>>> +        return;
>>>>>> +    }
>>>>>> +    g_assert(!nvgpus->slots[i].gpdev);
>>>>>> +    nvgpus->slots[i].gpdev = pdev;
>>>>>> +
>>>>>> +    nvgpus->slots[i].gpa = nvgpus->nv2_ram_current;
>>>>>> +    nvgpus->nv2_ram_current += memory_region_size(mr);
>>>>>> +}
>>>>>> +
>>>>>> +static void spapr_pci_collect_nvnpu(struct spapr_phb_pci_nvgpu_config 
>>>>>> *nvgpus,
>>>>>> +                                    PCIDevice *pdev, uint64_t tgt,
>>>>>> +                                    MemoryRegion *mr)
>>>>>> +{
>>>>>> +    int i = spapr_pci_nvgpu_get_slot(nvgpus, tgt), j;
>>>>>> +    struct spapr_phb_pci_nvgpu_slot *nvslot;
>>>>>> +
>>>>>> +    if (i < 0) {
>>>>>> +        return;
>>>>>> +    }
>>>>>> +
>>>>>> +    nvslot = &nvgpus->slots[i];
>>>>>> +    j = nvslot->linknum;
>>>>>> +    if (j == ARRAY_SIZE(nvslot->links)) {
>>>>>> +        warn_report("Found too many NVLink2 bridges");
>>>>>> +        return;
>>>>>> +    }
>>>>>> +    ++nvslot->linknum;
>>>>>> +
>>>>>> +    g_assert(!nvslot->links[j].npdev);
>>>>>> +    nvslot->links[j].npdev = pdev;
>>>>>> +    nvslot->links[j].atsd_gpa = nvgpus->nv2_atsd_current;
>>>>>> +    nvgpus->nv2_atsd_current += memory_region_size(mr);
>>>>>> +    nvslot->links[j].link_speed =
>>>>>> +        object_property_get_uint(OBJECT(pdev), "nvlink2-link-speed", 
>>>>>> NULL);
>>>>>> +}
>>>>>> +
>>>>>> +static void spapr_phb_pci_collect_nvgpu(PCIBus *bus, PCIDevice *pdev,
>>>>>> +                                        void *opaque)
>>>>>> +{
>>>>>> +    PCIBus *sec_bus;
>>>>>> +    Object *po = OBJECT(pdev);
>>>>>> +    uint64_t tgt = object_property_get_uint(po, "nvlink2-tgt", NULL);
>>>>>> +
>>>>>> +    if (tgt) {
>>>>>> +        Object *mr_gpu = object_property_get_link(po, "nvlink2-mr[0]", 
>>>>>> NULL);
>>>>>> +        Object *mr_npu = object_property_get_link(po, 
>>>>>> "nvlink2-atsd-mr[0]",
>>>>>> +                                                  NULL);
>>>>>> +
>>>>>> +        if (mr_gpu) {
>>>>>> +            spapr_pci_collect_nvgpu(opaque, pdev, tgt, 
>>>>>> MEMORY_REGION(mr_gpu));
>>>>>> +        } else if (mr_npu) {
>>>>>> +            spapr_pci_collect_nvnpu(opaque, pdev, tgt, 
>>>>>> MEMORY_REGION(mr_npu));
>>>>>> +        } else {
>>>>>> +            warn_report("Unexpected device with \"nvlink2-tgt\"");
>>>>>
>>>>> IIUC this would have to be a code error, so should be an assert() not
>>>>> a warning.
>>>>
>>>>
>>>> Ok.
>>>>
>>>>>
>>>>>> +        }
>>>>>> +    }
>>>>>> +    if ((pci_default_read_config(pdev, PCI_HEADER_TYPE, 1) !=
>>>>>> +         PCI_HEADER_TYPE_BRIDGE)) {
>>>>>> +        return;
>>>>>> +    }
>>>>>> +
>>>>>> +    sec_bus = pci_bridge_get_sec_bus(PCI_BRIDGE(pdev));
>>>>>> +    if (!sec_bus) {
>>>>>> +        return;
>>>>>> +    }
>>>>>> +
>>>>>> +    pci_for_each_device(sec_bus, pci_bus_num(sec_bus),
>>>>>> +                        spapr_phb_pci_collect_nvgpu, opaque);
>>>>>> +}
>>>>>> +
>>>>>> +void spapr_phb_nvgpu_setup(sPAPRPHBState *sphb)
>>>>>> +{
>>>>>> +    int i, j, valid_gpu_num;
>>>>>> +
>>>>>> +    /* If there are existing NVLink2 MRs, unmap those before recreating 
>>>>>> */
>>>>>> +    if (sphb->nvgpus) {
>>>>>> +        for (i = 0; i < sphb->nvgpus->num; ++i) {
>>>>>> +            struct spapr_phb_pci_nvgpu_slot *nvslot = 
>>>>>> &sphb->nvgpus->slots[i];
>>>>>> +            Object *nv_mrobj = 
>>>>>> object_property_get_link(OBJECT(nvslot->gpdev),
>>>>>> +                                                        
>>>>>> "nvlink2-mr[0]", NULL);
>>>>>> +
>>>>>> +            if (nv_mrobj) {
>>>>>> +                memory_region_del_subregion(get_system_memory(),
>>>>>> +                                            MEMORY_REGION(nv_mrobj));
>>>>>> +            }
>>>>>> +            for (j = 0; j < nvslot->linknum; ++j) {
>>>>>> +                PCIDevice *npdev = nvslot->links[j].npdev;
>>>>>> +                Object *atsd_mrobj;
>>>>>> +                atsd_mrobj = object_property_get_link(OBJECT(npdev),
>>>>>> +                                                      
>>>>>> "nvlink2-atsd-mr[0]",
>>>>>> +                                                      NULL);
>>>>>> +                if (atsd_mrobj) {
>>>>>> +                    memory_region_del_subregion(get_system_memory(),
>>>>>> +                                                
>>>>>> MEMORY_REGION(atsd_mrobj));
>>>>>> +                }
>>>>>> +            }
>>>>>> +        }
>>>>>> +        g_free(sphb->nvgpus);
>>>>>
>>>>> Probably worth collecting the above into a nvgpu_free() helper -
>>>>> chances are you'll want it on cleanup paths as well.
>>>>
>>>> The only other cleanup path is below and it only executes if there is no
>>>> MR added so for now it does not seem useful.
>>>
>>> Hrm... I've merged PHB hotplug recently.. so there should be a cleanup
>>> path for unplug as well.
>>
>>
>> ah right. Wooohooo :) btw with phb hotplug we can try supporting EEH on
>> hotplugged VFIO devices.
> 
> Yeah, Sam is looking into it.
> 
>>>>>> +        sphb->nvgpus = NULL;
>>>>>> +    }
>>>>>> +
>>>>>> +    /* Search for GPUs and NPUs */
>>>>>> +    if (sphb->nv2_gpa_win_addr && sphb->nv2_atsd_win_addr) {
>>>>>> +        PCIBus *bus = PCI_HOST_BRIDGE(sphb)->bus;
>>>>>> +
>>>>>> +        sphb->nvgpus = g_new0(struct spapr_phb_pci_nvgpu_config, 1);
>>>>>> +        sphb->nvgpus->nv2_ram_current = sphb->nv2_gpa_win_addr;
>>>>>> +        sphb->nvgpus->nv2_atsd_current = sphb->nv2_atsd_win_addr;
>>>>>> +
>>>>>> +        pci_for_each_device(bus, pci_bus_num(bus),
>>>>>> +                            spapr_phb_pci_collect_nvgpu, sphb->nvgpus);
>>>>>> +    }
>>>>>> +
>>>>>> +    /* Add found GPU RAM and ATSD MRs if found */
>>>>>> +    for (i = 0, valid_gpu_num = 0; i < sphb->nvgpus->num; ++i) {
>>>>>> +        Object *nvmrobj;
>>>>>> +        struct spapr_phb_pci_nvgpu_slot *nvslot = 
>>>>>> &sphb->nvgpus->slots[i];
>>>>>> +
>>>>>> +        if (!nvslot->gpdev) {
>>>>>> +            continue;
>>>>>> +        }
>>>>>> +        nvmrobj = object_property_get_link(OBJECT(nvslot->gpdev),
>>>>>> +                                           "nvlink2-mr[0]", NULL);
>>>>>> +        /* ATSD is pointless without GPU RAM MR so skip those */
>>>>>> +        if (!nvmrobj) {
>>>>>> +            continue;
>>>>>> +        }
>>>>>> +
>>>>>> +        ++valid_gpu_num;
>>>>>> +        memory_region_add_subregion(get_system_memory(), nvslot->gpa,
>>>>>> +                                    MEMORY_REGION(nvmrobj));
>>>>>> +
>>>>>> +        for (j = 0; j < nvslot->linknum; ++j) {
>>>>>> +            Object *atsdmrobj;
>>>>>> +
>>>>>> +            atsdmrobj = 
>>>>>> object_property_get_link(OBJECT(nvslot->links[j].npdev),
>>>>>> +                                                 "nvlink2-atsd-mr[0]",
>>>>>> +                                                 NULL);
>>>>>> +            if (!atsdmrobj) {
>>>>>> +                continue;
>>>>>> +            }
>>>>>> +            memory_region_add_subregion(get_system_memory(),
>>>>>> +                                        nvslot->links[j].atsd_gpa,
>>>>>> +                                        MEMORY_REGION(atsdmrobj));
>>>>>> +        }
>>>>>> +    }
>>>>>> +
>>>>>> +    if (!valid_gpu_num) {
>>>>>> +        /* We did not find any interesting GPU */
>>>>>> +        g_free(sphb->nvgpus);
>>>>>> +        sphb->nvgpus = NULL;
>>>>>> +    }
>>>>>> +}
>>>>>> +
>>>>>> +void spapr_phb_nvgpu_populate_dt(sPAPRPHBState *sphb, void *fdt, int 
>>>>>> bus_off)
>>>>>> +{
>>>>>> +    int i, j, atsdnum = 0;
>>>>>> +    uint64_t atsd[8]; /* The existing limitation of known guests */
>>>>>> +
>>>>>> +    if (!sphb->nvgpus) {
>>>>>> +        return;
>>>>>> +    }
>>>>>> +
>>>>>> +    for (i = 0; (i < sphb->nvgpus->num) && (atsdnum < 
>>>>>> ARRAY_SIZE(atsd)); ++i) {
>>>>>> +        struct spapr_phb_pci_nvgpu_slot *nvslot = 
>>>>>> &sphb->nvgpus->slots[i];
>>>>>> +
>>>>>> +        if (!nvslot->gpdev) {
>>>>>> +            continue;
>>>>>> +        }
>>>>>> +        for (j = 0; j < nvslot->linknum; ++j) {
>>>>>> +            if (!nvslot->links[j].atsd_gpa) {
>>>>>> +                continue;
>>>>>> +            }
>>>>>> +
>>>>>> +            if (atsdnum == ARRAY_SIZE(atsd)) {
>>>>>> +                warn_report("Only %ld ATSD registers allowed",
>>>>>> +                            ARRAY_SIZE(atsd));
>>>>>
>>>>> Probably should be an error not a warning.
>>>>
>>>> We can still continue though, it is not fatal. These things come from
>>>> skiboot (which we control) but skiboot itself could compose the
>>>> properties itself or use whatever hostboot provided (does not happen now
>>>> though) and I would not like to be blocked by hostboot if/when this 
>>>> happens.
>>>
>>> Um.. what?  atsdnum is just a counter incremented below, it doesn't
>>> come from skiboot or any other host-significant value.  The situation
>>> here is that we have more nvlinks assigned to a guest that qemu can
>>> support.  Yes, you could technically run the guest with some of the
>>> links unavailable, but that seems pretty clearly not what the user
>>> wanted.  Hence, an error is appropriate.
>>
>>
>> Not exactly. NVlinks are available whether there come with an ATSD VFIO
>> region or not, it was my choice to accompany ATSD with a NVLink2 bridge.
>> So it is quite possible to pass way too many links and yes QEMU won't
>> expose all accompaniying ATSDs to the guest but 1) guest might not need
>> this many ATSDs anyway (right now the NVIDIA driver always uses just one
>> and nobody complained about performance) 2) nvlink is functional as long
>> as the guest can access its config space.
> 
> Sure, it can work.  But remember the qemu user is setting up this
> configuration.  I think it makes sense to error if it's a stupid and
> pointless configuration, even if the guest could technically work with
> it.

Ability of the guest to work kinda tells us that it is not that
pointless. btw what exactly do you mean by reporting an error instead of
warning? A fatal hw_error() or error_report() (which is just a message)?



>>>>>> +                break;
>>>>>> +            }
>>>>>> +            atsd[atsdnum] = cpu_to_be64(nvslot->links[j].atsd_gpa);
>>>>>> +            ++atsdnum;
>>>>>> +        }
>>>>>> +    }
>>>>>> +
>>>>>> +    if (!atsdnum) {
>>>>>> +        warn_report("No ATSD registers found");
>>>>>> +    } else if (!spapr_phb_eeh_available(sphb)) {
>>>>>> +        /*
>>>>>> +         * ibm,mmio-atsd contains ATSD registers; these belong to an 
>>>>>> NPU PHB
>>>>>> +         * which we do not emulate as a separate device. Instead we put
>>>>>> +         * ibm,mmio-atsd to the vPHB with GPU and make sure that we do 
>>>>>> not
>>>>>> +         * put GPUs from different IOMMU groups to the same vPHB to 
>>>>>> ensure
>>>>>> +         * that the guest will use ATSDs from the corresponding NPU.
>>>>>> +         */
>>>>>> +        warn_report("ATSD requires separate vPHB per GPU IOMMU group");
>>>>>> +    } else {
>>>>>> +        _FDT((fdt_setprop(fdt, bus_off, "ibm,mmio-atsd",
>>>>>> +                          atsd, atsdnum * sizeof(atsd[0]))));
>>>>>> +    }
>>>>>> +}
>>>>>> +
>>>>>> +void spapr_phb_nvgpu_ram_populate_dt(sPAPRPHBState *sphb, void *fdt)
>>>>>> +{
>>>>>> +    int i, j, linkidx, npuoff;
>>>>>> +    char *npuname;
>>>>>> +
>>>>>> +    if (!sphb->nvgpus) {
>>>>>> +        return;
>>>>>> +    }
>>>>>> +
>>>>>> +    npuname = g_strdup_printf("npuphb%d", sphb->index);
>>>>>> +    npuoff = fdt_add_subnode(fdt, 0, npuname);
>>>>>> +    _FDT(npuoff);
>>>>>> +    _FDT(fdt_setprop_cell(fdt, npuoff, "#address-cells", 1));
>>>>>> +    _FDT(fdt_setprop_cell(fdt, npuoff, "#size-cells", 0));
>>>>>> +    /* Advertise NPU as POWER9 so the guest can enable NPU2 contexts */
>>>>>> +    _FDT((fdt_setprop_string(fdt, npuoff, "compatible", 
>>>>>> "ibm,power9-npu")));
>>>>>> +    g_free(npuname);
>>>>>> +
>>>>>> +    for (i = 0, linkidx = 0; i < sphb->nvgpus->num; ++i) {
>>>>>> +        for (j = 0; j < sphb->nvgpus->slots[i].linknum; ++j) {
>>>>>> +            char *linkname = g_strdup_printf("link@%d", linkidx);
>>>>>> +            int off = fdt_add_subnode(fdt, npuoff, linkname);
>>>>>> +
>>>>>> +            _FDT(off);
>>>>>> +            /* _FDT((fdt_setprop_cell(fdt, off, "reg", linkidx)));
>>>>>> */
>>>>>
>>>>> Are the indices you're using for 'reg' and the unit name arbitrary?
>>>>> If so it's generally best to base them on some static property of the
>>>>> device, rather than just allocating sequentially.
>>>>
>>>> On the host reg is the link index. Here it is actually commented out as
>>>> a reminder for the future.
>>>>
>>>>>
>>>>>> +            _FDT((fdt_setprop_string(fdt, off, "compatible",
>>>>>> +                                     "ibm,npu-link")));
>>>>>> +            _FDT((fdt_setprop_cell(fdt, off, "phandle",
>>>>>> +                                   PHANDLE_NVLINK(sphb, i, j))));
>>>>>> +            _FDT((fdt_setprop_cell(fdt, off, "ibm,npu-link-index", 
>>>>>> linkidx)));
>>>>>
>>>>> Why do you need the index here as well as in reg?
>>>>
>>>> I do not need "reg" really and I need index for this:
>>>>
>>>> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/arch/powerpc/platforms/powernv/npu-dma.c?h=v4.20#n692
>>>
>>>
>>> Ok, because of a silly binding.  That's a good enough reason.
>>>
>>>>>> +            g_free(linkname);
>>>>>> +            ++linkidx;
>>>>>> +        }
>>>>>> +    }
>>>>>> +
>>>>>> +    /* Add memory nodes for GPU RAM and mark them unusable */
>>>>>> +    for (i = 0; i < sphb->nvgpus->num; ++i) {
>>>>>> +        struct spapr_phb_pci_nvgpu_slot *nvslot = 
>>>>>> &sphb->nvgpus->slots[i];
>>>>>> +        Object *nv_mrobj = 
>>>>>> object_property_get_link(OBJECT(nvslot->gpdev),
>>>>>> +                                                    "nvlink2-mr[0]", 
>>>>>> NULL);
>>>>>> +        uint32_t at = cpu_to_be32(GPURAM_ASSOCIATIVITY(sphb, i));
>>>>>> +        uint32_t associativity[] = { cpu_to_be32(0x4), at, at, at, at };
>>>>>> +        uint64_t size = object_property_get_uint(nv_mrobj, "size", 
>>>>>> NULL);
>>>>>> +        uint64_t mem_reg[2] = { cpu_to_be64(nvslot->gpa), 
>>>>>> cpu_to_be64(size) };
>>>>>> +        char *mem_name = g_strdup_printf("memory@%lx", nvslot->gpa);
>>>>>> +        int off = fdt_add_subnode(fdt, 0, mem_name);
>>>>>> +
>>>>>> +        _FDT(off);
>>>>>> +        _FDT((fdt_setprop_string(fdt, off, "device_type", "memory")));
>>>>>> +        _FDT((fdt_setprop(fdt, off, "reg", mem_reg, sizeof(mem_reg))));
>>>>>> +        _FDT((fdt_setprop(fdt, off, "ibm,associativity", associativity,
>>>>>> +                          sizeof(associativity))));
>>>>>> +
>>>>>> +        _FDT((fdt_setprop_string(fdt, off, "compatible",
>>>>>> +                                 "ibm,coherent-device-memory")));
>>>>>> +
>>>>>> +        mem_reg[1] = cpu_to_be64(0);
>>>>>> +        _FDT((fdt_setprop(fdt, off, "linux,usable-memory", mem_reg,
>>>>>> +                          sizeof(mem_reg))));
>>>>>> +        _FDT((fdt_setprop_cell(fdt, off, "phandle",
>>>>>> +                               PHANDLE_GPURAM(sphb, i))));
>>>>>> +        g_free(mem_name);
>>>>>> +    }
>>>>>> +
>>>>>> +}
>>>>>> +
>>>>>> +void spapr_phb_nvgpu_populate_pcidev_dt(PCIDevice *dev, void *fdt, int 
>>>>>> offset,
>>>>>> +                                        sPAPRPHBState *sphb)
>>>>>> +{
>>>>>> +    int i, j;
>>>>>> +
>>>>>> +    if (!sphb->nvgpus) {
>>>>>> +        return;
>>>>>> +    }
>>>>>> +
>>>>>> +    for (i = 0; i < sphb->nvgpus->num; ++i) {
>>>>>> +        struct spapr_phb_pci_nvgpu_slot *nvslot = 
>>>>>> &sphb->nvgpus->slots[i];
>>>>>> +
>>>>>> +        /* Skip "slot" without attached GPU */
>>>>>
>>>>> IIUC a "slot" should always have at least one GPU.  You need to handle
>>>>> the case of an unitialized GPU in the "collect" functions because you
>>>>> don't know if you'll discover the GPU or an NPU first.  But here not
>>>>> having a GPU should be an error, shouldn't it?
>>>>
>>>>
>>>> If someone decides to pass 1 GPU with all related nvlinks and just
>>>> nvlinks from another GPU but without related GPU for whatever reason,
>>>> should we really stop him/her? Things won't work exactly at their best
>>>> but this still might be useful for weird debugging.
>>>
>>> Hm, ok, I guess so.
>>>
>>>>>> +        if (!nvslot->gpdev) {
>>>>>> +            continue;
>>>>>> +        }
>>>>>> +        if (dev == nvslot->gpdev) {
>>>>>> +            uint32_t npus[nvslot->linknum];
>>>>>> +
>>>>>> +            for (j = 0; j < nvslot->linknum; ++j) {
>>>>>> +                PCIDevice *npdev = nvslot->links[j].npdev;
>>>>>> +
>>>>>> +                npus[j] = cpu_to_be32(PHANDLE_PCIDEV(sphb, npdev));
>>>>>> +            }
>>>>>> +            _FDT(fdt_setprop(fdt, offset, "ibm,npu", npus,
>>>>>> +                             j * sizeof(npus[0])));
>>>>>> +            _FDT((fdt_setprop_cell(fdt, offset, "phandle",
>>>>>> +                                   PHANDLE_PCIDEV(sphb, dev))));
>>>>>> +            continue;
>>>>>> +        }
>>>>>> +
>>>>>> +        for (j = 0; j < nvslot->linknum; ++j) {
>>>>>> +            if (dev != nvslot->links[j].npdev) {
>>>>>> +                continue;
>>>>>> +            }
>>>>>> +
>>>>>> +            _FDT((fdt_setprop_cell(fdt, offset, "phandle",
>>>>>> +                                   PHANDLE_PCIDEV(sphb, dev))));
>>>>>> +            _FDT(fdt_setprop_cell(fdt, offset, "ibm,gpu",
>>>>>> +                                  PHANDLE_PCIDEV(sphb, nvslot->gpdev)));
>>>>>> +            _FDT((fdt_setprop_cell(fdt, offset, "ibm,nvlink",
>>>>>> +                                   PHANDLE_NVLINK(sphb, i, j))));
>>>>>> +            /*
>>>>>> +             * If we ever want to emulate GPU RAM at the same location 
>>>>>> as on
>>>>>> +             * the host - here is the encoding GPA->TGT:
>>>>>> +             *
>>>>>> +             * gta  = ((sphb->nv2_gpa >> 42) & 0x1) << 42;
>>>>>> +             * gta |= ((sphb->nv2_gpa >> 45) & 0x3) << 43;
>>>>>> +             * gta |= ((sphb->nv2_gpa >> 49) & 0x3) << 45;
>>>>>> +             * gta |= sphb->nv2_gpa & ((1UL << 43) - 1);
>>>>>> +             */
>>>>>> +            _FDT(fdt_setprop_cell(fdt, offset, "memory-region",
>>>>>> +                                  PHANDLE_GPURAM(sphb, i)));
>>>>>> +            _FDT(fdt_setprop_u64(fdt, offset, "ibm,device-tgt-addr",
>>>>>> +                                 nvslot->tgt));
>>>>>> +            _FDT(fdt_setprop_cell(fdt, offset, "ibm,nvlink-speed",
>>>>>> +                                  nvslot->links[j].link_speed));
>>>>>> +        }
>>>>>> +    }
>>>>>> +}
>>>>>> diff --git a/hw/vfio/pci-quirks.c b/hw/vfio/pci-quirks.c
>>>>>> index 40a1200..15ec0b4 100644
>>>>>> --- a/hw/vfio/pci-quirks.c
>>>>>> +++ b/hw/vfio/pci-quirks.c
>>>>>> @@ -2180,3 +2180,123 @@ int vfio_add_virt_caps(VFIOPCIDevice *vdev, 
>>>>>> Error **errp)
>>>>>>  
>>>>>>      return 0;
>>>>>>  }
>>>>>> +
>>>>>> +static void vfio_pci_nvlink2_get_tgt(Object *obj, Visitor *v,
>>>>>> +                                     const char *name,
>>>>>> +                                     void *opaque, Error **errp)
>>>>>> +{
>>>>>> +    uint64_t tgt = (uint64_t) opaque;
>>>>>> +    visit_type_uint64(v, name, &tgt, errp);
>>>>>> +}
>>>>>> +
>>>>>> +static void vfio_pci_nvlink2_get_link_speed(Object *obj, Visitor *v,
>>>>>> +                                                 const char *name,
>>>>>> +                                                 void *opaque, Error 
>>>>>> **errp)
>>>>>> +{
>>>>>> +    uint32_t link_speed = (uint32_t)(uint64_t) opaque;
>>>>>> +    visit_type_uint32(v, name, &link_speed, errp);
>>>>>> +}
>>>>>> +
>>>>>> +int vfio_pci_nvidia_v100_ram_init(VFIOPCIDevice *vdev, Error **errp)
>>>>>> +{
>>>>>> +    int ret;
>>>>>> +    void *p;
>>>>>> +    struct vfio_region_info *nv2region = NULL;
>>>>>> +    struct vfio_info_cap_header *hdr;
>>>>>> +    MemoryRegion *nv2mr = g_malloc0(sizeof(*nv2mr));
>>>>>> +
>>>>>> +    ret = vfio_get_dev_region_info(&vdev->vbasedev,
>>>>>> +                                   VFIO_REGION_TYPE_PCI_VENDOR_TYPE |
>>>>>> +                                   PCI_VENDOR_ID_NVIDIA,
>>>>>> +                                   
>>>>>> VFIO_REGION_SUBTYPE_NVIDIA_NVLINK2_RAM,
>>>>>> +                                   &nv2region);
>>>>>> +    if (ret) {
>>>>>> +        return ret;
>>>>>> +    }
>>>>>> +
>>>>>> +    p = mmap(NULL, nv2region->size, PROT_READ | PROT_WRITE | PROT_EXEC,
>>>>>> +             MAP_SHARED, vdev->vbasedev.fd, nv2region->offset);
>>>>>> +
>>>>>> +    if (!p) {
>>>>>> +        return -errno;
>>>>>> +    }
>>>>>> +
>>>>>> +    memory_region_init_ram_ptr(nv2mr, OBJECT(vdev), "nvlink2-mr",
>>>>>> +                               nv2region->size, p);
>>>>>> +
>>>>>> +    hdr = vfio_get_region_info_cap(nv2region,
>>>>>> +                                   VFIO_REGION_INFO_CAP_NVLINK2_SSATGT);
>>>>>> +    if (hdr) {
>>>>>> +        struct vfio_region_info_cap_nvlink2_ssatgt *cap = (void *) hdr;
>>>>>> +
>>>>>> +        object_property_add(OBJECT(vdev), "nvlink2-tgt", "uint64",
>>>>>> +                            vfio_pci_nvlink2_get_tgt, NULL, NULL,
>>>>>> +                            (void *) cap->tgt, NULL);
>>>>>> +        trace_vfio_pci_nvidia_gpu_setup_quirk(vdev->vbasedev.name, 
>>>>>> cap->tgt,
>>>>>> +                                              nv2region->size);
>>>>>> +    }
>>>>>> +    g_free(nv2region);
>>>>>> +
>>>>>> +    return 0;
>>>>>> +}
>>>>>> +
>>>>>> +int vfio_pci_nvlink2_init(VFIOPCIDevice *vdev, Error **errp)
>>>>>> +{
>>>>>> +    int ret;
>>>>>> +    void *p;
>>>>>> +    struct vfio_region_info *atsd_region = NULL;
>>>>>> +    struct vfio_info_cap_header *hdr;
>>>>>> +
>>>>>> +    ret = vfio_get_dev_region_info(&vdev->vbasedev,
>>>>>> +                                   VFIO_REGION_TYPE_PCI_VENDOR_TYPE |
>>>>>> +                                   PCI_VENDOR_ID_IBM,
>>>>>> +                                   VFIO_REGION_SUBTYPE_IBM_NVLINK2_ATSD,
>>>>>> +                                   &atsd_region);
>>>>>> +    if (ret) {
>>>>>> +        return ret;
>>>>>> +    }
>>>>>> +
>>>>>> +    /* Some NVLink bridges come without assigned ATSD, skip MR part */
>>>>>> +    if (atsd_region->size) {
>>>>>> +        MemoryRegion *atsd_mr = g_malloc0(sizeof(*atsd_mr));
>>>>>> +
>>>>>> +        p = mmap(NULL, atsd_region->size, PROT_READ | PROT_WRITE | 
>>>>>> PROT_EXEC,
>>>>>> +                 MAP_SHARED, vdev->vbasedev.fd, atsd_region->offset);
>>>>>> +
>>>>>> +        if (!p) {
>>>>>> +            return -errno;
>>>>>> +        }
>>>>>> +
>>>>>> +        memory_region_init_ram_device_ptr(atsd_mr, OBJECT(vdev),
>>>>>> +                                          "nvlink2-atsd-mr",
>>>>>> +                                          atsd_region->size,
>>>>>> +                                          p);
>>>>>> +    }
>>>>>> +
>>>>>> +    hdr = vfio_get_region_info_cap(atsd_region,
>>>>>> +                                   VFIO_REGION_INFO_CAP_NVLINK2_SSATGT);
>>>>>> +    if (hdr) {
>>>>>> +        struct vfio_region_info_cap_nvlink2_ssatgt *cap = (void *) hdr;
>>>>>> +
>>>>>> +        object_property_add(OBJECT(vdev), "nvlink2-tgt", "uint64",
>>>>>> +                            vfio_pci_nvlink2_get_tgt, NULL, NULL,
>>>>>> +                            (void *) cap->tgt, NULL);
>>>>>> +        trace_vfio_pci_nvlink2_setup_quirk_ssatgt(vdev->vbasedev.name, 
>>>>>> cap->tgt,
>>>>>> +                                                  atsd_region->size);
>>>>>> +    }
>>>>>> +
>>>>>> +    hdr = vfio_get_region_info_cap(atsd_region,
>>>>>> +                                   VFIO_REGION_INFO_CAP_NVLINK2_LNKSPD);
>>>>>> +    if (hdr) {
>>>>>> +        struct vfio_region_info_cap_nvlink2_lnkspd *cap = (void *) hdr;
>>>>>> +
>>>>>> +        object_property_add(OBJECT(vdev), "nvlink2-link-speed", 
>>>>>> "uint32",
>>>>>> +                            vfio_pci_nvlink2_get_link_speed, NULL, NULL,
>>>>>> +                            (void *) (uint64_t) cap->link_speed, NULL);
>>>>>> +        trace_vfio_pci_nvlink2_setup_quirk_lnkspd(vdev->vbasedev.name,
>>>>>> +                                                  cap->link_speed);
>>>>>> +    }
>>>>>> +    g_free(atsd_region);
>>>>>> +
>>>>>> +    return 0;
>>>>>> +}
>>>>>> diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
>>>>>> index dd12f36..07aa141 100644
>>>>>> --- a/hw/vfio/pci.c
>>>>>> +++ b/hw/vfio/pci.c
>>>>>> @@ -3069,6 +3069,20 @@ static void vfio_realize(PCIDevice *pdev, Error 
>>>>>> **errp)
>>>>>>          goto out_teardown;
>>>>>>      }
>>>>>>  
>>>>>> +    if (vdev->vendor_id == PCI_VENDOR_ID_NVIDIA) {
>>>>>> +        ret = vfio_pci_nvidia_v100_ram_init(vdev, errp);
>>>>>> +        if (ret && ret != -ENODEV) {
>>>>>> +            error_report("Failed to setup NVIDIA V100 GPU RAM");
>>>>>> +        }
>>>>>> +    }
>>>>>> +
>>>>>> +    if (vdev->vendor_id == PCI_VENDOR_ID_IBM) {
>>>>>> +        ret = vfio_pci_nvlink2_init(vdev, errp);
>>>>>> +        if (ret && ret != -ENODEV) {
>>>>>> +            error_report("Failed to setup NVlink2 bridge");
>>>>>> +        }
>>>>>> +    }
>>>>>> +
>>>>>>      vfio_register_err_notifier(vdev);
>>>>>>      vfio_register_req_notifier(vdev);
>>>>>>      vfio_setup_resetfn_quirk(vdev);
>>>>>> diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
>>>>>> index cf1e886..88841e9 100644
>>>>>> --- a/hw/vfio/trace-events
>>>>>> +++ b/hw/vfio/trace-events
>>>>>> @@ -87,6 +87,10 @@ vfio_pci_igd_opregion_enabled(const char *name) "%s"
>>>>>>  vfio_pci_igd_host_bridge_enabled(const char *name) "%s"
>>>>>>  vfio_pci_igd_lpc_bridge_enabled(const char *name) "%s"
>>>>>>  
>>>>>> +vfio_pci_nvidia_gpu_setup_quirk(const char *name, uint64_t tgt, 
>>>>>> uint64_t size) "%s tgt=0x%"PRIx64" size=0x%"PRIx64
>>>>>> +vfio_pci_nvlink2_setup_quirk_ssatgt(const char *name, uint64_t tgt, 
>>>>>> uint64_t size) "%s tgt=0x%"PRIx64" size=0x%"PRIx64
>>>>>> +vfio_pci_nvlink2_setup_quirk_lnkspd(const char *name, uint32_t 
>>>>>> link_speed) "%s link_speed=0x%x"
>>>>>> +
>>>>>>  # hw/vfio/common.c
>>>>>>  vfio_region_write(const char *name, int index, uint64_t addr, uint64_t 
>>>>>> data, unsigned size) " (%s:region%d+0x%"PRIx64", 0x%"PRIx64 ", %d)"
>>>>>>  vfio_region_read(char *name, int index, uint64_t addr, unsigned size, 
>>>>>> uint64_t data) " (%s:region%d+0x%"PRIx64", %d) = 0x%"PRIx64
>>>>>
>>>>
>>>
>>
> 

-- 
Alexey
Re: [Qemu-devel] [PATCH qemu v3 6/6] spapr: Support NVIDIA V100 GPU with NVLink2

Reply via email to