date:20220615

On Wed, Jun 15, 2022 at 06:07:44AM +0900, Akihiko Odaki wrote:
> softmmu/datadir.c had its own implementation to find files in the
> build tree, but now bundle mechanism provides the unified
> implementation which works for datadir and the other files.
> 
> Signed-off-by: Akihiko Odaki 
> ---
>  .travis.yml |  2 +-
>  meson.build |  3 ++-
>  pc-bios/keymaps/meson.build |  3 +++
>  pc-bios/meson.build | 17 +
>  scripts/oss-fuzz/build.sh   |  2 +-
>  softmmu/datadir.c   | 35 ---
>  tests/qtest/fuzz/fuzz.c | 15 ---
>  tests/vm/fedora |  2 +-
>  tests/vm/freebsd|  2 +-
>  tests/vm/netbsd |  2 +-
>  tests/vm/openbsd|  2 +-
>  11 files changed, 32 insertions(+), 53 deletions(-)
> 
> diff --git a/.travis.yml b/.travis.yml
> index 9afc4a54b8f..9fee2167b95 100644
> --- a/.travis.yml
> +++ b/.travis.yml
> @@ -223,7 +223,7 @@ jobs:
>  - BUILD_RC=0 && make -j${JOBS} || BUILD_RC=$?
>  - |
>if [ "$BUILD_RC" -eq 0 ] ; then
> -  mv pc-bios/s390-ccw/*.img pc-bios/ ;
> +  mv pc-bios/s390-ccw/*.img qemu-bundle/share/qemu ;
>${TEST_CMD} ;
>else
>$(exit $BUILD_RC);
> diff --git a/meson.build b/meson.build
> index 0c2e11ff071..c573815813f 100644
> --- a/meson.build
> +++ b/meson.build
> @@ -32,6 +32,7 @@ if get_option('qemu_suffix').startswith('/')
>error('qemu_suffix cannot start with a /')
>  endif
>  
> +qemu_bundledir = meson.project_build_root() / 'qemu-bundle'
>  qemu_confdir = get_option('sysconfdir') / get_option('qemu_suffix')
>  qemu_datadir = get_option('datadir') / get_option('qemu_suffix')
>  qemu_docdir = get_option('docdir') / get_option('qemu_suffix')
> @@ -1682,7 +1683,7 @@ endif
>  config_host_data.set_quoted('CONFIG_BINDIR', get_option('prefix') / 
> get_option('bindir'))
>  config_host_data.set_quoted('CONFIG_PREFIX', get_option('prefix'))
>  config_host_data.set_quoted('CONFIG_QEMU_CONFDIR', get_option('prefix') / 
> qemu_confdir)
> -config_host_data.set_quoted('CONFIG_QEMU_DATADIR', get_option('prefix') / 
> qemu_datadir)
> +config_host_data.set_quoted('CONFIG_QEMU_BUNDLE_DATADIR', qemu_datadir)
>  config_host_data.set_quoted('CONFIG_QEMU_DESKTOPDIR', get_option('prefix') / 
> qemu_desktopdir)
>  config_host_data.set_quoted('CONFIG_QEMU_FIRMWAREPATH', get_option('prefix') 
> / get_option('qemu_firmwarepath'))
>  config_host_data.set_quoted('CONFIG_QEMU_HELPERDIR', get_option('prefix') / 
> get_option('libexecdir'))
> diff --git a/pc-bios/keymaps/meson.build b/pc-bios/keymaps/meson.build
> index 44247a12b54..b8bac138756 100644
> --- a/pc-bios/keymaps/meson.build
> +++ b/pc-bios/keymaps/meson.build
> @@ -67,3 +67,6 @@ if native_qemu_keymap.found()
>  endif
>  
>  install_data(['sl', 'sv'], install_dir: qemu_datadir / 'keymaps')
> +
> +run_command('ln', '-sf', '../../../pc-bios/keymaps', qemu_bundledir / 
> qemu_datadir,
> +check: true)
> diff --git a/pc-bios/meson.build b/pc-bios/meson.build
> index 41ba1c0ec7b..d1ff75b0b13 100644
> --- a/pc-bios/meson.build
> +++ b/pc-bios/meson.build
> @@ -1,3 +1,5 @@
> +run_command('mkdir', '-p', qemu_bundledir / qemu_datadir, check: true)
> +
>  roms = []
>  if unpack_edk2_blobs
>fds = [
> @@ -20,6 +22,9 @@ if unpack_edk2_blobs
>install: get_option('install_blobs'),
>install_dir: qemu_datadir,
>command: [ bzip2, '-dc', '@INPUT0@' ])
> +
> +run_command('ln', '-sf', '../../../pc-bios' / f, qemu_bundledir / 
> qemu_datadir,
> +check: true)
>endforeach
>  endif
>  
> @@ -85,15 +90,11 @@ blobs = [
>'vof-nvram.bin',
>  ]
>  
> -ln_s = [find_program('ln', required: true), '-sf']
> +install_data(blobs, install_dir: qemu_datadir)
> +
>  foreach f : blobs
> -  roms += custom_target(f,
> -build_by_default: have_system,
> -output: f,
> -input: files('meson.build'),# dummy input
> -install: get_option('install_blobs'),
> -install_dir: qemu_datadir,
> -command: [ ln_s, meson.project_source_root() / 'pc-bios' / 
> f, '@OUTPUT@' ])
> +  run_command('ln', '-sf', meson.current_source_dir() / f, qemu_bundledir / 
> qemu_datadir,
> +  check: true)
>  endforeach
>  
>  subdir('descriptors')
> diff --git a/scripts/oss-fuzz/build.sh b/scripts/oss-fuzz/build.sh
> index 98b56e05210..cbf8b3080e9 100755
> --- a/scripts/oss-fuzz/build.sh
> +++ b/scripts/oss-fuzz/build.sh
> @@ -88,7 +88,7 @@ if [ "$GITLAB_CI" != "true" ]; then
>  fi
>  
>  # Copy over the datadir
> -cp  -r ../pc-bios/ "$DEST_DIR/pc-bios"
> +cp  -r ../pc-bios/ "$DEST_DIR/qemu-bundle/share/qemu"
>  
>  targets=$(./qemu-fuzz-i386 | awk '$1 ~ /\*/  {print $2}')
>  base_copy="$DEST_DIR/qemu-fuzz-i386-target-$(echo "$targets" | head -n 1)"
> diff --git a/softmmu/d

Re: [PATCH] build: fix check for -fsanitize-coverage-allowlist


On 6/14/22 17:54, Alexander Bulekov wrote:

The existing check has two problems:
1. Meson uses a private directory for the get_supported_arguments check.
./instrumentation-filter does not exist in that private directory (it is
copied into the root of the build-directory).

2. fsanitize-coverage-allowlist is unused when coverage instrumentation
is not configured. No instrumentation are passed for the
get_supported_arguments check

Thus the check always fails. To work around this, change the check to an
"if cc.compiles" check and provide /dev/null, instead of the real
filter.

Meson log:
Working directory:  build/meson-private/tmpl6wld2d9
Command line:  clang-13 -m64 -mcx16
build/meson-private/tmpl6wld2d9/output.obj -c -O3 -D_FILE_OFFSET_BITS=64
-O0 -Werror=implicit-function-declaration -Werror=unknown-warning-option
-Werror=unused-command-line-argument
-Werror=ignored-optimization-argument
-fsanitize-coverage-allowlist=instrumentation-filter

Error:
error: argument unused during compilation:
'-fsanitize-coverage-allowlist=instrumentation-filter'

Signed-off-by: Alexander Bulekov 
---
  meson.build | 10 +++---
  1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/meson.build b/meson.build
index 0c2e11ff07..85134267b5 100644
--- a/meson.build
+++ b/meson.build
@@ -209,9 +209,13 @@ if get_option('fuzzing')
configure_file(output: 'instrumentation-filter',
   input: 'scripts/oss-fuzz/instrumentation-filter-template',
   copy: true)
-  add_global_arguments(
-  
cc.get_supported_arguments('-fsanitize-coverage-allowlist=instrumentation-filter'),
-  native: false, language: ['c', 'cpp', 'objc'])
+
+  if cc.compiles('int main () { return 0; }',
+  name: '-fsanitize-coverage-allowlist=/dev/null',
+ args: ['-fsanitize-coverage-allowlist=/dev/null'] )
+
add_global_arguments('-fsanitize-coverage-allowlist=instrumentation-filter',
+ native: false, language: ['c', 'cpp', 'objc'])
+  endif
  
if get_option('fuzzing_engine') == ''

  # Add CFLAGS to tell clang to add fuzzer-related instrumentation to all 
the


Queued, thanks.

Paolo

Re: [PATCH v4 2/4] datadir: Use bundle mechanism


On 6/14/22 23:07, Akihiko Odaki wrote:

)
+
  roms = []
  if unpack_edk2_blobs
fds = [
@@ -20,6 +22,9 @@ if unpack_edk2_blobs
install: get_option('install_blobs'),
install_dir: qemu_datadir,
command: [ bzip2, '-dc', '@INPUT0@' ])
+
+run_command('ln', '-sf', '../../../pc-bios' / f, qemu_bundledir / 
qemu_datadir,
+check: true)
endforeach
  endif
  
@@ -85,15 +90,11 @@ blobs = [

'vof-nvram.bin',
  ]
  
-ln_s = [find_program('ln', required: true), '-sf']

+install_data(blobs, install_dir: qemu_datadir)


This needs to be conditional on get_option('install_blobs').

Paolo


  foreach f : blobs
-  roms += custom_target(f,
-build_by_default: have_system,
-output: f,
-input: files('meson.build'),# dummy input
-install: get_option('install_blobs'),
-install_dir: qemu_datadir,
-command: [ ln_s, meson.project_source_root() / 'pc-bios' / f, 
'@OUTPUT@' ])
+  run_command('ln', '-sf', meson.current_source_dir() / f, qemu_bundledir / 
qemu_datadir,
+  check: true)
  endforeach
  
  subdir('descriptors')

Re: [PATCH v4 1/4] cutils: Introduce bundle mechanism


On 6/14/22 23:07, Akihiko Odaki wrote:

diff --git a/util/cutils.c b/util/cutils.c
index a58bcfd80e7..fe3bbb1c4eb 100644
--- a/util/cutils.c
+++ b/util/cutils.c
@@ -1086,3 +1086,36 @@ char *get_relocated_path(const char *dir)
  }
  return g_string_free(result, false);
  }
+
+static const char * const bundle_formats[] = {
+"%s" G_DIR_SEPARATOR_S ".." G_DIR_SEPARATOR_S "%s",
+"%s" G_DIR_SEPARATOR_S "qemu-bundle" G_DIR_SEPARATOR_S "%s"
+};


Why do you need both?

Paolo

Re: [PATCH] hw/mem/nvdimm: fix error message for 'unarmed' flag

2022-06-15 Thread David Hildenbrand

On 14.06.22 14:13, Julia Suvorova wrote:
> On Tue, Jun 14, 2022 at 11:50 AM David Hildenbrand  wrote:
>>
>> On 14.06.22 10:54, Igor Mammedov wrote:
>>> On Mon, 13 Jun 2022 16:09:53 +0100
>>> Stefan Hajnoczi  wrote:
>>>
 On Mon, Jun 13, 2022 at 05:01:10PM +0200, Julia Suvorova wrote:
> On Tue, May 31, 2022 at 5:32 PM Stefan Hajnoczi  
> wrote:
>>
>> On Tue, May 31, 2022 at 04:51:47PM +0200, Julia Suvorova wrote:
>>> In the ACPI specification [1], the 'unarmed' bit is set when a device
>>> cannot accept a persistent write. This means that when a memdev is
>>> read-only, the 'unarmed' flag must be turned on. The logic is correct,
>>> just changing the error message.
>>>
>>> [1] ACPI NFIT NVDIMM Region Mapping Structure "NVDIMM State Flags" Bit 3
>>>
>>> Signed-off-by: Julia Suvorova 
>>> ---
>>>  hw/mem/nvdimm.c | 2 +-
>>>  1 file changed, 1 insertion(+), 1 deletion(-)
>>
>> Reviewed-by: Stefan Hajnoczi 
>
> It seems like Xiao is not active, whose tree should this patch go to?
>>
>> Is that a temporary or a permanent thing? Do we know?
> 
> No idea. But his last signed-off was three years ago.

I sent a patch to Xiao, asking if he's still active in QEMU. If I don't
get a reply this week, I'll move forward with proposing an update to
MAINTAINERS as described.

-- 
Thanks,

David / dhildenb

Re: [PATCH v4 0/4] cutils: Introduce bundle mechanism

On Wed, Jun 15, 2022 at 06:07:42AM +0900, Akihiko Odaki wrote:
> Developers often run QEMU without installing. The bundle mechanism
> allows to look up files which should be present in installation even in
> such a situation.
> 
> It is a general mechanism and can find any files located relative
> to the installation tree. The build tree must have a new directory,
> qemu-bundle, to represent what files the installation tree would
> have for reference by the executables.

I don't think this is an attractive approach to the problem,
because it results in us adding a bunch of meson rules to
simulate 'make install' within the build dir. This is undesirable
clutter IMHO, and can be solved more simply by just modifying the
qemu_find_file() method.

The core problem is the impl of qemu_find_file is taking the wrong
approach, in several ways, but mostly because of its use of a single
'data_dirs' array for all types of file. This is bad because it
has the assumption that build dir and install dir layouts match,
and second because when we add extra firmware data dirs, we don't
want this used for non-firmware files.

We need to separate out the handling of different types of resources
for this to work correctly.

With regards,
Daniel
-- 
|: https://berrange.com  -o-https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o-https://fstop138.berrange.com :|
|: https://entangle-photo.org-o-https://www.instagram.com/dberrange :|

Re: [PATCH v12 00/14] vfio-user server in QEMU

2022-06-15 Thread Stefan Hajnoczi

On Tue, 14 Jun 2022 at 17:38, Stefan Hajnoczi  wrote:
>
> On Tue, Jun 14, 2022 at 02:37:02PM +, Jag Raman wrote:
> > > On Jun 14, 2022, at 3:06 AM, Stefan Hajnoczi  wrote:
> > >
> > > On Mon, Jun 13, 2022 at 04:26:20PM -0400, Jagannathan Raman wrote:
> > >> This is v12 of the server side changes to enable vfio-user in QEMU.
> > >>
> > >> Thanks so much for reviewing this series and sharing your feedback.
> > >>
> > >> We made the following changes in this series:
> > >> [PATCH v12 13/14] vfio-user: handle device interrupts
> > >> - Renamed msi_set_irq_state() and msix_set_irq_state() as
> > >>   msi_set_mask() and msix_set_mask() respectively
> > >> - Added missing return statement for error case in msi_set_mask()
> > >
> > > Thanks, applied to my block tree:
> > > https://gitlab.com/stefanha/qemu/commits/block
> >
> > Thank you very much, Stefan! :)
>
> You're welcome! Thanks for the persistence in getting the vfio-user
> server into QEMU.
>
> I have mirrored libvfio-user here:
> https://gitlab.com/qemu-project/libvfio-user
>
> The QEMU project's policy is to mirror dependencies so full source code
> can be provided even in the event that dependencies become unavailable.
> The mirror is currently manually updated, so please ping me if you want
> newer commits.

I have semi-automated the mirroring. As long as I use my laptop the
repo will be kept up-to-date. :)

Stefan

Re: [PATCH 1/2] hw/nvme: Implement shadow doorbell buffer support

On Jun 14 08:41, Keith Busch wrote:
> It's a pretty nasty hack, and definitely not in compliance with the spec: the
> db_addr is supposed to be read-only from the device side, though I do think
> it's safe for this environment. Unless Klaus or anyone finds something I'm
> missing, I feel this is an acceptable compromise to address this odd
> discrepency.
> 

No, I love this hack! :D

I have tested your hack against a dbbuf enabled driver that enables
shadow doorbells on the admin queue by default. I can confirm that this
works as well as on "broken" (or, lets call them "reasonable") drivers.

> By the way, I noticed that the patch never updates the cq's ei_addr value. Is
> that on purpose?

Yeah, I also mentioned this previously[1] and I still think we need to
update the event index. Otherwise (and my testing confirms this), we end
up in a situation where the driver skips the mmio, leaving a completion
queue entry "in use" on the device until some other completion comes
along.

I have folded these changes into a patch for testing[2]. Note, your
patch was missing equivalent changes in nvme_post_cqes(), so I added
that as well as updating of the event index.

  [1]: https://lore.kernel.org/qemu-devel/YqEMwsclktptJvQI@apples/
  [2]: 
http://git.infradead.org/qemu-nvme.git/commitdiff/60712930e441b684490a792b00ef6698cc85f116

Cheers,
Klaus

signature.asc
Description: PGP signature

Re: [PATCH v4 0/4] cutils: Introduce bundle mechanism


On 6/14/22 23:07, Akihiko Odaki wrote:

Developers often run QEMU without installing. The bundle mechanism
allows to look up files which should be present in installation even in
such a situation.

It is a general mechanism and can find any files located relative
to the installation tree. The build tree must have a new directory,
qemu-bundle, to represent what files the installation tree would
have for reference by the executables.

v4:
* Add Daniel P. Berrangé to CC. Hopefully this helps merging his patch:
   https://mail.gnu.org/archive/html/qemu-devel/2022-06/msg02276.html
* Rebased to the latest QEMU.

v3:
* Note that the bundle mechanism is for any files located relative to the
   installation tree including but not limited to datadir. (Peter Maydell)
* Fix "bridge" typo (Philippe Mathieu-Daudé)

v2: Rebased to the latest QEMU.


I like the idea, but I have a couple issues with the implementation:

- at the meson level, there is some repetition of mkdir and ln 
run_commands.  Perhaps you could just fill in a dictionary, and then do 
something like


  created_paths = {}
  foreach source, dest: var
path = fs.parent(qemu_bundledir / dest)
created_paths += {path: true}
  endforeach
  run_command('mkdir', '-p', created_paths.keys())
  foreach source, dest: var
run_command('ln', '-sf', meson.project_source_root() / source,
qemu_bundledir / dest)
  endforeach

at the end of the toplevel meson.build.

- at the code level, it seems to me that this could reuse a lot of the 
logic of get_relocated_path().  In particular, I would include $prefix 
in the qemu_bundledir, so that the files in the bundle directory would 
look like qemu-bundle/usr/share/qemu/bios.bin: just like an install that 
uses DESTDIR.  Then, if an uninstalled QEMU somehow returns 
$exec_path/qemu-bundle/$prefix/$bindir for qemu_get_exec_dir() instead 
of $exec_path, then get_relocated_path() will automatically return the 
correct paths from qemu-bundle/.


Thanks,

Paolo

Re: [PATCH] q35：Enable TSEG only when G_SMRAME and TSEG_EN both enabled

Queued, thanks.

Paolo

Re: [PATCH] q35：Enable TSEG only when G_SMRAME and TSEG_EN both enabled


On 6/15/22 05:45, Zhenzhong Duan wrote:

According to spec:
"TSEG Enable (T_EN): Enabling of SMRAM memory for Extended SMRAM space
only. When G_SMRAME = 1 and TSEG_EN = 1, the TSEG is enabled to appear
in the appropriate physical address space. Note that once D_LCK is set,
this bit becomes read only."

Changed to match the spec description.

Signed-off-by: Zhenzhong Duan 
---
  hw/pci-host/q35.c | 3 ++-
  1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/hw/pci-host/q35.c b/hw/pci-host/q35.c
index ab5a47aff560..20da1213747c 100644
--- a/hw/pci-host/q35.c
+++ b/hw/pci-host/q35.c
@@ -379,7 +379,8 @@ static void mch_update_smram(MCHPCIState *mch)
  memory_region_set_enabled(&mch->high_smram, false);
  }
  
-if (pd->config[MCH_HOST_BRIDGE_ESMRAMC] & MCH_HOST_BRIDGE_ESMRAMC_T_EN) {

+if ((pd->config[MCH_HOST_BRIDGE_ESMRAMC] & MCH_HOST_BRIDGE_ESMRAMC_T_EN) &&
+(pd->config[MCH_HOST_BRIDGE_SMRAM] & SMRAM_G_SMRAME)) {
  switch (pd->config[MCH_HOST_BRIDGE_ESMRAMC] &
  MCH_HOST_BRIDGE_ESMRAMC_TSEG_SZ_MASK) {
  case MCH_HOST_BRIDGE_ESMRAMC_TSEG_SZ_1MB:


Queued, thanks.

paolo

Re: [PATCH 0/2] Two sets of trivials

On Jun 14 11:40, Dr. David Alan Gilbert (git) wrote:
> From: "Dr. David Alan Gilbert" 
> 
> I've sent the 3 char set last month, but have updated
> it a little; I cleaned up a comment style that was already
> broken so checkpatch is happy.
> 
> The 'namesapce' is a new patch; it's amazing how many places
> make the same typo!
> 
> Dave
> 
> Dr. David Alan Gilbert (2):
>   Trivial: 3 char repeat typos
>   trivial typos: namesapce
> 
>  hw/9pfs/9p-xattr-user.c  | 8 
>  hw/acpi/nvdimm.c | 2 +-
>  hw/intc/openpic.c| 2 +-
>  hw/net/imx_fec.c | 2 +-
>  hw/nvme/ctrl.c   | 2 +-
>  hw/pci/pcie_aer.c| 2 +-
>  hw/pci/shpc.c| 3 ++-
>  hw/ppc/spapr_caps.c  | 2 +-
>  hw/scsi/spapr_vscsi.c| 2 +-
>  qapi/net.json| 2 +-
>  tools/virtiofsd/passthrough_ll.c | 2 +-
>  ui/input.c   | 2 +-
>  12 files changed, 16 insertions(+), 15 deletions(-)
> 
> -- 
> 2.36.1
> 

Nice (and Thanks)!

Reviewed-by: Klaus Jensen 


signature.asc
Description: PGP signature

Re: [PATCH v6 3/8] mm/memfd: Introduce MFD_INACCESSIBLE flag

2022-06-15 Thread Chao Peng

On Tue, Jun 14, 2022 at 08:23:46PM +, Sean Christopherson wrote:
> On Thu, Jun 02, 2022, Chao Peng wrote:
> > On Wed, Jun 01, 2022 at 02:11:42PM +0200, Gupta, Pankaj wrote:
> > > 
> > > > > > Introduce a new memfd_create() flag indicating the content of the
> > > > > > created memfd is inaccessible from userspace through ordinary MMU
> > > > > > access (e.g., read/write/mmap). However, the file content can be
> > > > > > accessed via a different mechanism (e.g. KVM MMU) indirectly.
> > > > > > 
> > > > > 
> > > > > SEV, TDX, pkvm and software-only VMs seem to have usecases to set up
> > > > > initial guest boot memory with the needed blobs.
> > > > > TDX already supports a KVM IOCTL to transfer contents to private
> > > > > memory using the TDX module but rest of the implementations will need
> > > > > to invent
> > > > > a way to do this.
> > > > 
> > > > There are some discussions in 
> > > > https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flkml.org%2Flkml%2F2022%2F5%2F9%2F1292&data=05%7C01%7Cpankaj.gupta%40amd.com%7Cb81ef334e2dd44c6143308da43b87d17%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637896756895977587%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000%7C%7C%7C&sdata=oQbM2Hj7GlhJTwnTM%2FPnwsfJlmTL7JR9ULBysAqm6V8%3D&reserved=0
> > > > already. I somehow agree with Sean. TDX is using an dedicated ioctl to
> > > > copy guest boot memory to private fd so the rest can do that similarly.
> > > > The concern is the performance (extra memcpy) but it's trivial since the
> > > > initial guest payload is usually optimized in size.
> > > > 
> > > > > 
> > > > > Is there a plan to support a common implementation for either allowing
> > > > > initial write access from userspace to private fd or adding a KVM
> > > > > IOCTL to transfer contents to such a file,
> > > > > as part of this series through future revisions?
> > > > 
> > > > Indeed, adding pre-boot private memory populating on current design
> > > > isn't impossible, but there are still some opens, e.g. how to expose
> > > > private fd to userspace for access, pKVM and CC usages may have
> > > > different requirements. Before that's well-studied I would tend to not
> > > > add that and instead use an ioctl to copy. Whether we need a generic
> > > > ioctl or feature-specific ioctl, I don't have strong opinion here.
> > > > Current TDX uses a feature-specific ioctl so it's not covered in this
> > > > series.
> > > 
> > > Common function or ioctl to populate preboot private memory actually makes
> > > sense.
> > > 
> > > Sorry, did not follow much of TDX code yet, Is it possible to filter out
> > > the current TDX specific ioctl to common function so that it can be used 
> > > by
> > > other technologies?
> > 
> > TDX code is here:
> > https://patchwork.kernel.org/project/kvm/patch/70ed041fd47c1f7571aa259450b3f9244edda48d.1651774250.git.isaku.yamah...@intel.com/
> > 
> > AFAICS It might be possible to filter that out to a common function. But
> > would like to hear from Paolo/Sean for their opinion.
> 
> Eh, I wouldn't put too much effort into creating a common helper, I would be 
> very
> surprised if TDX and SNP can share a meaningful amount of code that isn't 
> already
> shared, e.g. provided by MMU helpers.
> 
> The only part I truly care about sharing is whatever ioctl(s) get added, i.e. 
> I
> don't want to end up with two ioctls that do the same thing for TDX vs. SNP.

OK, then that part would be better to be added in TDX or SNP series.

Chao

Re: [PATCH v6 0/8] KVM: mm: fd-based approach for supporting KVM guest private memory

2022-06-15 Thread Chao Peng

On Tue, Jun 14, 2022 at 01:59:41PM -0700, Andy Lutomirski wrote:
> On Tue, Jun 14, 2022 at 12:09 PM Sean Christopherson  
> wrote:
> >
> > On Tue, Jun 14, 2022, Andy Lutomirski wrote:
> > > On Tue, Jun 14, 2022 at 12:32 AM Chao Peng  
> > > wrote:
> > > >
> > > > On Thu, Jun 09, 2022 at 08:29:06PM +, Sean Christopherson wrote:
> > > > > On Wed, Jun 08, 2022, Vishal Annapurve wrote:
> > > > >
> > > > > One argument is that userspace can simply rely on cgroups to detect 
> > > > > misbehaving
> > > > > guests, but (a) those types of OOMs will be a nightmare to debug and 
> > > > > (b) an OOM
> > > > > kill from the host is typically considered a _host_ issue and will be 
> > > > > treated as
> > > > > a missed SLO.
> > > > >
> > > > > An idea for handling this in the kernel without too much complexity 
> > > > > would be to
> > > > > add F_SEAL_FAULT_ALLOCATIONS (terrible name) that would prevent page 
> > > > > faults from
> > > > > allocating pages, i.e. holes can only be filled by an explicit 
> > > > > fallocate().  Minor
> > > > > faults, e.g. due to NUMA balancing stupidity, and major faults due to 
> > > > > swap would
> > > > > still work, but writes to previously unreserved/unallocated memory 
> > > > > would get a
> > > > > SIGSEGV on something it has mapped.  That would allow the userspace 
> > > > > VMM to prevent
> > > > > unintentional allocations without having to coordinate 
> > > > > unmapping/remapping across
> > > > > multiple processes.
> > > >
> > > > Since this is mainly for shared memory and the motivation is catching
> > > > misbehaved access, can we use mprotect(PROT_NONE) for this? We can mark
> > > > those range backed by private fd as PROT_NONE during the conversion so
> > > > subsequence misbehaved accesses will be blocked instead of causing 
> > > > double
> > > > allocation silently.
> >
> > PROT_NONE, a.k.a. mprotect(), has the same vma downsides as munmap().

Yes, right.

> >
> > > This patch series is fairly close to implementing a rather more
> > > efficient solution.  I'm not familiar enough with hypervisor userspace
> > > to really know if this would work, but:
> > >
> > > What if shared guest memory could also be file-backed, either in the
> > > same fd or with a second fd covering the shared portion of a memslot?
> > > This would allow changes to the backing store (punching holes, etc) to
> > > be some without mmap_lock or host-userspace TLB flushes?  Depending on
> > > what the guest is doing with its shared memory, userspace might need
> > > the memory mapped or it might not.
> >
> > That's what I'm angling for with the F_SEAL_FAULT_ALLOCATIONS idea.  The 
> > issue,
> > unless I'm misreading code, is that punching a hole in the shared memory 
> > backing
> > store doesn't prevent reallocating that hole on fault, i.e. a helper 
> > process that
> > keeps a valid mapping of guest shared memory can silently fill the hole.
> >
> > What we're hoping to achieve is a way to prevent allocating memory without 
> > a very
> > explicit action from userspace, e.g. fallocate().
> 
> Ah, I misunderstood.  I thought your goal was to mmap it and prevent
> page faults from allocating.

I think we still need the mmap, but want to prevent allocating when
userspace touches previously mmaped area that has never filled the page.
I don't have clear answer if other operations like read/write should be
also prevented (probably yes). And only after an explicit fallocate() to
allocate the page these operations would act normally.

> 
> It is indeed the case (and has been since before quite a few of us
> were born) that a hole in a sparse file is logically just a bunch of
> zeros.  A way to make a file for which a hole is an actual hole seems
> like it would solve this problem nicely.  It could also be solved more
> specifically for KVM by making sure that the private/shared mode that
> userspace programs is strict enough to prevent accidental allocations
> -- if a GPA is definitively private, shared, neither, or (potentially,
> on TDX only) both, then a page that *isn't* shared will never be
> accidentally allocated by KVM.

KVM is clever enough to not allocate since it knows a GPA is shared or
not. This case it's the host userspace that can cause the allocating and
is too complex to check on every access from guest.

> If the shared backing is not mmapped,
> it also won't be accidentally allocated by host userspace on a stray
> or careless write.

As said above, mmap is still prefered, otherwise too many changes are
needed for usespace VMM.

Thanks,
Chao
> 
> 
> --Andy

Re: [PATCH 1/2] hw/nvme: Implement shadow doorbell buffer support

2022-06-15 Thread John Levon

On Wed, Jun 15, 2022 at 10:48:26AM +0200, Klaus Jensen wrote:

> > By the way, I noticed that the patch never updates the cq's ei_addr value. 
> > Is
> > that on purpose?
> 
> Yeah, I also mentioned this previously[1] and I still think we need to
> update the event index. Otherwise (and my testing confirms this), we end
> up in a situation where the driver skips the mmio, leaving a completion
> queue entry "in use" on the device until some other completion comes
> along.

Hmm, can you expand on this a little bit? We don't touch cq eventidx this in
SPDK either, on the basis that mmio exits are expensive, and we only ever need
to look at cq_head when we're checking for room when posting a completion - and
in that case, we can just look directly at shadow cq_head value.

Can you clarify the exact circumstance that needs an mmio write when the driver
updates cq_head?

BTW I'm surprised that this patch has just this:

+static void nvme_update_sq_eventidx(const NvmeSQueue *sq)
+{
+pci_dma_write(&sq->ctrl->parent_obj, sq->ei_addr, &sq->tail,
+  sizeof(sq->tail));
+}

Isn't this racy against the driver? Compare
https://github.com/spdk/spdk/blob/master/lib/nvmf/vfio_user.c#L1317

thanks
john

Re: [PATCH 1/2] hw/nvme: Implement shadow doorbell buffer support

On Jun 15 10:07, John Levon wrote:
> On Wed, Jun 15, 2022 at 10:48:26AM +0200, Klaus Jensen wrote:
> 
> > > By the way, I noticed that the patch never updates the cq's ei_addr 
> > > value. Is
> > > that on purpose?
> > 
> > Yeah, I also mentioned this previously[1] and I still think we need to
> > update the event index. Otherwise (and my testing confirms this), we end
> > up in a situation where the driver skips the mmio, leaving a completion
> > queue entry "in use" on the device until some other completion comes
> > along.
> 
> Hmm, can you expand on this a little bit? We don't touch cq eventidx this in
> SPDK either, on the basis that mmio exits are expensive, and we only ever need
> to look at cq_head when we're checking for room when posting a completion - 
> and
> in that case, we can just look directly at shadow cq_head value.
> 
> Can you clarify the exact circumstance that needs an mmio write when the 
> driver
> updates cq_head?
> 

No, I see, you are correct that not updating the eventidx reduces MMIO
and that we check read the cq head anyway prior to posting completions.
I guess its a perfectly reasonable device-side optimization in this
case. We can safely drop that addition again I think.

> BTW I'm surprised that this patch has just this:
> 
> +static void nvme_update_sq_eventidx(const NvmeSQueue *sq)
> +{
> +pci_dma_write(&sq->ctrl->parent_obj, sq->ei_addr, &sq->tail,
> +  sizeof(sq->tail));
> +}
> 
> Isn't this racy against the driver? Compare
> https://github.com/spdk/spdk/blob/master/lib/nvmf/vfio_user.c#L1317
> 
> thanks
> john

QEMU has full memory barriers on dma read/write, so I believe this is
safe?


signature.asc
Description: PGP signature

Re: [PATCH 1/2] hw/nvme: Implement shadow doorbell buffer support

On Jun 15 11:58, Jinhao Fan wrote:
> 
> > On Jun 14, 2022, at 11:41 PM, Keith Busch  wrote:
> > 
> > It's a pretty nasty hack, and definitely not in compliance with the spec: 
> > the
> > db_addr is supposed to be read-only from the device side, though I do think
> > it's safe for this environment. Unless Klaus or anyone finds something I'm
> > missing, I feel this is an acceptable compromise to address this odd
> > discrepency.
> 
> :) In my next patch I will check the performance numbers with this hack. Not
> sure if updating db_addr value from the host will have any performance 
> implications but I guess it should be OK.
> 

I prefer we use the NVMe terminology to minimize misunderstandings, so
"host" means the driver and "device" means the qemu side of things

> > By the way, I noticed that the patch never updates the cq's ei_addr value. 
> > Is
> > that on purpose?
> 
> Klaus also raised a similar question in a prior comment. I think we need to 
> figure
> this out before we move on to the v2 patch. I did this because the original 
> Google
> extension patch did not update cq’s ei_addr. This seems to make sense because
> the purpose of cq’s ei_addr is for the guest to notify the host about cq head
> changes when necessary. However, the host does not need this notification 
> because we let the host proactively check for cq’s db_addr value when it wants
> to post a new cqe.
> This is also the only point where the host uses the cq’s
> db_addr. Therefore, it is OK to postpone the check for cq’s db_addr to this 
> point,
> instead of getting timely but not useful notifications by updating cq’s 
> ei_addr.
> This helps to reduce the number of MMIO’s on the cq’s doorbell register.
> 

True, it does reduce it, but it may leave CQEs "lingering" on the device
side (since the device has not been notified that the host has consumed
them).

> Klaus, Keith, do you think this design makes sense?

As I mentioned in my reply to John, I can see why this is a perfectly
reasonable optimization, we don't really care about the lingering CQEs
since we read the head anyway prior to posting completions. I jumped the
gun here in my eagerness to be "spec compliant" ;)


signature.asc
Description: PGP signature

Re: [PATCH] target/ppc: cpu_init: Clean up stop state on cpu reset

2022-06-15 Thread Cédric Le Goater

On 6/15/22 09:17, Frederic Barrat wrote:

On 15/06/2022 07:23, Cédric Le Goater wrote:

On 6/14/22 10:29, Frederic Barrat wrote:

The 'resume_as_sreset' attribute of a cpu can be set when a thread is
entering a stop state on ppc books. It causes the thread to be
re-routed to vector 0x100 when woken up by an exception. So it must be
cleaned on reset or a thread might be re-routed unexpectedly after a
reset, when it was not in a stop state and/or when the appropriate
exception handler isn't set up yet.

What is the test scenario ? and what are the symptoms ?

I was hitting it because of another bug in skiboot: if you have many chips, we
spend way too much time in add_opal_interrupts(), especially on powernv10 (I'm
working on a separate patch in skiboot to fix that). Sufficiently so that the
watchdog timer resets the system. When it happens, all the secondary threads
are in stopped state, only the main thread is working. That's how I was
reproducing.

What happens after the reset can vary a bit due to timing, but the most likely scenario is that we go through another primary thread election in skiboot. If the primary thread is the same as before, then there's no problem. If it's a different primary, then it will enter main_cpu_entry() while the other threads wait as secondaries. At some point, the primary thread (which still carries the wrong resume_as_sreset value from before reset) will enable the decrementer interrupt. The vector for the decrementer exception 0x900 is defined, so that shouldn't be a problem. However, because of the wrong resume_as_sreset value, it is re-routed to vector 0x100, which is still defined as the default boot-time handler, which is the entry point for BML. So the thread restarts as new, but this time it will be elected secondary. And we end up with all threads waiting as secondaries and a system stuck. All that happen before we've init the uart, so there's not a single trace on the console.
Fun :-)

Great analysis !

I think this deserve a v2 just to put in the commit log what you
just wrote :)

Thanks,

Re: [PATCH v16 3/9] linux-user: Add LoongArch elf support

2022-06-15 Thread gaosong


Hi Richard.

On 2022/6/15 上午12:21, Richard Henderson wrote:

On 6/14/22 02:05, Song Gao wrote:

+#define ELF_HWCAP get_elf_hwcap()
+
+static uint32_t get_elf_hwcap(void)
+{
+    return 0;
+}


This should not be zero.  See cpu_probe_common in the kernel.  At 
minimum HWCAP_LOONGARCH_CRC32 and HWCAP_LOONGARCH_FPU are missing.  I 
don't know how many of the other features are implemented in 
target/loongarch/.



HWCAP_LOONGARCH_LAM  and  HWCAP_LOONGARCH_UAL  are need.
Missing ELF_PLATFORM, per the kernel's set_elf_platform(cpu, 
"loongarch").

OK,  I will correct on next version.

Thanks.
Song Gao

Re: [PULL 00/10] Block jobs & NBD patches

2022-06-15 Thread Vladimir Sementsov-Ogievskiy


On 6/14/22 21:05, Richard Henderson wrote:

On 6/14/22 03:29, Vladimir Sementsov-Ogievskiy wrote:

The following changes since commit debd0753663bc89c86f5462a53268f2e3f680f60:

   Merge tag 'pull-testing-next-140622-1' of https://github.com/stsquad/qemu 
into staging (2022-06-13 21:10:57 -0700)

are available in the Git repository at:

   https://gitlab.com/vsementsov/qemu.git tags/pull-block-2022-06-14

for you to fetch changes up to 5aef6747a250f545ff53ba7e1a3ed7a3d166011a:

   MAINTAINERS: update Vladimir's address and repositories (2022-06-14 12:51:48 
+0300)


Block jobs & NBD patches

- add new options for copy-before-write filter
- new trace points for NBD
- prefer unsigned type for some 'in_flight' fields
- update my addresses in MAINTAINERS (already in Stefan's tree, but
   I think it's OK to send it with this PULL)


Note also, that I've recently updated my pgp key with new address and
new expire time.
Updated key is here: 
https://keys.openpgp.org/search?q=vsementsov%40yandex-team.ru


This introduces or exposes new timeouts:

https://gitlab.com/qemu-project/qemu/-/pipelines/563590515/failures



Not obvious from logs, which iotest hangs. But excluding iotests that passed, 
it becomes obvious that problem is in copy-before-write iotest, which is added 
and then updated in the series..

And most probably, that's a new timeout feature, that doesn't work (patches 
04-07).. It works for me locally still. I'd be glad if someone could look it 
through.

I think, for now, I'll just resend a pull request without these 4 patches.

Also, could/should I run all these test pipelines on gitlab by hand before 
sending a PULL request? Or can I rerun them on my qemu fork for debugging?


--
Best regards,
Vladimir

Re: [PATCH v16 7/9] target/loongarch: Adjust functions and structure to support user-mode

2022-06-15 Thread gaosong




On 2022/6/15 上午12:43, Richard Henderson wrote:

On 6/14/22 02:05, Song Gao wrote:
@@ -172,17 +173,20 @@ static void loongarch_cpu_do_interrupt(CPUState 
*cs)

  update_badinstr = 0;
  break;
  case EXCCODE_ADEM:
+    case EXCCODE_BCE:
  case EXCCODE_SYS:
  case EXCCODE_BRK:
+    case EXCCODE_INE:
+    case EXCCODE_IPE:
+    case EXCCODE_FPE:
+    env->badvaddr = env->pc;
+    QEMU_FALLTHROUGH;


This is incorrect still.

(1) env->badaddr (in this patch renamed badvaddr) is actually unused 
prior to this patch and should go away.  It seems to have been copied 
from RISC-V?  The correct LoongArch variable is env->CSR_BADV (see 
raise_mmu_exception in tlb_helper.c).



I also think we should remove env->badaddr,
(2) EXCCODE_ADEM is on the wrong side of this FALLTHROUGH.  This is 
the exception raised by TLB faults, and should retain the BADV address 
of the fault, not the faulting instruction.


Also, this patch is trying to do too many things at once.  Please 
split it into smaller logical changes.  Any bug fixes for the system 
code, for instance raising EXCCODE_BCE instead of EXCCODE_ADEM for 
helper_asrtle_d should be completely separated.



Thanks you for you advice,  I will correct them on next version.

Thanks.
Song Gao

Re: [PATCH v16 2/9] linux-user: Add LoongArch signal support

2022-06-15 Thread gaosong



On 2022/6/15 上午12:15, Richard Henderson wrote:

+static void *get_ctx(struct target_sctx_info *info)
+{
+    return (void *)((char *)info + sizeof(struct target_sctx_info));
+}


Return type should be struct target_sctx_info *. 


I wonder that if we return target_fpu_context * and rename get_ctx to 
get_fpu_context() would be better.


So we need't  cast  like this:

    struct target_fpu_context *fpu_ctx = (struct target_fpu_context *)
 get_ctx(info);


Thanks.
Song Gao

Re: [RFC PATCH v8 00/21] Net Control VQ support with asid in vDPA SVQ

2022-06-15 Thread Eugenio Perez Martin

On Wed, Jun 15, 2022 at 5:04 AM Jason Wang  wrote:
>
> On Tue, Jun 14, 2022 at 5:32 PM Eugenio Perez Martin
>  wrote:
> >
> > On Tue, Jun 14, 2022 at 10:20 AM Jason Wang  wrote:
> > >
> > > On Tue, Jun 14, 2022 at 4:14 PM Eugenio Perez Martin
> > >  wrote:
> > > >
> > > > On Tue, Jun 14, 2022 at 10:02 AM Jason Wang  wrote:
> > > > >
> > > > > On Tue, Jun 14, 2022 at 12:32 AM Eugenio Perez Martin
> > > > >  wrote:
> > > > > >
> > > > > > On Wed, Jun 8, 2022 at 9:28 PM Eugenio Perez Martin 
> > > > > >  wrote:
> > > > > > >
> > > > > > > On Wed, Jun 8, 2022 at 7:51 AM Jason Wang  
> > > > > > > wrote:
> > > > > > > >
> > > > > > > >
> > > > > > > > 在 2022/5/20 03:12, Eugenio Pérez 写道:
> > > > > > > > > Control virtqueue is used by networking device for accepting 
> > > > > > > > > various
> > > > > > > > > commands from the driver. It's a must to support multiqueue 
> > > > > > > > > and other
> > > > > > > > > configurations.
> > > > > > > > >
> > > > > > > > > Shadow VirtQueue (SVQ) already makes possible migration of 
> > > > > > > > > virtqueue
> > > > > > > > > states, effectively intercepting them so qemu can track what 
> > > > > > > > > regions of memory
> > > > > > > > > are dirty because device action and needs migration. However, 
> > > > > > > > > this does not
> > > > > > > > > solve networking device state seen by the driver because CVQ 
> > > > > > > > > messages, like
> > > > > > > > > changes on MAC addresses from the driver.
> > > > > > > > >
> > > > > > > > > To solve that, this series uses SVQ infraestructure proposed 
> > > > > > > > > to intercept
> > > > > > > > > networking control messages used by the device. This way, 
> > > > > > > > > qemu is able to
> > > > > > > > > update VirtIONet device model and to migrate it.
> > > > > > > > >
> > > > > > > > > However, to intercept all queues would slow device data 
> > > > > > > > > forwarding. To solve
> > > > > > > > > that, only the CVQ must be intercepted all the time. This is 
> > > > > > > > > achieved using
> > > > > > > > > the ASID infraestructure, that allows different translations 
> > > > > > > > > for different
> > > > > > > > > virtqueues. The most updated kernel part of ASID is proposed 
> > > > > > > > > at [1].
> > > > > > > > >
> > > > > > > > > You can run qemu in two modes after applying this series: 
> > > > > > > > > only intercepting
> > > > > > > > > cvq with x-cvq-svq=on or intercept all the virtqueues adding 
> > > > > > > > > cmdline x-svq=on:
> > > > > > > > >
> > > > > > > > > -netdev 
> > > > > > > > > type=vhost-vdpa,vhostdev=/dev/vhost-vdpa-0,id=vhost-vdpa0,x-cvq-svq=on,x-svq=on
> > > > > > > > >
> > > > > > > > > First three patches enable the update of the virtio-net 
> > > > > > > > > device model for each
> > > > > > > > > CVQ message acknoledged by the device.
> > > > > > > > >
> > > > > > > > > Patches from 5 to 9 enables individual SVQ to copy the 
> > > > > > > > > buffers to QEMU's VA.
> > > > > > > > > This allows simplyfing the memory mapping, instead of map all 
> > > > > > > > > the guest's
> > > > > > > > > memory like in the data virtqueues.
> > > > > > > > >
> > > > > > > > > Patch 10 allows to inject control messages to the device. 
> > > > > > > > > This allows to set
> > > > > > > > > state to the device both at QEMU startup and at live 
> > > > > > > > > migration destination. In
> > > > > > > > > the future, this may also be used to emulate _F_ANNOUNCE.
> > > > > > > > >
> > > > > > > > > Patch 11 updates kernel headers, but it assign random numbers 
> > > > > > > > > to needed ioctls
> > > > > > > > > because they are still not accepted in the kernel.
> > > > > > > > >
> > > > > > > > > Patches 12-16 enables the set of the features of the net 
> > > > > > > > > device model to the
> > > > > > > > > vdpa device at device start.
> > > > > > > > >
> > > > > > > > > Last ones enables the sepparated ASID and SVQ.
> > > > > > > > >
> > > > > > > > > Comments are welcomed.
> > > > > > > >
> > > > > > > >
> > > > > > > > As discussed, I think we need to split this huge series into 
> > > > > > > > smaller ones:
> > > > > > > >
> > > > > > > > 1) shadow CVQ only, this makes rx-filter-event work
> > > > > > > > 2) ASID support for CVQ
> > > > > > > >
> > > > > > > > And for 1) we need consider whether or not it could be 
> > > > > > > > simplified.
> > > > > > > >
> > > > > > > > Or do it in reverse order, since if we do 1) first, we may have 
> > > > > > > > security
> > > > > > > > issues.
> > > > > > > >
> > > > > > >
> > > > > > > I'm ok with both, but I also think 2) before 1) might make more 
> > > > > > > sense.
> > > > > > > There is no way to only shadow CVQ otherwise ATM.
> > > > > > >
> > > > > >
> > > > > > On second thought, that order is kind of harder.
> > > > > >
> > > > > > If we only map CVQ buffers, we need to either:
> > > > > > a. Copy them to controlled buffers
> > > > > > b. Track properly when to unmap them
> > > > >
> > > > > Just to make sure we're at the same page:
> >

Re: [PATCH 1/2] hw/nvme: Implement shadow doorbell buffer support

2022-06-15 Thread John Levon

On Wed, Jun 15, 2022 at 11:33:02AM +0200, Klaus Jensen wrote:

> > BTW I'm surprised that this patch has just this:
> > 
> > +static void nvme_update_sq_eventidx(const NvmeSQueue *sq)
> > +{
> > +pci_dma_write(&sq->ctrl->parent_obj, sq->ei_addr, &sq->tail,
> > +  sizeof(sq->tail));
> > +}
> > 
> > Isn't this racy against the driver? Compare
> > https://github.com/spdk/spdk/blob/master/lib/nvmf/vfio_user.c#L1317
> > 
> > thanks
> > john
> 
> QEMU has full memory barriers on dma read/write, so I believe this is
> safe?

But don't you need to re-read the tail still, for example:


driver  device

eventidx is 3

write 4 to tail
read tail of 4
write 5 to tail
read eventidx of 3
nvme_dbbuf_need_event (1)

set eventidx to 4
go to sleep

at (1), our tail update of 4->5 doesn't straddle the eventidx, so we don't send
any MMIO, and the device won't wake up. This is why the above code checks the
tail twice for any concurrent update.

regards
john

[PATCH v3 0/4] softmmu: make qemu_find_file more flexible wrt build dir layout

The qemu_find_file method impl is rather crude with a variety of
problems (detailed in commit message of first patch).

This series addresses those problems, making qemu_find_file
much more flexible and able to be trivially extended to find
any type of file, both in a (optionally relocated) install
tree location and the local build tree.

This is proposed as an alternative to

  https://lists.gnu.org/archive/html/qemu-devel/2022-06/msg02589.html

avoiding the need to add many more meson rules to simulate
results of 'make install' in the build dir.

It has been tested as follows:

  mkdir -p build/quick
  cd build/quick
  ./configure --target-list=x86_64-softmmu --prefix=/usr
  make -j 8 install DESTDIR=`pwd`/../vroot

Now running from build dir:

$ ./build/local/qemu-system-x86_64 -vnc :1 -k fr -trace 'datadir*' -display 
sdl
datadir_init default data dir 
/home/berrange/src/virt/qemu/build/local/../share/qemu icon dir 
/home/berrange/src/virt/qemu/build/local/../share/icons helper dir 
/home/berrange/src/virt/qemu/build/local/../libexec in build dir 1
datadir_load_file name bios-256k.bin location 
/home/berrange/src/virt/qemu/build/local/pc-bios/bios-256k.bin errno 0
datadir_load_file name bios-256k.bin location 
/home/berrange/src/virt/qemu/build/local/pc-bios/bios-256k.bin errno 0
datadir_load_file name kvmvapic.bin location 
/home/berrange/src/virt/qemu/build/local/pc-bios/kvmvapic.bin errno 0
datadir_load_file name vgabios-stdvga.bin location 
/home/berrange/src/virt/qemu/build/local/pc-bios/vgabios-stdvga.bin errno 0
datadir_load_file name efi-e1000.rom location 
/home/berrange/src/virt/qemu/build/local/pc-bios/efi-e1000.rom errno 0
datadir_load_file name 128x128/apps/qemu.png location 
/home/berrange/src/virt/qemu/build/local/ui/icons/128x128/apps/qemu.png errno 0
datadir_load_file name fr location 
/home/berrange/src/virt/qemu/build/local/ui/keymaps/fr errno 0

$ ./build/local/qemu-system-x86_64 -vnc :1 -k fr -trace 'datadir*' -display 
sdl -net bridge
datadir_init default data dir 
/home/berrange/src/virt/qemu/build/local/../share/qemu icon dir 
/home/berrange/src/virt/qemu/build/local/../share/icons helper dir 
/home/berrange/src/virt/qemu/build/local/../libexec in build dir 1
datadir_load_file name qemu-bridge-helper location 
/home/berrange/src/virt/qemu/build/local/qemu-bridge-helper errno 0
Helper /home/berrange/src/virt/qemu/build/local/qemu-bridge-helper
access denied by acl file

And running from the (relocated) install dir:

$ ./build/vroot/usr/bin/qemu-system-x86_64 -vnc :1 -k fr -trace 'datadir*' 
-display sdl
datadir_init default data dir 
/home/berrange/src/virt/qemu/build/vroot/usr/bin/../share/qemu icon dir 
/home/berrange/src/virt/qemu/build/vroot/usr/bin/../share/icons helper dir 
/home/berrange/src/virt/qemu/build/vroot/usr/bin/../libexec in build dir 0
datadir_load_file name bios-256k.bin location 
/home/berrange/src/virt/qemu/build/vroot/usr/bin/../qemu-firmware/bios-256k.bin 
errno 2
datadir_load_file name bios-256k.bin location 
/home/berrange/src/virt/qemu/build/vroot/usr/bin/../share/qemu/bios-256k.bin 
errno 0
datadir_load_file name bios-256k.bin location 
/home/berrange/src/virt/qemu/build/vroot/usr/bin/../qemu-firmware/bios-256k.bin 
errno 2
datadir_load_file name bios-256k.bin location 
/home/berrange/src/virt/qemu/build/vroot/usr/bin/../share/qemu/bios-256k.bin 
errno 0
datadir_load_file name kvmvapic.bin location 
/home/berrange/src/virt/qemu/build/vroot/usr/bin/../qemu-firmware/kvmvapic.bin 
errno 2
datadir_load_file name kvmvapic.bin location 
/home/berrange/src/virt/qemu/build/vroot/usr/bin/../share/qemu/kvmvapic.bin 
errno 0
datadir_load_file name vgabios-stdvga.bin location 
/home/berrange/src/virt/qemu/build/vroot/usr/bin/../qemu-firmware/vgabios-stdvga.bin
 errno 2
datadir_load_file name vgabios-stdvga.bin location 
/home/berrange/src/virt/qemu/build/vroot/usr/bin/../share/qemu/vgabios-stdvga.bin
 errno 0
datadir_load_file name efi-e1000.rom location 
/home/berrange/src/virt/qemu/build/vroot/usr/bin/../qemu-firmware/efi-e1000.rom 
errno 2
datadir_load_file name efi-e1000.rom location 
/home/berrange/src/virt/qemu/build/vroot/usr/bin/../share/qemu/efi-e1000.rom 
errno 0
datadir_load_file name 128x128/apps/qemu.png location 
/home/berrange/src/virt/qemu/build/vroot/usr/bin/../share/icons/hicolor/128x128/apps/qemu.png
 errno 0
datadir_load_file name fr location 
/home/berrange/src/virt/qemu/build/vroot/usr/bin/../share/qemu/keymaps/fr errno 0

$ ./build/vroot/usr/bin/qemu-system-x86_64 -vnc :1 -k fr -trace 'datadir*' 
-display sdl -net bridge
datadir_init default data dir 
/home/berrange/src/virt/qemu/build/vroot/usr/bin/../share/qemu icon dir 
/home/berrange/src/virt/qemu/build/vroot/usr/bin/../share/icons helper dir 
/home/berrange/src/virt/qemu/build/vroot/usr/bin/../libexec in build dir 0
datadir_load_file name qemu-bridge-helper location

[PATCH v3 1/4] softmmu: rewrite handling of qemu_find_file

The qemu_find_file method has a couple of flaws in its current
implementation:

 * The configure time 'qemu-firmware' search path is mistakenly
   also used to find keymaps

 * The configure time 'qemu-firmware' search path is mistakenly
   relocated even when running from build dir resulting in
   non-sensical paths that won't resolve

 * When searching for files it has the assumption that the
   in-build-tree layout will match the installed root layout

The latter problem has forced us to keep the keymap files under a
sub-dir of the pc-bios/ instead of ui/.

This all stems from the way qemu_find_file tries to use a single
list of data directory locations, appending a type specific
subdir.

This can be addressed by refactoring the logic as follows

For each type of file to be found identify

  * Optional: any user specified dir (non-relocated)
  * Path relative to build dir
  * Path relative to install dir
  * Optional: extra configure time install dirs (bios only, relocated)
  * The default install directory (relocated)

We can then search through:

 * User specified dir
 * If running from build dir
 - Path relative to build dir
   else
 - Extra configure time dirs
 - Path relative to default install dir

This is now flexible enough to extend to find any type of file,
by plugging in different input values, regardless of what layout
might be used in build dir vs install dir.

Signed-off-by: Daniel P. Berrangé 
---
 include/qemu/datadir.h  |   5 +-
 softmmu/datadir.c   | 145 +++-
 softmmu/trace-events|   5 +-
 softmmu/vl.c|   2 +-
 tests/qtest/fuzz/fuzz.c |   2 +-
 5 files changed, 92 insertions(+), 67 deletions(-)

diff --git a/include/qemu/datadir.h b/include/qemu/datadir.h
index 21f9097f58..a333cd9b0d 100644
--- a/include/qemu/datadir.h
+++ b/include/qemu/datadir.h
@@ -15,6 +15,9 @@
  * configured at build time (DATADIR) or registered with the -L command
  * line option.
  *
+ * @name may be NULL to indicate the caller just wants the
+ * first search directory that is found.
+ *
  * The caller must use g_free() to free the returned data when it is
  * no longer required.
  *
@@ -22,7 +25,7 @@
  */
 char *qemu_find_file(int type, const char *name);
 void qemu_add_default_firmwarepath(void);
-void qemu_add_data_dir(char *path);
+void qemu_set_user_data_dir(const char *path);
 void qemu_list_data_dirs(void);
 
 #endif
diff --git a/softmmu/datadir.c b/softmmu/datadir.c
index 160cac999a..7457717542 100644
--- a/softmmu/datadir.c
+++ b/softmmu/datadir.c
@@ -27,102 +27,121 @@
 #include "qemu/cutils.h"
 #include "trace.h"
 
-static const char *data_dir[16];
-static int data_dir_idx;
+/* User specified data directory */
+static char *user_data_dir;
+
+/* Extra build time defined search locations for firmware (NULL terminated) */
+static char **extra_firmware_dirs;
+
+/* Default built-in directories */
+static char *default_data_dir;
+
+/* Whether we're known to be executing from a build tree */
+static bool in_build_dir;
 
 char *qemu_find_file(int type, const char *name)
 {
-int i;
-const char *subdir;
-char *buf;
-
-/* Try the name as a straight path first */
-if (access(name, R_OK) == 0) {
-trace_load_file(name, name);
-return g_strdup(name);
-}
+const char *user_install_dir = NULL;
+char **extra_install_dirs = NULL;
+const char *rel_build_dir;
+const char *rel_install_dir;
+const char *default_install_dir;
+char *maybepath = NULL;
+size_t i;
+int ret;
 
 switch (type) {
 case QEMU_FILE_TYPE_BIOS:
-subdir = "";
+user_install_dir = user_data_dir;
+extra_install_dirs = extra_firmware_dirs;
+rel_install_dir = "";
+rel_build_dir = "pc-bios";
+default_install_dir = default_data_dir;
 break;
+
 case QEMU_FILE_TYPE_KEYMAP:
-subdir = "keymaps/";
+user_install_dir = user_data_dir;
+rel_install_dir = "keymaps";
+rel_build_dir = "pc-bios/keymaps";
+default_install_dir = default_data_dir;
 break;
+
 default:
 abort();
 }
 
-for (i = 0; i < data_dir_idx; i++) {
-buf = g_strdup_printf("%s/%s%s", data_dir[i], subdir, name);
-if (access(buf, R_OK) == 0) {
-trace_load_file(name, buf);
-return buf;
-}
-g_free(buf);
-}
-return NULL;
-}
-
-void qemu_add_data_dir(char *path)
-{
-int i;
+#define TRY_LOAD(path)  \
+do {\
+ret = access(path, R_OK);   \
+trace_datadir_load_file(name, path, ret == 0 ? 0 : errno);  \
+if (ret == 0) { \
+return maybepath;   \
+}

[PATCH v3 3/4] ui: find icons using qemu_find_file

The SDL/GTK/Cocoa UIs currently fail to load icons when run from the
build directory as get_resource returns a bogus path.

To address this we first re-arrange the ui/icons sub-directory
so that its layout reflects the contents that will be installed.

Then we introduce QEMU_FILE_TYPE_ICON to qemu_find_file such
that it can locate icons from the build dir.

Signed-off-by: Daniel P. Berrangé 
---
 configure |   1 +
 docs/conf.py  |   4 +--
 include/qemu/datadir.h|   2 ++
 softmmu/datadir.c |  12 +++-
 softmmu/trace-events  |   2 +-
 ui/cocoa.m|   3 +-
 ui/gtk.c  |   3 +-
 .../apps/qemu.png}| Bin
 .../{qemu_16x16.png => 16x16/apps/qemu.png}   | Bin
 .../{qemu_24x24.png => 24x24/apps/qemu.png}   | Bin
 .../apps/qemu.png}| Bin
 .../{qemu_32x32.bmp => 32x32/apps/qemu.bmp}   | Bin
 .../{qemu_32x32.png => 32x32/apps/qemu.png}   | Bin
 .../{qemu_48x48.png => 48x48/apps/qemu.png}   | Bin
 .../apps/qemu.png}| Bin
 .../{qemu_64x64.png => 64x64/apps/qemu.png}   | Bin
 ui/icons/meson.build  |  27 --
 ui/icons/{ => scalable/apps}/qemu.svg |   0
 ui/sdl2.c |   5 ++--
 19 files changed, 43 insertions(+), 16 deletions(-)
 rename ui/icons/{qemu_128x128.png => 128x128/apps/qemu.png} (100%)
 rename ui/icons/{qemu_16x16.png => 16x16/apps/qemu.png} (100%)
 rename ui/icons/{qemu_24x24.png => 24x24/apps/qemu.png} (100%)
 rename ui/icons/{qemu_256x256.png => 256x256/apps/qemu.png} (100%)
 rename ui/icons/{qemu_32x32.bmp => 32x32/apps/qemu.bmp} (100%)
 rename ui/icons/{qemu_32x32.png => 32x32/apps/qemu.png} (100%)
 rename ui/icons/{qemu_48x48.png => 48x48/apps/qemu.png} (100%)
 rename ui/icons/{qemu_512x512.png => 512x512/apps/qemu.png} (100%)
 rename ui/icons/{qemu_64x64.png => 64x64/apps/qemu.png} (100%)
 rename ui/icons/{ => scalable/apps}/qemu.svg (100%)

diff --git a/configure b/configure
index 4b12a8094c..fdcbfbc1b1 100755
--- a/configure
+++ b/configure
@@ -2218,6 +2218,7 @@ LINKS="$LINKS tests/avocado tests/data"
 LINKS="$LINKS tests/qemu-iotests/check"
 LINKS="$LINKS python"
 LINKS="$LINKS contrib/plugins/Makefile "
+LINKS="$LINKS ui/icons "
 for f in $LINKS ; do
 if [ -e "$source_path/$f" ]; then
 mkdir -p `dirname ./$f`
diff --git a/docs/conf.py b/docs/conf.py
index 49dab44cca..16d5d96228 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -180,9 +180,9 @@
 "navigation_with_keys": True,
 }
 
-html_logo = os.path.join(qemu_docdir, "../ui/icons/qemu_128x128.png")
+html_logo = os.path.join(qemu_docdir, "../ui/icons/128x128/apps/qemu.png")
 
-html_favicon = os.path.join(qemu_docdir, "../ui/icons/qemu_32x32.png")
+html_favicon = os.path.join(qemu_docdir, "../ui/icons/32x32/apps/qemu.png")
 
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
diff --git a/include/qemu/datadir.h b/include/qemu/datadir.h
index a333cd9b0d..427e90787a 100644
--- a/include/qemu/datadir.h
+++ b/include/qemu/datadir.h
@@ -3,6 +3,8 @@
 
 #define QEMU_FILE_TYPE_BIOS   0
 #define QEMU_FILE_TYPE_KEYMAP 1
+#define QEMU_FILE_TYPE_ICON   2
+
 /**
  * qemu_find_file:
  * @type: QEMU_FILE_TYPE_BIOS (for BIOS, VGA BIOS)
diff --git a/softmmu/datadir.c b/softmmu/datadir.c
index 32c765d228..e5d1fd0116 100644
--- a/softmmu/datadir.c
+++ b/softmmu/datadir.c
@@ -35,6 +35,7 @@ static char **extra_firmware_dirs;
 
 /* Default built-in directories */
 static char *default_data_dir;
+static char *default_icon_dir;
 
 /* Whether we're known to be executing from a build tree */
 static bool in_build_dir;
@@ -66,6 +67,12 @@ char *qemu_find_file(int type, const char *name)
 default_install_dir = default_data_dir;
 break;
 
+case QEMU_FILE_TYPE_ICON:
+rel_install_dir = "hicolor";
+rel_build_dir = "ui/icons";
+default_install_dir = default_icon_dir;
+break;
+
 default:
 abort();
 }
@@ -132,8 +139,11 @@ void qemu_add_default_firmwarepath(void)
 
 /* Add default dirs relative to the executable path */
 default_data_dir = get_relocated_path(CONFIG_QEMU_DATADIR);
+default_icon_dir = get_relocated_path(CONFIG_QEMU_ICONDIR);
 
-trace_datadir_init(default_data_dir, in_build_dir);
+trace_datadir_init(default_data_dir,
+   default_icon_dir,
+   in_build_dir);
 }
 
 void qemu_list_data_dirs(void)
diff --git a/softmmu/trace-events b/softmmu/trace-events
index a9ba53f50d..9c00e9f389 100644
--- a/softmmu/trace-events
+++ b/softmmu/trace-events
@@ -6,7 +6,7 @@ balloon_event(void *opaque, unsigned long addr) "opaque %p addr 
%lu"
 
 # datadir.c
 datadir_load_fi

Re: [PATCH v4 0/4] cutils: Introduce bundle mechanism

On Wed, Jun 15, 2022 at 10:39:29AM +0200, Paolo Bonzini wrote:
> On 6/14/22 23:07, Akihiko Odaki wrote:
> > Developers often run QEMU without installing. The bundle mechanism
> > allows to look up files which should be present in installation even in
> > such a situation.
> > 
> > It is a general mechanism and can find any files located relative
> > to the installation tree. The build tree must have a new directory,
> > qemu-bundle, to represent what files the installation tree would
> > have for reference by the executables.
> > 
> > v4:
> > * Add Daniel P. Berrangé to CC. Hopefully this helps merging his patch:
> >https://mail.gnu.org/archive/html/qemu-devel/2022-06/msg02276.html
> > * Rebased to the latest QEMU.
> > 
> > v3:
> > * Note that the bundle mechanism is for any files located relative to the
> >installation tree including but not limited to datadir. (Peter Maydell)
> > * Fix "bridge" typo (Philippe Mathieu-Daudé)
> > 
> > v2: Rebased to the latest QEMU.
> 
> I like the idea, but I have a couple issues with the implementation:
> 
> - at the meson level, there is some repetition of mkdir and ln run_commands.
> Perhaps you could just fill in a dictionary, and then do something like
> 
>   created_paths = {}
>   foreach source, dest: var
> path = fs.parent(qemu_bundledir / dest)
> created_paths += {path: true}
>   endforeach
>   run_command('mkdir', '-p', created_paths.keys())
>   foreach source, dest: var
> run_command('ln', '-sf', meson.project_source_root() / source,
> qemu_bundledir / dest)
>   endforeach

Per my other reply, IMHO, all the meson changes are redundant.

I've just sent a series that illustrates how we can improve the
qemu_find_file method so it correctly copes with install dir
vs build dir being different layouts, and be extensible to
any types of file (bios, keymaps, icons, helper exes, and
more).

With regards,
Daniel
-- 
|: https://berrange.com  -o-https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o-https://fstop138.berrange.com :|
|: https://entangle-photo.org-o-https://www.instagram.com/dberrange :|

[PATCH v3 4/4] net: convert to use qemu_find_file to locate bridge helper

The TAP device code currently uses get_relocate_path to find the bridge
helper, however, this fails when run from the build dir. Adding support
to qemu_find_file for helper binaries, allows it to work from both the
(relocated) install tree and build dir.

Signed-off-by: Daniel P. Berrangé 
---
 include/net/net.h  | 3 ++-
 include/qemu/datadir.h | 1 +
 net/tap.c  | 5 -
 qemu-options.hx| 4 ++--
 softmmu/datadir.c  | 9 +
 softmmu/trace-events   | 2 +-
 6 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/include/net/net.h b/include/net/net.h
index 523136c7ac..6a853512ac 100644
--- a/include/net/net.h
+++ b/include/net/net.h
@@ -228,7 +228,8 @@ NetClientState *net_hub_port_find(int hub_id);
 
 #define DEFAULT_NETWORK_SCRIPT CONFIG_SYSCONFDIR "/qemu-ifup"
 #define DEFAULT_NETWORK_DOWN_SCRIPT CONFIG_SYSCONFDIR "/qemu-ifdown"
-#define DEFAULT_BRIDGE_HELPER CONFIG_QEMU_HELPERDIR "/qemu-bridge-helper"
+#define DEFAULT_BRIDGE_HELPER "qemu-bridge-helper"
+#define DEFAULT_BRIDGE_HELPER_PATH CONFIG_QEMU_HELPERDIR "/qemu-bridge-helper"
 #define DEFAULT_BRIDGE_INTERFACE "br0"
 
 void qdev_set_nic_properties(DeviceState *dev, NICInfo *nd);
diff --git a/include/qemu/datadir.h b/include/qemu/datadir.h
index 427e90787a..a211b6b235 100644
--- a/include/qemu/datadir.h
+++ b/include/qemu/datadir.h
@@ -4,6 +4,7 @@
 #define QEMU_FILE_TYPE_BIOS   0
 #define QEMU_FILE_TYPE_KEYMAP 1
 #define QEMU_FILE_TYPE_ICON   2
+#define QEMU_FILE_TYPE_HELPER 3
 
 /**
  * qemu_find_file:
diff --git a/net/tap.c b/net/tap.c
index b3ddfd4a74..161608e34a 100644
--- a/net/tap.c
+++ b/net/tap.c
@@ -42,6 +42,7 @@
 #include "qemu/error-report.h"
 #include "qemu/main-loop.h"
 #include "qemu/sockets.h"
+#include "qemu/datadir.h"
 
 #include "net/tap.h"
 
@@ -507,9 +508,11 @@ static int net_bridge_run_helper(const char *helper, const 
char *bridge,
 sigprocmask(SIG_BLOCK, &mask, &oldmask);
 
 if (!helper) {
-helper = default_helper = get_relocated_path(DEFAULT_BRIDGE_HELPER);
+helper = default_helper = qemu_find_file(QEMU_FILE_TYPE_HELPER,
+ DEFAULT_BRIDGE_HELPER);
 }
 
+g_printerr("Helper %s\n", helper);
 if (socketpair(PF_UNIX, SOCK_STREAM, 0, sv) == -1) {
 error_setg_errno(errp, errno, "socketpair() failed");
 return -1;
diff --git a/qemu-options.hx b/qemu-options.hx
index 377d22fbd8..b5b7e75048 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -2665,7 +2665,7 @@ DEF("netdev", HAS_ARG, QEMU_OPTION_netdev,
 "to configure it and 'dfile' (default=" 
DEFAULT_NETWORK_DOWN_SCRIPT ")\n"
 "to deconfigure it\n"
 "use '[down]script=no' to disable script execution\n"
-"use network helper 'helper' (default=" 
DEFAULT_BRIDGE_HELPER ") to\n"
+"use network helper 'helper' (default=" 
DEFAULT_BRIDGE_HELPER_PATH ") to\n"
 "configure it\n"
 "use 'fd=h' to connect to an already opened TAP 
interface\n"
 "use 'fds=x:y:...:z' to connect to already opened 
multiqueue capable TAP interfaces\n"
@@ -2684,7 +2684,7 @@ DEF("netdev", HAS_ARG, QEMU_OPTION_netdev,
 "-netdev bridge,id=str[,br=bridge][,helper=helper]\n"
 "configure a host TAP network backend with ID 'str' that 
is\n"
 "connected to a bridge (default=" DEFAULT_BRIDGE_INTERFACE 
")\n"
-"using the program 'helper (default=" 
DEFAULT_BRIDGE_HELPER ")\n"
+"using the program 'helper (default=" 
DEFAULT_BRIDGE_HELPER_PATH ")\n"
 #endif
 #ifdef __linux__
 "-netdev 
l2tpv3,id=str,src=srcaddr,dst=dstaddr[,srcport=srcport][,dstport=dstport]\n"
diff --git a/softmmu/datadir.c b/softmmu/datadir.c
index e5d1fd0116..a68fe7167a 100644
--- a/softmmu/datadir.c
+++ b/softmmu/datadir.c
@@ -36,6 +36,7 @@ static char **extra_firmware_dirs;
 /* Default built-in directories */
 static char *default_data_dir;
 static char *default_icon_dir;
+static char *default_helper_dir;
 
 /* Whether we're known to be executing from a build tree */
 static bool in_build_dir;
@@ -73,6 +74,12 @@ char *qemu_find_file(int type, const char *name)
 default_install_dir = default_icon_dir;
 break;
 
+case QEMU_FILE_TYPE_HELPER:
+rel_install_dir = "";
+rel_build_dir = "";
+default_install_dir = default_helper_dir;
+break;
+
 default:
 abort();
 }
@@ -140,9 +147,11 @@ void qemu_add_default_firmwarepath(void)
 /* Add default dirs relative to the executable path */
 default_data_dir = get_relocated_path(CONFIG_QEMU_DATADIR);
 default_icon_dir = get_relocated_path(CONFIG_QEMU_ICONDIR);
+default_helper_dir = get_relocated_path(CONFIG_QEMU_HELPERDIR);
 
 trace_datadir_init(default_data_dir,
default_icon_dir,
+   default_helper_dir,

[PATCH v3 2/4] ui: move 'pc-bios/keymaps' to 'ui/keymaps'

The 'keymaps' directory contents is nothing to do with the firmware
blobs. The 'pc-bios/keymaps' directory appears to have been used
previously as a convenience for getting the files installed into
a subdir of the firmware install dir, as well as to make it easier
to launch QEMU directly from the build tree. These requirements
do not need to be reflected in the source tree arrangement. The
keymaps logically belong with the UI code, and meson can install
them into the right place. For in-tree execution, we merely need
a suitable symlink from the source tree to the build tree.

Signed-off-by: Daniel P. Berrangé 
---
 pc-bios/meson.build | 1 -
 softmmu/datadir.c   | 2 +-
 {pc-bios => ui}/keymaps/ar  | 0
 {pc-bios => ui}/keymaps/bepo| 0
 {pc-bios => ui}/keymaps/cz  | 0
 {pc-bios => ui}/keymaps/da  | 0
 {pc-bios => ui}/keymaps/de  | 0
 {pc-bios => ui}/keymaps/de-ch   | 0
 {pc-bios => ui}/keymaps/en-gb   | 0
 {pc-bios => ui}/keymaps/en-us   | 0
 {pc-bios => ui}/keymaps/es  | 0
 {pc-bios => ui}/keymaps/et  | 0
 {pc-bios => ui}/keymaps/fi  | 0
 {pc-bios => ui}/keymaps/fo  | 0
 {pc-bios => ui}/keymaps/fr  | 0
 {pc-bios => ui}/keymaps/fr-be   | 0
 {pc-bios => ui}/keymaps/fr-ca   | 0
 {pc-bios => ui}/keymaps/fr-ch   | 0
 {pc-bios => ui}/keymaps/hr  | 0
 {pc-bios => ui}/keymaps/hu  | 0
 {pc-bios => ui}/keymaps/is  | 0
 {pc-bios => ui}/keymaps/it  | 0
 {pc-bios => ui}/keymaps/ja  | 0
 {pc-bios => ui}/keymaps/lt  | 0
 {pc-bios => ui}/keymaps/lv  | 0
 {pc-bios => ui}/keymaps/meson.build | 0
 {pc-bios => ui}/keymaps/mk  | 0
 {pc-bios => ui}/keymaps/nl  | 0
 {pc-bios => ui}/keymaps/no  | 0
 {pc-bios => ui}/keymaps/pl  | 0
 {pc-bios => ui}/keymaps/pt  | 0
 {pc-bios => ui}/keymaps/pt-br   | 0
 {pc-bios => ui}/keymaps/ru  | 0
 {pc-bios => ui}/keymaps/sl  | 0
 {pc-bios => ui}/keymaps/sv  | 0
 {pc-bios => ui}/keymaps/th  | 0
 {pc-bios => ui}/keymaps/tr  | 0
 ui/meson.build  | 1 +
 38 files changed, 2 insertions(+), 2 deletions(-)
 rename {pc-bios => ui}/keymaps/ar (100%)
 rename {pc-bios => ui}/keymaps/bepo (100%)
 rename {pc-bios => ui}/keymaps/cz (100%)
 rename {pc-bios => ui}/keymaps/da (100%)
 rename {pc-bios => ui}/keymaps/de (100%)
 rename {pc-bios => ui}/keymaps/de-ch (100%)
 rename {pc-bios => ui}/keymaps/en-gb (100%)
 rename {pc-bios => ui}/keymaps/en-us (100%)
 rename {pc-bios => ui}/keymaps/es (100%)
 rename {pc-bios => ui}/keymaps/et (100%)
 rename {pc-bios => ui}/keymaps/fi (100%)
 rename {pc-bios => ui}/keymaps/fo (100%)
 rename {pc-bios => ui}/keymaps/fr (100%)
 rename {pc-bios => ui}/keymaps/fr-be (100%)
 rename {pc-bios => ui}/keymaps/fr-ca (100%)
 rename {pc-bios => ui}/keymaps/fr-ch (100%)
 rename {pc-bios => ui}/keymaps/hr (100%)
 rename {pc-bios => ui}/keymaps/hu (100%)
 rename {pc-bios => ui}/keymaps/is (100%)
 rename {pc-bios => ui}/keymaps/it (100%)
 rename {pc-bios => ui}/keymaps/ja (100%)
 rename {pc-bios => ui}/keymaps/lt (100%)
 rename {pc-bios => ui}/keymaps/lv (100%)
 rename {pc-bios => ui}/keymaps/meson.build (100%)
 rename {pc-bios => ui}/keymaps/mk (100%)
 rename {pc-bios => ui}/keymaps/nl (100%)
 rename {pc-bios => ui}/keymaps/no (100%)
 rename {pc-bios => ui}/keymaps/pl (100%)
 rename {pc-bios => ui}/keymaps/pt (100%)
 rename {pc-bios => ui}/keymaps/pt-br (100%)
 rename {pc-bios => ui}/keymaps/ru (100%)
 rename {pc-bios => ui}/keymaps/sl (100%)
 rename {pc-bios => ui}/keymaps/sv (100%)
 rename {pc-bios => ui}/keymaps/th (100%)
 rename {pc-bios => ui}/keymaps/tr (100%)

diff --git a/pc-bios/meson.build b/pc-bios/meson.build
index 41ba1c0ec7..e49c0e5f56 100644
--- a/pc-bios/meson.build
+++ b/pc-bios/meson.build
@@ -97,4 +97,3 @@ foreach f : blobs
 endforeach
 
 subdir('descriptors')
-subdir('keymaps')
diff --git a/softmmu/datadir.c b/softmmu/datadir.c
index 7457717542..32c765d228 100644
--- a/softmmu/datadir.c
+++ b/softmmu/datadir.c
@@ -62,7 +62,7 @@ char *qemu_find_file(int type, const char *name)
 case QEMU_FILE_TYPE_KEYMAP:
 user_install_dir = user_data_dir;
 rel_install_dir = "keymaps";
-rel_build_dir = "pc-bios/keymaps";
+rel_build_dir = "ui/keymaps";
 default_install_dir = default_data_dir;
 break;
 
diff --git a/pc-bios/keymaps/ar b/ui/keymaps/ar
similarity index 100%
rename from pc-bios/keymaps/ar
rename to ui/keymaps/ar
diff --git a/pc-bios/keymaps/bepo b/ui/keymaps/bepo
similarity index 100%
rename from pc-bios/keymaps/bepo
rename to ui/keymaps/bepo
diff --git a/pc-bios/keymaps/cz b/ui/keymaps/cz
similarity index 100%
rename from pc-bios/keymaps/cz
rename to ui/keymaps/cz
diff --git a/pc-bios/keymaps/da b/ui/keymaps/da
similarity index 100%
rename from pc-bios/keymaps/da
rename to ui/keymaps/da
diff --git a/pc-bios/keymaps/de b/ui/keymaps/de
s

Re: [PATCH v4 0/4] cutils: Introduce bundle mechanism


On 6/15/22 10:30, Daniel P. Berrangé wrote:

I don't think this is an attractive approach to the problem,
because it results in us adding a bunch of meson rules to
simulate 'make install' within the build dir. This is undesirable
clutter IMHO, and can be solved more simply by just modifying the
qemu_find_file() method.

The core problem is the impl of qemu_find_file is taking the wrong
approach, in several ways, but mostly because of its use of a single
'data_dirs' array for all types of file. This is bad because it
has the assumption that build dir and install dir layouts match,
and second because when we add extra firmware data dirs, we don't
want this used for non-firmware files.

We need to separate out the handling of different types of resources
for this to work correctly.


In some sense this is what Akihiko did - instead of separating them in 
qemu_find_file(), the "pre-install" layout separates them in the 
filesystem.  While I had remarks on the implementation I think it's a 
sensible approach.


The pre-install directory could even be created as a custom_target, 
using the JSON files from Meson introspection.


Paolo

Re: [PATCH] hw/mem/nvdimm: fix error message for 'unarmed' flag

2022-06-15 Thread Xiao Guangrong

On Wed, Jun 15, 2022 at 4:24 PM David Hildenbrand  wrote:

> >> Is that a temporary or a permanent thing? Do we know?
> >
> > No idea. But his last signed-off was three years ago.
>
> I sent a patch to Xiao, asking if he's still active in QEMU. If I don't
> get a reply this week, I'll move forward with proposing an update to
> MAINTAINERS as described.
>

Okay, please do it.

Sorry, I am just roughly reading the mailing list of qemu & kvm usually,
and do not get enough time to actively review or contribute on these
fields. :-(

Re: [PATCH 1/2] hw/nvme: Implement shadow doorbell buffer support




> On Jun 15, 2022, at 6:11 PM, John Levon  wrote:
> 
> On Wed, Jun 15, 2022 at 11:33:02AM +0200, Klaus Jensen wrote:
> 
>>> BTW I'm surprised that this patch has just this:
>>> 
>>> +static void nvme_update_sq_eventidx(const NvmeSQueue *sq)
>>> +{
>>> +pci_dma_write(&sq->ctrl->parent_obj, sq->ei_addr, &sq->tail,
>>> +  sizeof(sq->tail));
>>> +}
>>> 
>>> Isn't this racy against the driver? Compare
>>> https://github.com/spdk/spdk/blob/master/lib/nvmf/vfio_user.c#L1317
>>> 
>>> thanks
>>> john
>> 
>> QEMU has full memory barriers on dma read/write, so I believe this is
>> safe?
> 
> But don't you need to re-read the tail still, for example:


Hi John,

I think we also have a check for concurrent update on the tail. After writing 
eventidx, we read the tail again. It is here:

@@ -5854,6 +5943,11 @@ static void nvme_process_sq(void *opaque)
 req->status = status;
 nvme_enqueue_req_completion(cq, req);
 }
+
+if (n->dbbuf_enabled) {
+nvme_update_sq_eventidx(sq);
+nvme_update_sq_tail(sq);
+}

> 
> 
> driverdevice
> 
>   eventidx is 3
> 
> write 4 to tail
>   read tail of 4
> write 5 to tail
> read eventidx of 3
> nvme_dbbuf_need_event (1)
> 
>   set eventidx to 4

Therefore, at this point, we read the tail of 5.

>   go to sleep
> 
> at (1), our tail update of 4->5 doesn't straddle the eventidx, so we don't 
> send
> any MMIO, and the device won't wake up. This is why the above code checks the
> tail twice for any concurrent update.

Thanks,
Jinhao Fan

> 
> regards
> john

Re: [PATCH v4 0/4] cutils: Introduce bundle mechanism

On Wed, Jun 15, 2022 at 01:02:08PM +0200, Paolo Bonzini wrote:
> On 6/15/22 10:30, Daniel P. Berrangé wrote:
> > I don't think this is an attractive approach to the problem,
> > because it results in us adding a bunch of meson rules to
> > simulate 'make install' within the build dir. This is undesirable
> > clutter IMHO, and can be solved more simply by just modifying the
> > qemu_find_file() method.
> > 
> > The core problem is the impl of qemu_find_file is taking the wrong
> > approach, in several ways, but mostly because of its use of a single
> > 'data_dirs' array for all types of file. This is bad because it
> > has the assumption that build dir and install dir layouts match,
> > and second because when we add extra firmware data dirs, we don't
> > want this used for non-firmware files.
> > 
> > We need to separate out the handling of different types of resources
> > for this to work correctly.
> 
> In some sense this is what Akihiko did - instead of separating them in
> qemu_find_file(), the "pre-install" layout separates them in the filesystem.
> While I had remarks on the implementation I think it's a sensible approach.
> 
> The pre-install directory could even be created as a custom_target, using
> the JSON files from Meson introspection.

Doing that is more complicated than just refactoring qemu_find_file,
such that its search locations can be tailored per file type, just
by setting a couple variables in the code IMHO.

With regards,
Daniel
-- 
|: https://berrange.com  -o-https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o-https://fstop138.berrange.com :|
|: https://entangle-photo.org-o-https://www.instagram.com/dberrange :|

Re: [PATCH v3 4/4] net: convert to use qemu_find_file to locate bridge helper


On 6/15/22 12:52, Daniel P. Berrangé wrote:
  
+case QEMU_FILE_TYPE_HELPER:

+rel_install_dir = "";
+rel_build_dir = "";
+default_install_dir = default_helper_dir;
+break;
+


You're replacing ad hoc rules in Akihiko's meson.build with an ad hoc 
enum + the corresponding "case"s here in qemu_find_file().  There is 
duplication anyway, in this case between Meson and QEMU (plus QEMU needs 
to know about two filesystem layouts).


Paolo

Re: [PATCH 1/2] hw/nvme: Implement shadow doorbell buffer support

2022-06-15 Thread John Levon

On Wed, Jun 15, 2022 at 07:22:22PM +0800, Jinhao Fan wrote:

> >>> Isn't this racy against the driver? Compare
> >>> https://github.com/spdk/spdk/blob/master/lib/nvmf/vfio_user.c#L1317
> >> 
> >> QEMU has full memory barriers on dma read/write, so I believe this is
> >> safe?
> > 
> > But don't you need to re-read the tail still, for example:
> 
> I think we also have a check for concurrent update on the tail. After writing 
> eventidx, we read the tail again. It is here:
> 
> @@ -5854,6 +5943,11 @@ static void nvme_process_sq(void *opaque)
>  req->status = status;
>  nvme_enqueue_req_completion(cq, req);
>  }
> +
> +if (n->dbbuf_enabled) {
> +nvme_update_sq_eventidx(sq);
> +nvme_update_sq_tail(sq);
> +}

Ah, and we go around the loop another time in this case.

> > driver  device
> > 
> > eventidx is 3
> > 
> > write 4 to tail
> > read tail of 4
> > write 5 to tail
> > read eventidx of 3
> > nvme_dbbuf_need_event (1)
> > 
> > set eventidx to 4
> 
> Therefore, at this point, we read the tail of 5.

The driver could still update the tail after the nvme_update_sq_tail() above.
However, the driver ordering (read tail, then eventidx), does mean that it would
then do an mmio write, so yes, this looks safe, thank you.

regards
john

Re: [RFC PATCH v2 3/8] qapi: net: add stream and dgram netdevs

2022-06-15 Thread Markus Armbruster

Laurent Vivier  writes:

> On 13/05/2022 13:44, Markus Armbruster wrote:
>> Laurent Vivier  writes:
>> 
>>> Copied from socket netdev file and modified to use SocketAddress
>>> to be able to introduce new features like unix socket.
>>>
>>> "udp" and "mcast" are squashed into dgram netdev, multicast is detected
>>> according to the IP address type.
>>> "listen" and "connect" modes are managed by stream netdev. An optional
>>> parameter "server" defines the mode (server by default)
>>>
>>> Signed-off-by: Laurent Vivier 
>>> ---
>>>   hmp-commands.hx |   2 +-
>>>   net/clients.h   |   6 +
>>>   net/dgram.c | 630 
>>>   net/hub.c   |   2 +
>>>   net/meson.build |   2 +
>>>   net/net.c   |  24 +-
>>>   net/stream.c| 425 
>>>   qapi/net.json   |  38 ++-
>>>   8 files changed, 1125 insertions(+), 4 deletions(-)
>>>   create mode 100644 net/dgram.c
>>>   create mode 100644 net/stream.c
>>>
>>> diff --git a/hmp-commands.hx b/hmp-commands.hx
>>> index 03e6a73d1f55..172dbab1dfed 100644
>>> --- a/hmp-commands.hx
>>> +++ b/hmp-commands.hx
>>> @@ -1269,7 +1269,7 @@ ERST
>>>   {
>>>   .name   = "netdev_add",
>>>   .args_type  = "netdev:O",
>>> -.params = 
>>> "[user|tap|socket|vde|bridge|hubport|netmap|vhost-user],id=str[,prop=value][,...]",
>>> +.params = 
>>> "[user|tap|socket|stream|dgram|vde|bridge|hubport|netmap|vhost-user],id=str[,prop=value][,...]",
>>>   .help   = "add host network device",
>>>   .cmd= hmp_netdev_add,
>>>   .command_completion = netdev_add_completion,
>> 
>> Does qemu-options.hx need an update, too?
>
> Done
>
>> 
>>> diff --git a/net/clients.h b/net/clients.h
>>> index 92f9b59aedce..c1b51d79b147 100644
>>> --- a/net/clients.h
>>> +++ b/net/clients.h
>>> @@ -40,6 +40,12 @@ int net_init_hubport(const Netdev *netdev, const char 
>>> *name,
>>>   int net_init_socket(const Netdev *netdev, const char *name,
>>>   NetClientState *peer, Error **errp);
>>>   
>>> +int net_init_stream(const Netdev *netdev, const char *name,
>>> +NetClientState *peer, Error **errp);
>>> +
>>> +int net_init_dgram(const Netdev *netdev, const char *name,
>>> +   NetClientState *peer, Error **errp);
>>> +
>>>   int net_init_tap(const Netdev *netdev, const char *name,
>>>NetClientState *peer, Error **errp);
>>>   
>>> diff --git a/net/dgram.c b/net/dgram.c
>>> new file mode 100644
>>> index ..aa4240501ed0
>>> --- /dev/null
>>> +++ b/net/dgram.c
>>> @@ -0,0 +1,630 @@
>>> +/*
>>> + * QEMU System Emulator
>>> + *
>>> + * Copyright (c) 2003-2008 Fabrice Bellard
>>> + *
>>> + * Permission is hereby granted, free of charge, to any person obtaining a 
>>> copy
>>> + * of this software and associated documentation files (the "Software"), 
>>> to deal
>>> + * in the Software without restriction, including without limitation the 
>>> rights
>>> + * to use, copy, modify, merge, publish, distribute, sublicense, and/or 
>>> sell
>>> + * copies of the Software, and to permit persons to whom the Software is
>>> + * furnished to do so, subject to the following conditions:
>>> + *
>>> + * The above copyright notice and this permission notice shall be included 
>>> in
>>> + * all copies or substantial portions of the Software.
>>> + *
>>> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 
>>> OR
>>> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
>>> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
>>> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR 
>>> OTHER
>>> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
>>> FROM,
>>> + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 
>>> IN
>>> + * THE SOFTWARE.
>>> + */
>> 
>> Blank line here, please.
>> 
>> Why not GPLv2+?
>
> I've kept the original text copied from net/socket.c, but I can move this to 
> GPL2+

If the file's contents is derived from net/socket.c, copying the
legalese from there makes sense.

>>> +#include "qemu/osdep.h"
>> 
>> [...]
>> 
>>> diff --git a/net/net.c b/net/net.c
>>> index 2aab7167316c..fd6b30a10c57 100644
>>> --- a/net/net.c
>>> +++ b/net/net.c
>>> @@ -1015,6 +1015,8 @@ static int (* const 
>>> net_client_init_fun[NET_CLIENT_DRIVER__MAX])(
>>>   #endif
>>>   [NET_CLIENT_DRIVER_TAP]   = net_init_tap,
>>>   [NET_CLIENT_DRIVER_SOCKET]= net_init_socket,
>>> +[NET_CLIENT_DRIVER_STREAM]= net_init_stream,
>>> +[NET_CLIENT_DRIVER_DGRAM] = net_init_dgram,
>>>   #ifdef CONFIG_VDE
>>>   [NET_CLIENT_DRIVER_VDE]   = net_init_vde,
>>>   #endif
>>> @@ -1097,6 +1099,8 @@ void show_netdevs(void)
>>>   int idx;
>>>   const char *available_netdevs[] = {
>>>   "socket",
>>> +"stream",
>>>

Re: [PATCH] hw/mem/nvdimm: fix error message for 'unarmed' flag

2022-06-15 Thread David Hildenbrand

On 15.06.22 13:17, Xiao Guangrong wrote:
> On Wed, Jun 15, 2022 at 4:24 PM David Hildenbrand  wrote:
> 
 Is that a temporary or a permanent thing? Do we know?
>>>
>>> No idea. But his last signed-off was three years ago.
>>
>> I sent a patch to Xiao, asking if he's still active in QEMU. If I don't

s/patch/mail/ :)

>> get a reply this week, I'll move forward with proposing an update to
>> MAINTAINERS as described.
>>
> 
> Okay, please do it.
> 
> Sorry, I am just roughly reading the mailing list of qemu & kvm usually,
> and do not get enough time to actively review or contribute on these
> fields. :-(

Not an issue, thanks for that information and thanks for your work in
the past on that!

Should I keep you entered as a reviewer for the new section?

-- 
Thanks,

David / dhildenb

Re: [RFC PATCH v2 2/8] qapi: net: introduce a way to bypass qemu_opts_parse_noisily()

2022-06-15 Thread Markus Armbruster

Laurent Vivier  writes:

> On 13/05/2022 13:21, Markus Armbruster wrote:
>> Laurent Vivier  writes:
>> 
>>> As qemu_opts_parse_noisily() flattens the QAPI structures ("type" field
>>> of Netdev structure can collides with "type" field of SocketAddress),
>> 
>> To remember how this works, I have to write a more verbose version of
>> the above.  Why not post it then, so here goes.
>> 
>> qemu_init() passes the argument of -netdev, -nic, and -net to
>> net_client_parse().
>> 
>> net_client_parse() parses with qemu_opts_parse_noisily(), passing
>> QemuOptsList qemu_netdev_opts for -netdev, qemu_nic_opts for -nic, and
>> qemu_net_opts for -net.  Their desc[] are all empty, which means any
>> keys are accepted.  The result of the parse (a QemuOpts) is stored in
>> the QemuOptsList.
>> 
>> Note that QemuOpts is flat by design.  In some places, we layer non-flat
>> on top using dotted keys convention, but not here.
>> 
>> net_init_clients() iterates over the stored QemuOpts, and passes them to
>> net_init_netdev(), net_param_nic(), or net_init_client(), respectively.
>> 
>> These functions pass the QemuOpts to net_client_init().  They also do
>> other things with the QemuOpts, which we can ignore here.
>> 
>> net_client_init() uses the opts visitor to convert the (flat) QemOpts to
>> a (non-flat) QAPI object Netdev.  Netdev is also the argument of QMP
>> command netdev_add.
>> 
>> The opts visitor was an early attempt to support QAPI in
>> (QemuOpts-based) CLI.  It restricts QAPI types to a certain shape; see
>> commit eb7ee2cbeb "qapi: introduce OptsVisitor".
>> 
>> A more modern way to support QAPI is qobject_input_visitor_new_str().
>> It uses keyval_parse() instead of QemuOpts for KEY=VALUE,... syntax, and
>> it also supports JSON syntax.  The former isn't quite as expressive as
>> JSON, but it's a lot closer than QemuOpts + opts visitor.
>> 
>>> we introduce a way to bypass qemu_opts_parse_noisily() and use directly
>>> visit_type_Netdev() to parse the backend parameters.
>> 
>> This commit paves the way to use of the modern way instead.
>
> I'm going to copy your analysis to the commit message of the patch.

Go right ahead :)

>>> Signed-off-by: Laurent Vivier 
>>> ---
>>>   net/net.c | 54 ++
>>>   1 file changed, 54 insertions(+)
>>>
>>> diff --git a/net/net.c b/net/net.c
>>> index 58c05c200622..2aab7167316c 100644
>>> --- a/net/net.c
>>> +++ b/net/net.c
>>> @@ -54,6 +54,7 @@
>>>   #include "net/colo-compare.h"
>>>   #include "net/filter.h"
>>>   #include "qapi/string-output-visitor.h"
>>> +#include "qapi/qobject-input-visitor.h"
>>>   
>>>   /* Net bridge is currently not supported for W32. */
>>>   #if !defined(_WIN32)
>>> @@ -63,6 +64,17 @@
>>>   static VMChangeStateEntry *net_change_state_entry;
>>>   static QTAILQ_HEAD(, NetClientState) net_clients;
>>>   
>>> +typedef struct NetdevQueueEntry {
>>> +bool is_netdev;
>>> +Netdev *nd;
>>> +Location loc;
>>> +QSIMPLEQ_ENTRY(NetdevQueueEntry) entry;
>>> +} NetdevQueueEntry;
>>> +
>>> +typedef QSIMPLEQ_HEAD(, NetdevQueueEntry) NetdevQueue;
>>> +
>>> +static NetdevQueue nd_queue = QSIMPLEQ_HEAD_INITIALIZER(nd_queue);
>>> +
>>>   /***/
>>>   /* network device redirectors */
>>>   
>>> @@ -1559,6 +1571,19 @@ int net_init_clients(Error **errp)
>>>   
>>>   QTAILQ_INIT(&net_clients);
>>>   
>>> +while (!QSIMPLEQ_EMPTY(&nd_queue)) {
>>> +NetdevQueueEntry *nd = QSIMPLEQ_FIRST(&nd_queue);
>>> +
>>> +QSIMPLEQ_REMOVE_HEAD(&nd_queue, entry);
>>> +loc_push_restore(&nd->loc);
>>> +if (net_client_init1(nd->nd, nd->is_netdev, errp) < 0) {
>> 
>> I think you need to loc_pop() here.
>> 
>>> +return -1;
>>> +}
>> 
>> Since the only caller passes &error_fatal, I'd be tempted to ditch the
>> @errp argument, and simply do
>> 
>> net_client_init1(nd->nd, nd->is_netdev, &error_fatal);
>> 
>> It's what we do for -blockdev, -device, and -object.
>
> I've added a patch to remove the @errp from the net_init_clients() arguments.
>
>> 
>>> +loc_pop(&nd->loc);
>>> +qapi_free_Netdev(nd->nd);
>>> +g_free(nd);
>>> +}
>>> +
>>>   if (qemu_opts_foreach(qemu_find_opts("netdev"),
>>> net_init_netdev, NULL, errp)) {
>>>   return -1;
>>> @@ -1575,8 +1600,37 @@ int net_init_clients(Error **errp)
>>>   return 0;
>>>   }
>>>   
>>> +/*
>>> + * netdev_is_modern() returns true when the backend needs to bypass
>>> + * qemu_opts_parse_noisily()
>>> + */
>>> +static bool netdev_is_modern(const char *optarg)
>>> +{
>>> +return false;
>>> +}
>>> +
>>>   int net_client_parse(QemuOptsList *opts_list, const char *optarg)
>>>   {
>>> +if (netdev_is_modern(optarg)) {
>>> +/*
>>> + * We need to bypass qemu_opts_parse_noisily() to accept
>>> + * new style object like addr.type=inet in SocketAddress
>>> +

Re: [PULL 20/33] configure: handle host compiler in probe_target_compiler

2022-06-15 Thread Matheus Kowalczuk Ferst

On 01/06/2022 15:05, Alex Bennée wrote:
> From: Paolo Bonzini 
> 
> In preparation for handling more binaries than just cc, handle
> the case of "probe_target_compiler $cpu" directly in the function,
> setting the target_* variables based on the ones that are used to
> build QEMU.  The clang check also needs to be moved after this
> fallback.
> 
> Signed-off-by: Paolo Bonzini 
> Reviewed-by: Richard Henderson 
> Message-Id: <20220517092616.1272238-10-pbonz...@redhat.com>
> Signed-off-by: Alex Bennée 
> Message-Id: <20220527153603.887929-21-alex.ben...@linaro.org>

Hi,

After this patch, a clean build in ppc64le hosts will not build 
ppc64{,le}-linux-user tests with "make check-tcg"

> 
> diff --git a/configure b/configure
> index fbf6d39f96..217c8b3cac 100755
> --- a/configure
> +++ b/configure
> @@ -954,10 +954,6 @@ case $git_submodules_action in
>   ;;
>   esac
> 
> -if eval test -z "\${cross_cc_$cpu}"; then
> -eval "cross_cc_${cpu}=\$cc"
> -fi
> -
>   default_target_list=""
>   mak_wilds=""
> 
> @@ -2008,13 +2004,6 @@ probe_target_compiler() {
> if eval test -n "\"\${cross_cc_$1}\""; then
>   if eval has "\"\${cross_cc_$1}\""; then
> eval "target_cc=\"\${cross_cc_$1}\""
> -  case $1 in
> -i386|x86_64)
> -  if $target_cc --version | grep -qi "clang"; then
> -unset target_cc
> -  fi
> -  ;;
> -  esac
>   fi
> fi
> if eval test -n "\"\${cross_as_$1}\""; then
> @@ -2027,6 +2016,20 @@ probe_target_compiler() {
> eval "target_ld=\"\${cross_ld_$1}\""
>   fi
> fi
> +  if test "$1" = $cpu; then > +: ${target_cc:=$cc}
> +: ${target_as:=$as}
> +: ${target_ld:=$ld}
> +  fi

$cpu is normalized[1] to ppc64 on little-endian hosts, so 
ppc64le-linux-user will not have $target_{cc,as,ld} set, and 
ppc64-linux-user will have them set to a toolchain that may not support 
-mbig-endian. I suppose we have a similar problem with MIPS targets on 
MIPS hosts.

[1] 
https://gitlab.com/qemu-project/qemu/-/blob/2ad60f6f8c12ca0acd8834fdd70e088361b8791f/configure#L611

-- 
Matheus K. Ferst
Instituto de Pesquisas ELDORADO 
Analista de Software
Aviso Legal - Disclaimer

Re: [PATCH v3 4/4] net: convert to use qemu_find_file to locate bridge helper

On Wed, Jun 15, 2022 at 01:42:58PM +0200, Paolo Bonzini wrote:
> On 6/15/22 12:52, Daniel P. Berrangé wrote:
> > +case QEMU_FILE_TYPE_HELPER:
> > +rel_install_dir = "";
> > +rel_build_dir = "";
> > +default_install_dir = default_helper_dir;
> > +break;
> > +
> 
> You're replacing ad hoc rules in Akihiko's meson.build with an ad hoc enum +
> the corresponding "case"s here in qemu_find_file().  There is duplication
> anyway, in this case between Meson and QEMU (plus QEMU needs to know about
> two filesystem layouts).

IMHO this is simpler to deal with than the meson additions, and also
avoids the confusion of having files appearing in two places in the
build dir.

If we really want to have the build dir look just like the install
dir though, why write custom meson commands per file type at all,
instead add a rule that always invokes

   DESTDIR=$(BUILDDIR)/vroot ninja install

to populate a dir that's guaranteed identical to the install layout

Regards,
Daniel
-- 
|: https://berrange.com  -o-https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o-https://fstop138.berrange.com :|
|: https://entangle-photo.org-o-https://www.instagram.com/dberrange :|

Re: [PATCH 2/9] target/riscv: debug: Introduce build_tdata1() to build tdata1 register content

On Fri, Jun 10, 2022 at 1:14 PM  wrote:
>
> From: Frank Chang 
>
> Introduce build_tdata1() to build tdata1 register content, which can be
> shared among all types of triggers.
>
> Signed-off-by: Frank Chang 
> ---
>  target/riscv/debug.c | 15 ++-
>  1 file changed, 10 insertions(+), 5 deletions(-)
>
> diff --git a/target/riscv/debug.c b/target/riscv/debug.c
> index abbcd38a17..089aae0696 100644
> --- a/target/riscv/debug.c
> +++ b/target/riscv/debug.c
> @@ -94,18 +94,23 @@ static inline target_ulong get_trigger_type(CPURISCVState 
> *env,
>  return extract_trigger_type(env, tdata1);
>  }
>
> -static inline target_ulong trigger_type(CPURISCVState *env,
> -trigger_type_t type)
> +static inline target_ulong build_tdata1(CPURISCVState *env,
> +trigger_type_t type,
> +bool dmode, target_ulong data)
>  {
>  target_ulong tdata1;
>
>  switch (riscv_cpu_mxl(env)) {
>  case MXL_RV32:
> -tdata1 = RV32_TYPE(type);
> +tdata1 = RV32_TYPE(type) |
> + (dmode ? RV32_DMODE : 0) |
> + (data & RV32_DATA_MASK);

RV32_DATA_MASK should be introduced in this patch

>  break;
>  case MXL_RV64:
>  case MXL_RV128:
> -tdata1 = RV64_TYPE(type);
> +tdata1 = RV64_TYPE(type) |
> + (dmode ? RV64_DMODE : 0) |
> + (data & RV64_DATA_MASK);

ditto

>  break;
>  default:
>  g_assert_not_reached();
> @@ -490,7 +495,7 @@ bool riscv_cpu_debug_check_watchpoint(CPUState *cs, 
> CPUWatchpoint *wp)
>
>  void riscv_trigger_init(CPURISCVState *env)
>  {
> -target_ulong tdata1 = trigger_type(env, TRIGGER_TYPE_AD_MATCH);
> +target_ulong tdata1 = build_tdata1(env, TRIGGER_TYPE_AD_MATCH, 0, 0);
>  int i;
>
>  /* init to type 2 triggers */
> --
>

Otherwise,
Reviewed-by: Bin Meng

Re: [PATCH 3/9] target/riscv: debug: Introduce tdata1, tdata2, and tdata3 CSRs

On Fri, Jun 10, 2022 at 1:15 PM  wrote:
>
> From: Frank Chang 
>
> Replace type2_trigger_t with the real tdata1, tdata2, and tdata3 CSRs,
> which allows us to support more types of triggers in the future.
>
> Signed-off-by: Frank Chang 
> ---
>  target/riscv/cpu.h |   6 ++-
>  target/riscv/debug.c   | 101 -
>  target/riscv/debug.h   |   7 ---
>  target/riscv/machine.c |  20 ++--
>  4 files changed, 48 insertions(+), 86 deletions(-)
>
> diff --git a/target/riscv/cpu.h b/target/riscv/cpu.h
> index 535123a989..bac5f00722 100644
> --- a/target/riscv/cpu.h
> +++ b/target/riscv/cpu.h
> @@ -289,7 +289,11 @@ struct CPUArchState {
>
>  /* trigger module */
>  target_ulong trigger_cur;
> -type2_trigger_t type2_trig[RV_MAX_TRIGGERS];
> +target_ulong tdata1[RV_MAX_TRIGGERS];
> +target_ulong tdata2[RV_MAX_TRIGGERS];
> +target_ulong tdata3[RV_MAX_TRIGGERS];
> +struct CPUBreakpoint *cpu_breakpoint[RV_MAX_TRIGGERS];
> +struct CPUWatchpoint *cpu_watchpoint[RV_MAX_TRIGGERS];

I believe the breakpoint and watchpoint here does not make sense to
every type of trigger. It only makes sense to type 2, 6. Type 3 only
has breakpoint.

>
>  /* machine specific rdtime callback */
>  uint64_t (*rdtime_fn)(void *);
> diff --git a/target/riscv/debug.c b/target/riscv/debug.c
> index 089aae0696..6913682f75 100644
> --- a/target/riscv/debug.c
> +++ b/target/riscv/debug.c
> @@ -90,8 +90,7 @@ static inline target_ulong 
> extract_trigger_type(CPURISCVState *env,
>  static inline target_ulong get_trigger_type(CPURISCVState *env,
>  target_ulong trigger_index)
>  {
> -target_ulong tdata1 = env->type2_trig[trigger_index].mcontrol;
> -return extract_trigger_type(env, tdata1);
> +return extract_trigger_type(env, env->tdata1[trigger_index]);
>  }
>
>  static inline target_ulong build_tdata1(CPURISCVState *env,
> @@ -187,6 +186,8 @@ static inline void warn_always_zero_bit(target_ulong val, 
> target_ulong mask,
>  }
>  }
>
> +/* type 2 trigger */
> +
>  static uint32_t type2_breakpoint_size(CPURISCVState *env, target_ulong ctrl)
>  {
>  uint32_t size, sizelo, sizehi = 0;
> @@ -246,8 +247,8 @@ static target_ulong type2_mcontrol_validate(CPURISCVState 
> *env,
>
>  static void type2_breakpoint_insert(CPURISCVState *env, target_ulong index)
>  {
> -target_ulong ctrl = env->type2_trig[index].mcontrol;
> -target_ulong addr = env->type2_trig[index].maddress;
> +target_ulong ctrl = env->tdata1[index];
> +target_ulong addr = env->tdata2[index];
>  bool enabled = type2_breakpoint_enabled(ctrl);
>  CPUState *cs = env_cpu(env);
>  int flags = BP_CPU | BP_STOP_BEFORE_ACCESS;
> @@ -258,7 +259,7 @@ static void type2_breakpoint_insert(CPURISCVState *env, 
> target_ulong index)
>  }
>
>  if (ctrl & TYPE2_EXEC) {
> -cpu_breakpoint_insert(cs, addr, flags, &env->type2_trig[index].bp);
> +cpu_breakpoint_insert(cs, addr, flags, &env->cpu_breakpoint[index]);
>  }
>
>  if (ctrl & TYPE2_LOAD) {
> @@ -272,10 +273,10 @@ static void type2_breakpoint_insert(CPURISCVState *env, 
> target_ulong index)
>  size = type2_breakpoint_size(env, ctrl);
>  if (size != 0) {
>  cpu_watchpoint_insert(cs, addr, size, flags,
> -  &env->type2_trig[index].wp);
> +  &env->cpu_watchpoint[index]);
>  } else {
>  cpu_watchpoint_insert(cs, addr, 8, flags,
> -  &env->type2_trig[index].wp);
> +  &env->cpu_watchpoint[index]);
>  }
>  }
>  }
> @@ -284,34 +285,15 @@ static void type2_breakpoint_remove(CPURISCVState *env, 
> target_ulong index)
>  {
>  CPUState *cs = env_cpu(env);
>
> -if (env->type2_trig[index].bp) {
> -cpu_breakpoint_remove_by_ref(cs, env->type2_trig[index].bp);
> -env->type2_trig[index].bp = NULL;
> +if (env->cpu_breakpoint[index]) {
> +cpu_breakpoint_remove_by_ref(cs, env->cpu_breakpoint[index]);
> +env->cpu_breakpoint[index] = NULL;
>  }
>
> -if (env->type2_trig[index].wp) {
> -cpu_watchpoint_remove_by_ref(cs, env->type2_trig[index].wp);
> -env->type2_trig[index].wp = NULL;
> -}
> -}
> -
> -static target_ulong type2_reg_read(CPURISCVState *env,
> -   target_ulong index, int tdata_index)
> -{
> -target_ulong tdata;
> -
> -switch (tdata_index) {
> -case TDATA1:
> -tdata = env->type2_trig[index].mcontrol;
> -break;
> -case TDATA2:
> -tdata = env->type2_trig[index].maddress;
> -break;
> -default:
> -g_assert_not_reached();
> +if (env->cpu_watchpoint[index]) {
> +cpu_watchpoint_remove_by_ref(cs, env->cpu_watchpoint[index]);
> +env->cpu_watchpoint[index] = NULL;
>  }
> -
> -return tdata;
>  }
>
>  static voi

Re: [PATCH 4/9] target/riscv: debug: Restrict the range of tselect value can be written

On Fri, Jun 10, 2022 at 1:14 PM  wrote:
>
> From: Frank Chang 
>
> The value of tselect CSR can be written should be limited within the
> range of supported triggers number.
>
> Signed-off-by: Frank Chang 
> ---
>  target/riscv/debug.c | 9 +++--
>  1 file changed, 3 insertions(+), 6 deletions(-)
>

Reviewed-by: Bin Meng

[PATCH] MAINTAINERS: Add softmmu/runstate.c to "Main loop"

2022-06-15 Thread Markus Armbruster

Signed-off-by: Markus Armbruster 
---
 MAINTAINERS | 1 +
 1 file changed, 1 insertion(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index 4cf6174f9f..4c921c07db 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2745,6 +2745,7 @@ F: softmmu/cpu-throttle.c
 F: softmmu/cpu-timers.c
 F: softmmu/icount.c
 F: softmmu/runstate-action.c
+F: softmmu/runstate.c
 F: qapi/run-state.json
 
 Read, Copy, Update (RCU)
-- 
2.35.3

Re: [PATCH 5/9] target/riscv: debug: Introduce tinfo CSR

On Fri, Jun 10, 2022 at 1:21 PM  wrote:
>
> From: Frank Chang 
>
> tinfo.info:
>   One bit for each possible type enumerated in tdata1.
>   If the bit is set, then that type is supported by the currently
>   selected trigger.
>
> Signed-off-by: Frank Chang 
> ---
>  target/riscv/cpu_bits.h |  1 +
>  target/riscv/csr.c  |  8 
>  target/riscv/debug.c| 10 +++---
>  target/riscv/debug.h|  2 ++
>  4 files changed, 18 insertions(+), 3 deletions(-)
>

Reviewed-by: Bin Meng

Re: [PATCH 6/9] target/riscv: debug: Create common trigger actions function

On Fri, Jun 10, 2022 at 1:21 PM  wrote:
>
> From: Frank Chang 
>
> Trigger actions are shared among all triggers. Extract to a common
> function.
>
> Signed-off-by: Frank Chang 
> ---
>  target/riscv/debug.c | 55 ++--
>  target/riscv/debug.h | 13 +++
>  2 files changed, 66 insertions(+), 2 deletions(-)
>
> diff --git a/target/riscv/debug.c b/target/riscv/debug.c
> index 1668b8abda..ab23566113 100644
> --- a/target/riscv/debug.c
> +++ b/target/riscv/debug.c
> @@ -91,6 +91,35 @@ static inline target_ulong get_trigger_type(CPURISCVState 
> *env,
>  return extract_trigger_type(env, env->tdata1[trigger_index]);
>  }
>
> +static trigger_action_t get_trigger_action(CPURISCVState *env,
> +   target_ulong trigger_index)
> +{
> +target_ulong tdata1 = env->tdata1[trigger_index];
> +int trigger_type = get_trigger_type(env, trigger_index);
> +trigger_action_t action = DBG_ACTION_NONE;
> +
> +switch (trigger_type) {
> +case TRIGGER_TYPE_AD_MATCH:
> +action = (tdata1 & TYPE2_ACTION) >> 12;
> +break;
> +case TRIGGER_TYPE_INST_CNT:
> +case TRIGGER_TYPE_INT:
> +case TRIGGER_TYPE_EXCP:
> +case TRIGGER_TYPE_AD_MATCH6:
> +case TRIGGER_TYPE_EXT_SRC:
> +qemu_log_mask(LOG_UNIMP, "trigger type: %d is not supported\n",
> +  trigger_type);
> +break;
> +case TRIGGER_TYPE_NO_EXIST:
> +case TRIGGER_TYPE_UNAVAIL:
> +break;
> +default:
> +g_assert_not_reached();
> +}
> +
> +return action;
> +}
> +
>  static inline target_ulong build_tdata1(CPURISCVState *env,
>  trigger_type_t type,
>  bool dmode, target_ulong data)
> @@ -181,6 +210,28 @@ static inline void warn_always_zero_bit(target_ulong 
> val, target_ulong mask,
>  }
>  }
>
> +static void do_trigger_action(CPURISCVState *env, target_ulong trigger_index)
> +{
> +trigger_action_t action = get_trigger_action(env, trigger_index);
> +
> +switch (action) {
> +case DBG_ACTION_BP:
> +riscv_raise_exception(env, RISCV_EXCP_BREAKPOINT, 0);
> +break;
> +case DBG_ACTION_DBG_MODE:
> +case DBG_ACTION_TRACE0:
> +case DBG_ACTION_TRACE1:
> +case DBG_ACTION_TRACE2:
> +case DBG_ACTION_TRACE3:
> +case DBG_ACTION_EXT_DBG0:
> +case DBG_ACTION_EXT_DBG1:
> +qemu_log_mask(LOG_UNIMP, "action: %d is not supported\n", action);
> +break;

case DBG_ACTION_NONE should be added here as get_trigger_action() may
return such value

> +default:
> +g_assert_not_reached();
> +}
> +}
> +
>  /* type 2 trigger */
>
>  static uint32_t type2_breakpoint_size(CPURISCVState *env, target_ulong ctrl)
> @@ -381,11 +432,11 @@ void riscv_cpu_debug_excp_handler(CPUState *cs)
>  if (cs->watchpoint_hit) {
>  if (cs->watchpoint_hit->flags & BP_CPU) {
>  cs->watchpoint_hit = NULL;
> -riscv_raise_exception(env, RISCV_EXCP_BREAKPOINT, 0);
> +do_trigger_action(env, DBG_ACTION_BP);
>  }
>  } else {
>  if (cpu_breakpoint_test(cs, env->pc, BP_CPU)) {
> -riscv_raise_exception(env, RISCV_EXCP_BREAKPOINT, 0);
> +do_trigger_action(env, DBG_ACTION_BP);
>  }
>  }
>  }
> diff --git a/target/riscv/debug.h b/target/riscv/debug.h
> index 9f69c64591..0e4859cf74 100644
> --- a/target/riscv/debug.h
> +++ b/target/riscv/debug.h
> @@ -44,6 +44,19 @@ typedef enum {
>  TRIGGER_TYPE_NUM
>  } trigger_type_t;
>
> +/* actions */
> +typedef enum {
> +DBG_ACTION_NONE = -1,   /* sentinel value */
> +DBG_ACTION_BP = 0,
> +DBG_ACTION_DBG_MODE,
> +DBG_ACTION_TRACE0,
> +DBG_ACTION_TRACE1,
> +DBG_ACTION_TRACE2,
> +DBG_ACTION_TRACE3,
> +DBG_ACTION_EXT_DBG0 = 8,
> +DBG_ACTION_EXT_DBG1
> +} trigger_action_t;
> +
>  /* tdata1 field masks */
>
>  #define RV32_TYPE(t)((uint32_t)(t) << 28)

Regards,
Bin

Re: [PATCH 7/9] target/riscv: debug: Check VU/VS modes for type 2 trigger

On Fri, Jun 10, 2022 at 1:25 PM  wrote:
>
> From: Frank Chang 
>
> Type 2 trigger cannot be fired in VU/VS modes.
>
> Signed-off-by: Frank Chang 
> ---
>  target/riscv/debug.c | 10 ++
>  1 file changed, 10 insertions(+)
>

Reviewed-by: Bin Meng

[PATCH v5 2/4] datadir: Use bundle mechanism

softmmu/datadir.c had its own implementation to find files in the
build tree, but now bundle mechanism provides the unified
implementation which works for datadir and the other files.

Signed-off-by: Akihiko Odaki 
---
 .travis.yml |  2 +-
 meson.build |  2 +-
 pc-bios/keymaps/meson.build |  2 ++
 pc-bios/meson.build | 19 +--
 scripts/oss-fuzz/build.sh   |  2 +-
 softmmu/datadir.c   | 35 ---
 tests/qtest/fuzz/fuzz.c | 15 ---
 tests/vm/fedora |  2 +-
 tests/vm/freebsd|  2 +-
 tests/vm/netbsd |  2 +-
 tests/vm/openbsd|  2 +-
 11 files changed, 30 insertions(+), 55 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 9afc4a54b8f..9fee2167b95 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -223,7 +223,7 @@ jobs:
 - BUILD_RC=0 && make -j${JOBS} || BUILD_RC=$?
 - |
   if [ "$BUILD_RC" -eq 0 ] ; then
-  mv pc-bios/s390-ccw/*.img pc-bios/ ;
+  mv pc-bios/s390-ccw/*.img qemu-bundle/share/qemu ;
   ${TEST_CMD} ;
   else
   $(exit $BUILD_RC);
diff --git a/meson.build b/meson.build
index 2f9bb27554c..9bdcd26a4af 100644
--- a/meson.build
+++ b/meson.build
@@ -1683,7 +1683,7 @@ endif
 config_host_data.set_quoted('CONFIG_BINDIR', get_option('prefix') / 
get_option('bindir'))
 config_host_data.set_quoted('CONFIG_PREFIX', get_option('prefix'))
 config_host_data.set_quoted('CONFIG_QEMU_CONFDIR', get_option('prefix') / 
qemu_confdir)
-config_host_data.set_quoted('CONFIG_QEMU_DATADIR', get_option('prefix') / 
qemu_datadir)
+config_host_data.set_quoted('CONFIG_QEMU_BUNDLE_DATADIR', qemu_datadir)
 config_host_data.set_quoted('CONFIG_QEMU_DESKTOPDIR', get_option('prefix') / 
qemu_desktopdir)
 config_host_data.set_quoted('CONFIG_QEMU_FIRMWAREPATH', get_option('prefix') / 
get_option('qemu_firmwarepath'))
 config_host_data.set_quoted('CONFIG_QEMU_HELPERDIR', get_option('prefix') / 
get_option('libexecdir'))
diff --git a/pc-bios/keymaps/meson.build b/pc-bios/keymaps/meson.build
index 44247a12b54..dd103092290 100644
--- a/pc-bios/keymaps/meson.build
+++ b/pc-bios/keymaps/meson.build
@@ -67,3 +67,5 @@ if native_qemu_keymap.found()
 endif
 
 install_data(['sl', 'sv'], install_dir: qemu_datadir / 'keymaps')
+
+bundles += { qemu_datadir / 'keymaps': '../../../pc-bios/keymaps' }
diff --git a/pc-bios/meson.build b/pc-bios/meson.build
index 41ba1c0ec7b..0d2119836bd 100644
--- a/pc-bios/meson.build
+++ b/pc-bios/meson.build
@@ -20,6 +20,8 @@ if unpack_edk2_blobs
   install: get_option('install_blobs'),
   install_dir: qemu_datadir,
   command: [ bzip2, '-dc', '@INPUT0@' ])
+
+bundles += { qemu_datadir / f: '../../../pc-bios' / f  }
   endforeach
 endif
 
@@ -85,16 +87,13 @@ blobs = [
   'vof-nvram.bin',
 ]
 
-ln_s = [find_program('ln', required: true), '-sf']
-foreach f : blobs
-  roms += custom_target(f,
-build_by_default: have_system,
-output: f,
-input: files('meson.build'),# dummy input
-install: get_option('install_blobs'),
-install_dir: qemu_datadir,
-command: [ ln_s, meson.project_source_root() / 'pc-bios' / f, 
'@OUTPUT@' ])
-endforeach
+if get_option('install_blobs')
+  install_data(blobs, install_dir: qemu_datadir)
+
+  foreach f : blobs
+bundles += { qemu_datadir / f: meson.current_source_dir() / f }
+  endforeach
+endif
 
 subdir('descriptors')
 subdir('keymaps')
diff --git a/scripts/oss-fuzz/build.sh b/scripts/oss-fuzz/build.sh
index 98b56e05210..cbf8b3080e9 100755
--- a/scripts/oss-fuzz/build.sh
+++ b/scripts/oss-fuzz/build.sh
@@ -88,7 +88,7 @@ if [ "$GITLAB_CI" != "true" ]; then
 fi
 
 # Copy over the datadir
-cp  -r ../pc-bios/ "$DEST_DIR/pc-bios"
+cp  -r ../pc-bios/ "$DEST_DIR/qemu-bundle/share/qemu"
 
 targets=$(./qemu-fuzz-i386 | awk '$1 ~ /\*/  {print $2}')
 base_copy="$DEST_DIR/qemu-fuzz-i386-target-$(echo "$targets" | head -n 1)"
diff --git a/softmmu/datadir.c b/softmmu/datadir.c
index 160cac999a6..4dadf0e010c 100644
--- a/softmmu/datadir.c
+++ b/softmmu/datadir.c
@@ -35,6 +35,7 @@ char *qemu_find_file(int type, const char *name)
 int i;
 const char *subdir;
 char *buf;
+char *bundle;
 
 /* Try the name as a straight path first */
 if (access(name, R_OK) == 0) {
@@ -61,6 +62,16 @@ char *qemu_find_file(int type, const char *name)
 }
 g_free(buf);
 }
+
+bundle = g_strdup_printf("%s/%s%s",
+ CONFIG_QEMU_BUNDLE_DATADIR, subdir, name);
+buf = find_bundle(bundle);
+g_free(bundle);
+if (buf) {
+trace_load_file(name, buf);
+return buf;
+}
+
 return NULL;
 }
 
@@ -83,26 +94,6 @@ void qemu_add_data_dir(char *path)
 data_dir[data_dir_idx++] = path;
 }
 
-/*
- * Find a likely location for support files using the loca

[PATCH v5 1/4] cutils: Introduce bundle mechanism

Developers often run QEMU without installing. The bundle mechanism
allows to look up files which should be present in installation even in
such a situation.

It is a general mechanism and can find any files located relative
to the installation tree. The build tree must have a new directory,
qemu-bundle, to represent what files the installation tree would
have for reference by the executables.

Signed-off-by: Akihiko Odaki 
---
 include/qemu/cutils.h | 19 +++
 meson.build   | 12 
 util/cutils.c | 33 +
 3 files changed, 64 insertions(+)

diff --git a/include/qemu/cutils.h b/include/qemu/cutils.h
index 40e10e19a7e..3b66026cd3c 100644
--- a/include/qemu/cutils.h
+++ b/include/qemu/cutils.h
@@ -213,6 +213,25 @@ const char *qemu_get_exec_dir(void);
  */
 char *get_relocated_path(const char *dir);
 
+/**
+ * find_bundle:
+ * @path: Relative path
+ *
+ * Returns a path for the specified directory or file bundled in QEMU. It uses
+ * the directory of the running executable as the prefix first. See
+ * get_relocated_path() for the details. The next candidate is "qemu-bundle"
+ * directory in the directory of the running executable. "qemu-bundle"
+ * directory is typically present in the build tree.
+ *
+ * The returned string should be freed by the caller.
+ *
+ * Returns: a path that can access the bundle, or NULL if no matching bundle
+ * exists.
+ */
+char *find_bundle(const char *path);
+
+void list_bundle_candidates(const char *path);
+
 static inline const char *yes_no(bool b)
 {
  return b ? "yes" : "no";
diff --git a/meson.build b/meson.build
index 0c2e11ff071..2f9bb27554c 100644
--- a/meson.build
+++ b/meson.build
@@ -32,6 +32,7 @@ if get_option('qemu_suffix').startswith('/')
   error('qemu_suffix cannot start with a /')
 endif
 
+qemu_bundledir = meson.project_build_root() / 'qemu-bundle'
 qemu_confdir = get_option('sysconfdir') / get_option('qemu_suffix')
 qemu_datadir = get_option('datadir') / get_option('qemu_suffix')
 qemu_docdir = get_option('docdir') / get_option('qemu_suffix')
@@ -2844,6 +2845,8 @@ target_arch = {}
 target_softmmu_arch = {}
 target_user_arch = {}
 
+bundles = {}
+
 ###
 # Trace files #
 ###
@@ -3614,6 +3617,15 @@ if host_machine.system() == 'windows'
   alias_target('installer', nsis)
 endif
 
+###
+# Bundles #
+###
+
+foreach dst, src: bundles
+  run_command('mkdir', '-p', qemu_bundledir / fs.parent(dst), check: true)
+  run_command('ln', '-sf', src, qemu_bundledir / dst, check: true)
+endforeach
+
 #
 # Configuration summary #
 #
diff --git a/util/cutils.c b/util/cutils.c
index a58bcfd80e7..100e6c03c5c 100644
--- a/util/cutils.c
+++ b/util/cutils.c
@@ -1086,3 +1086,36 @@ char *get_relocated_path(const char *dir)
 }
 return g_string_free(result, false);
 }
+
+static const char * const bundle_formats[] = {
+"%s" G_DIR_SEPARATOR_S "qemu-bundle" G_DIR_SEPARATOR_S "%s",
+"%s" G_DIR_SEPARATOR_S ".." G_DIR_SEPARATOR_S "%s"
+};
+
+char *find_bundle(const char *path)
+{
+const char *dir = qemu_get_exec_dir();
+char *candidate;
+int i;
+
+for (i = 0; i < ARRAY_SIZE(bundle_formats); i++) {
+candidate = g_strdup_printf(bundle_formats[i], dir, path);
+if (access(candidate, R_OK) == 0) {
+return candidate;
+}
+g_free(candidate);
+}
+
+return NULL;
+}
+
+void list_bundle_candidates(const char *path)
+{
+const char *dir = qemu_get_exec_dir();
+int i;
+
+for (i = 0; i < ARRAY_SIZE(bundle_formats); i++) {
+printf(bundle_formats[i], dir, path);
+putc('\n', stdout);
+}
+}
-- 
2.32.1 (Apple Git-133)

[PATCH v5 0/4] cutils: Introduce bundle mechanism

Developers often run QEMU without installing. The bundle mechanism
allows to look up files which should be present in installation even in
such a situation.

It is a general mechanism and can find any files located relative
to the installation tree. The build tree must have a new directory,
qemu-bundle, to represent what files the installation tree would
have for reference by the executables.

v5:
* Prefer qemu-bundle if it exists. (Daniel P. Berrangé)
* Check install_blobs option before installing BIOSes (Paolo Bonzini)
* Add common code to set up qemu-bundle to the top level meson.build
  (Paolo Bonzini)

v4:
* Add Daniel P. Berrangé to CC. Hopefully this helps merging his patch:
  https://mail.gnu.org/archive/html/qemu-devel/2022-06/msg02276.html
* Rebased to the latest QEMU.

v3:
* Note that the bundle mechanism is for any files located relative to the
  installation tree including but not limited to datadir. (Peter Maydell)
* Fix "bridge" typo (Philippe Mathieu-Daudé)

v2: Rebased to the latest QEMU.

Akihiko Odaki (4):
  cutils: Introduce bundle mechanism
  datadir: Use bundle mechanism
  ui/icons: Use bundle mechanism
  net: Use bundle mechanism

 .travis.yml |  2 +-
 include/net/net.h   |  2 +-
 include/qemu/cutils.h   | 19 +++
 meson.build | 20 +---
 net/tap.c   |  6 +-
 pc-bios/keymaps/meson.build |  2 ++
 pc-bios/meson.build | 19 +--
 qemu-options.hx |  4 ++--
 scripts/oss-fuzz/build.sh   |  2 +-
 softmmu/datadir.c   | 35 ---
 tests/qtest/fuzz/fuzz.c | 15 ---
 tests/vm/fedora |  2 +-
 tests/vm/freebsd|  2 +-
 tests/vm/netbsd |  2 +-
 tests/vm/openbsd|  2 +-
 ui/cocoa.m  | 29 -
 ui/gtk.c|  6 +-
 ui/icons/meson.build| 32 
 ui/sdl2.c   | 18 +++---
 util/cutils.c   | 33 +
 20 files changed, 162 insertions(+), 90 deletions(-)

-- 
2.32.1 (Apple Git-133)

Re: [PATCH 0/2] Make local migration with TAP network device possible

2022-06-15 Thread Stefan Hajnoczi

On Tue, Jun 14, 2022 at 02:18:41PM +0300, Andrey Ryabinin wrote:
> Hi
> 
> These couple patches aims to  make possible local migration (within one host)
> on the same TAP device used by source and destination QEMU
> 
> The scenario looks like this
>  1. Create TAP devices and pass file descriptors to source QEMU
>  2. Launch destination QEMU (-incoming defer) and pass same descriptors to it.
>  3. Start migration
> 
> 
> Regarding the first patch: It makes possible to receive file descriptor in 
> non-blocking
> state. But I probably didn't cover all FD users which might need to set 
> blocking state after
> the patch. So I'm hopping for the hints where else, besides 
> fd_start_incoming_migration()
> I need to put qemu_socket_set_block() calls.

Nice feature. I am worried that these patches are unsafe/incomplete
though.

Tap local migration isn't explicitly visible in the code. How will other
developers know the feature is there and how to avoid breaking it when
modifying the code? Maybe a migration test case, comments that explain
the rules about accessing the tap fd, and/or assertions?

How does this interact with hw/net/vhost_net.c, which uses tap_get_fd()
to borrow the fd? I guess the idea is that the source VM is paused and
no tap activity is expected. Then migration handover happens and the
destination VM starts running and is allowed to access the tap fd.
However, the source VM still has vhost_net with the tap fd set up. I
wonder if there is any issue with interference between the two vhost_net
instances?

These kinds of questions should be answered, mostly in the code but also
in the cover letter. It should be clear why this approach is correct.

Thanks,
Stefan

> 
> 
> Andrey Ryabinin (2):
>   chardev: don't set O_NONBLOCK on SCM_RIGHTS file descriptors.
>   tap: initialize TAPState->enabled according to the actual state of
> queue
> 
>  chardev/char-socket.c |  3 ---
>  io/channel-socket.c   |  3 ---
>  migration/fd.c|  2 ++
>  net/tap-bsd.c |  5 +
>  net/tap-linux.c   | 12 
>  net/tap-solaris.c |  5 +
>  net/tap.c |  2 +-
>  net/tap_int.h |  1 +
>  8 files changed, 26 insertions(+), 7 deletions(-)
> 
> -- 
> 2.35.1
> 

signature.asc
Description: PGP signature

[PATCH v5 3/4] ui/icons: Use bundle mechanism

Signed-off-by: Akihiko Odaki 
---
 meson.build  |  2 +-
 ui/cocoa.m   | 29 -
 ui/gtk.c |  6 +-
 ui/icons/meson.build | 32 
 ui/sdl2.c| 18 +++---
 5 files changed, 57 insertions(+), 30 deletions(-)

diff --git a/meson.build b/meson.build
index 9bdcd26a4af..e17c1ebc1c9 100644
--- a/meson.build
+++ b/meson.build
@@ -1687,7 +1687,7 @@ config_host_data.set_quoted('CONFIG_QEMU_BUNDLE_DATADIR', 
qemu_datadir)
 config_host_data.set_quoted('CONFIG_QEMU_DESKTOPDIR', get_option('prefix') / 
qemu_desktopdir)
 config_host_data.set_quoted('CONFIG_QEMU_FIRMWAREPATH', get_option('prefix') / 
get_option('qemu_firmwarepath'))
 config_host_data.set_quoted('CONFIG_QEMU_HELPERDIR', get_option('prefix') / 
get_option('libexecdir'))
-config_host_data.set_quoted('CONFIG_QEMU_ICONDIR', get_option('prefix') / 
qemu_icondir)
+config_host_data.set_quoted('CONFIG_QEMU_BUNDLE_ICONDIR', qemu_icondir)
 config_host_data.set_quoted('CONFIG_QEMU_LOCALEDIR', get_option('prefix') / 
get_option('localedir'))
 config_host_data.set_quoted('CONFIG_QEMU_LOCALSTATEDIR', get_option('prefix') 
/ get_option('localstatedir'))
 config_host_data.set_quoted('CONFIG_QEMU_MODDIR', get_option('prefix') / 
qemu_moddir)
diff --git a/ui/cocoa.m b/ui/cocoa.m
index 84c84e98fc5..bd8a3211d3b 100644
--- a/ui/cocoa.m
+++ b/ui/cocoa.m
@@ -1562,21 +1562,24 @@ - (BOOL)verifyQuit
 - (IBAction) do_about_menu_item: (id) sender
 {
 NSAutoreleasePool *pool = [[NSAutoreleasePool alloc] init];
-char *icon_path_c = get_relocated_path(CONFIG_QEMU_ICONDIR 
"/hicolor/512x512/apps/qemu.png");
-NSString *icon_path = [NSString stringWithUTF8String:icon_path_c];
-g_free(icon_path_c);
-NSImage *icon = [[NSImage alloc] initWithContentsOfFile:icon_path];
+char *icon_path_c = find_bundle(CONFIG_QEMU_BUNDLE_ICONDIR 
"/hicolor/512x512/apps/qemu.png");
 NSString *version = @"QEMU emulator version " QEMU_FULL_VERSION;
 NSString *copyright = @QEMU_COPYRIGHT;
-NSDictionary *options;
-if (icon) {
-options = @{
-NSAboutPanelOptionApplicationIcon : icon,
-NSAboutPanelOptionApplicationVersion : version,
-@"Copyright" : copyright,
-};
-[icon release];
-} else {
+NSDictionary *options = nil;
+if (icon_path_c) {
+NSString *icon_path = [NSString stringWithUTF8String:icon_path_c];
+g_free(icon_path_c);
+NSImage *icon = [[NSImage alloc] initWithContentsOfFile:icon_path];
+if (icon) {
+options = @{
+NSAboutPanelOptionApplicationIcon : icon,
+NSAboutPanelOptionApplicationVersion : version,
+@"Copyright" : copyright,
+};
+[icon release];
+}
+}
+if (!options) {
 options = @{
 NSAboutPanelOptionApplicationVersion : version,
 @"Copyright" : copyright,
diff --git a/ui/gtk.c b/ui/gtk.c
index 2a791dd2aa0..8f7afe795f4 100644
--- a/ui/gtk.c
+++ b/ui/gtk.c
@@ -2321,7 +2321,11 @@ static void gtk_display_init(DisplayState *ds, 
DisplayOptions *opts)
 s->opts = opts;
 
 theme = gtk_icon_theme_get_default();
-dir = get_relocated_path(CONFIG_QEMU_ICONDIR);
+dir = find_bundle(CONFIG_QEMU_BUNDLE_ICONDIR);
+if (dir) {
+gtk_icon_theme_prepend_search_path(theme, dir);
+g_free(dir);
+}
 gtk_icon_theme_prepend_search_path(theme, dir);
 g_free(dir);
 g_set_prgname("qemu");
diff --git a/ui/icons/meson.build b/ui/icons/meson.build
index 12c52080ebd..1d99aff10ed 100644
--- a/ui/icons/meson.build
+++ b/ui/icons/meson.build
@@ -1,13 +1,29 @@
+icons = [
+  {
+'source': 'qemu_32x32.bmp',
+'install': 'hicolor' / '32x32' / 'apps' / 'qemu.bmp',
+  },
+  {
+'source': 'qemu.svg',
+'install': 'hicolor' / 'scalable' / 'apps' / 'qemu.svg',
+  },
+]
+
 foreach s: [16, 24, 32, 48, 64, 128, 256, 512]
   s = '@0@x@0@'.format(s.to_string())
-  install_data('qemu_@0@.png'.format(s),
-   rename: 'qemu.png',
-   install_dir: qemu_icondir / 'hicolor' / s / 'apps')
+  icons += {
+'source': 'qemu_@0@.png'.format(s),
+'install': 'hicolor' / s / 'apps' / 'qemu.png',
+  }
 endforeach
 
-install_data('qemu_32x32.bmp',
- rename: 'qemu.bmp',
- install_dir: qemu_icondir / 'hicolor' / '32x32' / 'apps')
+foreach icon: icons
+  source = icon.get('source')
+  install = icon.get('install')
+
+  install_data(source,
+   rename: fs.name(install),
+   install_dir: qemu_icondir / fs.parent(install))
 
-install_data('qemu.svg',
- install_dir: qemu_icondir / 'hicolor' / 'scalable' / 'apps')
+  bundles += { qemu_bundledir / qemu_icondir / install: 
meson.current_source_dir() / source }
+endforeach
diff --git a/ui/sdl2.c b/ui/sdl2.c
index 8cb77416af2..bbcb4762e1b 100644
--- a/ui/sdl2.c
+++ b/ui/sdl2.c
@@ -910,15 +910,19 @@ static vo

Re: [PATCH v4 1/4] cutils: Introduce bundle mechanism


On 2022/06/15 17:19, Paolo Bonzini wrote:

On 6/14/22 23:07, Akihiko Odaki wrote:

diff --git a/util/cutils.c b/util/cutils.c
index a58bcfd80e7..fe3bbb1c4eb 100644
--- a/util/cutils.c
+++ b/util/cutils.c
@@ -1086,3 +1086,36 @@ char *get_relocated_path(const char *dir)
  }
  return g_string_free(result, false);
  }
+
+static const char * const bundle_formats[] = {
+    "%s" G_DIR_SEPARATOR_S ".." G_DIR_SEPARATOR_S "%s",
+    "%s" G_DIR_SEPARATOR_S "qemu-bundle" G_DIR_SEPARATOR_S "%s"
+};


Why do you need both?

Paolo


The earlier one is used when QEMU is installed. The latter one is used 
in the build tree.


Actually the order was problematic as Daniel pointed out. It is fixed in 
the v5, which I have just sent out.

On 2022/06/15 17:16, Daniel P. Berrangé wrote:
> This is flawed because it looks at the installed paths first, and
> falls back to uninstalled paths afterwards. So if you're building
> and running QEMU 7.1.0 from git, and have QEMU 5.0.0 installed,
> your QEMU 7.1.0 will end up finding files from the 5.0.0 install.

Regards,
Akihiko Odaki

[PATCH v5 4/4] net: Use bundle mechanism

Signed-off-by: Akihiko Odaki 
---
 include/net/net.h | 2 +-
 meson.build   | 4 +++-
 net/tap.c | 6 +-
 qemu-options.hx   | 4 ++--
 4 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/include/net/net.h b/include/net/net.h
index 523136c7acb..4a5ed27a4b7 100644
--- a/include/net/net.h
+++ b/include/net/net.h
@@ -228,7 +228,7 @@ NetClientState *net_hub_port_find(int hub_id);
 
 #define DEFAULT_NETWORK_SCRIPT CONFIG_SYSCONFDIR "/qemu-ifup"
 #define DEFAULT_NETWORK_DOWN_SCRIPT CONFIG_SYSCONFDIR "/qemu-ifdown"
-#define DEFAULT_BRIDGE_HELPER CONFIG_QEMU_HELPERDIR "/qemu-bridge-helper"
+#define DEFAULT_BUNDLE_BRIDGE_HELPER CONFIG_QEMU_BUNDLE_HELPERDIR 
"/qemu-bridge-helper"
 #define DEFAULT_BRIDGE_INTERFACE "br0"
 
 void qdev_set_nic_properties(DeviceState *dev, NICInfo *nd);
diff --git a/meson.build b/meson.build
index e17c1ebc1c9..909c8a3c63d 100644
--- a/meson.build
+++ b/meson.build
@@ -1686,7 +1686,7 @@ config_host_data.set_quoted('CONFIG_QEMU_CONFDIR', 
get_option('prefix') / qemu_c
 config_host_data.set_quoted('CONFIG_QEMU_BUNDLE_DATADIR', qemu_datadir)
 config_host_data.set_quoted('CONFIG_QEMU_DESKTOPDIR', get_option('prefix') / 
qemu_desktopdir)
 config_host_data.set_quoted('CONFIG_QEMU_FIRMWAREPATH', get_option('prefix') / 
get_option('qemu_firmwarepath'))
-config_host_data.set_quoted('CONFIG_QEMU_HELPERDIR', get_option('prefix') / 
get_option('libexecdir'))
+config_host_data.set_quoted('CONFIG_QEMU_BUNDLE_HELPERDIR', 
get_option('libexecdir'))
 config_host_data.set_quoted('CONFIG_QEMU_BUNDLE_ICONDIR', qemu_icondir)
 config_host_data.set_quoted('CONFIG_QEMU_LOCALEDIR', get_option('prefix') / 
get_option('localedir'))
 config_host_data.set_quoted('CONFIG_QEMU_LOCALSTATEDIR', get_option('prefix') 
/ get_option('localstatedir'))
@@ -3575,6 +3575,8 @@ if have_tools
dependencies: [authz, crypto, io, qom, qemuutil,
   libcap_ng, mpathpersist],
install: true)
+
+bundles += { get_option('libexecdir') / 'qemu-bridge-helper': 
'../../qemu-bridge-helper' }
   endif
 
   if have_ivshmem
diff --git a/net/tap.c b/net/tap.c
index b3ddfd4a74b..ea013ca3873 100644
--- a/net/tap.c
+++ b/net/tap.c
@@ -507,7 +507,11 @@ static int net_bridge_run_helper(const char *helper, const 
char *bridge,
 sigprocmask(SIG_BLOCK, &mask, &oldmask);
 
 if (!helper) {
-helper = default_helper = get_relocated_path(DEFAULT_BRIDGE_HELPER);
+helper = default_helper = find_bundle(DEFAULT_BUNDLE_BRIDGE_HELPER);
+if (!helper) {
+error_setg(errp, "bridge helper not found");
+return -1;
+}
 }
 
 if (socketpair(PF_UNIX, SOCK_STREAM, 0, sv) == -1) {
diff --git a/qemu-options.hx b/qemu-options.hx
index 377d22fbd82..1959db01061 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -2665,7 +2665,7 @@ DEF("netdev", HAS_ARG, QEMU_OPTION_netdev,
 "to configure it and 'dfile' (default=" 
DEFAULT_NETWORK_DOWN_SCRIPT ")\n"
 "to deconfigure it\n"
 "use '[down]script=no' to disable script execution\n"
-"use network helper 'helper' (default=" 
DEFAULT_BRIDGE_HELPER ") to\n"
+"use network helper 'helper' (default=" 
DEFAULT_BUNDLE_BRIDGE_HELPER ") to\n"
 "configure it\n"
 "use 'fd=h' to connect to an already opened TAP 
interface\n"
 "use 'fds=x:y:...:z' to connect to already opened 
multiqueue capable TAP interfaces\n"
@@ -2684,7 +2684,7 @@ DEF("netdev", HAS_ARG, QEMU_OPTION_netdev,
 "-netdev bridge,id=str[,br=bridge][,helper=helper]\n"
 "configure a host TAP network backend with ID 'str' that 
is\n"
 "connected to a bridge (default=" DEFAULT_BRIDGE_INTERFACE 
")\n"
-"using the program 'helper (default=" 
DEFAULT_BRIDGE_HELPER ")\n"
+"using the program 'helper (default=" 
DEFAULT_BUNDLE_BRIDGE_HELPER ")\n"
 #endif
 #ifdef __linux__
 "-netdev 
l2tpv3,id=str,src=srcaddr,dst=dstaddr[,srcport=srcport][,dstport=dstport]\n"
-- 
2.32.1 (Apple Git-133)

Re: [PATCH 1/2] chardev: don't set O_NONBLOCK on SCM_RIGHTS file descriptors.

2022-06-15 Thread Stefan Hajnoczi

On Tue, Jun 14, 2022 at 02:18:42PM +0300, Andrey Ryabinin wrote:
> This reverts commit 9b938c7262e4 ("chardev: clear O_NONBLOCK on SCM_RIGHTS 
> file descriptors").
> File descriptor passed to QEMU via 'getfd' QMP command always
> changed to blocking mode. Instead of that, change blocking mode by QEMU
> file descriptors users when necessary, e.g. like migration.
> 
> We need to preserve the state of the file descriptor in case it's still
> used by an external process and before the QEMU itself started
> using it.
> 
> E.g. our local migration scenario with TAP networking looks like this:
>  1. Create TAP devices and pass file descriptors to source QEMU
>  2. Launch destination QEMU (-incoming defer) and pass same descriptors to it.
>  3. Start migration
> 
> In such scenario setting blocking state at stage (2) will hang source QEMU
> since TAP fd suddenly become blocking.

Is it possible to add a special flag or API for preserving the
O_NONBLOCK open flag? That way the rest of QEMU could continue to safely
reset the flag while the tap fd passing code would explicitly ask for
the O_NONBLOCK open flag to be preserved. That seems safer but I haven't
checked whether it's possible to do this.

> 
> Signed-off-by: Andrey Ryabinin 
> ---
>  chardev/char-socket.c | 3 ---
>  io/channel-socket.c   | 3 ---
>  migration/fd.c| 2 ++
>  3 files changed, 2 insertions(+), 6 deletions(-)
> 
> diff --git a/chardev/char-socket.c b/chardev/char-socket.c
> index dc4e218eeb6..c9592fb5836 100644
> --- a/chardev/char-socket.c
> +++ b/chardev/char-socket.c
> @@ -310,9 +310,6 @@ static ssize_t tcp_chr_recv(Chardev *chr, char *buf, 
> size_t len)
>  continue;
>  }
>  
> -/* O_NONBLOCK is preserved across SCM_RIGHTS so reset it */
> -qemu_socket_set_block(fd);
> -
>  #ifndef MSG_CMSG_CLOEXEC
>  qemu_set_cloexec(fd);
>  #endif
> diff --git a/io/channel-socket.c b/io/channel-socket.c
> index dc9c165de11..8b9679460dc 100644
> --- a/io/channel-socket.c
> +++ b/io/channel-socket.c
> @@ -479,9 +479,6 @@ static void qio_channel_socket_copy_fds(struct msghdr 
> *msg,
>  continue;
>  }
>  
> -/* O_NONBLOCK is preserved across SCM_RIGHTS so reset it */
> -qemu_socket_set_block(fd);
> -
>  #ifndef MSG_CMSG_CLOEXEC
>  qemu_set_cloexec(fd);
>  #endif
> diff --git a/migration/fd.c b/migration/fd.c
> index 6f2f50475f4..793fffeb169 100644
> --- a/migration/fd.c
> +++ b/migration/fd.c
> @@ -60,6 +60,8 @@ void fd_start_incoming_migration(const char *fdname, Error 
> **errp)
>  return;
>  }
>  
> +qemu_socket_set_block(fd);
> +
>  trace_migration_fd_incoming(fd);
>  
>  ioc = qio_channel_new_fd(fd, errp);
> -- 
> 2.35.1
> 


signature.asc
Description: PGP signature

Re: [PATCH 9/9] target/riscv: debug: Add initial support of type 6 trigger

On Fri, Jun 10, 2022 at 1:25 PM  wrote:
>
> From: Frank Chang 
>
> Type 6 trigger is similar to a type 2 trigger, but provides additional
> functionality and should be used instead of type 2 in newer
> implementations.
>
> Signed-off-by: Frank Chang 
> ---
>  target/riscv/debug.c | 174 ++-
>  target/riscv/debug.h |  18 +
>  2 files changed, 188 insertions(+), 4 deletions(-)
>

Reviewed-by: Bin Meng

regarding QEMU ACPI table generation and passing acpi tables/methods to guest OS

2022-06-15 Thread ritul guru

Came across below link about QEMU to pass acpi tables to guest OS.
https://wiki.qemu.org/Features/ACPITableGeneration

Can I get more docs with respect to acpi tables/devices passing to guest OS
from hypervisor or dom0?

Looking for an example how an asl file which gets added in the SSDT table
can be passed to the guest OS with the help of QEMU.



*Thanks & RegardsRitul Guru+91-9916513186*

Re: [PATCH 8/9] target/riscv: debug: Return 0 if previous value written to tselect >= number of triggers

On Fri, Jun 10, 2022 at 1:24 PM  wrote:
>
> From: Frank Chang 
>
> If the value written to tselect is greater than or equal to the number
> of supported triggers, then the following reads of tselect would return
> value 0.

Where is this behavior documented?

>
> Signed-off-by: Frank Chang 
> ---
>  target/riscv/cpu.h   | 1 +
>  target/riscv/debug.c | 6 ++
>  2 files changed, 7 insertions(+)
>
> diff --git a/target/riscv/cpu.h b/target/riscv/cpu.h
> index bac5f00722..c7ee3f80e6 100644
> --- a/target/riscv/cpu.h
> +++ b/target/riscv/cpu.h
> @@ -289,6 +289,7 @@ struct CPUArchState {
>
>  /* trigger module */
>  target_ulong trigger_cur;
> +target_ulong trigger_prev;
>  target_ulong tdata1[RV_MAX_TRIGGERS];
>  target_ulong tdata2[RV_MAX_TRIGGERS];
>  target_ulong tdata3[RV_MAX_TRIGGERS];
> diff --git a/target/riscv/debug.c b/target/riscv/debug.c
> index ce9ff15d75..83b72fa1b9 100644
> --- a/target/riscv/debug.c
> +++ b/target/riscv/debug.c
> @@ -158,6 +158,10 @@ bool tdata_available(CPURISCVState *env, int tdata_index)
>
>  target_ulong tselect_csr_read(CPURISCVState *env)
>  {
> +if (env->trigger_prev >= RV_MAX_TRIGGERS) {
> +return 0;
> +}
> +
>  return env->trigger_cur;
>  }
>
> @@ -166,6 +170,8 @@ void tselect_csr_write(CPURISCVState *env, target_ulong 
> val)
>  if (val < RV_MAX_TRIGGERS) {
>  env->trigger_cur = val;
>  }
> +
> +env->trigger_prev = val;
>  }
>
>  static target_ulong tdata1_validate(CPURISCVState *env, target_ulong val,
> --

The spec mentions "implementations which have 2^n triggers only need
to implement n bits of tselect", so in QEMU we can always implement
2^n triggers and have tselect implement just n bit.

In such way, writing tselect can be: env->trigger_cur = val &
(RV_MAX_TRIGGERS - 1).

and I believe you can squash this patch into patch 4 "target/riscv:
debug: Restrict the range of tselect value can be written" because in
patch 4 you changed the actual tselect range while the original
implementation allowed all bits to be set.

Regards,
Bin

Re: [PATCH 2/5] tests/qemu-iotests: skip 108 when FUSE is not loaded

2022-06-15 Thread John Snow

On Tue, Jun 14, 2022 at 10:30 AM John Snow  wrote:
>
> On Tue, Jun 14, 2022 at 4:59 AM Daniel P. Berrangé  
> wrote:
> >
> > On Tue, Jun 14, 2022 at 06:46:35AM +0200, Thomas Huth wrote:
> > > On 14/06/2022 03.50, John Snow wrote:
> > > > In certain container environments we may not have FUSE at all, so skip
> > > > the test in this circumstance too.
> > > >
> > > > Signed-off-by: John Snow 
> > > > ---
> > > >   tests/qemu-iotests/108 | 6 ++
> > > >   1 file changed, 6 insertions(+)
> > > >
> > > > diff --git a/tests/qemu-iotests/108 b/tests/qemu-iotests/108
> > > > index 9e923d6a59f..e401c5e9933 100755
> > > > --- a/tests/qemu-iotests/108
> > > > +++ b/tests/qemu-iotests/108
> > > > @@ -60,6 +60,12 @@ if sudo -n losetup &>/dev/null; then
> > > >   else
> > > >   loopdev=false
> > > > +# Check for fuse support in the host environment:
> > > > +lsmod | grep fuse &>/dev/null;
> > >
> > > That doesn't work if fuse has been linked statically into the kernel. 
> > > Would
> > > it make sense to test for /sys/fs/fuse instead?
> > >
> > > (OTOH, we likely hardly won't run this on statically linked kernels 
> > > anyway,
> > > so it might not matter too much)
> >
> > But more importantly 'lsmod' may not be installed in our container
> > images. So checking /sys/fs/fuse avoids introducing a dep on the
> > 'kmod' package.
> >
> > >
> > > > +if [[ $? -ne 0 ]]; then
> > >
> > > I'd prefer single "[" instead of "[[" ... but since we're requiring bash
> > > anyway, it likely doesn't matter.
> >
> > Or
> >
> > if  test $? != 0 ; then
> >
> > >
> > > > +_notrun 'No Passwordless sudo nor FUSE kernel module'
> > > > +fi
> > > > +
> > > >   # QSD --export fuse will either yield "Parameter 'id' is missing"
> > > >   # or "Invalid parameter 'fuse'", depending on whether there is
> > > >   # FUSE support or not.
> > >
>
> Good suggestions, thanks!
>

I think I need to test against /dev/fuse instead, because /sys/fs/fuse
actually exists, but because of docker permissions (etc), FUSE isn't
actually usable from the child container.

I wound up with this:

# Check for usable FUSE in the host environment:
if test ! -c "/dev/fuse"; then
_notrun 'No passwordless sudo nor usable /dev/fuse'
fi

Seems to work for my case here, at least, but I don't have a good
sense for how broadly flexible it might be. It might be nicer to
concoct some kind of NOP fuse mount instead, but I wasn't able to
figure out such a command quickly.

The next problem I have is actually related; test-qga (for the
Centos.x86_64 run) is failing because the guest agent is reading
/proc/self/mountinfo -- which contains entries for block devices that
are not visible in the current container scope. I think when QGA goes
to read info about these devices to populate a response, it chokes.
This might be a genuine bug in QGA if we want it to tolerate existing
inside of a container.

--js

Re: regarding QEMU ACPI table generation and passing acpi tables/methods to guest OS

2022-06-15 Thread Igor Mammedov

On Wed, 15 Jun 2022 18:23:28 +0530
ritul guru  wrote:

> Came across below link about QEMU to pass acpi tables to guest OS.
> https://wiki.qemu.org/Features/ACPITableGeneration

that link a bit outdated (project was completed but than later QEMU
moved on to built-in library for composing ACPI tables)

> Can I get more docs with respect to acpi tables/devices passing to guest OS
> from hypervisor or dom0?
> 
> Looking for an example how an asl file which gets added in the SSDT table
> can be passed to the guest OS with the help of QEMU.


You can look at AML library QEMU utilizes currently to build DSDT/SSDT tables
  ./hw/acpi/aml-build.c
  ./include/hw/acpi/aml-build.h

and see build_dsdt* functions for examples how it's used to compose tables.

> 
> 
> 
> *Thanks & RegardsRitul Guru+91-9916513186*

Re: [PATCH v6 0/8] KVM: mm: fd-based approach for supporting KVM guest private memory

2022-06-15 Thread Sean Christopherson

On Wed, Jun 15, 2022, Chao Peng wrote:
> On Tue, Jun 14, 2022 at 01:59:41PM -0700, Andy Lutomirski wrote:
> > On Tue, Jun 14, 2022 at 12:09 PM Sean Christopherson  
> > wrote:
> > >
> > > On Tue, Jun 14, 2022, Andy Lutomirski wrote:
> > > > This patch series is fairly close to implementing a rather more
> > > > efficient solution.  I'm not familiar enough with hypervisor userspace
> > > > to really know if this would work, but:
> > > >
> > > > What if shared guest memory could also be file-backed, either in the
> > > > same fd or with a second fd covering the shared portion of a memslot?
> > > > This would allow changes to the backing store (punching holes, etc) to
> > > > be some without mmap_lock or host-userspace TLB flushes?  Depending on
> > > > what the guest is doing with its shared memory, userspace might need
> > > > the memory mapped or it might not.
> > >
> > > That's what I'm angling for with the F_SEAL_FAULT_ALLOCATIONS idea.  The 
> > > issue,
> > > unless I'm misreading code, is that punching a hole in the shared memory 
> > > backing
> > > store doesn't prevent reallocating that hole on fault, i.e. a helper 
> > > process that
> > > keeps a valid mapping of guest shared memory can silently fill the hole.
> > >
> > > What we're hoping to achieve is a way to prevent allocating memory 
> > > without a very
> > > explicit action from userspace, e.g. fallocate().
> > 
> > Ah, I misunderstood.  I thought your goal was to mmap it and prevent
> > page faults from allocating.

I don't think you misunderstood, that's also one of the goals.  The use case is
that multiple processes in the host mmap() guest memory, and we'd like to be 
able
to punch a hole without having to rendezvous with all processes and also to 
prevent
an unintentional re-allocation.

> I think we still need the mmap, but want to prevent allocating when
> userspace touches previously mmaped area that has never filled the page.

Yes, or if a chunk was filled at some point but then was removed via PUNCH_HOLE.

> I don't have clear answer if other operations like read/write should be
> also prevented (probably yes). And only after an explicit fallocate() to
> allocate the page these operations would act normally.

I always forget about read/write.  I believe reads should be ok, the semantics 
of
holes are that they return zeros, i.e. can use ZERO_PAGE() and not allocate a 
new
backing page.  Not sure what to do about writes though.  Allocating on direct 
writes
might be ok for our use case, but that could also result in a rather wierd API.

> > It is indeed the case (and has been since before quite a few of us
> > were born) that a hole in a sparse file is logically just a bunch of
> > zeros.  A way to make a file for which a hole is an actual hole seems
> > like it would solve this problem nicely.  It could also be solved more
> > specifically for KVM by making sure that the private/shared mode that
> > userspace programs is strict enough to prevent accidental allocations
> > -- if a GPA is definitively private, shared, neither, or (potentially,
> > on TDX only) both, then a page that *isn't* shared will never be
> > accidentally allocated by KVM.
> 
> KVM is clever enough to not allocate since it knows a GPA is shared or
> not. This case it's the host userspace that can cause the allocating and
> is too complex to check on every access from guest.

Yes, KVM is not in the picture at all.  KVM won't trigger allocation, but KVM 
also
is not in a position to prevent userspace from touching memory.

> > If the shared backing is not mmapped,
> > it also won't be accidentally allocated by host userspace on a stray
> > or careless write.
> 
> As said above, mmap is still prefered, otherwise too many changes are
> needed for usespace VMM.

Forcing userspace to change doesn't bother me too much, the biggest concern is
having to take mmap_lock for write in a per-host process.

Re: [PULL 00/10] Block jobs & NBD patches

2022-06-15 Thread Richard Henderson


On 6/15/22 02:47, Vladimir Sementsov-Ogievskiy wrote:
Also, could/should I run all these test pipelines on gitlab by hand before sending a PULL 
request? Or can I rerun them on my qemu fork for debugging?


The first thing I'd try is make vm-build- and make 
docker-test-full@.

Either or both will reproduce the docker environment being used on gitlab.
If that fails to reproduce, it could be a difference in kernels, at which point I don't 
know how to advise.


It would be a good idea to run those test pipelines manually before the next 
PULL.


r~

Re: [External] [PATCH v13 3/8] QIOChannelSocket: Implement io_writev zero copy flag & io_flush for CONFIG_LINUX

2022-06-15 Thread chuang xu



On 2022/6/14 下午10:14, Dr. David Alan Gilbert wrote:

I don't think we can tell which one of them triggered the error; so the
only thing I can suggest is that we document the need for optmem_max
setting; I wonder how we get a better answer than 'a few 100KB'?
I guess it's something like the number of packets inflight *
sizeof(cmsghdr) ?

Dave


Three cases with errno ENOBUFS are described in the official 
doc(https://www.kernel.org/doc/html/v5.12/networking/msg_zerocopy.html):


1.The socket option was not set

2.The socket exceeds its optmem limit

3.The user exceeds its ulimit on locked pages

For case 1, if the code logic is correct, this possibility can be ignored.

For case 2, I asked a kernel developer about the reason for "a few 
100KB". He said that the recommended value should be for the purpose of 
improving the performance of zero_copy send. If the NICsends data slower 
than the data generation speed, even if optmem is set to 100KB, there is 
a probability that sendmsg returns with errno ENOBUFS.


For case 3, If I do not set max locked memory for the qemu, the max 
locked memory will be unlimited. I set the max locked memory for qemu 
and found that once the memory usage exceeds the max locked memory, oom 
will occur.  Does this mean that sendmsg cannot return with errno 
ENOBUFS at all when user exceeds its ulimit on locked pages?


If the above is true, can we take the errno as the case 2？

I modified the code logic to call sendmsg again when the errno is 
ENOBUFS and set optmem to the initial 20KB(echo 20480 > 
/proc/sys/net/core/optmem_max), now the multifd zero_copy migration goes 
well.


Here are the changes I made to the code:


Signed-off-by: chuang xu 
---
 io/channel-socket.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/io/channel-socket.c b/io/channel-socket.c
index dc9c165de1..9267f55a1d 100644
--- a/io/channel-socket.c
+++ b/io/channel-socket.c
@@ -595,9 +595,7 @@ static ssize_t qio_channel_socket_writev(QIOChannel 
*ioc,

 #ifdef QEMU_MSG_ZEROCOPY
 case ENOBUFS:
 if (sflags & MSG_ZEROCOPY) {
-    error_setg_errno(errp, errno,
- "Process can't lock enough memory for 
using MSG_ZEROCOPY");

-    return -1;
+    goto retry;
 }
 break;
 #endif
--

Dave, what's your take?

Best Regards,

chuang xu

Re: [PATCH v16 3/9] linux-user: Add LoongArch elf support

2022-06-15 Thread Richard Henderson


On 6/15/22 02:44, gaosong wrote:

Hi Richard.

On 2022/6/15 上午12:21, Richard Henderson wrote:

On 6/14/22 02:05, Song Gao wrote:

+#define ELF_HWCAP get_elf_hwcap()
+
+static uint32_t get_elf_hwcap(void)
+{
+    return 0;
+}


This should not be zero.  See cpu_probe_common in the kernel.  At minimum 
HWCAP_LOONGARCH_CRC32 and HWCAP_LOONGARCH_FPU are missing.  I don't know how many of the 
other features are implemented in target/loongarch/.



HWCAP_LOONGARCH_LAM  and  HWCAP_LOONGARCH_UAL  are need.


Ok, good.  For clarity, you should determine these bits just like the kernel does from the 
config registers set at cpu reset, via cpu->env.cpucfg[*].



r~

[PATCH v2 0/2] hw/nvme: Add shadow doorbell buffer support

This patch adds shadow doorbell buffer support in NVMe 1.3 to QEMU
NVMe. The Doorbell Buffer Config admin command is implemented for the
guest to enable shadow doobell buffer. When this feature is enabled, each
SQ/CQ is associated with two buffers, i.e., Shadow Doorbell buffer and
EventIdx buffer. According to the Spec, each queue's doorbell register
is only updated when the Shadow Doorbell buffer value changes from being
less than or equal to the value of the corresponding EventIdx buffer
entry to being greater than that value. Therefore, the number of MMIO's
on the doorbell registers is greatly reduced.

This patch is adapted from Huaicheng Li's patch[1] in 2018.

[1] 
https://patchwork.kernel.org/project/qemu-devel/patch/20180305194906.ga3...@gmail.com/

IOPS comparison with FIO:

iodepth1  2  4  8
  QEMU   25.1k  25.9k  24.5k  24.0k
 +dbbuf  29.1k  60.1k  99.8k  82.5k

MMIO's per IO measured by perf-kvm:

iodepth1  2  4  8
  QEMU   2.01   1.99   1.99   1.99
 +dbbuf  1.00   0.52   0.27   0.46

The tests are done on Ubuntu 22.04 with 5.15.0-33 kernel with Intel(R) 
Xeon(R) Gold 6248R CPU @ 3.00GHz.

QEMU set up:

bin/x86_64-softmmu/qemu-system-x86_64 \
-name "nvme-test" \
-machine accel=kvm \
-cpu host \
-smp 4 \
-m 8G \
-daemonize \
-device virtio-scsi-pci,id=scsi0 \
-device scsi-hd,drive=hd0 \
-drive 
file=$OSIMGF,if=none,aio=native,cache=none,format=qcow2,id=hd0,snapshot=on \
-drive "id=nvm,if=none,file=null-co://,file.read-zeroes=on,format=raw" \
-device nvme,serial=deadbeef,drive=nvm \
-net user,hostfwd=tcp::8080-:22 \
-net nic,model=virtio

FIO configuration:

[global]
ioengine=libaio
filename=/dev/nvme0n1
thread=1
group_reporting=1
direct=1
verify=0
time_based=1
ramp_time=0
runtime=30
;size=1G
;iodepth=1
rw=randread
bs=4k

[test]
numjobs=1

Changes since v1:
  - Add compatibility with hosts that do not use admin queue shadow doorbell

Jinhao Fan (2):
  hw/nvme: Implement shadow doorbell buffer support
  hw/nvme: Add trace events for shadow doorbell buffer

 hw/nvme/ctrl.c   | 117 ++-
 hw/nvme/nvme.h   |   8 +++
 hw/nvme/trace-events |   5 ++
 include/block/nvme.h |   2 +
 4 files changed, 131 insertions(+), 1 deletion(-)

-- 
2.25.1

[PATCH v2 1/2] hw/nvme: Implement shadow doorbell buffer support

Implement Doorbel Buffer Config command (Section 5.7 in NVMe Spec 1.3)
and Shadow Doorbel buffer & EventIdx buffer handling logic (Section 7.13
in NVMe Spec 1.3). For queues created before the Doorbell Buffer Config
command, the nvme_dbbuf_config function tries to associate each existing
SQ and CQ with its Shadow Doorbel buffer and EventIdx buffer address.
Queues created after the Doorbell Buffer Config command will have the
doorbell buffers associated with them when they are initialized.

In nvme_process_sq and nvme_post_cqe, proactively check for Shadow
Doorbell buffer changes instead of wait for doorbell register changes.
This reduces the number of MMIOs.

In nvme_process_db(), update the shadow doorbell buffer value with
the doorbell register value if it is the admin queue. This is a hack
since hosts like Linux NVMe driver and SPDK do not use shadow
doorbell buffer for the admin queue. Copying the doorbell register
value to the shadow doorbell buffer allows us to support these hosts
as well as spec-compliant hosts that use shadow doorbell buffer for
the admin queue.

Signed-off-by: Jinhao Fan 
---
 hw/nvme/ctrl.c   | 112 ++-
 hw/nvme/nvme.h   |   8 
 include/block/nvme.h |   2 +
 3 files changed, 121 insertions(+), 1 deletion(-)

diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index 03760ddeae..7be2e43f52 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -223,6 +223,7 @@ static const uint32_t nvme_cse_acs[256] = {
 [NVME_ADM_CMD_GET_FEATURES] = NVME_CMD_EFF_CSUPP,
 [NVME_ADM_CMD_ASYNC_EV_REQ] = NVME_CMD_EFF_CSUPP,
 [NVME_ADM_CMD_NS_ATTACHMENT]= NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_NIC,
+[NVME_ADM_CMD_DBBUF_CONFIG] = NVME_CMD_EFF_CSUPP,
 [NVME_ADM_CMD_FORMAT_NVM]   = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
 };
 
@@ -1304,6 +1305,12 @@ static inline void nvme_blk_write(BlockBackend *blk, 
int64_t offset,
 }
 }
 
+static void nvme_update_cq_head(NvmeCQueue *cq)
+{
+pci_dma_read(&cq->ctrl->parent_obj, cq->db_addr, &cq->head,
+sizeof(cq->head));
+}
+
 static void nvme_post_cqes(void *opaque)
 {
 NvmeCQueue *cq = opaque;
@@ -1316,6 +1323,10 @@ static void nvme_post_cqes(void *opaque)
 NvmeSQueue *sq;
 hwaddr addr;
 
+if (cq->cqid && n->dbbuf_enabled) {
+nvme_update_cq_head(cq);
+}
+
 if (nvme_cq_full(cq)) {
 break;
 }
@@ -4237,6 +4248,7 @@ static uint16_t nvme_del_sq(NvmeCtrl *n, NvmeRequest *req)
 static void nvme_init_sq(NvmeSQueue *sq, NvmeCtrl *n, uint64_t dma_addr,
  uint16_t sqid, uint16_t cqid, uint16_t size)
 {
+uint32_t stride = 4 << NVME_CAP_DSTRD(n->bar.cap);
 int i;
 NvmeCQueue *cq;
 
@@ -4256,6 +4268,11 @@ static void nvme_init_sq(NvmeSQueue *sq, NvmeCtrl *n, 
uint64_t dma_addr,
 }
 sq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_process_sq, sq);
 
+if (n->dbbuf_enabled) {
+sq->db_addr = n->dbbuf_dbs + 2 * sqid * stride;
+sq->ei_addr = n->dbbuf_eis + 2 * sqid * stride;
+}
+
 assert(n->cq[cqid]);
 cq = n->cq[cqid];
 QTAILQ_INSERT_TAIL(&(cq->sq_list), sq, entry);
@@ -4599,6 +4616,7 @@ static void nvme_init_cq(NvmeCQueue *cq, NvmeCtrl *n, 
uint64_t dma_addr,
  uint16_t cqid, uint16_t vector, uint16_t size,
  uint16_t irq_enabled)
 {
+uint32_t stride = 4 << NVME_CAP_DSTRD(n->bar.cap);
 int ret;
 
 if (msix_enabled(&n->parent_obj)) {
@@ -4615,6 +4633,10 @@ static void nvme_init_cq(NvmeCQueue *cq, NvmeCtrl *n, 
uint64_t dma_addr,
 cq->head = cq->tail = 0;
 QTAILQ_INIT(&cq->req_list);
 QTAILQ_INIT(&cq->sq_list);
+if (n->dbbuf_enabled) {
+cq->db_addr = n->dbbuf_dbs + (2 * cqid + 1) * stride;
+cq->ei_addr = n->dbbuf_eis + (2 * cqid + 1) * stride;
+}
 n->cq[cqid] = cq;
 cq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_post_cqes, cq);
 }
@@ -5767,6 +5789,47 @@ out:
 return status;
 }
 
+static uint16_t nvme_dbbuf_config(NvmeCtrl *n, const NvmeRequest *req)
+{
+uint32_t stride = 4 << NVME_CAP_DSTRD(n->bar.cap);
+uint64_t dbs_addr = le64_to_cpu(req->cmd.dptr.prp1);
+uint64_t eis_addr = le64_to_cpu(req->cmd.dptr.prp2);
+int i;
+
+/* Address should be page aligned */
+if (dbs_addr & (n->page_size - 1) || eis_addr & (n->page_size - 1)) {
+return NVME_INVALID_FIELD | NVME_DNR;
+}
+
+/* Save shadow buffer base addr for use during queue creation */
+n->dbbuf_dbs = dbs_addr;
+n->dbbuf_eis = eis_addr;
+n->dbbuf_enabled = true;
+
+for (i = 0; i < n->params.max_ioqpairs + 1; i++) {
+NvmeSQueue *sq = n->sq[i];
+NvmeCQueue *cq = n->cq[i];
+
+if (sq) {
+/* Submission queue tail pointer location, 2 * QID * stride */
+sq->db_addr = dbs_addr + 2 * i * stride;
+sq->ei_addr = eis_addr + 2 * i * stride;
+pci_dma_write(&n->parent_obj,

[PATCH v2 2/2] hw/nvme: Add trace events for shadow doorbell buffer

When shadow doorbell buffer is enabled, doorbell registers are lazily
updated. The actual queue head and tail pointers are stored in Shadow
Doorbell buffers.

Add trace events for updates on the Shadow Doorbell buffers and EventIdx
buffers. Also add trace event for the Doorbell Buffer Config command.

Signed-off-by: Jinhao Fan 
---
 hw/nvme/ctrl.c   | 5 +
 hw/nvme/trace-events | 5 +
 2 files changed, 10 insertions(+)

diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index 7be2e43f52..77fa79143d 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -1309,6 +1309,7 @@ static void nvme_update_cq_head(NvmeCQueue *cq)
 {
 pci_dma_read(&cq->ctrl->parent_obj, cq->db_addr, &cq->head,
 sizeof(cq->head));
+trace_pci_nvme_shadow_doorbell_cq(cq->cqid, cq->head);
 }
 
 static void nvme_post_cqes(void *opaque)
@@ -5827,6 +5828,8 @@ static uint16_t nvme_dbbuf_config(NvmeCtrl *n, const 
NvmeRequest *req)
 }
 }
 
+trace_pci_nvme_dbbuf_config(dbs_addr, eis_addr);
+
 return NVME_SUCCESS;
 }
 
@@ -5887,12 +5890,14 @@ static void nvme_update_sq_eventidx(const NvmeSQueue 
*sq)
 {
 pci_dma_write(&sq->ctrl->parent_obj, sq->ei_addr, &sq->tail,
   sizeof(sq->tail));
+trace_pci_nvme_eventidx_sq(sq->sqid, sq->tail);
 }
 
 static void nvme_update_sq_tail(NvmeSQueue *sq)
 {
 pci_dma_read(&sq->ctrl->parent_obj, sq->db_addr, &sq->tail,
  sizeof(sq->tail));
+trace_pci_nvme_shadow_doorbell_sq(sq->sqid, sq->tail);
 }
 
 static void nvme_process_sq(void *opaque)
diff --git a/hw/nvme/trace-events b/hw/nvme/trace-events
index ff1b458969..00ee42f475 100644
--- a/hw/nvme/trace-events
+++ b/hw/nvme/trace-events
@@ -3,6 +3,7 @@ pci_nvme_irq_msix(uint32_t vector) "raising MSI-X IRQ vector %u"
 pci_nvme_irq_pin(void) "pulsing IRQ pin"
 pci_nvme_irq_masked(void) "IRQ is masked"
 pci_nvme_dma_read(uint64_t prp1, uint64_t prp2) "DMA read, prp1=0x%"PRIx64" 
prp2=0x%"PRIx64""
+pci_nvme_dbbuf_config(uint64_t dbs_addr, uint64_t eis_addr) 
"dbs_addr=0x%"PRIx64" eis_addr=0x%"PRIx64""
 pci_nvme_map_addr(uint64_t addr, uint64_t len) "addr 0x%"PRIx64" len %"PRIu64""
 pci_nvme_map_addr_cmb(uint64_t addr, uint64_t len) "addr 0x%"PRIx64" len 
%"PRIu64""
 pci_nvme_map_prp(uint64_t trans_len, uint32_t len, uint64_t prp1, uint64_t 
prp2, int num_prps) "trans_len %"PRIu64" len %"PRIu32" prp1 0x%"PRIx64" prp2 
0x%"PRIx64" num_prps %d"
@@ -81,6 +82,8 @@ pci_nvme_enqueue_event_noqueue(int queued) "queued %d"
 pci_nvme_enqueue_event_masked(uint8_t typ) "type 0x%"PRIx8""
 pci_nvme_no_outstanding_aers(void) "ignoring event; no outstanding AERs"
 pci_nvme_enqueue_req_completion(uint16_t cid, uint16_t cqid, uint32_t dw0, 
uint32_t dw1, uint16_t status) "cid %"PRIu16" cqid %"PRIu16" dw0 0x%"PRIx32" 
dw1 0x%"PRIx32" status 0x%"PRIx16""
+pci_nvme_eventidx_cq(uint16_t cqid, uint16_t new_eventidx) "cqid %"PRIu16" 
new_eventidx %"PRIu16""
+pci_nvme_eventidx_sq(uint16_t sqid, uint16_t new_eventidx) "sqid %"PRIu16" 
new_eventidx %"PRIu16""
 pci_nvme_mmio_read(uint64_t addr, unsigned size) "addr 0x%"PRIx64" size %d"
 pci_nvme_mmio_write(uint64_t addr, uint64_t data, unsigned size) "addr 
0x%"PRIx64" data 0x%"PRIx64" size %d"
 pci_nvme_mmio_doorbell_cq(uint16_t cqid, uint16_t new_head) "cqid %"PRIu16" 
new_head %"PRIu16""
@@ -97,6 +100,8 @@ pci_nvme_mmio_start_success(void) "setting controller enable 
bit succeeded"
 pci_nvme_mmio_stopped(void) "cleared controller enable bit"
 pci_nvme_mmio_shutdown_set(void) "shutdown bit set"
 pci_nvme_mmio_shutdown_cleared(void) "shutdown bit cleared"
+pci_nvme_shadow_doorbell_cq(uint16_t cqid, uint16_t new_shadow_doorbell) "cqid 
%"PRIu16" new_shadow_doorbell %"PRIu16""
+pci_nvme_shadow_doorbell_sq(uint16_t sqid, uint16_t new_shadow_doorbell) "sqid 
%"PRIu16" new_shadow_doorbell %"PRIu16""
 pci_nvme_open_zone(uint64_t slba, uint32_t zone_idx, int all) "open zone, 
slba=%"PRIu64", idx=%"PRIu32", all=%"PRIi32""
 pci_nvme_close_zone(uint64_t slba, uint32_t zone_idx, int all) "close zone, 
slba=%"PRIu64", idx=%"PRIu32", all=%"PRIi32""
 pci_nvme_finish_zone(uint64_t slba, uint32_t zone_idx, int all) "finish zone, 
slba=%"PRIu64", idx=%"PRIu32", all=%"PRIi32""
-- 
2.25.1

[PATCH V8 00/39] Live Update

Provide the cpr-save, cpr-exec, and cpr-load commands for live update.
These save and restore VM state, with minimal guest pause time, so that
qemu may be updated to a new version in between.

cpr-save stops the VM and saves vmstate to an ordinary file.  It supports
any type of guest image and block device, but the caller must not modify
guest block devices between cpr-save and cpr-load.  It supports two modes:
reboot and restart.

In reboot mode, the caller invokes cpr-save and then terminates qemu.
The caller may then update the host kernel and system software and reboot.
The caller resumes the guest by running qemu with the same arguments as the
original process, plus -S so new qemu starts in a paused state, and invoking
cpr-load.  For maximum efficiency in this mode, guest ram should be mapped to
a persistent shared memory file such as /dev/dax0.0, or /dev/shm PKRAM as
proposed in 
https://lore.kernel.org/lkml/1617140178-8773-1-git-send-email-anthony.yzn...@oracle.com.

The reboot mode supports vfio devices if the caller first suspends the
guest, such as by issuing guest-suspend-ram to the qemu guest agent.  The
guest drivers' suspend methods flush outstanding requests and re-initialize
the devices, and thus there is no device state to save and restore.

Restart mode preserves the guest VM across a restart of the qemu process.
After cpr-save, the caller passes the original qemu command-line arguments
plus -S to cpr-exec. The restart mode supports vfio devices by preserving the
vfio container, group, device, and event descriptors across the qemu re-exec,
and by updating DMA mapping virtual addresses using VFIO_DMA_UNMAP_FLAG_VADDR
and VFIO_DMA_MAP_FLAG_VADDR as defined in 
https://lore.kernel.org/kvm/1611939252-7240-1-git-send-email-steven.sist...@oracle.com/
and integrated in Linux kernel 5.12.

For restart mode, the user must create guest ram using a memory-backend-memfd
or a shared memory-backend-file.  These are re-mmap'd in the updated process,
so guest ram is efficiently preserved in place, albeit with new virtual
addresses.  In addition, qemu allocates secondary guest ram blocks -- those
that cannot be specified as objects on the command line -- using memfd_create.
The memfd's are remembered and kept open across exec, after which they are
re-mmap'd.

The caller resumes the guest by invoking cpr-load, which loads state from
the file. If the VM was running at cpr-save time, then VM execution resumes.
If the VM was suspended at cpr-save time (reboot mode), then the caller must
issue a system_wakeup command to resume.

The first patches add reboot mode:
  - migration: fix populate_vfio_info
  - migration: qemu file wrappers
  - migration: simplify savevm
  - memory: RAM_ANON flag
  - vl: start on wakeup request
  - cpr: reboot mode
  - cpr: reboot HMP interfaces
  - cpr: blockers
  - cpr: register blockers
  - cpr: cpr-enable option
  - cpr: save ram blocks

The next patches add restart mode:
  - memory: flat section iterator
  - oslib: qemu_clear_cloexec
  - qapi: strList_from_string
  - qapi: QAPI_LIST_LENGTH
  - qapi: strv_from_strList
  - qapi: strList unit tests
  - vl: helper to request re-exec
  - cpr: preserve extra state
  - cpr: restart mode
  - cpr: restart HMP interfaces
  - cpr: ram block blockers
  - hostmem-memfd: cpr for memory-backend-memfd

The next patches add vfio support for restart mode:
  - pci: export export msix_is_pending
  - cpr: notifiers
  - vfio-pci: refactor for cpr
  - vfio-pci: cpr part 1 (fd and dma)
  - vfio-pci: cpr part 2 (msi)
  - vfio-pci: cpr part 3 (intx)
  - vfio-pci: recover from unmap-all-vaddr failure

The next patches preserve various descriptor-based backend devices across
cpr-exec:
  - vhost: reset vhost devices for cpr
  - loader: suppress rom_reset during cpr
  - chardev: cpr framework
  - chardev: cpr for simple devices
  - chardev: cpr for pty
  - chardev: cpr for sockets
  - cpr: only-cpr-capable option

The next patches add a test:
  - python/machine: add QEMUMachine accessors
  - tests/avocado: add cpr regression test

Here is an example of updating qemu from v7.0.0 to v7.1.0 using
restart mode.  The software update is performed while the guest is
running to minimize downtime.

window 1| window 2
|
# qemu-system-x86_64 ...|
QEMU 7.0.0 monitor - type 'help' ...|
(qemu) info status  |
VM status: running  |
| # yum update qemu
(qemu) cpr-save /tmp/qemu.sav restart   |
(qemu) cpr-exec qemu-system-x86_64 -S ...   |
QEMU 7.1.0 monitor - type 'help' ...|
(qemu) info status  |
VM status: paused (prelaunch)   |
(qemu) cpr-load /tmp/qemu.sav   |
(qemu) info status  |
VM status: running  |


Here

[PATCH V8 02/39] migration: qemu file wrappers

Add qemu_file_open and qemu_fd_open to create QEMUFile objects for unix
files and file descriptors.

Signed-off-by: Steve Sistare 
---
 migration/qemu-file-channel.c | 36 
 migration/qemu-file-channel.h |  6 ++
 2 files changed, 42 insertions(+)

diff --git a/migration/qemu-file-channel.c b/migration/qemu-file-channel.c
index bb5a575..cc5aebc 100644
--- a/migration/qemu-file-channel.c
+++ b/migration/qemu-file-channel.c
@@ -27,8 +27,10 @@
 #include "qemu-file.h"
 #include "io/channel-socket.h"
 #include "io/channel-tls.h"
+#include "io/channel-file.h"
 #include "qemu/iov.h"
 #include "qemu/yank.h"
+#include "qapi/error.h"
 #include "yank_functions.h"
 
 
@@ -192,3 +194,37 @@ QEMUFile *qemu_fopen_channel_output(QIOChannel *ioc)
 object_ref(OBJECT(ioc));
 return qemu_fopen_ops(ioc, &channel_output_ops, true);
 }
+
+QEMUFile *qemu_fopen_file(const char *path, int flags, int mode,
+  const char *name, Error **errp)
+{
+g_autoptr(QIOChannelFile) fioc = NULL;
+QIOChannel *ioc;
+QEMUFile *f;
+
+if (flags & O_RDWR) {
+error_setg(errp, "qemu_fopen_file %s: O_RDWR not supported", path);
+return NULL;
+}
+
+fioc = qio_channel_file_new_path(path, flags, mode, errp);
+if (!fioc) {
+return NULL;
+}
+
+ioc = QIO_CHANNEL(fioc);
+qio_channel_set_name(ioc, name);
+f = (flags & O_WRONLY) ? qemu_fopen_channel_output(ioc) :
+ qemu_fopen_channel_input(ioc);
+return f;
+}
+
+QEMUFile *qemu_fopen_fd(int fd, bool writable, const char *name)
+{
+g_autoptr(QIOChannelFile) fioc = qio_channel_file_new_fd(fd);
+QIOChannel *ioc = QIO_CHANNEL(fioc);
+QEMUFile *f = writable ? qemu_fopen_channel_output(ioc) :
+ qemu_fopen_channel_input(ioc);
+qio_channel_set_name(ioc, name);
+return f;
+}
diff --git a/migration/qemu-file-channel.h b/migration/qemu-file-channel.h
index 0028a09..75fd0ad 100644
--- a/migration/qemu-file-channel.h
+++ b/migration/qemu-file-channel.h
@@ -29,4 +29,10 @@
 
 QEMUFile *qemu_fopen_channel_input(QIOChannel *ioc);
 QEMUFile *qemu_fopen_channel_output(QIOChannel *ioc);
+
+QEMUFile *qemu_fopen_file(const char *path, int flags, int mode,
+ const char *name, Error **errp);
+
+QEMUFile *qemu_fopen_fd(int fd, bool writable, const char *name);
+
 #endif
-- 
1.8.3.1

[PATCH V8 08/39] cpr: blockers

Add an interface to register a blocker for cpr-save for one or more modes.
Devices and options that do not support a cpr mode can register a blocker,
and cpr-save will fail with a descriptive error message.  Conversely, if
such a device is deleted and un-registers its blocker, cpr will be allowed
again.

Signed-off-by: Steve Sistare 
---
 MAINTAINERS |  1 +
 include/migration/cpr.h |  6 
 migration/cpr.c | 79 +
 stubs/cpr.c | 23 ++
 stubs/meson.build   |  1 +
 5 files changed, 110 insertions(+)
 create mode 100644 stubs/cpr.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 9273891..1e4e72f 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3159,6 +3159,7 @@ S: Maintained
 F: include/migration/cpr.h
 F: migration/cpr.c
 F: qapi/cpr.json
+F: stubs/cpr.c
 
 Record/replay
 M: Pavel Dovgalyuk 
diff --git a/include/migration/cpr.h b/include/migration/cpr.h
index 1b6c82f..dfe5a1d 100644
--- a/include/migration/cpr.h
+++ b/include/migration/cpr.h
@@ -13,4 +13,10 @@
 void cpr_set_mode(CprMode mode);
 CprMode cpr_get_mode(void);
 
+#define CPR_MODE_ALL CPR_MODE__MAX
+
+int cpr_add_blocker(Error **reasonp, Error **errp, CprMode mode, ...);
+int cpr_add_blocker_str(const char *reason, Error **errp, CprMode mode, ...);
+void cpr_del_blocker(Error **reasonp);
+
 #endif
diff --git a/migration/cpr.c b/migration/cpr.c
index 24b0bcc..c1da784 100644
--- a/migration/cpr.c
+++ b/migration/cpr.c
@@ -29,12 +29,91 @@ void cpr_set_mode(CprMode mode)
 cpr_mode = mode;
 }
 
+static GSList *cpr_blockers[CPR_MODE__MAX];
+
+/*
+ * Add blocker for each mode in varargs list, or for all modes if CPR_MODE_ALL
+ * is specified.  Caller terminates the list with 0 or CPR_MODE_ALL.  This
+ * function takes ownership of *reasonp, and frees it on error, or in
+ * cpr_del_blocker.  errp is set in a later patch.
+ */
+int cpr_add_blocker(Error **reasonp, Error **errp, CprMode mode, ...)
+{
+int modes = 0;
+va_list ap;
+ERRP_GUARD();
+
+va_start(ap, mode);
+while (mode != CPR_MODE_NONE && mode != CPR_MODE_ALL) {
+assert(mode > CPR_MODE_NONE && mode < CPR_MODE__MAX);
+modes |= BIT(mode);
+mode = va_arg(ap, CprMode);
+}
+va_end(ap);
+if (mode == CPR_MODE_ALL) {
+modes = BIT(CPR_MODE__MAX) - 1;
+}
+
+for (mode = 0; mode < CPR_MODE__MAX; mode++) {
+if (modes & BIT(mode)) {
+cpr_blockers[mode] = g_slist_prepend(cpr_blockers[mode], *reasonp);
+}
+}
+return 0;
+}
+
+/*
+ * Delete the blocker from all modes it is associated with.
+ */
+void cpr_del_blocker(Error **reasonp)
+{
+CprMode mode;
+
+if (*reasonp) {
+for (mode = 0; mode < CPR_MODE__MAX; mode++) {
+cpr_blockers[mode] = g_slist_remove(cpr_blockers[mode], *reasonp);
+}
+error_free(*reasonp);
+*reasonp = NULL;
+}
+}
+
+/*
+ * Add a blocker which will not be deleted.  Simpler for some callers.
+ */
+int cpr_add_blocker_str(const char *msg, Error **errp, CprMode mode, ...)
+{
+int ret;
+va_list ap;
+Error *reason = NULL;
+
+error_setg(&reason, "%s", msg);
+va_start(ap, mode);
+ret = cpr_add_blocker(&reason, errp, mode, ap);
+va_end(ap);
+return ret;
+}
+
+static bool cpr_is_blocked(Error **errp, CprMode mode)
+{
+if (cpr_blockers[mode]) {
+error_propagate(errp, error_copy(cpr_blockers[mode]->data));
+return true;
+}
+
+return false;
+}
+
 void qmp_cpr_save(const char *filename, CprMode mode, Error **errp)
 {
 int ret;
 QEMUFile *f;
 int saved_vm_running = runstate_is_running();
 
+if (cpr_is_blocked(errp, mode)) {
+return;
+}
+
 if (global_state_store()) {
 error_setg(errp, "Error saving global state");
 return;
diff --git a/stubs/cpr.c b/stubs/cpr.c
new file mode 100644
index 000..06a9a1c
--- /dev/null
+++ b/stubs/cpr.c
@@ -0,0 +1,23 @@
+/*
+ * Copyright (c) 2022 Oracle and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "migration/cpr.h"
+
+int cpr_add_blocker(Error **reasonp, Error **errp, CprMode mode, ...)
+{
+return 0;
+}
+
+int cpr_add_blocker_str(const char *reason, Error **errp, CprMode mode, ...)
+{
+return 0;
+}
+
+void cpr_del_blocker(Error **reasonp)
+{
+}
diff --git a/stubs/meson.build b/stubs/meson.build
index 6f80fec..0d7565b 100644
--- a/stubs/meson.build
+++ b/stubs/meson.build
@@ -4,6 +4,7 @@ stub_ss.add(files('blk-exp-close-all.c'))
 stub_ss.add(files('blockdev-close-all-bdrv-states.c'))
 stub_ss.add(files('change-state-handler.c'))
 stub_ss.add(files('cmos.c'))
+stub_ss.add(files('cpr.c'))
 stub_ss.add(files('cpu-get-clock.c'))
 stub_ss.add(files('cpus-get-virtual-clock.c'))
 stub_ss.add(files('qemu-timer-notify-cb.c'))
-- 
1.8.3.1

Re: [PATCH 1/2] hw/nvme: Implement shadow doorbell buffer support

> On Jun 15, 2022, at 5:38 PM, Klaus Jensen  wrote:
> 
> I prefer we use the NVMe terminology to minimize misunderstandings, so
> "host" means the driver and "device" means the qemu side of things
> 

Thanks for helping me disambiguate this!

Now that we have resolved all issues in v1, I’ve submitted a v2 patch.

[PATCH V8 12/39] memory: flat section iterator

Add an iterator over the sections of a flattened address space.

Signed-off-by: Steve Sistare 
Reviewed-by: Marc-André Lureau 
---
 include/exec/memory.h | 31 +++
 softmmu/memory.c  | 20 
 2 files changed, 51 insertions(+)

diff --git a/include/exec/memory.h b/include/exec/memory.h
index a03301d..6a257a4 100644
--- a/include/exec/memory.h
+++ b/include/exec/memory.h
@@ -2343,6 +2343,37 @@ void memory_region_set_ram_discard_manager(MemoryRegion 
*mr,
RamDiscardManager *rdm);
 
 /**
+ * memory_region_section_cb: callback for address_space_flat_for_each_section()
+ *
+ * @mrs: MemoryRegionSection of the range
+ * @opaque: data pointer passed to address_space_flat_for_each_section()
+ * @errp: error message, returned to the address_space_flat_for_each_section
+ *caller.
+ *
+ * Returns: non-zero to stop the iteration, and 0 to continue.  The same
+ * non-zero value is returned to the address_space_flat_for_each_section 
caller.
+ */
+
+typedef int (*memory_region_section_cb)(MemoryRegionSection *mrs,
+void *opaque,
+Error **errp);
+
+/**
+ * address_space_flat_for_each_section: walk the ranges in the address space
+ * flat view and call @func for each.  Return 0 on success, else return 
non-zero
+ * with a message in @errp.
+ *
+ * @as: target address space
+ * @func: callback function
+ * @opaque: passed to @func
+ * @errp: passed to @func
+ */
+int address_space_flat_for_each_section(AddressSpace *as,
+memory_region_section_cb func,
+void *opaque,
+Error **errp);
+
+/**
  * memory_region_find: translate an address/size relative to a
  * MemoryRegion into a #MemoryRegionSection.
  *
diff --git a/softmmu/memory.c b/softmmu/memory.c
index 0fe6fac..e5aefdd 100644
--- a/softmmu/memory.c
+++ b/softmmu/memory.c
@@ -2683,6 +2683,26 @@ bool memory_region_is_mapped(MemoryRegion *mr)
 return !!mr->container || mr->mapped_via_alias;
 }
 
+int address_space_flat_for_each_section(AddressSpace *as,
+memory_region_section_cb func,
+void *opaque,
+Error **errp)
+{
+FlatView *view = address_space_get_flatview(as);
+FlatRange *fr;
+int ret;
+
+FOR_EACH_FLAT_RANGE(fr, view) {
+MemoryRegionSection mrs = section_from_flat_range(fr, view);
+ret = func(&mrs, opaque, errp);
+if (ret) {
+return ret;
+}
+}
+
+return 0;
+}
+
 /* Same as memory_region_find, but it does not add a reference to the
  * returned region.  It must be called from an RCU critical section.
  */
-- 
1.8.3.1

[PATCH V8 10/39] cpr: cpr-enable option

Add the '-cpr-enable ' command-line option as a pre-requisite for
using cpr-save and cpr-load for the mode.  Multiple -cpr-enable options
may be specified, one per mode.

Requiring -cpr-enable allows qemu to initialize objects differently, if
necessary, so that cpr-save is not blocked.

Signed-off-by: Steve Sistare 
---
 hmp-commands.hx |  4 
 include/migration/cpr.h |  2 ++
 migration/cpr.c | 22 ++
 qapi/cpr.json   |  4 
 qemu-options.hx | 10 ++
 softmmu/vl.c|  8 
 6 files changed, 50 insertions(+)

diff --git a/hmp-commands.hx b/hmp-commands.hx
index 9d9f984..d621968 100644
--- a/hmp-commands.hx
+++ b/hmp-commands.hx
@@ -370,6 +370,8 @@ SRST
   and does not require that guest RAM be saved in the file.  The caller must
   not modify guest block devices between cpr-save and cpr-load.
 
+  cpr-save requires that qemu was started with -cpr-enable for *mode*.
+
   If *mode* is 'reboot', the checkpoint remains valid after a host reboot.
   The guest RAM memory-backend should be shared and non-volatile across
   reboot, else it will be saved to the file.  To resume from the checkpoint,
@@ -391,6 +393,8 @@ SRST
   Load a virtual machine from the checkpoint file *filename* that was created
   earlier by the cpr-save command, and continue the VCPUs.  *mode* must match
   the mode specified for cpr-save.
+
+  cpr-load requires that qemu was started with -cpr-enable for *mode*.
 ERST
 
 {
diff --git a/include/migration/cpr.h b/include/migration/cpr.h
index dfe5a1d..f236cbf 100644
--- a/include/migration/cpr.h
+++ b/include/migration/cpr.h
@@ -10,8 +10,10 @@
 
 #include "qapi/qapi-types-cpr.h"
 
+void cpr_init(int modes);
 void cpr_set_mode(CprMode mode);
 CprMode cpr_get_mode(void);
+bool cpr_enabled(CprMode mode);
 
 #define CPR_MODE_ALL CPR_MODE__MAX
 
diff --git a/migration/cpr.c b/migration/cpr.c
index c1da784..76b9225 100644
--- a/migration/cpr.c
+++ b/migration/cpr.c
@@ -29,6 +29,18 @@ void cpr_set_mode(CprMode mode)
 cpr_mode = mode;
 }
 
+static int cpr_enabled_modes;
+
+void cpr_init(int modes)
+{
+cpr_enabled_modes = modes;
+}
+
+bool cpr_enabled(CprMode mode)
+{
+return !!(cpr_enabled_modes & BIT(mode));
+}
+
 static GSList *cpr_blockers[CPR_MODE__MAX];
 
 /*
@@ -110,6 +122,11 @@ void qmp_cpr_save(const char *filename, CprMode mode, 
Error **errp)
 QEMUFile *f;
 int saved_vm_running = runstate_is_running();
 
+if (!(cpr_enabled_modes & BIT(mode))) {
+error_setg(errp, "cpr mode is not enabled.  Use -cpr-enable.");
+return;
+}
+
 if (cpr_is_blocked(errp, mode)) {
 return;
 }
@@ -154,6 +171,11 @@ void qmp_cpr_load(const char *filename, CprMode mode, 
Error **errp)
 int ret;
 RunState state;
 
+if (!(cpr_enabled_modes & BIT(mode))) {
+error_setg(errp, "cpr mode is not enabled.  Use -cpr-enable.");
+return;
+}
+
 if (runstate_is_running()) {
 error_setg(errp, "cpr-load called for a running VM");
 return;
diff --git a/qapi/cpr.json b/qapi/cpr.json
index bdaabcb..11c6f88 100644
--- a/qapi/cpr.json
+++ b/qapi/cpr.json
@@ -30,6 +30,8 @@
 # and does not require that guest RAM be saved in the file.  The caller must
 # not modify guest block devices between cpr-save and cpr-load.
 #
+# cpr-save requires that qemu was started with -cpr-enable for @mode.
+#
 # If @mode is 'reboot', the checkpoint remains valid after a host reboot.
 # The guest RAM memory-backend should be shared and non-volatile across
 # reboot, else it will be saved to the file.  To resume from the checkpoint,
@@ -52,6 +54,8 @@
 # earlier by the cpr-save command, and continue the VCPUs.  @mode must match
 # the mode specified for cpr-save.
 #
+# cpr-load requires that qemu was started with -cpr-enable for @mode.
+#
 # @filename: name of checkpoint file
 # @mode: @CprMode mode
 #
diff --git a/qemu-options.hx b/qemu-options.hx
index 377d22f..6e51c33 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -4483,6 +4483,16 @@ SRST
 an unmigratable state.
 ERST
 
+DEF("cpr-enable", HAS_ARG, QEMU_OPTION_cpr_enable, \
+"-cpr-enable rebootenable the cpr mode\n",
+QEMU_ARCH_ALL)
+SRST
+``-cpr-enable reboot``
+Enable the specified cpr mode.  May be supplied multiple times, once
+per mode.  This is a pre-requisite for calling the cpr-save and cpr-load
+commands.
+ERST
+
 DEF("nodefaults", 0, QEMU_OPTION_nodefaults, \
 "-nodefaults don't create default devices\n", QEMU_ARCH_ALL)
 SRST
diff --git a/softmmu/vl.c b/softmmu/vl.c
index 54e920a..ce779cf 100644
--- a/softmmu/vl.c
+++ b/softmmu/vl.c
@@ -78,6 +78,7 @@
 #include "hw/i386/pc.h"
 #include "migration/misc.h"
 #include "migration/snapshot.h"
+#include "migration/cpr.h"
 #include "sysemu/tpm.h"
 #include "sysemu/dma.h"
 #include "hw/audio/soundhw.h"
@@ -2600,6 +2601,7 @@ void qemu_init(int argc, char **argv, char **envp)
 MachineClass *machine_class;
 bool userconfig =

[PATCH V8 25/39] cpr: notifiers

Add an interface to register notifiers for cpr transitions.  It is used to
support vfio cpr in a subsequent patch.

Signed-off-by: Steve Sistare 
---
 include/migration/cpr.h | 13 +
 migration/cpr.c | 25 +
 stubs/cpr.c | 10 ++
 3 files changed, 48 insertions(+)

diff --git a/include/migration/cpr.h b/include/migration/cpr.h
index b75dec4..ab5f53e 100644
--- a/include/migration/cpr.h
+++ b/include/migration/cpr.h
@@ -9,6 +9,7 @@
 #define MIGRATION_CPR_H
 
 #include "qapi/qapi-types-cpr.h"
+#include "qemu/notify.h"
 
 void cpr_init(int modes);
 void cpr_set_mode(CprMode mode);
@@ -37,4 +38,16 @@ int cpr_add_blocker(Error **reasonp, Error **errp, CprMode 
mode, ...);
 int cpr_add_blocker_str(const char *reason, Error **errp, CprMode mode, ...);
 void cpr_del_blocker(Error **reasonp);
 
+typedef enum CprNotifyState {
+CPR_NOTIFY_EXEC,
+CPR_NOTIFY_SAVE_FAILED,
+CPR_NOTIFY_LOAD_FAILED,
+CPR_NOTIFY_NUM
+} CprNotifyState;
+
+void cpr_add_notifier(Notifier *notify,
+  void (*cb)(Notifier *notifier, void *data),
+  CprNotifyState state);
+void cpr_remove_notifier(Notifier *notify);
+
 #endif
diff --git a/migration/cpr.c b/migration/cpr.c
index 8b3fffd..9d6bca4 100644
--- a/migration/cpr.c
+++ b/migration/cpr.c
@@ -105,6 +105,28 @@ static bool cpr_is_blocked(Error **errp, CprMode mode)
 return false;
 }
 
+static NotifierList cpr_notifiers[CPR_NOTIFY_NUM];
+
+void cpr_add_notifier(Notifier *notify,
+  void (*cb)(Notifier *notifier, void *data),
+  CprNotifyState state)
+{
+assert(state >= 0 && state < CPR_NOTIFY_NUM);
+notify->notify = cb;
+notifier_list_add(&cpr_notifiers[state], notify);
+}
+
+void cpr_remove_notifier(Notifier *notify)
+{
+notifier_remove(notify);
+notify->notify = NULL;
+}
+
+static void cpr_call_notifiers(CprNotifyState state)
+{
+notifier_list_notify(&cpr_notifiers[state], 0);
+}
+
 void qmp_cpr_save(const char *filename, CprMode mode, Error **errp)
 {
 int ret;
@@ -142,6 +164,7 @@ void qmp_cpr_save(const char *filename, CprMode mode, Error 
**errp)
 qemu_fclose(f);
 if (ret < 0) {
 error_setg(errp, "Error %d while saving VM state", ret);
+cpr_call_notifiers(CPR_NOTIFY_SAVE_FAILED);
 goto err;
 }
 
@@ -182,6 +205,7 @@ void qmp_cpr_exec(strList *args, Error **errp)
 return;
 }
 
+cpr_call_notifiers(CPR_NOTIFY_EXEC);
 assert(qemu_system_exec_request(args, errp) == 0);
 }
 
@@ -218,6 +242,7 @@ void qmp_cpr_load(const char *filename, CprMode mode, Error 
**errp)
 qemu_fclose(f);
 if (ret < 0) {
 error_setg(errp, "Error %d while loading VM state", ret);
+cpr_call_notifiers(CPR_NOTIFY_LOAD_FAILED);
 goto out;
 }
 
diff --git a/stubs/cpr.c b/stubs/cpr.c
index 06a9a1c..9262e78 100644
--- a/stubs/cpr.c
+++ b/stubs/cpr.c
@@ -21,3 +21,13 @@ int cpr_add_blocker_str(const char *reason, Error **errp, 
CprMode mode, ...)
 void cpr_del_blocker(Error **reasonp)
 {
 }
+
+void cpr_add_notifier(Notifier *notify,
+  void (*cb)(Notifier *notifier, void *data),
+  CprNotifyState state)
+{
+}
+
+void cpr_remove_notifier(Notifier *notify)
+{
+}
-- 
1.8.3.1

[PATCH V8 03/39] migration: simplify savevm

Use qemu_file_open to simplify a few functions in savevm.c.
No functional change.

Signed-off-by: Steve Sistare 
Reviewed-by: Dr. David Alan Gilbert 
---
 migration/savevm.c | 20 ++--
 1 file changed, 6 insertions(+), 14 deletions(-)

diff --git a/migration/savevm.c b/migration/savevm.c
index d907689..0b2c5cd 100644
--- a/migration/savevm.c
+++ b/migration/savevm.c
@@ -2931,7 +2931,6 @@ void qmp_xen_save_devices_state(const char *filename, 
bool has_live, bool live,
 Error **errp)
 {
 QEMUFile *f;
-QIOChannelFile *ioc;
 int saved_vm_running;
 int ret;
 
@@ -2945,14 +2944,11 @@ void qmp_xen_save_devices_state(const char *filename, 
bool has_live, bool live,
 vm_stop(RUN_STATE_SAVE_VM);
 global_state_store_running();
 
-ioc = qio_channel_file_new_path(filename, O_WRONLY | O_CREAT | O_TRUNC,
-0660, errp);
-if (!ioc) {
+f = qemu_fopen_file(filename, O_WRONLY | O_CREAT | O_TRUNC, 0660,
+"migration-xen-save-state", errp);
+if (!f) {
 goto the_end;
 }
-qio_channel_set_name(QIO_CHANNEL(ioc), "migration-xen-save-state");
-f = qemu_fopen_channel_output(QIO_CHANNEL(ioc));
-object_unref(OBJECT(ioc));
 ret = qemu_save_device_state(f);
 if (ret < 0 || qemu_fclose(f) < 0) {
 error_setg(errp, QERR_IO_ERROR);
@@ -2981,7 +2977,6 @@ void qmp_xen_save_devices_state(const char *filename, 
bool has_live, bool live,
 void qmp_xen_load_devices_state(const char *filename, Error **errp)
 {
 QEMUFile *f;
-QIOChannelFile *ioc;
 int ret;
 
 /* Guest must be paused before loading the device state; the RAM state
@@ -2993,14 +2988,11 @@ void qmp_xen_load_devices_state(const char *filename, 
Error **errp)
 }
 vm_stop(RUN_STATE_RESTORE_VM);
 
-ioc = qio_channel_file_new_path(filename, O_RDONLY | O_BINARY, 0, errp);
-if (!ioc) {
+f = qemu_fopen_file(filename, O_RDONLY | O_BINARY, 0,
+"migration-xen-load-state", errp);
+if (!f) {
 return;
 }
-qio_channel_set_name(QIO_CHANNEL(ioc), "migration-xen-load-state");
-f = qemu_fopen_channel_input(QIO_CHANNEL(ioc));
-object_unref(OBJECT(ioc));
-
 ret = qemu_loadvm_state(f);
 qemu_fclose(f);
 if (ret < 0) {
-- 
1.8.3.1

[PATCH V8 19/39] cpr: preserve extra state

cpr must save state that is needed after qemu is restarted, when devices are
realized.  Thus the extra state cannot be saved in the cpr-load vmstate file,
as objects must already exist before that file can be loaded.  Instead,
define auxilliary state structures and vmstate descriptions, not associated
with any registered object, and serialize the aux state to a memfd file.
Deserialize after qemu restarts, before devices are realized.

The following state is saved:
  * cpr mode
  * file descriptor names and values
  * memfd values and properties for anonymous ram blocks

Signed-off-by: Steve Sistare 
---
 MAINTAINERS |   2 +
 include/migration/cpr.h |  16 +++
 migration/cpr-state.c   | 330 
 migration/cpr.c |  12 --
 migration/meson.build   |   1 +
 migration/trace-events  |   8 ++
 stubs/cpr-state.c   |  27 
 stubs/meson.build   |   1 +
 8 files changed, 385 insertions(+), 12 deletions(-)
 create mode 100644 migration/cpr-state.c
 create mode 100644 stubs/cpr-state.c

diff --git a/MAINTAINERS b/MAINTAINERS
index f9a6362..74a43e6 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3161,6 +3161,8 @@ F: migration/cpr.c
 F: qapi/cpr.json
 F: stubs/cpr.c
 F: tests/unit/test-strlist.c
+F: migration/cpr-state.c
+F: stubs/cpr-state.c
 
 Record/replay
 M: Pavel Dovgalyuk 
diff --git a/include/migration/cpr.h b/include/migration/cpr.h
index f236cbf..b75dec4 100644
--- a/include/migration/cpr.h
+++ b/include/migration/cpr.h
@@ -15,6 +15,22 @@ void cpr_set_mode(CprMode mode);
 CprMode cpr_get_mode(void);
 bool cpr_enabled(CprMode mode);
 
+typedef int (*cpr_walk_fd_cb)(const char *name, int id, int fd, void *opaque);
+
+void cpr_save_fd(const char *name, int id, int fd);
+void cpr_delete_fd(const char *name, int id);
+int cpr_find_fd(const char *name, int id);
+int cpr_walk_fd(cpr_walk_fd_cb cb, void *handle);
+void cpr_save_memfd(const char *name, int fd, size_t len, size_t maxlen,
+uint64_t align);
+int cpr_find_memfd(const char *name, size_t *lenp, size_t *maxlenp,
+   uint64_t *alignp);
+void cpr_delete_memfd(const char *name);
+int cpr_resave_fd(const char *name, int id, int fd, Error **errp);
+int cpr_state_save(Error **errp);
+int cpr_state_load(Error **errp);
+void cpr_state_print(void);
+
 #define CPR_MODE_ALL CPR_MODE__MAX
 
 int cpr_add_blocker(Error **reasonp, Error **errp, CprMode mode, ...);
diff --git a/migration/cpr-state.c b/migration/cpr-state.c
new file mode 100644
index 000..ff1e122
--- /dev/null
+++ b/migration/cpr-state.c
@@ -0,0 +1,330 @@
+/*
+ * Copyright (c) 2022 Oracle and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/cutils.h"
+#include "qemu/queue.h"
+#include "qemu/memfd.h"
+#include "qapi/error.h"
+#include "migration/vmstate.h"
+#include "migration/cpr.h"
+#include "migration/qemu-file.h"
+#include "migration/qemu-file-channel.h"
+#include "trace.h"
+
+/*/
+/* cpr state container for all information to be saved. */
+
+typedef QLIST_HEAD(CprNameList, CprName) CprNameList;
+
+typedef struct CprState {
+CprMode mode;
+CprNameList fds;/* list of CprFd */
+CprNameList memfd;  /* list of CprMemfd */
+} CprState;
+
+static CprState cpr_state = {
+.mode = CPR_MODE_NONE,
+};
+
+/*/
+/* Misc accessors. */
+
+CprMode cpr_get_mode(void)
+{
+return cpr_state.mode;
+}
+
+void cpr_set_mode(CprMode mode)
+{
+cpr_state.mode = mode;
+}
+
+/*/
+/* Generic list of names. */
+
+typedef struct CprName {
+char *name;
+unsigned int namelen;
+int id;
+QLIST_ENTRY(CprName) next;
+} CprName;
+
+static const VMStateDescription vmstate_cpr_name = {
+.name = "cpr name",
+.version_id = 1,
+.minimum_version_id = 1,
+.fields = (VMStateField[]) {
+VMSTATE_UINT32(namelen, CprName),
+VMSTATE_VBUFFER_ALLOC_UINT32(name, CprName, 0, NULL, namelen),
+VMSTATE_INT32(id, CprName),
+VMSTATE_END_OF_LIST()
+}
+};
+
+static void
+add_name(CprNameList *head, const char *name, int id, CprName *elem)
+{
+elem->name = g_strdup(name);
+elem->namelen = strlen(name) + 1;
+elem->id = id;
+QLIST_INSERT_HEAD(head, elem, next);
+}
+
+static CprName *find_name(CprNameList *head, const char *name, int id)
+{
+CprName *elem;
+
+QLIST_FOREACH(elem, head, next) {
+if (!strcmp(elem->name, name) && elem->id == id) {
+return elem;
+}
+}
+return NULL;
+}
+
+static void delete_name(CprNameList *head, const char *name, int id)
+{
+CprName *elem = find_name(head, name, id);
+
+if (elem) {
+QLIST_REMOVE(

[PATCH V8 17/39] qapi: strList unit tests

Signed-off-by: Steve Sistare 
---
 MAINTAINERS   |  1 +
 tests/unit/meson.build|  1 +
 tests/unit/test-strlist.c | 81 +++
 3 files changed, 83 insertions(+)
 create mode 100644 tests/unit/test-strlist.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 1e4e72f..f9a6362 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3160,6 +3160,7 @@ F: include/migration/cpr.h
 F: migration/cpr.c
 F: qapi/cpr.json
 F: stubs/cpr.c
+F: tests/unit/test-strlist.c
 
 Record/replay
 M: Pavel Dovgalyuk 
diff --git a/tests/unit/meson.build b/tests/unit/meson.build
index 287b367..57d48d5 100644
--- a/tests/unit/meson.build
+++ b/tests/unit/meson.build
@@ -17,6 +17,7 @@ tests = {
   'test-forward-visitor': [testqapi],
   'test-string-input-visitor': [testqapi],
   'test-string-output-visitor': [testqapi],
+  'test-strlist': [testqapi],
   'test-opts-visitor': [testqapi],
   'test-visitor-serialization': [testqapi],
   'test-bitmap': [],
diff --git a/tests/unit/test-strlist.c b/tests/unit/test-strlist.c
new file mode 100644
index 000..ef740dc
--- /dev/null
+++ b/tests/unit/test-strlist.c
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2022 Oracle and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "qapi/util.h"
+#include "qapi/qapi-builtin-types.h"
+
+static strList *make_list(int length)
+{
+strList *head = 0, *list, **prev = &head;
+
+while (length--) {
+list = *prev = g_new0(strList, 1);
+list->value = g_strdup("aaa");
+prev = &list->next;
+}
+return head;
+}
+
+static void test_length(void)
+{
+strList *list;
+int i;
+
+for (i = 0; i < 5; i++) {
+list = make_list(i);
+g_assert_cmpint(i, ==, QAPI_LIST_LENGTH(list));
+qapi_free_strList(list);
+}
+}
+
+struct {
+const char *string;
+char delim;
+const char *args[5];
+} list_data[] = {
+{ 0, ',', { 0 } },
+{ "", ',', { 0 } },
+{ "a", ',', { "a", 0 } },
+{ "a,b", ',', { "a", "b", 0 } },
+{ "a,b,c", ',', { "a", "b", "c", 0 } },
+{ "first last", ' ', { "first", "last", 0 } },
+{ "a:", ':', { "a", 0 } },
+{ "a::b", ':', { "a", "", "b", 0 } },
+{ ":", ':', { "", 0 } },
+{ ":a", ':', { "", "a", 0 } },
+{ "::a", ':', { "", "", "a", 0 } },
+};
+
+static void test_strv(void)
+{
+int i, j;
+const char **expect;
+strList *list;
+GStrv args;
+
+for (i = 0; i < ARRAY_SIZE(list_data); i++) {
+expect = list_data[i].args;
+list = strList_from_string(list_data[i].string, list_data[i].delim);
+args = strv_from_strList(list);
+qapi_free_strList(list);
+for (j = 0; expect[j] && args[j]; j++) {
+g_assert_cmpstr(expect[j], ==, args[j]);
+}
+g_assert_null(expect[j]);
+g_assert_null(args[j]);
+g_strfreev(args);
+}
+}
+
+int main(int argc, char **argv)
+{
+g_test_init(&argc, &argv, NULL);
+g_test_add_func("/test-string/length", test_length);
+g_test_add_func("/test-string/strv", test_strv);
+return g_test_run();
+}
-- 
1.8.3.1

[PATCH V8 11/39] cpr: save ram blocks

Add a vmstate handler to save volatile ram blocks in the state file.  This
is used to preserve secondary guest ram blocks (those that cannot be
specified on the command line) such as video ram and roms for cpr reboot,
as there is no option to allocate them in shared memory.  For efficiency,
the user should create a shared memory-backend-file for the VM's main ram,
so it is not copied to the state file, but this is not enforced.

Signed-off-by: Steve Sistare 
---
 include/exec/memory.h |  6 +
 migration/savevm.c|  2 ++
 softmmu/memory.c  | 18 ++
 softmmu/physmem.c | 67 +++
 4 files changed, 93 insertions(+)

diff --git a/include/exec/memory.h b/include/exec/memory.h
index 0daddd7..a03301d 100644
--- a/include/exec/memory.h
+++ b/include/exec/memory.h
@@ -3002,6 +3002,12 @@ bool ram_block_discard_is_disabled(void);
  */
 bool ram_block_discard_is_required(void);
 
+/*
+ * Register/unregister a ram block for cpr.
+ */
+void ram_block_register(RAMBlock *rb);
+void ram_block_unregister(RAMBlock *rb);
+
 #endif
 
 #endif
diff --git a/migration/savevm.c b/migration/savevm.c
index 0b2c5cd..9d528ed 100644
--- a/migration/savevm.c
+++ b/migration/savevm.c
@@ -3108,10 +3108,12 @@ void vmstate_register_ram(MemoryRegion *mr, DeviceState 
*dev)
 qemu_ram_set_idstr(mr->ram_block,
memory_region_name(mr), dev);
 qemu_ram_set_migratable(mr->ram_block);
+ram_block_register(mr->ram_block);
 }
 
 void vmstate_unregister_ram(MemoryRegion *mr, DeviceState *dev)
 {
+ram_block_unregister(mr->ram_block);
 qemu_ram_unset_idstr(mr->ram_block);
 qemu_ram_unset_migratable(mr->ram_block);
 }
diff --git a/softmmu/memory.c b/softmmu/memory.c
index 7ba2048..0fe6fac 100644
--- a/softmmu/memory.c
+++ b/softmmu/memory.c
@@ -3541,13 +3541,31 @@ void __attribute__((weak)) fuzz_dma_read_cb(size_t addr,
 }
 #endif
 
+static char *
+memory_region_vmstate_if_get_id(VMStateIf *obj)
+{
+MemoryRegion *mr = MEMORY_REGION(obj);
+return strdup(mr->ram_block->idstr);
+}
+
+static void memory_region_class_init(ObjectClass *class, void *data)
+{
+VMStateIfClass *vc = VMSTATE_IF_CLASS(class);
+vc->get_id = memory_region_vmstate_if_get_id;
+}
+
 static const TypeInfo memory_region_info = {
 .parent = TYPE_OBJECT,
 .name   = TYPE_MEMORY_REGION,
 .class_size = sizeof(MemoryRegionClass),
+.class_init = memory_region_class_init,
 .instance_size  = sizeof(MemoryRegion),
 .instance_init  = memory_region_initfn,
 .instance_finalize  = memory_region_finalize,
+.interfaces = (InterfaceInfo[]) {
+{ TYPE_VMSTATE_IF },
+{ }
+}
 };
 
 static const TypeInfo iommu_memory_region_info = {
diff --git a/softmmu/physmem.c b/softmmu/physmem.c
index 0f1ce28..822c424 100644
--- a/softmmu/physmem.c
+++ b/softmmu/physmem.c
@@ -66,7 +66,9 @@
 
 #include "qemu/pmem.h"
 
+#include "migration/cpr.h"
 #include "migration/vmstate.h"
+#include "migration/qemu-file.h"
 
 #include "qemu/range.h"
 #ifndef _WIN32
@@ -2450,6 +2452,71 @@ ram_addr_t qemu_ram_addr_from_host(void *ptr)
 return block->offset + offset;
 }
 
+static int put_ram_block(QEMUFile *f, void *pv, size_t size,
+ const VMStateField *field, JSONWriter *vmdesc)
+{
+RAMBlock *rb = pv;
+
+if (rb->used_length > 1024 * 1024) {
+warn_report("Large RAM block %s size %ld saved to state file. "
+"Use a shared file memory backend to avoid the copy.",
+rb->idstr, rb->used_length);
+}
+qemu_put_buffer(f, rb->host, rb->used_length);
+return 0;
+}
+
+static int get_ram_block(QEMUFile *f, void *pv, size_t size,
+ const VMStateField *field)
+{
+RAMBlock *rb = pv;
+qemu_get_buffer(f, rb->host, rb->used_length);
+return 0;
+}
+
+static const VMStateInfo vmstate_info_ram_block = {
+.name = "ram block host",
+.get  = get_ram_block,
+.put  = put_ram_block,
+};
+
+#define VMSTATE_RAM_BLOCK() {   \
+.name  = "ram_block_host",  \
+.info  = &vmstate_info_ram_block,   \
+.flags = VMS_SINGLE,\
+}
+
+static bool ram_block_needed(void *opaque)
+{
+RAMBlock *rb = opaque;
+
+return cpr_get_mode() == CPR_MODE_REBOOT &&
+qemu_ram_is_migratable(rb) &&
+(!qemu_ram_is_shared(rb) || ramblock_is_anon(rb));
+}
+
+const VMStateDescription vmstate_ram_block = {
+.name = "RAMBlock",
+.version_id = 1,
+.minimum_version_id = 1,
+.needed = ram_block_needed,
+.fields = (VMStateField[]) {
+VMSTATE_UINT64(used_length, RAMBlock),
+VMSTATE_RAM_BLOCK(),
+VMSTATE_END_OF_LIST()
+},
+};
+
+void ram_block_register(RAMBlock *rb)
+{
+vmstate_register(VMSTATE_IF(rb->mr), 0, &vmstate_ram_block, rb);
+}
+
+void ram_block_unregister(RAMBlock *rb)
+{
+vmstate_unregister(VMSTATE_IF(rb->mr), &

[PATCH V8 06/39] cpr: reboot mode

Provide the cpr-save and cpr-load functions for live update.  These save and
restore VM state, with minimal guest pause time, so that qemu may be updated
to a new version in between.

cpr-save stops the VM and saves vmstate to an ordinary file.  It supports
any type of guest image and block device, but the caller must not modify
guest block devices between cpr-save and cpr-load.

cpr-save supports several modes, the first of which is reboot. In this mode
the caller invokes cpr-save and then terminates qemu.  The caller may then
update the host kernel and system software and reboot.  The caller resumes
the guest by running qemu with the same arguments as the original process
and invoking cpr-load.  To use this mode, guest ram must be mapped to a
persistent shared memory file such as /dev/dax0.0 or /dev/shm PKRAM.

The reboot mode supports vfio devices if the caller first suspends the
guest, such as by issuing guest-suspend-ram to the qemu guest agent.  The
guest drivers' suspend methods flush outstanding requests and re-initialize
the devices, and thus there is no device state to save and restore.

cpr-load loads state from the file.  If the VM was running at cpr-save time
then VM execution resumes.  If the VM was suspended at cpr-save time, then
the caller must issue a system_wakeup command to resume.

cpr-save syntax:
  { 'enum': 'CprMode', 'data': [ 'reboot' ] }
  { 'command': 'cpr-save', 'data': { 'filename': 'str', 'mode': 'CprMode' }}

cpr-load syntax:
  { 'command': 'cpr-load', 'data': { 'filename': 'str', 'mode': 'CprMode' }}

Signed-off-by: Steve Sistare 
---
 MAINTAINERS |   8 
 include/migration/cpr.h |  16 +++
 migration/cpr.c | 116 
 migration/meson.build   |   1 +
 qapi/cpr.json   |  62 ++
 qapi/meson.build|   1 +
 qapi/qapi-schema.json   |   1 +
 softmmu/runstate.c  |   1 +
 8 files changed, 206 insertions(+)
 create mode 100644 include/migration/cpr.h
 create mode 100644 migration/cpr.c
 create mode 100644 qapi/cpr.json

diff --git a/MAINTAINERS b/MAINTAINERS
index 4cf6174..9273891 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3152,6 +3152,14 @@ F: net/filter-rewriter.c
 F: net/filter-mirror.c
 F: tests/qtest/test-filter*
 
+CPR
+M: Steve Sistare 
+M: Mark Kanda 
+S: Maintained
+F: include/migration/cpr.h
+F: migration/cpr.c
+F: qapi/cpr.json
+
 Record/replay
 M: Pavel Dovgalyuk 
 R: Paolo Bonzini 
diff --git a/include/migration/cpr.h b/include/migration/cpr.h
new file mode 100644
index 000..1b6c82f
--- /dev/null
+++ b/include/migration/cpr.h
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2021, 2022 Oracle and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ * See the COPYING file in the top-level directory.
+ */
+
+#ifndef MIGRATION_CPR_H
+#define MIGRATION_CPR_H
+
+#include "qapi/qapi-types-cpr.h"
+
+void cpr_set_mode(CprMode mode);
+CprMode cpr_get_mode(void);
+
+#endif
diff --git a/migration/cpr.c b/migration/cpr.c
new file mode 100644
index 000..24b0bcc
--- /dev/null
+++ b/migration/cpr.c
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2021, 2022 Oracle and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "migration/cpr.h"
+#include "migration/global_state.h"
+#include "qapi/error.h"
+#include "qapi/qapi-commands-cpr.h"
+#include "qemu-file-channel.h"
+#include "qemu-file.h"
+#include "savevm.h"
+#include "sysemu/cpu-timers.h"
+#include "sysemu/runstate.h"
+#include "sysemu/sysemu.h"
+
+static CprMode cpr_mode = CPR_MODE_NONE;
+
+CprMode cpr_get_mode(void)
+{
+return cpr_mode;
+}
+
+void cpr_set_mode(CprMode mode)
+{
+cpr_mode = mode;
+}
+
+void qmp_cpr_save(const char *filename, CprMode mode, Error **errp)
+{
+int ret;
+QEMUFile *f;
+int saved_vm_running = runstate_is_running();
+
+if (global_state_store()) {
+error_setg(errp, "Error saving global state");
+return;
+}
+
+f = qemu_fopen_file(filename, O_CREAT | O_WRONLY | O_TRUNC, 0600,
+"cpr-save", errp);
+if (!f) {
+return;
+}
+
+if (runstate_check(RUN_STATE_SUSPENDED)) {
+/* Update timers_state before saving.  Suspend did not so do. */
+cpu_disable_ticks();
+}
+vm_stop(RUN_STATE_SAVE_VM);
+
+cpr_set_mode(mode);
+ret = qemu_save_device_state(f);
+qemu_fclose(f);
+if (ret < 0) {
+error_setg(errp, "Error %d while saving VM state", ret);
+goto err;
+}
+
+return;
+
+err:
+if (saved_vm_running) {
+vm_start();
+}
+cpr_set_mode(CPR_MODE_NONE);
+}
+
+void qmp_cpr_load(const char *filename, CprMode mode, Error **errp)
+{
+QEMUFile *f;
+int ret;
+RunState state;
+
+if (runstate_is_running()) {
+error_setg(errp, "cpr-load called for a running VM");
+r

[PATCH V8 13/39] oslib: qemu_clear_cloexec

Define qemu_clear_cloexec, analogous to qemu_set_cloexec.

Reviewed-by: Dr. David Alan Gilbert 
Signed-off-by: Steve Sistare 
---
 include/qemu/osdep.h | 1 +
 util/oslib-posix.c   | 9 +
 util/oslib-win32.c   | 4 
 3 files changed, 14 insertions(+)

diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h
index b1c161c..e916f3b 100644
--- a/include/qemu/osdep.h
+++ b/include/qemu/osdep.h
@@ -548,6 +548,7 @@ ssize_t qemu_write_full(int fd, const void *buf, size_t 
count)
 G_GNUC_WARN_UNUSED_RESULT;
 
 void qemu_set_cloexec(int fd);
+void qemu_clear_cloexec(int fd);
 
 /* Return a dynamically allocated directory path that is appropriate for 
storing
  * local state.
diff --git a/util/oslib-posix.c b/util/oslib-posix.c
index 7a34c16..421e987 100644
--- a/util/oslib-posix.c
+++ b/util/oslib-posix.c
@@ -261,6 +261,15 @@ void qemu_set_cloexec(int fd)
 assert(f != -1);
 }
 
+void qemu_clear_cloexec(int fd)
+{
+int f;
+f = fcntl(fd, F_GETFD);
+assert(f != -1);
+f = fcntl(fd, F_SETFD, f & ~FD_CLOEXEC);
+assert(f != -1);
+}
+
 char *
 qemu_get_local_state_dir(void)
 {
diff --git a/util/oslib-win32.c b/util/oslib-win32.c
index 5723d3e..5bed148 100644
--- a/util/oslib-win32.c
+++ b/util/oslib-win32.c
@@ -226,6 +226,10 @@ void qemu_set_cloexec(int fd)
 {
 }
 
+void qemu_clear_cloexec(int fd)
+{
+}
+
 int qemu_get_thread_id(void)
 {
 return GetCurrentThreadId();
-- 
1.8.3.1

[PATCH V8 31/39] vhost: reset vhost devices for cpr

A vhost device is implicitly preserved across re-exec because its fd is not
closed, and the value of the fd is specified on the command line for the
new qemu to find.  However, new qemu issues an VHOST_RESET_OWNER ioctl,
which fails because the device already has an owner.  To fix, reset the
owner prior to exec.

Signed-off-by: Mark Kanda 
Signed-off-by: Steve Sistare 
---
 hw/virtio/vhost.c | 17 +
 include/hw/virtio/vhost.h |  1 +
 2 files changed, 18 insertions(+)

diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
index dd3263d..efaa28c 100644
--- a/hw/virtio/vhost.c
+++ b/hw/virtio/vhost.c
@@ -23,6 +23,7 @@
 #include "standard-headers/linux/vhost_types.h"
 #include "hw/virtio/virtio-bus.h"
 #include "hw/virtio/virtio-access.h"
+#include "migration/cpr.h"
 #include "migration/blocker.h"
 #include "migration/qemu-file-types.h"
 #include "sysemu/dma.h"
@@ -1306,6 +1307,17 @@ static void vhost_virtqueue_cleanup(struct 
vhost_virtqueue *vq)
 event_notifier_cleanup(&vq->masked_notifier);
 }
 
+static void vhost_cpr_exec_notifier(Notifier *notifier, void *data)
+{
+struct vhost_dev *dev = container_of(notifier, struct vhost_dev,
+ cpr_notifier);
+int r = dev->vhost_ops->vhost_reset_device(dev);
+
+if (r < 0) {
+VHOST_OPS_DEBUG(r, "vhost_reset_device failed");
+}
+}
+
 int vhost_dev_init(struct vhost_dev *hdev, void *opaque,
VhostBackendType backend_type, uint32_t busyloop_timeout,
Error **errp)
@@ -1405,6 +1417,8 @@ int vhost_dev_init(struct vhost_dev *hdev, void *opaque,
 hdev->log_enabled = false;
 hdev->started = false;
 memory_listener_register(&hdev->memory_listener, &address_space_memory);
+cpr_add_notifier(&hdev->cpr_notifier, vhost_cpr_exec_notifier,
+ CPR_NOTIFY_EXEC);
 QLIST_INSERT_HEAD(&vhost_devices, hdev, entry);
 
 if (used_memslots > hdev->vhost_ops->vhost_backend_memslots_limit(hdev)) {
@@ -1444,6 +1458,9 @@ void vhost_dev_cleanup(struct vhost_dev *hdev)
 migrate_del_blocker(hdev->migration_blocker);
 error_free(hdev->migration_blocker);
 }
+if (hdev->cpr_notifier.notify) {
+cpr_remove_notifier(&hdev->cpr_notifier);
+}
 g_free(hdev->mem);
 g_free(hdev->mem_sections);
 if (hdev->vhost_ops) {
diff --git a/include/hw/virtio/vhost.h b/include/hw/virtio/vhost.h
index b291fe4..1316b14 100644
--- a/include/hw/virtio/vhost.h
+++ b/include/hw/virtio/vhost.h
@@ -100,6 +100,7 @@ struct vhost_dev {
 QLIST_ENTRY(vhost_dev) entry;
 QLIST_HEAD(, vhost_iommu) iommu_list;
 IOMMUNotifier n;
+Notifier cpr_notifier;
 const VhostDevConfigOps *config_ops;
 };
 
-- 
1.8.3.1

[PATCH V8 18/39] vl: helper to request re-exec

Add a qemu_system_exec_request() hook that causes the main loop to exit and
re-exec qemu using the specified arguments.

Signed-off-by: Steve Sistare 
---
 include/sysemu/runstate.h |  1 +
 softmmu/runstate.c| 26 ++
 2 files changed, 27 insertions(+)

diff --git a/include/sysemu/runstate.h b/include/sysemu/runstate.h
index 16c1c41..6b0b4f1 100644
--- a/include/sysemu/runstate.h
+++ b/include/sysemu/runstate.h
@@ -63,6 +63,7 @@ void qemu_system_wakeup_enable(WakeupReason reason, bool 
enabled);
 void qemu_register_wakeup_notifier(Notifier *notifier);
 void qemu_register_wakeup_support(void);
 void qemu_system_shutdown_request(ShutdownCause reason);
+int qemu_system_exec_request(const strList *args, Error **errp);
 void qemu_system_powerdown_request(void);
 void qemu_register_powerdown_notifier(Notifier *notifier);
 void qemu_register_shutdown_notifier(Notifier *notifier);
diff --git a/softmmu/runstate.c b/softmmu/runstate.c
index cfd6aa9..c35ab09 100644
--- a/softmmu/runstate.c
+++ b/softmmu/runstate.c
@@ -37,6 +37,7 @@
 #include "monitor/monitor.h"
 #include "net/net.h"
 #include "net/vhost_net.h"
+#include "qapi/util.h"
 #include "qapi/error.h"
 #include "qapi/qapi-commands-run-state.h"
 #include "qapi/qapi-events-run-state.h"
@@ -355,6 +356,7 @@ static NotifierList wakeup_notifiers =
 static NotifierList shutdown_notifiers =
 NOTIFIER_LIST_INITIALIZER(shutdown_notifiers);
 static uint32_t wakeup_reason_mask = ~(1 << QEMU_WAKEUP_REASON_NONE);
+static GStrv exec_argv;
 
 ShutdownCause qemu_shutdown_requested_get(void)
 {
@@ -371,6 +373,11 @@ static int qemu_shutdown_requested(void)
 return qatomic_xchg(&shutdown_requested, SHUTDOWN_CAUSE_NONE);
 }
 
+static int qemu_exec_requested(void)
+{
+return exec_argv != NULL;
+}
+
 static void qemu_kill_report(void)
 {
 if (!qtest_driver() && shutdown_signal) {
@@ -641,6 +648,18 @@ void qemu_system_shutdown_request(ShutdownCause reason)
 qemu_notify_event();
 }
 
+int qemu_system_exec_request(const strList *args, Error **errp)
+{
+exec_argv = strv_from_strList(args);
+if (!exec_argv[0]) {
+error_setg(errp, "qemu_system_exec_request: argv[0] is NULL");
+return 1;
+}
+shutdown_requested = 1;
+qemu_notify_event();
+return 0;
+}
+
 static void qemu_system_powerdown(void)
 {
 qapi_event_send_powerdown();
@@ -689,6 +708,13 @@ static bool main_loop_should_exit(void)
 }
 request = qemu_shutdown_requested();
 if (request) {
+
+if (qemu_exec_requested()) {
+execvp(exec_argv[0], exec_argv);
+error_report("execvp %s failed: %s", exec_argv[0], 
strerror(errno));
+g_strfreev(exec_argv);
+exec_argv = NULL;
+}
 qemu_kill_report();
 qemu_system_shutdown(request);
 if (shutdown_action == SHUTDOWN_ACTION_PAUSE) {
-- 
1.8.3.1

[PATCH V8 09/39] cpr: register blockers

Register the known cpr blockers.

Signed-off-by: Steve Sistare 
---
 accel/xen/xen-all.c| 3 +++
 backends/hostmem-epc.c | 6 ++
 migration/migration.c  | 6 ++
 replay/replay.c| 4 
 4 files changed, 19 insertions(+)

diff --git a/accel/xen/xen-all.c b/accel/xen/xen-all.c
index 69aa7d0..9dd0dc6 100644
--- a/accel/xen/xen-all.c
+++ b/accel/xen/xen-all.c
@@ -21,6 +21,7 @@
 #include "sysemu/runstate.h"
 #include "migration/misc.h"
 #include "migration/global_state.h"
+#include "migration/cpr.h"
 #include "hw/boards.h"
 
 //#define DEBUG_XEN
@@ -181,6 +182,8 @@ static int xen_init(MachineState *ms)
  * opt out of system RAM being allocated by generic code
  */
 mc->default_ram_id = NULL;
+
+cpr_add_blocker_str("xen does not support cpr", &error_fatal, 
CPR_MODE_ALL);
 return 0;
 }
 
diff --git a/backends/hostmem-epc.c b/backends/hostmem-epc.c
index cb06255..094fed9 100644
--- a/backends/hostmem-epc.c
+++ b/backends/hostmem-epc.c
@@ -16,6 +16,7 @@
 #include "qapi/error.h"
 #include "sysemu/hostmem.h"
 #include "hw/i386/hostmem-epc.h"
+#include "migration/cpr.h"
 
 static void
 sgx_epc_backend_memory_alloc(HostMemoryBackend *backend, Error **errp)
@@ -23,6 +24,7 @@ sgx_epc_backend_memory_alloc(HostMemoryBackend *backend, 
Error **errp)
 uint32_t ram_flags;
 char *name;
 int fd;
+Error *blocker = NULL;
 
 if (!backend->size) {
 error_setg(errp, "can't create backend with size 0");
@@ -41,6 +43,10 @@ sgx_epc_backend_memory_alloc(HostMemoryBackend *backend, 
Error **errp)
 memory_region_init_ram_from_fd(&backend->mr, OBJECT(backend),
name, backend->size, ram_flags,
fd, 0, errp);
+
+error_setg(&blocker, "RAM_PROTECTED block %s does not support cpr", name);
+cpr_add_blocker(&blocker, errp, CPR_MODE_ALL);
+
 g_free(name);
 }
 
diff --git a/migration/migration.c b/migration/migration.c
index 31739b2..1451bae 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -32,6 +32,7 @@
 #include "savevm.h"
 #include "qemu-file-channel.h"
 #include "qemu-file.h"
+#include "migration/cpr.h"
 #include "migration/vmstate.h"
 #include "block/block.h"
 #include "qapi/error.h"
@@ -1283,6 +1284,11 @@ static bool migrate_caps_check(bool *cap_list,
 return false;
 }
 
+if (cap_list[MIGRATION_CAPABILITY_X_COLO]) {
+return cpr_add_blocker_str("x-colo is not compatible with cpr",
+   errp, CPR_MODE_ALL);
+}
+
 return true;
 }
 
diff --git a/replay/replay.c b/replay/replay.c
index 4c396bb..eb5456f 100644
--- a/replay/replay.c
+++ b/replay/replay.c
@@ -19,6 +19,7 @@
 #include "qemu/option.h"
 #include "sysemu/cpus.h"
 #include "qemu/error-report.h"
+#include "migration/cpr.h"
 
 /* Current version of the replay mechanism.
Increase it when file format changes. */
@@ -232,6 +233,9 @@ static void replay_enable(const char *fname, int mode)
 const char *fmode = NULL;
 assert(!replay_file);
 
+cpr_add_blocker_str("replay is not compatible with cpr",
+&error_fatal, CPR_MODE_ALL);
+
 switch (mode) {
 case REPLAY_MODE_RECORD:
 fmode = "wb";
-- 
1.8.3.1

[PATCH V8 22/39] cpr: ram block blockers

Unlike reboot mode, restart mode cannot save volatile ram blocks in the
vmstate file and recreate them later, because the physical memory for the
blocks is pinned and registered for vfio.  Add a restart-mode blocker for
volatile ram blocks.

Signed-off-by: Steve Sistare 
---
 include/exec/memory.h   |  2 ++
 include/exec/ramblock.h |  1 +
 softmmu/physmem.c   | 48 
 softmmu/vl.c|  2 ++
 4 files changed, 53 insertions(+)

diff --git a/include/exec/memory.h b/include/exec/memory.h
index 6a257a4..812226f 100644
--- a/include/exec/memory.h
+++ b/include/exec/memory.h
@@ -3039,6 +3039,8 @@ bool ram_block_discard_is_required(void);
 void ram_block_register(RAMBlock *rb);
 void ram_block_unregister(RAMBlock *rb);
 
+void ram_block_add_cpr_blockers(Error **errp);
+
 #endif
 
 #endif
diff --git a/include/exec/ramblock.h b/include/exec/ramblock.h
index 6cbedf9..a5cbd9e 100644
--- a/include/exec/ramblock.h
+++ b/include/exec/ramblock.h
@@ -39,6 +39,7 @@ struct RAMBlock {
 /* RCU-enabled, writes protected by the ramlist lock */
 QLIST_ENTRY(RAMBlock) next;
 QLIST_HEAD(, RAMBlockNotifier) ramblock_notifiers;
+Error *cpr_blocker;
 int fd;
 size_t page_size;
 /* dirty bitmap used during migration */
diff --git a/softmmu/physmem.c b/softmmu/physmem.c
index 412cc80..b90ab4e 100644
--- a/softmmu/physmem.c
+++ b/softmmu/physmem.c
@@ -1968,6 +1968,53 @@ static bool memory_region_is_backend(MemoryRegion *mr)
 return !!object_dynamic_cast(mr->parent_obj.parent, TYPE_MEMORY_BACKEND);
 }
 
+/*
+ * Return true if ram contents would be lost during cpr for CPR_MODE_RESTART.
+ * Return false for ram_device because it is remapped after restart.  Do not
+ * exclude rom, even though it is readonly, because the rom file could change
+ * in the new qemu.  Return false for non-migratable blocks.  They are either
+ * re-created after restart, or are handled specially, or are covered by a
+ * device-level cpr blocker.  Return false for an fd, because it is visible and
+ * can be remapped in the new process.
+ */
+static bool ram_is_volatile(RAMBlock *rb)
+{
+MemoryRegion *mr = rb->mr;
+
+return mr &&
+memory_region_is_ram(mr) &&
+!memory_region_is_ram_device(mr) &&
+(!qemu_ram_is_shared(rb) || ramblock_is_anon(rb)) &&
+qemu_ram_is_migratable(rb) &&
+rb->fd < 0;
+}
+
+/*
+ * Add a CPR_MODE_RESTART blocker for each volatile ram block.  This cannot be
+ * performed in ram_block_add because the migratable flag has not been set yet.
+ */
+void ram_block_add_cpr_blockers(Error **errp)
+{
+RAMBlock *rb;
+
+RAMBLOCK_FOREACH(rb) {
+if (ram_is_volatile(rb)) {
+const char *name = memory_region_name(rb->mr);
+rb->cpr_blocker = NULL;
+if (memory_region_is_backend(rb->mr)) {
+error_setg(&rb->cpr_blocker,
+"Memory region %s is volatile. A memory-backend-memfd or"
+" memory-backend-file with share=on is required.", name);
+} else {
+error_setg(&rb->cpr_blocker,
+"Memory region %s is volatile. "
+"-cpr-enable restart is required.", name);
+}
+cpr_add_blocker(&rb->cpr_blocker, errp, CPR_MODE_RESTART, 0);
+}
+}
+}
+
 static void *qemu_anon_memfd_alloc(RAMBlock *rb, size_t maxlen, Error **errp)
 {
 size_t len, align;
@@ -2275,6 +2322,7 @@ void qemu_ram_free(RAMBlock *block)
 
 qemu_mutex_lock_ramlist();
 cpr_delete_memfd(memory_region_name(block->mr));
+cpr_del_blocker(&block->cpr_blocker);
 QLIST_REMOVE_RCU(block, next);
 ram_list.mru_block = NULL;
 /* Write list before version */
diff --git a/softmmu/vl.c b/softmmu/vl.c
index ce779cf..3e19c74 100644
--- a/softmmu/vl.c
+++ b/softmmu/vl.c
@@ -28,6 +28,7 @@
 #include "qemu/units.h"
 #include "exec/cpu-common.h"
 #include "exec/page-vary.h"
+#include "exec/memory.h"
 #include "hw/qdev-properties.h"
 #include "qapi/compat-policy.h"
 #include "qapi/error.h"
@@ -2569,6 +2570,7 @@ void qmp_x_exit_preconfig(Error **errp)
 qemu_init_board();
 qemu_create_cli_devices();
 qemu_machine_creation_done();
+ram_block_add_cpr_blockers(&error_fatal);
 
 if (loadvm) {
 load_snapshot(loadvm, NULL, false, NULL, &error_fatal);
-- 
1.8.3.1

[PATCH V8 14/39] qapi: strList_from_string

Generalize strList_from_comma_list() to take any delimiter character, rename
as strList_from_string(), and move it to qapi/util.c.

No functional change.

Signed-off-by: Steve Sistare 
---
 include/qapi/util.h |  9 +
 monitor/hmp-cmds.c  | 29 ++---
 qapi/qapi-util.c| 23 +++
 3 files changed, 34 insertions(+), 27 deletions(-)

diff --git a/include/qapi/util.h b/include/qapi/util.h
index 81a2b13..7d88b09 100644
--- a/include/qapi/util.h
+++ b/include/qapi/util.h
@@ -22,6 +22,8 @@ typedef struct QEnumLookup {
 const int size;
 } QEnumLookup;
 
+struct strList;
+
 const char *qapi_enum_lookup(const QEnumLookup *lookup, int val);
 int qapi_enum_parse(const QEnumLookup *lookup, const char *buf,
 int def, Error **errp);
@@ -31,6 +33,13 @@ bool qapi_bool_parse(const char *name, const char *value, 
bool *obj,
 int parse_qapi_name(const char *name, bool complete);
 
 /*
+ * Produce a strList from the character delimited string @in.
+ * All strings are g_strdup'd.
+ * A NULL or empty input string returns NULL.
+ */
+struct strList *strList_from_string(const char *in, char delim);
+
+/*
  * For any GenericList @list, insert @element at the front.
  *
  * Note that this macro evaluates @element exactly once, so it is safe
diff --git a/monitor/hmp-cmds.c b/monitor/hmp-cmds.c
index bb12589..9f58b1f 100644
--- a/monitor/hmp-cmds.c
+++ b/monitor/hmp-cmds.c
@@ -43,6 +43,7 @@
 #include "qapi/qapi-commands-run-state.h"
 #include "qapi/qapi-commands-tpm.h"
 #include "qapi/qapi-commands-ui.h"
+#include "qapi/util.h"
 #include "qapi/qapi-visit-net.h"
 #include "qapi/qapi-visit-migration.h"
 #include "qapi/qmp/qdict.h"
@@ -70,32 +71,6 @@ bool hmp_handle_error(Monitor *mon, Error *err)
 return false;
 }
 
-/*
- * Produce a strList from a comma separated list.
- * A NULL or empty input string return NULL.
- */
-static strList *strList_from_comma_list(const char *in)
-{
-strList *res = NULL;
-strList **tail = &res;
-
-while (in && in[0]) {
-char *comma = strchr(in, ',');
-char *value;
-
-if (comma) {
-value = g_strndup(in, comma - in);
-in = comma + 1; /* skip the , */
-} else {
-value = g_strdup(in);
-in = NULL;
-}
-QAPI_LIST_APPEND(tail, value);
-}
-
-return res;
-}
-
 void hmp_info_name(Monitor *mon, const QDict *qdict)
 {
 NameInfo *info;
@@ -1115,7 +1090,7 @@ void hmp_announce_self(Monitor *mon, const QDict *qdict)
 migrate_announce_params());
 
 qapi_free_strList(params->interfaces);
-params->interfaces = strList_from_comma_list(interfaces_str);
+params->interfaces = strList_from_string(interfaces_str, ',');
 params->has_interfaces = params->interfaces != NULL;
 params->id = g_strdup(id);
 params->has_id = !!params->id;
diff --git a/qapi/qapi-util.c b/qapi/qapi-util.c
index 63596e1..b61c73c 100644
--- a/qapi/qapi-util.c
+++ b/qapi/qapi-util.c
@@ -15,6 +15,7 @@
 #include "qapi/error.h"
 #include "qemu/ctype.h"
 #include "qapi/qmp/qerror.h"
+#include "qapi/qapi-builtin-types.h"
 
 CompatPolicy compat_policy;
 
@@ -152,3 +153,25 @@ int parse_qapi_name(const char *str, bool complete)
 }
 return p - str;
 }
+
+strList *strList_from_string(const char *in, char delim)
+{
+strList *res = NULL;
+strList **tail = &res;
+
+while (in && in[0]) {
+char *next = strchr(in, delim);
+char *value;
+
+if (next) {
+value = g_strndup(in, next - in);
+in = next + 1; /* skip the delim */
+} else {
+value = g_strdup(in);
+in = NULL;
+}
+QAPI_LIST_APPEND(tail, value);
+}
+
+return res;
+}
-- 
1.8.3.1

[PATCH V8 29/39] vfio-pci: cpr part 3 (intx)

Preserve vfio INTX state across cpr restart.  Preserve VFIOINTx fields as
follows:
  pin : Recover this from the vfio config in kernel space
  interrupt : Preserve its eventfd descriptor across exec.
  unmask : Ditto
  route.irq : This could perhaps be recovered in vfio_pci_post_load by
calling pci_device_route_intx_to_irq(pin), whose implementation reads
config space for a bridge device such as ich9.  However, there is no
guarantee that the bridge vmstate is read before vfio vmstate.  Rather
than fiddling with MigrationPriority for vmstate handlers, explicitly
save route.irq in vfio vmstate.
  pending : save in vfio vmstate.
  mmap_timeout, mmap_timer : Re-initialize
  bool kvm_accel : Re-initialize

In vfio_realize, defer calling vfio_intx_enable until the vmstate
is available, in vfio_pci_post_load.  Modify vfio_intx_enable and
vfio_intx_kvm_enable to skip vfio initialization, but still perform
kvm initialization.

Signed-off-by: Steve Sistare 
---
 hw/vfio/pci.c | 92 +--
 1 file changed, 83 insertions(+), 9 deletions(-)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 2fd7121..b8aee91 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -173,14 +173,45 @@ static void vfio_intx_eoi(VFIODevice *vbasedev)
 vfio_unmask_single_irqindex(vbasedev, VFIO_PCI_INTX_IRQ_INDEX);
 }
 
+#ifdef CONFIG_KVM
+static bool vfio_no_kvm_intx(VFIOPCIDevice *vdev)
+{
+return vdev->no_kvm_intx || !kvm_irqfds_enabled() ||
+   vdev->intx.route.mode != PCI_INTX_ENABLED ||
+   !kvm_resamplefds_enabled();
+}
+#endif
+
+static void vfio_intx_reenable_kvm(VFIOPCIDevice *vdev, Error **errp)
+{
+#ifdef CONFIG_KVM
+if (vfio_no_kvm_intx(vdev)) {
+return;
+}
+
+if (vfio_notifier_init(vdev, &vdev->intx.unmask, "intx-unmask", 0)) {
+error_setg(errp, "vfio_notifier_init intx-unmask failed");
+return;
+}
+
+if (kvm_irqchip_add_irqfd_notifier_gsi(kvm_state,
+   &vdev->intx.interrupt,
+   &vdev->intx.unmask,
+   vdev->intx.route.irq)) {
+error_setg_errno(errp, errno, "failed to setup resample irqfd");
+return;
+}
+
+vdev->intx.kvm_accel = true;
+#endif
+}
+
 static void vfio_intx_enable_kvm(VFIOPCIDevice *vdev, Error **errp)
 {
 #ifdef CONFIG_KVM
 int irq_fd = event_notifier_get_fd(&vdev->intx.interrupt);
 
-if (vdev->no_kvm_intx || !kvm_irqfds_enabled() ||
-vdev->intx.route.mode != PCI_INTX_ENABLED ||
-!kvm_resamplefds_enabled()) {
+if (vfio_no_kvm_intx(vdev)) {
 return;
 }
 
@@ -328,7 +359,13 @@ static int vfio_intx_enable(VFIOPCIDevice *vdev, Error 
**errp)
 return 0;
 }
 
-vfio_disable_interrupts(vdev);
+/*
+ * Do not alter interrupt state during vfio_realize and cpr-load.  The
+ * reused flag is cleared thereafter.
+ */
+if (!vdev->vbasedev.reused) {
+vfio_disable_interrupts(vdev);
+}
 
 vdev->intx.pin = pin - 1; /* Pin A (1) -> irq[0] */
 pci_config_set_interrupt_pin(vdev->pdev.config, pin);
@@ -353,6 +390,11 @@ static int vfio_intx_enable(VFIOPCIDevice *vdev, Error 
**errp)
 fd = event_notifier_get_fd(&vdev->intx.interrupt);
 qemu_set_fd_handler(fd, vfio_intx_interrupt, NULL, vdev);
 
+if (vdev->vbasedev.reused) {
+vfio_intx_reenable_kvm(vdev, &err);
+goto finish;
+}
+
 if (vfio_set_irq_signaling(&vdev->vbasedev, VFIO_PCI_INTX_IRQ_INDEX, 0,
VFIO_IRQ_SET_ACTION_TRIGGER, fd, errp)) {
 qemu_set_fd_handler(fd, NULL, NULL, vdev);
@@ -365,6 +407,7 @@ static int vfio_intx_enable(VFIOPCIDevice *vdev, Error 
**errp)
 warn_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
 }
 
+finish:
 vdev->interrupt = VFIO_INT_INTx;
 
 trace_vfio_intx_enable(vdev->vbasedev.name);
@@ -3195,9 +3238,13 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
  vfio_intx_routing_notifier);
 vdev->irqchip_change_notifier.notify = vfio_irqchip_change;
 kvm_irqchip_add_change_notifier(&vdev->irqchip_change_notifier);
-ret = vfio_intx_enable(vdev, errp);
-if (ret) {
-goto out_deregister;
+
+/* Wait until cpr-load reads intx routing data to enable */
+if (!vdev->vbasedev.reused) {
+ret = vfio_intx_enable(vdev, errp);
+if (ret) {
+goto out_deregister;
+}
 }
 }
 
@@ -3474,6 +3521,7 @@ static int vfio_pci_post_load(void *opaque, int 
version_id)
 VFIOPCIDevice *vdev = opaque;
 PCIDevice *pdev = &vdev->pdev;
 int nr_vectors;
+int ret = 0;
 
 if (msix_enabled(pdev)) {
 msix_set_vector_notifiers(pdev, vfio_msix_vector_use,
@@ -3486,10 +3534,35 @@ static int vfio_pci_post_load(void *opaque, int 
version_id)

[PATCH V8 32/39] loader: suppress rom_reset during cpr

Reported-by: Zheng Chuan 
Signed-off-by: Steve Sistare 
---
 hw/core/loader.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/hw/core/loader.c b/hw/core/loader.c
index 0548830..7b39c07 100644
--- a/hw/core/loader.c
+++ b/hw/core/loader.c
@@ -51,6 +51,7 @@
 #include "hw/hw.h"
 #include "disas/disas.h"
 #include "migration/vmstate.h"
+#include "migration/cpr.h"
 #include "monitor/monitor.h"
 #include "sysemu/reset.h"
 #include "sysemu/sysemu.h"
@@ -1153,6 +1154,7 @@ ssize_t rom_add_option(const char *file, int32_t 
bootindex)
 static void rom_reset(void *unused)
 {
 Rom *rom;
+bool cpr_is_active = (cpr_get_mode() != CPR_MODE_NONE);
 
 QTAILQ_FOREACH(rom, &roms, next) {
 if (rom->fw_file) {
@@ -1163,7 +1165,7 @@ static void rom_reset(void *unused)
  * the data in during the next incoming migration in all cases.  Note
  * that some of those RAMs can actually be modified by the guest.
  */
-if (runstate_check(RUN_STATE_INMIGRATE)) {
+if (runstate_check(RUN_STATE_INMIGRATE) || cpr_is_active) {
 if (rom->data && rom->isrom) {
 /*
  * Free it so that a rom_reset after migration doesn't
-- 
1.8.3.1

[PATCH V8 20/39] cpr: restart mode

Provide the cpr-save restart mode, which preserves the guest VM across a
restart of the qemu process.  After cpr-save, the caller passes qemu
command-line arguments to cpr-exec, which directly exec's the new qemu
binary.  The arguments must include -S so new qemu starts in a paused state.
The caller resumes the guest by calling cpr-load.

To use the restart mode, guest RAM must be backed by a memory-backend-file
with share=on.  The '-cpr-enable restart' option causes secondary guest
ram blocks (those not specified on the command line) to be allocated by
mmap'ing a memfd.  The memfd values are saved in special cpr state which
is retrieved after exec, and are kept open across exec, after which they
are retrieved and re-mmap'd.  Hence guest RAM is preserved in place, albeit
with new virtual addresses in the qemu process.

The restart mode supports vfio devices and memory-backend-memfd in
subsequent patches.

cpr-exec syntax:
  { 'command': 'cpr-exec', 'data': { 'argv': [ 'str' ] } }

Add the restart mode:
  { 'enum': 'CprMode', 'data': [ 'reboot', 'restart' ] }

Signed-off-by: Steve Sistare 
---
 migration/cpr.c   | 35 +++
 qapi/cpr.json | 26 +-
 qemu-options.hx   |  2 +-
 softmmu/physmem.c | 46 +-
 trace-events  |  1 +
 5 files changed, 107 insertions(+), 3 deletions(-)

diff --git a/migration/cpr.c b/migration/cpr.c
index 1cc8738..8b3fffd 100644
--- a/migration/cpr.c
+++ b/migration/cpr.c
@@ -22,6 +22,7 @@ static int cpr_enabled_modes;
 void cpr_init(int modes)
 {
 cpr_enabled_modes = modes;
+cpr_state_load(&error_fatal);
 }
 
 bool cpr_enabled(CprMode mode)
@@ -153,6 +154,37 @@ err:
 cpr_set_mode(CPR_MODE_NONE);
 }
 
+static int preserve_fd(const char *name, int id, int fd, void *opaque)
+{
+qemu_clear_cloexec(fd);
+return 0;
+}
+
+static int unpreserve_fd(const char *name, int id, int fd, void *opaque)
+{
+qemu_set_cloexec(fd);
+return 0;
+}
+
+void qmp_cpr_exec(strList *args, Error **errp)
+{
+if (!runstate_check(RUN_STATE_SAVE_VM)) {
+error_setg(errp, "runstate is not save-vm");
+return;
+}
+if (cpr_get_mode() != CPR_MODE_RESTART) {
+error_setg(errp, "cpr-exec requires cpr-save with restart mode");
+return;
+}
+
+cpr_walk_fd(preserve_fd, 0);
+if (cpr_state_save(errp)) {
+return;
+}
+
+assert(qemu_system_exec_request(args, errp) == 0);
+}
+
 void qmp_cpr_load(const char *filename, CprMode mode, Error **errp)
 {
 QEMUFile *f;
@@ -189,6 +221,9 @@ void qmp_cpr_load(const char *filename, CprMode mode, Error 
**errp)
 goto out;
 }
 
+/* Clear cloexec to prevent fd leaks until the next cpr-save */
+cpr_walk_fd(unpreserve_fd, 0);
+
 state = global_state_get_runstate();
 if (state == RUN_STATE_RUNNING) {
 vm_start();
diff --git a/qapi/cpr.json b/qapi/cpr.json
index 11c6f88..47ee4ff 100644
--- a/qapi/cpr.json
+++ b/qapi/cpr.json
@@ -15,11 +15,12 @@
 # @CprMode:
 #
 # @reboot: checkpoint can be cpr-load'ed after a host reboot.
+# @restart: checkpoint can be cpr-load'ed after restarting qemu.
 #
 # Since: 7.1
 ##
 { 'enum': 'CprMode',
-  'data': [ 'none', 'reboot' ] }
+  'data': [ 'none', 'reboot', 'restart' ] }
 
 ##
 # @cpr-save:
@@ -38,6 +39,11 @@
 # issue the quit command, reboot the system, start qemu using the same
 # arguments plus -S, and issue the cpr-load command.
 #
+# If @mode is 'restart', the checkpoint remains valid after restarting
+# qemu using a subsequent cpr-exec.  Guest RAM must be backed by a
+# memory-backend-file with share=on.
+# To resume from the checkpoint, issue the cpr-load command.
+#
 # @filename: name of checkpoint file
 # @mode: @CprMode mode
 #
@@ -48,6 +54,24 @@
 'mode': 'CprMode' } }
 
 ##
+# @cpr-exec:
+#
+# Restart qemu by directly exec'ing @argv[0], replacing the qemu process.
+# The PID remains the same.  Must be called after cpr-save restart.
+#
+# @argv[0] should be the path of a new qemu binary, or a prefix command that
+# in turn exec's the new qemu binary.  The arguments must match those used
+# to initially start qemu, plus the -S option so new qemu starts in a paused
+# state.
+#
+# @argv: arguments to be passed to exec().
+#
+# Since: 7.1
+##
+{ 'command': 'cpr-exec',
+  'data': { 'argv': [ 'str' ] } }
+
+##
 # @cpr-load:
 #
 # Load a virtual machine from the checkpoint file @filename that was created
diff --git a/qemu-options.hx b/qemu-options.hx
index 6e51c33..1b49360 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -4484,7 +4484,7 @@ SRST
 ERST
 
 DEF("cpr-enable", HAS_ARG, QEMU_OPTION_cpr_enable, \
-"-cpr-enable rebootenable the cpr mode\n",
+"-cpr-enable reboot|restartenable the cpr mode\n",
 QEMU_ARCH_ALL)
 SRST
 ``-cpr-enable reboot``
diff --git a/softmmu/physmem.c b/softmmu/physmem.c
index 822c424..412cc80 100644
--- a/softmmu/physmem.c
+++ b/softmmu/physmem.c
@@ -44,6 +44,7 @@

[PATCH V8 23/39] hostmem-memfd: cpr for memory-backend-memfd

Preserve memory-backend-memfd memory objects during cpr.

Signed-off-by: Steve Sistare 
---
 backends/hostmem-memfd.c | 21 -
 hmp-commands.hx  |  2 +-
 qapi/cpr.json|  2 +-
 3 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/backends/hostmem-memfd.c b/backends/hostmem-memfd.c
index c9d8001..2aeb5d1 100644
--- a/backends/hostmem-memfd.c
+++ b/backends/hostmem-memfd.c
@@ -14,6 +14,7 @@
 #include "sysemu/hostmem.h"
 #include "qom/object_interfaces.h"
 #include "qemu/memfd.h"
+#include "migration/cpr.h"
 #include "qemu/module.h"
 #include "qapi/error.h"
 #include "qom/object.h"
@@ -36,23 +37,25 @@ memfd_backend_memory_alloc(HostMemoryBackend *backend, 
Error **errp)
 {
 HostMemoryBackendMemfd *m = MEMORY_BACKEND_MEMFD(backend);
 uint32_t ram_flags;
-char *name;
-int fd;
+char *name = host_memory_backend_get_name(backend);
+int fd = cpr_find_fd(name, 0);
 
 if (!backend->size) {
 error_setg(errp, "can't create backend with size 0");
 return;
 }
 
-fd = qemu_memfd_create(TYPE_MEMORY_BACKEND_MEMFD, backend->size,
-   m->hugetlb, m->hugetlbsize, m->seal ?
-   F_SEAL_GROW | F_SEAL_SHRINK | F_SEAL_SEAL : 0,
-   errp);
-if (fd == -1) {
-return;
+if (fd < 0) {
+fd = qemu_memfd_create(TYPE_MEMORY_BACKEND_MEMFD, backend->size,
+   m->hugetlb, m->hugetlbsize, m->seal ?
+   F_SEAL_GROW | F_SEAL_SHRINK | F_SEAL_SEAL : 0,
+   errp);
+if (fd == -1) {
+return;
+}
+cpr_save_fd(name, 0, fd);
 }
 
-name = host_memory_backend_get_name(backend);
 ram_flags = backend->share ? RAM_SHARED : 0;
 ram_flags |= backend->reserve ? 0 : RAM_NORESERVE;
 ram_flags |= RAM_ANON;
diff --git a/hmp-commands.hx b/hmp-commands.hx
index da5dd60..540f9be 100644
--- a/hmp-commands.hx
+++ b/hmp-commands.hx
@@ -380,7 +380,7 @@ SRST
 
   If *mode* is 'restart', the checkpoint remains valid after restarting
   qemu using a subsequent cpr-exec.  Guest RAM must be backed by a
-  memory-backend-file with share=on.
+  memory-backend-memfd or memory-backend-file object with share=on.
   To resume from the checkpoint, issue the cpr-load command.
 ERST
 
diff --git a/qapi/cpr.json b/qapi/cpr.json
index 47ee4ff..1ec5aae 100644
--- a/qapi/cpr.json
+++ b/qapi/cpr.json
@@ -41,7 +41,7 @@
 #
 # If @mode is 'restart', the checkpoint remains valid after restarting
 # qemu using a subsequent cpr-exec.  Guest RAM must be backed by a
-# memory-backend-file with share=on.
+# memory-backend-memfd or memory-backend-file object with share=on.
 # To resume from the checkpoint, issue the cpr-load command.
 #
 # @filename: name of checkpoint file
-- 
1.8.3.1

[PATCH V8 21/39] cpr: restart HMP interfaces

cpr-save  
  mode may be "restart"

cpr-exec 
  Call qmp_cpr_exec().
  Arguments:
command : command line to execute, with space-separated arguments

Signed-off-by: Steve Sistare 
---
 hmp-commands.hx   | 29 ++---
 include/monitor/hmp.h |  1 +
 monitor/hmp-cmds.c| 11 +++
 3 files changed, 38 insertions(+), 3 deletions(-)

diff --git a/hmp-commands.hx b/hmp-commands.hx
index d621968..da5dd60 100644
--- a/hmp-commands.hx
+++ b/hmp-commands.hx
@@ -357,7 +357,7 @@ ERST
 {
 .name   = "cpr-save",
 .args_type  = "filename:s,mode:s",
-.params = "filename 'reboot'",
+.params = "filename 'reboot'|'restart'",
 .help   = "create a checkpoint of the VM in file",
 .cmd= hmp_cpr_save,
 },
@@ -377,13 +377,36 @@ SRST
   reboot, else it will be saved to the file.  To resume from the checkpoint,
   issue the quit command, reboot the system, start qemu using the same
   arguments plus -S, and issue the cpr-load command.
+
+  If *mode* is 'restart', the checkpoint remains valid after restarting
+  qemu using a subsequent cpr-exec.  Guest RAM must be backed by a
+  memory-backend-file with share=on.
+  To resume from the checkpoint, issue the cpr-load command.
+ERST
+
+{
+.name   = "cpr-exec",
+.args_type  = "command:S",
+.params = "command",
+.help   = "Restart qemu by directly exec'ing command",
+.cmd= hmp_cpr_exec,
+},
+
+SRST
+``cpr-exec`` *command*
+  Restart qemu by directly exec'ing *command*, replacing the qemu process.
+  The PID remains the same.  Must be called after cpr-save restart.
+
+  *command*[0] should be the path of a new qemu binary, or a prefix command 
that
+  in turn exec's the new qemu binary.  The arguments must match those used
+  to initially start qemu, plus the -S option so new qemu starts in a paused
+  state.
 ERST
 
 {
 .name   = "cpr-load",
 .args_type  = "filename:s,mode:s",
-.params = "filename 'reboot'",
-
+.params = "filename 'reboot'|'restart'",
 .help   = "load VM checkpoint from file",
 .cmd= hmp_cpr_load,
 },
diff --git a/include/monitor/hmp.h b/include/monitor/hmp.h
index b44588e..ec4fa44 100644
--- a/include/monitor/hmp.h
+++ b/include/monitor/hmp.h
@@ -60,6 +60,7 @@ void hmp_loadvm(Monitor *mon, const QDict *qdict);
 void hmp_savevm(Monitor *mon, const QDict *qdict);
 void hmp_delvm(Monitor *mon, const QDict *qdict);
 void hmp_cpr_save(Monitor *mon, const QDict *qdict);
+void hmp_cpr_exec(Monitor *mon, const QDict *qdict);
 void hmp_cpr_load(Monitor *mon, const QDict *qdict);
 void hmp_migrate_cancel(Monitor *mon, const QDict *qdict);
 void hmp_migrate_continue(Monitor *mon, const QDict *qdict);
diff --git a/monitor/hmp-cmds.c b/monitor/hmp-cmds.c
index 9f58b1f..b866c7f 100644
--- a/monitor/hmp-cmds.c
+++ b/monitor/hmp-cmds.c
@@ -,6 +,17 @@ void hmp_cpr_save(Monitor *mon, const QDict *qdict)
 hmp_handle_error(mon, err);
 }
 
+void hmp_cpr_exec(Monitor *mon, const QDict *qdict)
+{
+Error *err = NULL;
+const char *command = qdict_get_try_str(qdict, "command");
+strList *args = strList_from_string(command, ' ');
+
+qmp_cpr_exec(args, &err);
+qapi_free_strList(args);
+hmp_handle_error(mon, err);
+}
+
 void hmp_cpr_load(Monitor *mon, const QDict *qdict)
 {
 Error *err = NULL;
-- 
1.8.3.1

[PATCH V8 27/39] vfio-pci: cpr part 1 (fd and dma)

Enable vfio-pci devices to be saved and restored across an exec restart
of qemu.

At vfio creation time, save the value of vfio container, group, and device
descriptors in cpr state.

In the container pre_save handler, suspend the use of virtual addresses in
DMA mappings with VFIO_DMA_UNMAP_FLAG_VADDR, because guest ram will be
remapped at a different VA after exec.  DMA to already-mapped pages
continues.  Save the msi message area as part of vfio-pci vmstate, save the
interrupt and notifier eventfd's in cpr state, and clear the close-on-exec
flag for the vfio descriptors.  The flag is not cleared earlier because the
descriptors should not persist across miscellaneous fork and exec calls
that may be performed during normal operation.

On qemu restart, vfio_realize() finds the saved descriptors, uses
the descriptors, and notes that the device is being reused.  Device and
iommu state is already configured, so operations in vfio_realize that
would modify the configuration are skipped for a reused device, including
vfio ioctl's and writes to PCI configuration space.  Vfio PCI device reset
is also suppressed. The result is that vfio_realize constructs qemu data
structures that reflect the current state of the device.  However, the
reconstruction is not complete until cpr-load is called. cpr-load loads the
msi data.  The vfio post_load handler finds eventfds in cpr state, rebuilds
vector data structures, and attaches the interrupts to the new KVM instance.
The container post_load handler then invokes the main vfio listener
callback, which walks the flattened ranges of the vfio address space and
calls VFIO_DMA_MAP_FLAG_VADDR to inform the kernel of the new VA's.  Lastly,
cpr-load starts the VM.

This functionality is delivered by 3 patches for clarity.  Part 1 handles
device file descriptors and DMA.  Part 2 adds eventfd and MSI/MSI-X vector
support.  Part 3 adds INTX support.

Signed-off-by: Steve Sistare 
---
 MAINTAINERS   |   1 +
 hw/pci/pci.c  |  12 
 hw/vfio/common.c  | 151 +++---
 hw/vfio/cpr.c | 119 +
 hw/vfio/meson.build   |   1 +
 hw/vfio/pci.c |  44 
 hw/vfio/trace-events  |   1 +
 include/hw/vfio/vfio-common.h |  11 +++
 include/migration/vmstate.h   |   1 +
 9 files changed, 317 insertions(+), 24 deletions(-)
 create mode 100644 hw/vfio/cpr.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 74a43e6..864aec6 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3156,6 +3156,7 @@ CPR
 M: Steve Sistare 
 M: Mark Kanda 
 S: Maintained
+F: hw/vfio/cpr.c
 F: include/migration/cpr.h
 F: migration/cpr.c
 F: qapi/cpr.json
diff --git a/hw/pci/pci.c b/hw/pci/pci.c
index 6e70153..a3b19eb 100644
--- a/hw/pci/pci.c
+++ b/hw/pci/pci.c
@@ -32,6 +32,7 @@
 #include "hw/pci/pci_host.h"
 #include "hw/qdev-properties.h"
 #include "hw/qdev-properties-system.h"
+#include "migration/cpr.h"
 #include "migration/qemu-file-types.h"
 #include "migration/vmstate.h"
 #include "monitor/monitor.h"
@@ -341,6 +342,17 @@ static void pci_reset_regions(PCIDevice *dev)
 
 static void pci_do_device_reset(PCIDevice *dev)
 {
+/*
+ * A PCI device that is resuming for cpr is already configured, so do
+ * not reset it here when we are called from qemu_system_reset prior to
+ * cpr-load, else interrupts may be lost for vfio-pci devices.  It is
+ * safe to skip this reset for all PCI devices, because cpr-load will set
+ * all fields that would have been set here.
+ */
+if (cpr_get_mode() == CPR_MODE_RESTART) {
+return;
+}
+
 pci_device_deassert_intx(dev);
 assert(dev->irq_state == 0);
 
diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index ace9562..c7d73b6 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -31,6 +31,7 @@
 #include "exec/memory.h"
 #include "exec/ram_addr.h"
 #include "hw/hw.h"
+#include "migration/cpr.h"
 #include "qemu/error-report.h"
 #include "qemu/main-loop.h"
 #include "qemu/range.h"
@@ -460,6 +461,8 @@ static int vfio_dma_unmap(VFIOContainer *container,
 .size = size,
 };
 
+assert(!container->reused);
+
 if (iotlb && container->dirty_pages_supported &&
 vfio_devices_all_running_and_saving(container)) {
 return vfio_dma_unmap_bitmap(container, iova, size, iotlb);
@@ -496,12 +499,24 @@ static int vfio_dma_map(VFIOContainer *container, hwaddr 
iova,
 {
 struct vfio_iommu_type1_dma_map map = {
 .argsz = sizeof(map),
-.flags = VFIO_DMA_MAP_FLAG_READ,
 .vaddr = (__u64)(uintptr_t)vaddr,
 .iova = iova,
 .size = size,
 };
 
+/*
+ * Set the new vaddr for any mappings registered during cpr-load.
+ * Reused is cleared thereafter.
+ */
+if (container->reused) {
+map.flags = VFIO_DMA_MAP_FLAG_VADDR;
+if (ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map)) {
+goto fail;
+}
+r

[PATCH V8 33/39] chardev: cpr framework

Add QEMU_CHAR_FEATURE_CPR for devices that support cpr by preserving an
open descriptor across exec.  Add the chardev reopen-on-cpr option for
devices that should be closed on cpr and reopened after exec.

Enable cpr for a chardev if it has QEMU_CHAR_FEATURE_CPR and reopen-on-cpr
is false.  Allow cpr-save if either QEMU_CHAR_FEATURE_CPR or reopen-on-cpr
is true for all chardevs in the configuration.

Signed-off-by: Steve Sistare 
---
 chardev/char.c | 49 +
 include/chardev/char.h |  5 +
 qapi/char.json |  7 ++-
 qemu-options.hx| 26 ++
 4 files changed, 78 insertions(+), 9 deletions(-)

diff --git a/chardev/char.c b/chardev/char.c
index 0169d8d..ef3f196 100644
--- a/chardev/char.c
+++ b/chardev/char.c
@@ -36,9 +36,11 @@
 #include "qemu/help_option.h"
 #include "qemu/module.h"
 #include "qemu/option.h"
+#include "migration/cpr.h"
 #include "qemu/id.h"
 #include "qemu/coroutine.h"
 #include "qemu/yank.h"
+#include "sysemu/sysemu.h"
 
 #include "chardev-internal.h"
 
@@ -236,26 +238,55 @@ int qemu_chr_add_client(Chardev *s, int fd)
 static void qemu_char_open(Chardev *chr, ChardevBackend *backend,
bool *be_opened, Error **errp)
 {
+ERRP_GUARD();
+g_autofree char *fdname = NULL;
+
 ChardevClass *cc = CHARDEV_GET_CLASS(chr);
 /* Any ChardevCommon member would work */
 ChardevCommon *common = backend ? backend->u.null.data : NULL;
+bool has_logfile = (common && common->has_logfile);
+bool has_feature_cpr;
 
-if (common && common->has_logfile) {
+if (has_logfile) {
 int flags = O_WRONLY;
+fdname = g_strdup_printf("%s_log", chr->label);
 if (common->has_logappend &&
 common->logappend) {
 flags |= O_APPEND;
 } else {
 flags |= O_TRUNC;
 }
-chr->logfd = qemu_create(common->logfile, flags, 0666, errp);
+chr->logfd = cpr_find_fd(fdname, 0);
+if (chr->logfd < 0) {
+chr->logfd = qemu_create(common->logfile, flags, 0666, errp);
+}
 if (chr->logfd < 0) {
 return;
 }
 }
 
+chr->reopen_on_cpr = (common && common->reopen_on_cpr);
+
 if (cc->open) {
 cc->open(chr, backend, be_opened, errp);
+if (*errp) {
+return;
+}
+}
+
+/* Evaluate this after the open method sets the feature */
+has_feature_cpr = qemu_chr_has_feature(chr, QEMU_CHAR_FEATURE_CPR);
+chr->cpr_enabled = !chr->reopen_on_cpr && has_feature_cpr;
+
+if (!chr->reopen_on_cpr && !has_feature_cpr) {
+chr->cpr_blocker = NULL;
+error_setg(&chr->cpr_blocker,
+"chardev %s -> %s does not allow cpr. See reopen-on-cpr.",
+chr->label, chr->filename);
+cpr_add_blocker(&chr->cpr_blocker, errp, CPR_MODE_RESTART, 0);
+
+} else if (chr->cpr_enabled && has_logfile) {
+cpr_resave_fd(fdname, 0, chr->logfd, errp);
 }
 }
 
@@ -297,11 +328,16 @@ static void char_finalize(Object *obj)
 if (chr->be) {
 chr->be->chr = NULL;
 }
-g_free(chr->filename);
-g_free(chr->label);
 if (chr->logfd != -1) {
+g_autofree char *fdname = g_strdup_printf("%s_log", chr->label);
+if (chr->cpr_enabled) {
+cpr_delete_fd(fdname, 0);
+}
 close(chr->logfd);
 }
+cpr_del_blocker(&chr->cpr_blocker);
+g_free(chr->filename);
+g_free(chr->label);
 qemu_mutex_destroy(&chr->chr_write_lock);
 }
 
@@ -501,6 +537,8 @@ void qemu_chr_parse_common(QemuOpts *opts, ChardevCommon 
*backend)
 
 backend->has_logappend = true;
 backend->logappend = qemu_opt_get_bool(opts, "logappend", false);
+
+backend->reopen_on_cpr = qemu_opt_get_bool(opts, "reopen-on-cpr", false);
 }
 
 static const ChardevClass *char_get_class(const char *driver, Error **errp)
@@ -942,6 +980,9 @@ QemuOptsList qemu_chardev_opts = {
 },{
 .name = "abstract",
 .type = QEMU_OPT_BOOL,
+},{
+.name = "reopen-on-cpr",
+.type = QEMU_OPT_BOOL,
 #endif
 },
 { /* end of list */ }
diff --git a/include/chardev/char.h b/include/chardev/char.h
index a319b5f..bbf2560 100644
--- a/include/chardev/char.h
+++ b/include/chardev/char.h
@@ -50,6 +50,8 @@ typedef enum {
 /* Whether the gcontext can be changed after calling
  * qemu_chr_be_update_read_handlers() */
 QEMU_CHAR_FEATURE_GCONTEXT,
+/* Whether the device supports cpr */
+QEMU_CHAR_FEATURE_CPR,
 
 QEMU_CHAR_FEATURE_LAST,
 } ChardevFeature;
@@ -67,6 +69,9 @@ struct Chardev {
 int be_open;
 /* used to coordinate the chardev-change special-case: */
 bool handover_yank_instance;
+bool reopen_on_cpr;
+bool cpr_enabled;
+Error *cpr_blocker;
 GSource *gsource;
 GMainContext *gcontext;
 DECLARE_BITMAP(features, QEMU_CHAR_FEATURE_LAST);
diff --git a

[PATCH V8 24/39] pci: export export msix_is_pending

Export msix_is_pending for use by cpr.  No functional change.

Signed-off-by: Steve Sistare 
---
 hw/pci/msix.c | 2 +-
 include/hw/pci/msix.h | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/hw/pci/msix.c b/hw/pci/msix.c
index ae9331c..e492ce0 100644
--- a/hw/pci/msix.c
+++ b/hw/pci/msix.c
@@ -64,7 +64,7 @@ static uint8_t *msix_pending_byte(PCIDevice *dev, int vector)
 return dev->msix_pba + vector / 8;
 }
 
-static int msix_is_pending(PCIDevice *dev, int vector)
+int msix_is_pending(PCIDevice *dev, unsigned int vector)
 {
 return *msix_pending_byte(dev, vector) & msix_pending_mask(vector);
 }
diff --git a/include/hw/pci/msix.h b/include/hw/pci/msix.h
index 4c4a60c..0065354 100644
--- a/include/hw/pci/msix.h
+++ b/include/hw/pci/msix.h
@@ -32,6 +32,7 @@ int msix_present(PCIDevice *dev);
 bool msix_is_masked(PCIDevice *dev, unsigned vector);
 void msix_set_pending(PCIDevice *dev, unsigned vector);
 void msix_clr_pending(PCIDevice *dev, int vector);
+int msix_is_pending(PCIDevice *dev, unsigned vector);
 
 int msix_vector_use(PCIDevice *dev, unsigned vector);
 void msix_vector_unuse(PCIDevice *dev, unsigned vector);
-- 
1.8.3.1

[PATCH V8 36/39] chardev: cpr for sockets

Save accepted socket fds before cpr-save, and look for them after cpr-load.
Block cpr-exec if a socket enables the TLS or websocket option.  Allow a
monitor socket by closing it on exec.

Signed-off-by: Mark Kanda 
Signed-off-by: Steve Sistare 
---
 chardev/char-socket.c | 45 +++
 include/chardev/char-socket.h |  1 +
 monitor/hmp.c |  3 +++
 monitor/qmp.c |  3 +++
 4 files changed, 52 insertions(+)

diff --git a/chardev/char-socket.c b/chardev/char-socket.c
index dc4e218..3a1e36b 100644
--- a/chardev/char-socket.c
+++ b/chardev/char-socket.c
@@ -26,6 +26,7 @@
 #include "chardev/char.h"
 #include "io/channel-socket.h"
 #include "io/channel-websock.h"
+#include "migration/cpr.h"
 #include "qemu/error-report.h"
 #include "qemu/module.h"
 #include "qemu/option.h"
@@ -33,6 +34,7 @@
 #include "qapi/clone-visitor.h"
 #include "qapi/qapi-visit-sockets.h"
 #include "qemu/yank.h"
+#include "sysemu/sysemu.h"
 
 #include "chardev/char-io.h"
 #include "chardev/char-socket.h"
@@ -358,6 +360,11 @@ static void tcp_chr_free_connection(Chardev *chr)
 SocketChardev *s = SOCKET_CHARDEV(chr);
 int i;
 
+if (chr->cpr_enabled) {
+cpr_delete_fd(chr->label, 0);
+}
+cpr_del_blocker(&s->cpr_blocker);
+
 if (s->read_msgfds_num) {
 for (i = 0; i < s->read_msgfds_num; i++) {
 close(s->read_msgfds[i]);
@@ -923,6 +930,10 @@ static void tcp_chr_accept(QIONetListener *listener,
QIO_CHANNEL(cioc));
 }
 tcp_chr_new_client(chr, cioc);
+
+if (s->sioc && chr->cpr_enabled) {
+cpr_resave_fd(chr->label, 0, s->sioc->fd, NULL);
+}
 }
 
 
@@ -1178,6 +1189,26 @@ static gboolean socket_reconnect_timeout(gpointer opaque)
 return false;
 }
 
+static int load_char_socket_fd(Chardev *chr, Error **errp)
+{
+SocketChardev *sockchar = SOCKET_CHARDEV(chr);
+QIOChannelSocket *sioc;
+const char *label = chr->label;
+int fd = cpr_find_fd(label, 0);
+
+if (fd != -1) {
+sockchar = SOCKET_CHARDEV(chr);
+sioc = qio_channel_socket_new_fd(fd, errp);
+if (sioc) {
+tcp_chr_accept(sockchar->listener, sioc, chr);
+object_unref(OBJECT(sioc));
+} else {
+error_setg(errp, "could not restore socket for %s", label);
+return -1;
+}
+}
+return 0;
+}
 
 static int qmp_chardev_open_socket_server(Chardev *chr,
   bool is_telnet,
@@ -1388,6 +1419,18 @@ static void qmp_chardev_open_socket(Chardev *chr,
 }
 s->registered_yank = true;
 
+if (!s->tls_creds && !s->is_websock) {
+qemu_chr_set_feature(chr, QEMU_CHAR_FEATURE_CPR);
+} else if (!chr->reopen_on_cpr) {
+s->cpr_blocker = NULL;
+error_setg(&s->cpr_blocker,
+   "error: socket %s is not cpr capable due to %s option",
+   chr->label, (s->tls_creds ? "TLS" : "websocket"));
+if (cpr_add_blocker(&s->cpr_blocker, errp, CPR_MODE_RESTART, 0)) {
+return;
+}
+}
+
 /* be isn't opened until we get a connection */
 *be_opened = false;
 
@@ -1403,6 +1446,8 @@ static void qmp_chardev_open_socket(Chardev *chr,
 return;
 }
 }
+
+load_char_socket_fd(chr, errp);
 }
 
 static void qemu_chr_parse_socket(QemuOpts *opts, ChardevBackend *backend,
diff --git a/include/chardev/char-socket.h b/include/chardev/char-socket.h
index 0708ca6..1c3abf7 100644
--- a/include/chardev/char-socket.h
+++ b/include/chardev/char-socket.h
@@ -78,6 +78,7 @@ struct SocketChardev {
 bool connect_err_reported;
 
 QIOTask *connect_task;
+Error *cpr_blocker;
 };
 typedef struct SocketChardev SocketChardev;
 
diff --git a/monitor/hmp.c b/monitor/hmp.c
index 15ca047..75e6739 100644
--- a/monitor/hmp.c
+++ b/monitor/hmp.c
@@ -1501,4 +1501,7 @@ void monitor_init_hmp(Chardev *chr, bool use_readline, 
Error **errp)
 qemu_chr_fe_set_handlers(&mon->common.chr, monitor_can_read, monitor_read,
  monitor_event, NULL, &mon->common, NULL, true);
 monitor_list_append(&mon->common);
+
+/* monitor cannot yet be preserved across cpr */
+chr->reopen_on_cpr = true;
 }
diff --git a/monitor/qmp.c b/monitor/qmp.c
index 092c527..0043459 100644
--- a/monitor/qmp.c
+++ b/monitor/qmp.c
@@ -535,4 +535,7 @@ void monitor_init_qmp(Chardev *chr, bool pretty, Error 
**errp)
  NULL, &mon->common, NULL, true);
 monitor_list_append(&mon->common);
 }
+
+/* Monitor cannot yet be preserved across cpr */
+chr->reopen_on_cpr = true;
 }
-- 
1.8.3.1

[PATCH V8 30/39] vfio-pci: recover from unmap-all-vaddr failure