some nits/comments/questions below, but the general direction/structure
already looks quite good I think!

On November 7, 2024 5:51 pm, Fiona Ebner wrote:
> The state of the VM's disk images at the time the backup is started is
> preserved via a snapshot-access block node. Old data is moved to the
> fleecing image when new guest writes come in. The snapshot-access
> block node, as well as the associated bitmap in case of incremental
> backup, will be made available to the external provider. They are
> exported via NBD and for 'nbd' mechanism, the NBD socket path is
> passed to the provider, while for 'block-device' mechanism, the NBD
> export is made accessible as a regular block device first and the
> bitmap information is made available via a $next_dirty_region->()
> function. For 'block-device', the 'nbdinfo' binary is required.
> 
> The provider can indicate that it wants to do an incremental backup by
> returning the bitmap ID that was used for a previous backup and it
> will then be told if the bitmap was newly created (either first backup
> or old bitmap was invalid) or if the bitmap can be reused.
> 
> The provider then reads the parts of the NBD or block device it needs,
> either the full disk for full backup, or the dirty parts according to
> the bitmap for incremental backup. The bitmap has to be respected,
> reads to other parts of the image will return an error. After backing
> up each part of the disk, it should be discarded in the export to
> avoid unnecessary space usage in the fleecing image (requires the
> storage underlying the fleecing image to support discard too).
> 
> Signed-off-by: Fiona Ebner <f.eb...@proxmox.com>
> ---
> 
> Changes in v3:
> * adapt to API changes, config files are now passed as raw
> 
>  PVE/VZDump/QemuServer.pm | 309 ++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 308 insertions(+), 1 deletion(-)
> 
> diff --git a/PVE/VZDump/QemuServer.pm b/PVE/VZDump/QemuServer.pm
> index b6dcd6cc..d0218c9b 100644
> --- a/PVE/VZDump/QemuServer.pm
> +++ b/PVE/VZDump/QemuServer.pm
> @@ -20,7 +20,7 @@ use PVE::QMPClient;
>  use PVE::Storage::Plugin;
>  use PVE::Storage::PBSPlugin;
>  use PVE::Storage;
> -use PVE::Tools;
> +use PVE::Tools qw(run_command);
>  use PVE::VZDump;
>  use PVE::Format qw(render_duration render_bytes);
>  
> @@ -277,6 +277,8 @@ sub archive {
>  
>      if ($self->{vzdump}->{opts}->{pbs}) {
>       $self->archive_pbs($task, $vmid);
> +    } elsif ($self->{vzdump}->{'backup-provider'}) {
> +     $self->archive_external($task, $vmid);
>      } else {
>       $self->archive_vma($task, $vmid, $filename, $comp);
>      }
> @@ -1149,6 +1151,23 @@ sub cleanup {
>  
>      # If VM was started only for backup, it is already stopped now.
>      if (PVE::QemuServer::Helpers::vm_running_locally($vmid)) {
> +     if ($task->{cleanup}->{'nbd-stop'}) {
> +         eval { PVE::QemuServer::QMPHelpers::nbd_stop($vmid); };
> +         $self->logerr($@) if $@;
> +     }
> +
> +     if (my $info = $task->{cleanup}->{'backup-access-teardown'}) {
> +         my $params = {
> +             'target-id' => $info->{'target-id'},
> +             timeout => 60,
> +             success => $info->{success} ? JSON::true : JSON::false,
> +         };
> +
> +         $self->loginfo("tearing down backup-access");
> +         eval { mon_cmd($vmid, "backup-access-teardown", $params->%*) };
> +         $self->logerr($@) if $@;
> +     }
> +
>       $detach_tpmstate_drive->($task, $vmid);
>       detach_fleecing_images($task->{disks}, $vmid) if 
> $task->{'use-fleecing'};
>      }
> @@ -1160,4 +1179,292 @@ sub cleanup {
>      }
>  }
>  
> +my sub block_device_backup_cleanup {
> +    my ($self, $paths, $cpids) = @_;
> +
> +    for my $path ($paths->@*) {
> +     eval { run_command(["qemu-nbd", "-d", $path ]); };
> +     $self->log('warn', "unable to disconnect NBD backup source '$path' - 
> $@") if $@;
> +    }
> +
> +    my $waited;
> +    my $wait_limit = 5;
> +    for ($waited = 0; $waited < $wait_limit && scalar(keys $cpids->%*); 
> $waited++) {
> +     while ((my $cpid = waitpid(-1, POSIX::WNOHANG)) > 0) {
> +         delete($cpids->{$cpid});
> +     }
> +     if ($waited == 0) {
> +         kill 15, $_ for keys $cpids->%*;
> +     }
> +     sleep 1;
> +    }
> +    if ($waited == $wait_limit && scalar(keys $cpids->%*)) {
> +     kill 9, $_ for keys $cpids->%*;
> +     sleep 1;
> +     while ((my $cpid = waitpid(-1, POSIX::WNOHANG)) > 0) {

this could be a bit dangerous, since we have an explicit list of cpids
we want to wait for, couldn't we just waitpid explicitly for them?

just wary of potential side-effects on things like hookscripts or future
features that also require forking ;)

> +         delete($cpids->{$cpid});
> +     }
> +     $self->log('warn', "unable to collect nbdinfo child process '$_'") for 
> keys $cpids->%*;
> +    }
> +}
> +
> +my sub block_device_backup_prepare {
> +    my ($self, $devicename, $size, $nbd_path, $bitmap_name, $count) = @_;

nit: $device_name for consistency's sake?

> +
> +    my $nbd_info_uri = "nbd+unix:///${devicename}?socket=${nbd_path}";
> +    my $qemu_nbd_uri = "nbd:unix:${nbd_path}:exportname=${devicename}";
> +
> +    my $cpid;
> +    my $error_fh;
> +    my $next_dirty_region;
> +
> +    # If there is no dirty bitmap, it can be treated as if there's a full 
> dirty one. The output of
> +    # nbdinfo is a list of tuples with offset, length, type, description. 
> The first bit of 'type' is
> +    # set when the bitmap is dirty, see QEMU's docs/interop/nbd.txt
> +    my $dirty_bitmap = [];
> +    if ($bitmap_name) {
> +     my $input = IO::File->new();
> +     my $info = IO::File->new();
> +     $error_fh = IO::File->new();
> +     my $nbdinfo_cmd = ["nbdinfo", $nbd_info_uri, 
> "--map=qemu:dirty-bitmap:${bitmap_name}"];
> +     $cpid = open3($input, $info, $error_fh, $nbdinfo_cmd->@*)
> +         or die "failed to spawn nbdinfo child - $!\n";
> +
> +     $next_dirty_region = sub {
> +         my ($offset, $length, $type);
> +         do {
> +             my $line = <$info>;
> +             return if !$line;
> +             die "unexpected output from nbdinfo - $line\n"
> +                 if $line !~ m/^\s*(\d+)\s*(\d+)\s*(\d+)/; # also untaints
> +             ($offset, $length, $type) = ($1, $2, $3);
> +         } while (($type & 0x1) == 0); # not dirty
> +         return ($offset, $length);
> +     };
> +    } else {
> +     my $done = 0;
> +     $next_dirty_region = sub {
> +         return if $done;
> +         $done = 1;
> +         return (0, $size);
> +     };
> +    }
> +
> +    my $blockdev = "/dev/nbd${count}";

what if that is already used/taken by somebody? I think we'd need logic
to find a free slot here..

> +
> +    eval {
> +     run_command(["qemu-nbd", "-c", $blockdev, $qemu_nbd_uri, 
> "--format=raw", "--discard=on"]);
> +    };
> +    if (my $err = $@) {
> +     my $cpids = {};
> +     $cpids->{$cpid} = 1 if $cpid;
> +     block_device_backup_cleanup($self, [$blockdev], $cpids);
> +     die $err;
> +    }
> +
> +    return ($blockdev, $next_dirty_region, $cpid);
> +}
> +
> +my sub backup_access_to_volume_info {
> +    my ($self, $backup_access_info, $mechanism, $nbd_path) = @_;
> +
> +    my $child_pids = {}; # used for nbdinfo calls
> +    my $count = 0; # counter for block devices, i.e. /dev/nbd${count}
> +    my $volumes = {};
> +
> +    for my $info ($backup_access_info->@*) {
> +     my $bitmap_status = 'none';
> +     my $bitmap_name;
> +     if (my $bitmap_action = $info->{'bitmap-action'}) {
> +         my $bitmap_action_to_status = {
> +             'not-used' => 'none',
> +             'not-used-removed' => 'none',
> +             'new' => 'new',
> +             'used' => 'reuse',
> +             'invalid' => 'new',
> +         };

nit: should we move this outside of the loop? it's a static map after
all.. (or maybe the perl interpreter is smart enough anyway ;))

> +
> +         $bitmap_status = $bitmap_action_to_status->{$bitmap_action}
> +             or die "got unexpected bitmap action '$bitmap_action'\n";
> +
> +         $bitmap_name = $info->{'bitmap-name'} or die "bitmap-name is not 
> present\n";
> +     }
> +
> +     my ($device, $size) = $info->@{qw(device size)};
> +
> +     $volumes->{$device}->{'bitmap-mode'} = $bitmap_status;
> +     $volumes->{$device}->{size} = $size;
> +
> +     if ($mechanism eq 'block-device') {
> +         my ($blockdev, $next_dirty_region, $child_pid) = 
> block_device_backup_prepare(
> +             $self, $device, $size, $nbd_path, $bitmap_name, $count);
> +         $count++;
> +         $child_pids->{$child_pid} = 1 if $child_pid;
> +         $volumes->{$device}->{path} = $blockdev;
> +         $volumes->{$device}->{'next-dirty-region'} = $next_dirty_region;
> +     } elsif ($mechanism eq 'nbd') {
> +         $volumes->{$device}->{'nbd-path'} = $nbd_path;
> +         $volumes->{$device}->{'bitmap-name'} = $bitmap_name;
> +     } else {
> +         die "internal error - unkown mechanism '$mechanism'";
> +     }
> +    }
> +
> +    return ($volumes, $child_pids);
> +}
> +
> +sub archive_external {
> +    my ($self, $task, $vmid) = @_;
> +
> +    my $guest_config = 
> PVE::Tools::file_get_contents("$task->{tmpdir}/qemu-server.conf");
> +    my $firewall_file = "$task->{tmpdir}/qemu-server.fw";
> +
> +    my $opts = $self->{vzdump}->{opts};
> +
> +    my $backup_provider = $self->{vzdump}->{'backup-provider'};
> +
> +    $self->loginfo("starting external backup via " . 
> $backup_provider->provider_name());
> +
> +    my $starttime = time();
> +
> +    # get list early so we die on unkown drive types before doing anything
> +    my $devlist = _get_task_devlist($task);
> +
> +    $self->enforce_vm_running_for_backup($vmid);
> +    $self->{qmeventd_fh} = PVE::QemuServer::register_qmeventd_handle($vmid);
> +
> +    eval {
> +     $SIG{INT} = $SIG{TERM} = $SIG{QUIT} = $SIG{HUP} = $SIG{PIPE} = sub {
> +         die "interrupted by signal\n";
> +     };
> +
> +     my $qemu_support = mon_cmd($vmid, "query-proxmox-support");
> +
> +     $attach_tpmstate_drive->($self, $task, $vmid);
> +
> +     my $is_template = 
> PVE::QemuConfig->is_template($self->{vmlist}->{$vmid});
> +
> +     my $fleecing = check_and_prepare_fleecing(
> +         $self, $task, $vmid, $opts->{fleecing}, $task->{disks}, 
> $is_template, $qemu_support, 1);
> +     die "cannot setup backup access without fleecing\n" if !$fleecing;
> +
> +     $task->{'use-fleecing'} = 1;
> +
> +     my $fs_frozen = $self->qga_fs_freeze($task, $vmid);

should we move this (A)

> +
> +     my $target_id = $opts->{storage};
> +
> +     my $params = {
> +         'target-id' => $target_id,
> +         devlist => $devlist,
> +         timeout => 60,
> +     };

and this (B)

> +
> +     my ($mechanism, $bitmap_name) = 
> $backup_provider->backup_get_mechanism($vmid, 'qemu');
> +     die "mechanism '$mechanism' requested by backup provider is not 
> supported for VMs\n"
> +         if $mechanism ne 'block-device' && $mechanism ne 'nbd';
> +
> +     if ($mechanism eq 'block-device') {
> +         # For mechanism 'block-device' the bitmap needs to be passed to the 
> provider. The bitmap
> +         # cannot be dumped via QMP and doing it via qemu-img is 
> experimental, so use nbdinfo.
> +         die "need 'nbdinfo' binary from package libnbd-bin\n" if !-e 
> "/usr/bin/nbdinfo";
> +
> +         # NOTE nbds_max won't change if module is already loaded
> +         run_command(["modprobe", "nbd", "nbds_max=128"]);

should this maybe be put into a modprobe snippet somewhere, and we just
verify here that nbd is available? not that we can currently reach 128
guest disks ;)

> +     }

down here (B)

> +
> +     if ($bitmap_name) {
> +         # prepend storage ID so different providers can never cause clashes
> +         $bitmap_name = "$opts->{storage}-" . $bitmap_name;
> +         $params->{'bitmap-name'} = $bitmap_name;

not related to this patch directly - if we do this for external
providers, do we also want to do it for different PBS targets maybe? :)

> +     }
> +
> +     $self->loginfo("setting up snapshot-access for backup");
> +

and down here (A)?

> +     my $backup_access_info = eval { mon_cmd($vmid, "backup-access-setup", 
> $params->%*) };
> +     my $qmperr = $@;
> +
> +     $task->{cleanup}->{'backup-access-teardown'} = { 'target-id' => 
> $target_id, success => 0 };

should we differentiate here between setup success or failure? if not,
should we move it directly before the setup call?

> +
> +     if ($fs_frozen) {
> +         $self->qga_fs_thaw($vmid);
> +     }
> +
> +     die $qmperr if $qmperr;
> +
> +     $self->resume_vm_after_job_start($task, $vmid);
> +
> +     my $bitmap_info = mon_cmd($vmid, 'query-pbs-bitmap-info');
> +     for my $info (sort { $a->{drive} cmp $b->{drive} } $bitmap_info->@*) {
> +         my $text = $bitmap_action_to_human->($self, $info);
> +         my $drive = $info->{drive};
> +         $drive =~ s/^drive-//; # for consistency
> +         $self->loginfo("$drive: dirty-bitmap status: $text");
> +     }
> +
> +     $self->loginfo("starting NBD server");
> +
> +     my $nbd_path = "/run/qemu-server/$vmid\_nbd.backup_access";
> +     mon_cmd(
> +         $vmid, "nbd-server-start", addr => { type => 'unix', data => { path 
> => $nbd_path } } );
> +     $task->{cleanup}->{'nbd-stop'} = 1;
> +
> +     for my $info ($backup_access_info->@*) {
> +         $self->loginfo("adding NBD export for $info->{device}");
> +
> +         my $export_params = {
> +             id => $info->{device},
> +             'node-name' => $info->{'node-name'},
> +             writable => JSON::true, # for discard
> +             type => "nbd",
> +             name => $info->{device}, # NBD export name
> +         };
> +
> +         if ($info->{'bitmap-name'}) {
> +             $export_params->{bitmaps} = [{
> +                 node => $info->{'bitmap-node-name'},
> +                 name => $info->{'bitmap-name'},
> +             }],
> +         }
> +
> +         mon_cmd($vmid, "block-export-add", $export_params->%*);
> +     }
> +
> +     my $child_pids = {}; # used for nbdinfo calls
> +     my $volumes = {};
> +
> +     eval {
> +         ($volumes, $child_pids) =
> +             backup_access_to_volume_info($self, $backup_access_info, 
> $mechanism, $nbd_path);

so this here forks child processes (via block_device_backup_prepare),
but it might fail halfway through after having forked X/N children, then
we don't have any information about the forked processes here (C)

> +
> +         my $param = {};
> +         $param->{'bandwidth-limit'} = $opts->{bwlimit} * 1024 if 
> $opts->{bwlimit};
> +         $param->{'firewall-config'} = 
> PVE::Tools::file_get_contents($firewall_file)
> +             if -e $firewall_file;
> +
> +         $backup_provider->backup_vm($vmid, $guest_config, $volumes, $param);
> +     };
> +     my $err = $@;
> +
> +     if ($mechanism eq 'block-device') {
> +         my $cleanup_paths = [map { $volumes->{$_}->{path} } keys 
> $volumes->%*];
> +         block_device_backup_cleanup($self, $cleanup_paths, $child_pids)

C: to do this cleanup here.. should we maybe record both cpids and
volumes as part of $self->{cleanup}, instead of returning them, so that
we can handle that case as well?

> +     }
> +
> +     die $err if $err;
> +    };
> +    my $err = $@;
> +
> +    if ($err) {
> +     $self->logerr($err);
> +     $self->resume_vm_after_job_start($task, $vmid);
> +    } else {
> +     $task->{size} = $backup_provider->backup_get_task_size($vmid);
> +     $task->{cleanup}->{'backup-access-teardown'}->{success} = 1;
> +    }
> +    $self->restore_vm_power_state($vmid);
> +
> +    die $err if $err;
> +}
> +
>  1;
> -- 
> 2.39.5
> 
> 
> 
> _______________________________________________
> pve-devel mailing list
> pve-devel@lists.proxmox.com
> https://lists.proxmox.com/cgi-bin/mailman/listinfo/pve-devel
> 
> 
> 


_______________________________________________
pve-devel mailing list
pve-devel@lists.proxmox.com
https://lists.proxmox.com/cgi-bin/mailman/listinfo/pve-devel

Reply via email to