From: Wolfgang Bumiller <w.bumil...@proxmox.com> Signed-off-by: Wolfgang Bumiller <w.bumil...@proxmox.com> --- src/PVE/LXC/CGroup.pm | 128 +++++++++++++++++++++++++++ src/PVE/LXC/Command.pm | 196 +++++++++++++++++++++++++++++++++++++++++ src/PVE/LXC/Makefile | 2 + 3 files changed, 326 insertions(+) create mode 100644 src/PVE/LXC/CGroup.pm create mode 100644 src/PVE/LXC/Command.pm
diff --git a/src/PVE/LXC/CGroup.pm b/src/PVE/LXC/CGroup.pm new file mode 100644 index 0000000..7561fb2 --- /dev/null +++ b/src/PVE/LXC/CGroup.pm @@ -0,0 +1,128 @@ +# cgroup handler +# +# This package should deal with figuring out the right cgroup path for a +# container (via the command socket), reading and writing cgroup values, and +# handling cgroup v1 & v2 differences. +# +# Note that the long term plan is to have resource manage functions intead of +# dealing with cgroup files on the outside. + +package PVE::LXC::CGroup; + +use strict; +use warnings; + +use PVE::LXC::Command; + +# We don't want to do a command socket round trip for every cgroup read/write, +# so any cgroup function needs to have the container's path cached, so this +# package has to be instantiated. +# +# LXC keeps separate paths by controller (although they're normally all the +# same, in our # case anyway), so we cache them by controller as well. +sub new { + my ($class, $vmid) = @_; + + my $self = { vmid => $vmid }; + + return bless $self, $class; +} + +my $CPUSET_BASE = undef; +# Find the cpuset cgroup controller. +# +# This is a function, not a method! +sub cpuset_controller_path() { + if (!defined($CPUSET_BASE)) { + my $CPUSET_PATHS = [ + # legacy cpuset cgroup: + ['/sys/fs/cgroup/cpuset', 'cpuset.effective_cpus'], + # pure cgroupv2 environment: + ['/sys/fs/cgroup', 'cpuset.cpus.effective'], + # hybrid, with cpuset moved to cgroupv2 + ['/sys/fs/cgroup/unified', 'cpuset.cpus.effective'], + ]; + + my ($result) = grep { -f "$_->[0]/$_->[1]" } @$CPUSET_PATHS; + die "failed to find cpuset controller\n" if !defined($result); + + $CPUSET_BASE = $result->[0]; + } + + return $CPUSET_BASE; +} + +my $CGROUP_MODE = undef; +# Figure out which cgroup mode we're operating under: +# +# Returns 1 if cgroupv1 controllers exist (hybrid or legacy mode), and 2 in a +# cgroupv2-only environment. +# +# This is a function, not a method! +sub cgroup_mode() { + if (!defined($CGROUP_MODE)) { + my ($v1, $v2) = PVE::LXC::get_cgroup_subsystems(); + if (keys %$v1) { + # hybrid or legacy mode + $CGROUP_MODE = 1; + } elsif ($v2) { + $CGROUP_MODE = 2; + } + } + + die "unknown cgroup mode\n" if !defined($CGROUP_MODE); + return $CGROUP_MODE; +} + +# Get a subdirectory (without the cgroup mount point) for a controller. +# +# If `$controller` is `undef`, get the unified (cgroupv2) path. +# +# Note that in cgroup v2, lxc uses the activated controller names +# (`cgroup.controllers` file) as list of controllers for the unified hierarchy, +# so this returns a result when a `controller` is provided even when using +# a pure cgroupv2 setup. +my sub get_subdir { + my ($self, $controller, $limiting) = @_; + + my $entry_name = $controller || 'unified'; + my $entry = ($self->{controllers}->{$entry_name} //= {}); + + my $kind = $limiting ? 'limit' : 'ns'; + my $path = $entry->{$kind}; + + return $path if defined $path; + + $path = PVE::LXC::Command::get_cgroup_path( + $self->{vmid}, + $controller, + $limiting, + ) or return undef; + + # untaint: + if ($path =~ /\.\./) { + die "lxc returned suspicious path: '$path'\n"; + } + ($path) = ($path =~ /^(.*)$/s); + + $entry->{$kind} = $path; + + return $path; +} + +# Get a path for a controller. +# +# `$controller` may be `undef`, see get_subdir above for details. +sub get_path { + my ($self, $controller) = @_; + + my $path = get_subdir($self, $controller) + or return undef; + + # The main mount point we currenlty assume to be in a standard location. + return "/sys/fs/cgroup/$path" if cgroup_mode() == 2; + return "/sys/fs/cgroup/unified/$path" if !defined($controller); + return "/sys/fs/cgroup/$controller/$path"; +} + +1; diff --git a/src/PVE/LXC/Command.pm b/src/PVE/LXC/Command.pm new file mode 100644 index 0000000..2fd4e81 --- /dev/null +++ b/src/PVE/LXC/Command.pm @@ -0,0 +1,196 @@ +# LXC command socket client. +# +# For now this is only used to fetch the cgroup paths. +# This can also be extended to replace a few more `lxc-*` CLI invocations. +# (such as lxc-stop, info, freeze, unfreeze, or getting the init pid) + +package PVE::LXC::Command; + +use strict; +use warnings; + +use IO::Socket::UNIX; +use Socket qw(SOCK_STREAM SOL_SOCKET SO_PASSCRED); + +use base 'Exporter'; + +use constant { + LXC_CMD_GET_CGROUP => 6, + LXC_CMD_GET_LIMITING_CGROUP => 19, +}; + +our @EXPORT_OK = qw( + raw_command_transaction + simple_command + get_cgroup_path +); + +# Get the command socket for a container. +my sub _get_command_socket($) { + my ($vmid) = @_; + + my $sock = IO::Socket::UNIX->new( + Type => SOCK_STREAM(), + Peer => "\0/var/lib/lxc/$vmid/command", + ); + if (!defined($sock)) { + return undef if $!{ECONNREFUSED}; + die "failed to connect to command socket: $!\n"; + } + + # The documentation for this talks more about the receiving end, and it + # also *mostly works without, but then the kernel *sometimes* fails to + # provide correct credentials. + setsockopt($sock, SOL_SOCKET, SO_PASSCRED, 1) + or die "failed to pass credentials to command socket: $!\n"; + + return $sock; +} + +# Create an lxc_cmd_req struct. +my sub _lxc_cmd_req($$) { + my ($cmd, $datalen) = @_; + + # struct lxc_cmd_req { + # lxc_cmd_t cmd; + # int datalen; + # const void *data; + # }; + # + # Obviously the pointer makes no sense in the payload so we just use NULL. + my $packet = pack('i!i!L!', $cmd, $datalen, 0); + + return $packet; +} + +# Unpack an lxc_cmd_rsp into result into its result and payload length. +my sub _unpack_lxc_cmd_rsp($) { + my ($packet) = @_; + + #struct lxc_cmd_rsp { + # int ret; /* 0 on success, -errno on failure */ + # int datalen; + # void *data; + #}; + + # We drop the pointless pointer value. + my ($ret, $len, undef) = unpack("i!i!L!", $packet); + + return ($ret, $len); +} + +# Send a complete packet: +my sub _do_send($$) { + my ($sock, $data) = @_; + my $sent = send($sock, $data, 0) + // die "failed to send to command socket: $!\n"; + die "short write on command socket ($sent != ".length($data).")\n" + if $sent != length($data); +} + +# Send a complete packet: +my sub _do_recv($\$$) { + my ($sock, $scalar, $len) = @_; + my $got = recv($sock, $$scalar, $len, 0) + // die "failed to read from command socket: $!\n"; + die "short read on command socket ($len != ".length($$scalar).")\n" + if length($$scalar) != $len; +} + +# Receive a response from an lxc command socket. +# +# Performs the return value check (negative errno values) and returns the +# return value and payload in array context, or just the payload in scalar +# context. +my sub _recv_response($) { + my ($socket) = @_; + + my $buf = pack('i!i!L!', 0, 0, 0); # struct lxc_cmd_rsp + _do_recv($socket, $buf, length($buf)); + + my ($res, $datalen) = _unpack_lxc_cmd_rsp($buf); + my $data; + _do_recv($socket, $data, $datalen) + if $datalen > 0; + + if ($res < 0) { + $! = -$res; + die "command failed: $!\n"; + } + + return wantarray ? ($res, $data) : $data; +} + +# Perform a command transaction: Send command & payload, receive and unpack the +# response. +sub raw_command_transaction($$;$) { + my ($socket, $cmd, $data) = @_; + + $data //= ''; + + my $req = _lxc_cmd_req(LXC_CMD_GET_CGROUP, length($data)); + _do_send($socket, $req); + if (length($data) > 0) { + _do_send($socket, $data); + } + + return _recv_response($socket); +} + +# Perform a command transaction for a VMID where no command socket has been +# established yet. +# +# Returns ($ret, $data): +# $ret: numeric return value (typically 0) +# $data: optional data returned for the command, if any, otherwise undef +# +# Returns undef if the container is not running, dies on errors. +sub simple_command($$;$) { + my ($vmid, $cmd, $data) = @_; + + my $socket = _get_command_socket($vmid) + or return undef; + return raw_command_transaction($socket, $cmd, $data); +} + +# Retrieve the cgroup path for a running container. +# If $limiting is set, get the payload path without the namespace subdirectory, +# otherwise return the full namespaced path. +# +# Returns undef if the container is not running, dies on errors. +sub get_cgroup_path($;$$) { + my ($vmid, $subsystem, $limiting) = @_; + + # subsystem name must be a zero-terminated C string. + my ($res, $data) = simple_command( + $vmid, + $limiting ? LXC_CMD_GET_LIMITING_CGROUP : LXC_CMD_GET_CGROUP, + pack('Z*', $subsystem), + ); + return undef if !defined $res; + + # data is a zero-terminated string: + return unpack('Z*', $data); +} + +# Retrieve the cgroup path for a running container. +# If $limiting is set, get the payload path without the namespace subdirectory, +# otherwise return the full namespaced path. +# +# Returns undef if the container is not running, dies on errors. +sub get_limiting_cgroup_path($;$) { + my ($vmid, $subsystem) = @_; + + # subsystem name must be a zero-terminated C string. + my ($res, $data) = simple_command( + $vmid, + LXC_CMD_GET_LIMITING_CGROUP, + pack('Z*', $subsystem), + ); + return undef if !defined $res; + + # data is a zero-terminated string: + return unpack('Z*', $data); +} + +1; diff --git a/src/PVE/LXC/Makefile b/src/PVE/LXC/Makefile index d889204..f4f4dc1 100644 --- a/src/PVE/LXC/Makefile +++ b/src/PVE/LXC/Makefile @@ -1,4 +1,6 @@ SOURCES= \ + CGroup.pm \ + Command.pm \ Config.pm \ Create.pm \ Migrate.pm \ -- 2.20.1 _______________________________________________ pve-devel mailing list pve-devel@pve.proxmox.com https://pve.proxmox.com/cgi-bin/mailman/listinfo/pve-devel