vm configuration
----------------
hugepages: (any|2|1024)

any: we'll try to allocate 1GB hugepage if possible, if not we use 2MB hugepage
2: we want to use 2MB hugepage
1024: we want to use 1GB hugepage. (memory need to be multiple of 1GB in this 
case)

optionnal host configuration for 1GB hugepages
----------------------------------------------
1GB hugepages can be allocated at boot if user want it.
hugepages need to be contiguous, so sometime it's not possible to reserve them 
on the fly

/etc/default/grub : GRUB_CMDLINE_LINUX_DEFAULT="quiet hugepagesz=1G hugepages=x"

Signed-off-by: Alexandre Derumier <aderum...@odiso.com>
---
 PVE/QemuServer.pm | 274 ++++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 267 insertions(+), 7 deletions(-)

diff --git a/PVE/QemuServer.pm b/PVE/QemuServer.pm
index fc064c5..9821a3c 100644
--- a/PVE/QemuServer.pm
+++ b/PVE/QemuServer.pm
@@ -319,6 +319,12 @@ EODESC
        description => "Enable/disable NUMA.",
        default => 0,
     },
+    hugepages => {
+       optional => 1,
+       type => 'string',
+       description => "Enable/disable hugepages memory.",
+       enum => [qw(any 2 1024)],
+    },
     vcpus => {
        optional => 1,
        type => 'integer',
@@ -1376,6 +1382,21 @@ sub machine_type_is_q35 {
     return $conf->{machine} && ($conf->{machine} =~ m/q35/) ? 1 : 0;
 }
 
+sub print_mem_object {
+    my ($conf, $id, $size) = @_;
+
+    if ($conf->{hugepages}) {
+
+       my $hugepages_size = hugepages_size($conf, $size);
+       my $path = hugepages_mount_path($hugepages_size);
+
+       return 
"memory-backend-file,id=$id,size=${size}M,mem-path=$path,share=on,prealloc=yes";
+    } else {
+       return "memory-backend-ram,id=$id,size=${size}M";
+    }
+
+}
+
 sub print_tabletdevice_full {
     my ($conf) = @_;
 
@@ -3115,6 +3136,8 @@ sub config_to_command {
        push @$cmd, '-m', $static_memory;
     }
 
+    die "numa need to be enabled to use hugepages" if $conf->{hugepages} && 
!$conf->{numa};
+
     if ($conf->{numa}) {
 
        my $numa_totalmemory = undef;
@@ -3126,7 +3149,8 @@ sub config_to_command {
            die "missing numa node$i memory value\n" if !$numa->{memory};
            my $numa_memory = $numa->{memory};
            $numa_totalmemory += $numa_memory;
-           my $numa_object = 
"memory-backend-ram,id=ram-node$i,size=${numa_memory}M";
+
+           my $mem_object = print_mem_object($conf, "ram-node$i", 
$numa_memory);
 
            # cpus
            my $cpulists = $numa->{cpus};
@@ -3154,10 +3178,10 @@ sub config_to_command {
                # policy
                my $policy = $numa->{policy};
                die "you need to define a policy for hostnode $hostnodes\n" if 
!$policy;
-               $numa_object .= ",host-nodes=$hostnodes,policy=$policy";
+               $mem_object .= ",host-nodes=$hostnodes,policy=$policy";
            }
 
-           push @$cmd, '-object', $numa_object;
+           push @$cmd, '-object', $mem_object;
            push @$cmd, '-numa', "node,nodeid=$i,cpus=$cpus,memdev=ram-node$i";
        }
 
@@ -3167,7 +3191,7 @@ sub config_to_command {
        #if no custom tology, we split memory and cores across numa nodes
        if(!$numa_totalmemory) {
 
-           my $numa_memory = ($static_memory / $sockets) . "M";
+           my $numa_memory = ($static_memory / $sockets);
 
            for (my $i = 0; $i < $sockets; $i++)  {
 
@@ -3176,7 +3200,9 @@ sub config_to_command {
                my $cpus = $cpustart;
                $cpus .= "-$cpuend" if $cpuend;
 
-               push @$cmd, '-object', 
"memory-backend-ram,size=$numa_memory,id=ram-node$i";
+               my $mem_object = print_mem_object($conf, "ram-node$i", 
$numa_memory);
+
+               push @$cmd, '-object', $mem_object;
                push @$cmd, '-numa', 
"node,nodeid=$i,cpus=$cpus,memdev=ram-node$i";
            }
        }
@@ -3185,7 +3211,10 @@ sub config_to_command {
     if ($hotplug_features->{memory}) {
        foreach_dimm($conf, $vmid, $memory, $sockets, sub {
            my ($conf, $vmid, $name, $dimm_size, $numanode, $current_size, 
$memory) = @_;
-           push @$cmd, "-object" , 
"memory-backend-ram,id=mem-$name,size=${dimm_size}M";
+
+           my $mem_object = print_mem_object($conf, "mem-$name", $dimm_size);
+
+           push @$cmd, "-object" , $mem_object;
            push @$cmd, "-device", 
"pc-dimm,id=$name,memdev=mem-$name,node=$numanode";
 
            #if dimm_memory is not aligned to dimm map
@@ -3837,7 +3866,19 @@ sub qemu_memory_hotplug {
 
                return if $current_size <= $conf->{memory};
 
-               eval { vm_mon_cmd($vmid, "object-add", 'qom-type' => 
"memory-backend-ram", id => "mem-$name", props => { size => 
int($dimm_size*1024*1024) } ) };
+               if ($conf->{hugepages}) {
+
+                   my $hugepages_size = hugepages_size($conf, $dimm_size);
+                   my $path = hugepages_mount_path($hugepages_size);
+                   my $hugepages_topology->{$hugepages_size}->{$numanode} = 
hugepages_nr($dimm_size, $hugepages_size);
+                   hugepages_allocate($hugepages_topology);
+
+                   eval { vm_mon_cmd($vmid, "object-add", 'qom-type' => 
"memory-backend-file", id => "mem-$name", props => { 
+                                            size => int($dimm_size*1024*1024), 
'mem-path' => $path, share => JSON::true, prealloc => JSON::true } ) };
+               } else {
+                   eval { vm_mon_cmd($vmid, "object-add", 'qom-type' => 
"memory-backend-ram", id => "mem-$name", props => { size => 
int($dimm_size*1024*1024) } ) };
+               }
+
                if (my $err = $@) {
                    eval { qemu_objectdel($vmid, "mem-$name"); };
                    die $err;
@@ -4548,6 +4589,12 @@ sub vm_start {
          }
         }
 
+       if ($conf->{hugepages}) {
+           my $hugepages_topology = hugepages_topology($conf);
+           hugepages_mount();
+           hugepages_allocate($hugepages_topology);
+       }
+
        PVE::Storage::activate_volumes($storecfg, $vollist);
 
        if (!check_running($vmid, 1) && -d 
"/sys/fs/cgroup/systemd/qemu.slice/$vmid.scope") {
@@ -4727,6 +4774,11 @@ sub vm_stop_cleanup {
            unlink "/var/run/qemu-server/${vmid}.$ext";
        }
 
+       if($conf->{hugepages}) {
+           my $hugepages_topology = hugepages_topology($conf);
+           hugepages_deallocate($hugepages_topology);
+       }
+
        vmconfig_apply_pending($vmid, $conf, $storecfg) if 
$apply_pending_changes;
     };
     warn $@ if $@; # avoid errors - just warn
@@ -6191,6 +6243,214 @@ sub scsihw_infos {
     return ($maxdev, $controller, $controller_prefix);
 }
 
+sub hugepages_mount {
+
+   my $mountdata = PVE::ProcFSTools::parse_proc_mounts();
+
+   foreach my $size (qw(2048 1048576)) {
+       return if (! -d "/sys/kernel/mm/hugepages/hugepages-${size}kB");
+
+       my $path = "/run/hugepages/kvm/${size}kB";
+
+       my $found = grep {
+           $_->[2] =~ /^hugetlbfs/ &&
+           $_->[1] eq $path
+       } @$mountdata;
+
+       if (!$found) {
+
+           File::Path::make_path($path) if (!-d $path);
+           my $cmd = ['/bin/mount', '-t', 'hugetlbfs', '-o', 
"pagesize=${size}k", 'hugetlbfs', $path];
+           run_command($cmd, errmsg => "hugepage mount error");
+       }
+   }
+}
+
+sub hugepages_mount_path {
+   my ($size) = @_;
+
+   $size = $size * 1024;
+   return "/run/hugepages/kvm/${size}kB";
+
+}
+
+sub hugepages_nr {
+  my ($size, $hugepages_size) = @_;
+
+  return $size / $hugepages_size;
+}
+
+sub hugepages_size {
+   my ($conf, $size) = @_;
+
+   die "hugepages option is not enabled" if !$conf->{hugepages};
+
+   if ($conf->{hugepages} eq 'any') {
+
+       #try to use 1GB if available && memory size is matching
+       if (-d "/sys/kernel/mm/hugepages/hugepages-1048576kB" && ($size % 1024 
== 0)) {
+           return 1024;
+       } else {
+           return 2;
+       }
+
+   } else {
+
+       my $hugepagesize = $conf->{hugepages} * 1024 . "kB";
+
+       if (! -d "/sys/kernel/mm/hugepages/hugepages-$hugepagesize") {
+               die "your system don't support hugepages of $hugepagesize";
+       }
+       die "the $size memory is not a multiple of $hugepagesize hugepages 
size" if ($size % $conf->{hugepages}) != 0;  
+       return $conf->{hugepages};
+   }
+
+}
+
+sub hugepages_topology {
+    my ($conf) = @_;
+
+    my $hugepages_topology = {};
+
+    return if !$conf->{numa};
+
+    my $defaults = load_defaults();
+    my $memory = $conf->{memory} || $defaults->{memory};
+    my $static_memory = 0;
+    my $sockets = 1;
+    $sockets = $conf->{smp} if $conf->{smp}; # old style - no longer iused
+    $sockets = $conf->{sockets} if $conf->{sockets};
+    my $numa_custom_topology = undef;
+    my $hotplug_features = parse_hotplug_features(defined($conf->{hotplug}) ? 
$conf->{hotplug} : '1');
+
+    if ($hotplug_features->{memory}) {
+        $static_memory = $STATICMEM;
+    } else {
+        $static_memory = $memory;
+    }
+
+    #custom numa topology
+    for (my $i = 0; $i < $MAX_NUMA; $i++) {
+       next if !$conf->{"numa$i"};
+       my $numa = parse_numa($conf->{"numa$i"});
+       next if !$numa;
+
+       $numa_custom_topology = 1;
+       my $numa_memory = $numa->{memory};
+
+        my $hugepages_size = hugepages_size($conf, $numa_memory);
+        $hugepages_topology->{$hugepages_size}->{$i} += 
hugepages_nr($numa_memory, $hugepages_size);
+
+    }
+
+    #if no custom numa tology, we split memory and cores across numa nodes
+    if(!$numa_custom_topology) {
+
+       my $numa_memory = ($static_memory / $sockets);
+
+       for (my $i = 0; $i < $sockets; $i++)  {
+
+           my $hugepages_size = hugepages_size($conf, $numa_memory);
+           $hugepages_topology->{$hugepages_size}->{$i} += 
hugepages_nr($numa_memory, $hugepages_size);
+       }
+    }
+
+    if ($hotplug_features->{memory}) {
+       foreach_dimm($conf, undef, $memory, $sockets, sub {
+           my ($conf, undef, $name, $dimm_size, $numanode, $current_size, 
$memory) = @_;
+
+           my $hugepages_size = hugepages_size($conf, $dimm_size);
+           $hugepages_topology->{$hugepages_size}->{$numanode} += 
hugepages_nr($dimm_size, $hugepages_size);
+       });
+    }
+
+    return $hugepages_topology;
+}
+
+sub hugepages_allocate {
+    my ($hugepages_topology) = @_;
+
+    #read host hugepages
+    my $hugepages_host_topology = {};
+
+    dir_glob_foreach("/sys/devices/system/node/", 'node(\d+)', sub {
+       my ($nodepath, $numanode) = @_;
+
+       dir_glob_foreach("/sys/devices/system/node/$nodepath/hugepages/", 
'hugepages\-(\d+)kB', sub {
+           my ($hugepages_path, $hugepages_size) = @_;
+
+           $hugepages_size = $hugepages_size / 1024;
+           my $hugepages_nr = 
PVE::Tools::file_read_firstline("/sys/devices/system/node/$nodepath/hugepages/$hugepages_path/nr_hugepages");
+           $hugepages_host_topology->{$hugepages_size}->{$numanode} = 
$hugepages_nr;
+        });
+    });
+
+    #allocate new hupages if needed
+    foreach my $size (sort keys %$hugepages_topology) {
+
+       my $nodes = $hugepages_topology->{$size};
+
+       foreach my $numanode (keys %$nodes) {
+
+           my $hugepages_size = $size * 1024;
+           my $hugepages_requested = $hugepages_topology->{$size}->{$numanode};
+           my $path = 
"/sys/devices/system/node/node${numanode}/hugepages/hugepages-${hugepages_size}kB/";
+           my $hugepages_free = 
PVE::Tools::file_read_firstline($path."free_hugepages");
+           my $hugepages_nr = 
PVE::Tools::file_read_firstline($path."nr_hugepages");
+
+           if ($hugepages_requested > $hugepages_free) {
+               my $hugepages_needed = $hugepages_requested - $hugepages_free;
+               PVE::ProcFSTools::write_proc_entry($path."nr_hugepages", 
$hugepages_nr + $hugepages_needed);
+               #verify that is correctly allocated
+               $hugepages_free = 
PVE::Tools::file_read_firstline($path."free_hugepages");
+               if ($hugepages_free < $hugepages_requested) {
+                   #rollback to initial host config
+                   hugepages_reset($hugepages_host_topology);
+                   die "hugepage allocation fail";
+               }
+           }
+
+       }
+    }
+
+}
+
+sub hugepages_deallocate {
+    my ($hugepages_topology) = @_;
+
+    foreach my $size (sort keys %$hugepages_topology) {
+
+       my $nodes = $hugepages_topology->{$size};
+
+       foreach my $numanode (keys %$nodes) {
+
+           my $hugepages_size = $size * 1024;
+           my $hugepages_used = $hugepages_topology->{$size}->{$numanode};
+           my $path = 
"/sys/devices/system/node/node${numanode}/hugepages/hugepages-${hugepages_size}kB/";
+           my $hugepages_nr = 
PVE::Tools::file_read_firstline($path."nr_hugepages");
+
+           PVE::ProcFSTools::write_proc_entry($path."nr_hugepages", 
($hugepages_nr - $hugepages_used));
+       }
+    }
+}
+
+sub hugepages_reset {
+    my ($hugepages_topology) = @_;
+
+    foreach my $size (sort keys %$hugepages_topology) {
+
+       my $nodes = $hugepages_topology->{$size};
+       foreach my $numanode (keys %$nodes) {
+
+           my $hugepages_nr = $hugepages_topology->{$size}->{$numanode};
+           my $hugepages_size = $size * 1024;
+           my $path = 
"/sys/devices/system/node/node${numanode}/hugepages/hugepages-${hugepages_size}kB/";
+
+           PVE::ProcFSTools::write_proc_entry($path."nr_hugepages", 
$hugepages_nr);
+       }
+    }
+}
+
 # bash completion helper
 
 sub complete_backup_archives {
-- 
2.1.4

_______________________________________________
pve-devel mailing list
pve-devel@pve.proxmox.com
http://pve.proxmox.com/cgi-bin/mailman/listinfo/pve-devel

Reply via email to