= hirsute verification =
ubuntu@blanka:~/nvidia-dgx-2/tests$ cat /proc/version
Linux version 5.11.0-42-generic (buildd@lgw01-amd64-041) (gcc (Ubuntu 
10.3.0-1ubuntu1) 10.3.0, GNU ld (GNU Binutils for Ubuntu) 2.36.1) #46-Ubuntu 
SMP Fri Nov 26 12:04:17 UTC 2021
ubuntu@blanka:~/nvidia-dgx-2/tests$ ./nvidia-peermem-test.sh 
+ export DEBCONF_FRONTEND=noninteractive
+ DEBCONF_FRONTEND=noninteractive
+ export DEBIAN_PRIORITY=critical
+ DEBIAN_PRIORITY=critical
+ SERVER_IFACE=enp148s0
+ SERVER_IP=192.168.5.1/24
+ SERVER_IB_BDF=0000:4b:00.0
+ CLIENT_IFACE=enp18s0
+ CLIENT_IP=192.168.5.2/24
+ CLIENT_IB_BDF=0000:ba:00.0
+ trap cleanup EXIT
+ sudo service unattended-upgrades stop
+ install_cuda_perftest
+ local release
+ local components
+ dpkg-query -W -f '${Version}' perftest
+ grep -q '+cuda.1$'
+ return
+ for ibdev in /sys/class/infiniband/*
+++++ readlink /sys/class/infiniband/mlx5_0
++++ dirname 
../../devices/pci0000:00/0000:00:01.1/0000:01:00.0/0000:02:04.0/0000:0a:00.0/0000:0b:00.0/0000:0c:00.0/infiniband/mlx5_0
+++ dirname 
../../devices/pci0000:00/0000:00:01.1/0000:01:00.0/0000:02:04.0/0000:0a:00.0/0000:0b:00.0/0000:0c:00.0/infiniband
++ basename 
../../devices/pci0000:00/0000:00:01.1/0000:01:00.0/0000:02:04.0/0000:0a:00.0/0000:0b:00.0/0000:0c:00.0
+ bdf=0000:0c:00.0
+ case "$bdf" in
+ for ibdev in /sys/class/infiniband/*
+++++ readlink /sys/class/infiniband/mlx5_1
++++ dirname 
../../devices/pci0000:00/0000:00:01.1/0000:01:00.0/0000:02:08.0/0000:10:00.0/0000:11:10.0/0000:12:00.0/infiniband/mlx5_1
+++ dirname 
../../devices/pci0000:00/0000:00:01.1/0000:01:00.0/0000:02:08.0/0000:10:00.0/0000:11:10.0/0000:12:00.0/infiniband
++ basename 
../../devices/pci0000:00/0000:00:01.1/0000:01:00.0/0000:02:08.0/0000:10:00.0/0000:11:10.0/0000:12:00.0
+ bdf=0000:12:00.0
+ case "$bdf" in
+ for ibdev in /sys/class/infiniband/*
+++++ readlink /sys/class/infiniband/mlx5_2
++++ dirname 
../../devices/pci0000:40/0000:40:01.1/0000:41:00.0/0000:42:04.0/0000:49:00.0/0000:4a:00.0/0000:4b:00.0/infiniband/mlx5_2
+++ dirname 
../../devices/pci0000:40/0000:40:01.1/0000:41:00.0/0000:42:04.0/0000:49:00.0/0000:4a:00.0/0000:4b:00.0/infiniband
++ basename 
../../devices/pci0000:40/0000:40:01.1/0000:41:00.0/0000:42:04.0/0000:49:00.0/0000:4a:00.0/0000:4b:00.0
+ bdf=0000:4b:00.0
+ case "$bdf" in
++ basename /sys/class/infiniband/mlx5_2
+ server_ib_dev=mlx5_2
+ for ibdev in /sys/class/infiniband/*
+++++ readlink /sys/class/infiniband/mlx5_3
++++ dirname 
../../devices/pci0000:40/0000:40:01.1/0000:41:00.0/0000:42:08.0/0000:50:00.0/0000:51:10.0/0000:54:00.0/infiniband/mlx5_3
+++ dirname 
../../devices/pci0000:40/0000:40:01.1/0000:41:00.0/0000:42:08.0/0000:50:00.0/0000:51:10.0/0000:54:00.0/infiniband
++ basename 
../../devices/pci0000:40/0000:40:01.1/0000:41:00.0/0000:42:08.0/0000:50:00.0/0000:51:10.0/0000:54:00.0
+ bdf=0000:54:00.0
+ case "$bdf" in
+ for ibdev in /sys/class/infiniband/*
+++++ readlink /sys/class/infiniband/mlx5_4
++++ dirname 
../../devices/pci0000:80/0000:80:01.1/0000:81:00.0/0000:82:04.0/0000:8b:00.0/0000:8c:00.0/0000:8d:00.0/infiniband/mlx5_4
+++ dirname 
../../devices/pci0000:80/0000:80:01.1/0000:81:00.0/0000:82:04.0/0000:8b:00.0/0000:8c:00.0/0000:8d:00.0/infiniband
++ basename 
../../devices/pci0000:80/0000:80:01.1/0000:81:00.0/0000:82:04.0/0000:8b:00.0/0000:8c:00.0/0000:8d:00.0
+ bdf=0000:8d:00.0
+ case "$bdf" in
+ for ibdev in /sys/class/infiniband/*
+++++ readlink /sys/class/infiniband/mlx5_5
++++ dirname 
../../devices/pci0000:80/0000:80:01.1/0000:81:00.0/0000:82:08.0/0000:92:00.0/0000:93:10.0/0000:94:00.0/infiniband/mlx5_5
+++ dirname 
../../devices/pci0000:80/0000:80:01.1/0000:81:00.0/0000:82:08.0/0000:92:00.0/0000:93:10.0/0000:94:00.0/infiniband
++ basename 
../../devices/pci0000:80/0000:80:01.1/0000:81:00.0/0000:82:08.0/0000:92:00.0/0000:93:10.0/0000:94:00.0
+ bdf=0000:94:00.0
+ case "$bdf" in
+ for ibdev in /sys/class/infiniband/*
+++++ readlink /sys/class/infiniband/mlx5_6
++++ dirname 
../../devices/pci0000:b0/0000:b0:01.1/0000:b1:00.0/0000:b2:04.0/0000:b8:00.0/0000:b9:00.0/0000:ba:00.0/infiniband/mlx5_6
+++ dirname 
../../devices/pci0000:b0/0000:b0:01.1/0000:b1:00.0/0000:b2:04.0/0000:b8:00.0/0000:b9:00.0/0000:ba:00.0/infiniband
++ basename 
../../devices/pci0000:b0/0000:b0:01.1/0000:b1:00.0/0000:b2:04.0/0000:b8:00.0/0000:b9:00.0/0000:ba:00.0
+ bdf=0000:ba:00.0
+ case "$bdf" in
++ basename /sys/class/infiniband/mlx5_6
+ client_ib_dev=mlx5_6
+ for ibdev in /sys/class/infiniband/*
+++++ readlink /sys/class/infiniband/mlx5_7
++++ dirname 
../../devices/pci0000:b0/0000:b0:01.1/0000:b1:00.0/0000:b2:08.0/0000:be:00.0/0000:bf:10.0/0000:cc:00.0/infiniband/mlx5_7
+++ dirname 
../../devices/pci0000:b0/0000:b0:01.1/0000:b1:00.0/0000:b2:08.0/0000:be:00.0/0000:bf:10.0/0000:cc:00.0/infiniband
++ basename 
../../devices/pci0000:b0/0000:b0:01.1/0000:b1:00.0/0000:b2:08.0/0000:be:00.0/0000:bf:10.0/0000:cc:00.0
+ bdf=0000:cc:00.0
+ case "$bdf" in
+ for ibdev in /sys/class/infiniband/*
+++++ readlink /sys/class/infiniband/mlx5_8
++++ dirname 
../../devices/pci0000:e0/0000:e0:03.1/0000:e1:00.0/infiniband/mlx5_8
+++ dirname ../../devices/pci0000:e0/0000:e0:03.1/0000:e1:00.0/infiniband
++ basename ../../devices/pci0000:e0/0000:e0:03.1/0000:e1:00.0
+ bdf=0000:e1:00.0
+ case "$bdf" in
+ for ibdev in /sys/class/infiniband/*
+++++ readlink /sys/class/infiniband/mlx5_9
++++ dirname 
../../devices/pci0000:e0/0000:e0:03.1/0000:e1:00.1/infiniband/mlx5_9
+++ dirname ../../devices/pci0000:e0/0000:e0:03.1/0000:e1:00.1/infiniband
++ basename ../../devices/pci0000:e0/0000:e0:03.1/0000:e1:00.1
+ bdf=0000:e1:00.1
+ case "$bdf" in
+ '[' -z mlx5_6 ']'
+ '[' -z mlx5_2 ']'
+ sudo rdma system set netns exclusive
+ sudo ip netns add peermemclient
+ sudo rdma dev set mlx5_6 netns peermemclient
+ sudo ip netns exec peermemclient ip link set dev lo up
+ sudo ip link set netns peermemclient enp18s0
+ sudo ip netns exec peermemclient ip addr add dev enp18s0 192.168.5.2/24
+ sudo ip netns exec peermemclient ip link set dev enp18s0 up
+ sudo ip addr add dev enp148s0 192.168.5.1/24
+ sudo ip link set dev enp148s0 up
+ sudo modprobe ib_umad
+ sudo modprobe nvidia-peermem
+ sudo_apt install -y opensm
+ sudo --preserve-env=DEBCONF_FRONTEND,DEBIAN_PRIORITY apt install -y opensm
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
opensm is already the newest version (3.3.23-2).
0 upgraded, 0 newly installed, 0 to remove and 0 not upgraded.
+ sudo service opensm start
+ use_cuda_needs_devid
+ ib_write_bw --help
+ grep use_cuda=
      --use_cuda=<cuda device id> Use CUDA specific device for GPUDirect RDMA 
testing
+ return 0
+ server_use_cuda_arg=--use_cuda=0
+ client_use_cuda_arg=--use_cuda=1
+ srvpid=7209
+ sleep 5
+ sudo ib_write_bw -a -d mlx5_2 --use_cuda=0

************************************
* Waiting for client to connect... *
************************************
+ sudo ip netns exec peermemclient ib_write_bw -a -d mlx5_6 192.168.5.1 
--use_cuda=1
initializing CUDA
initializing CUDA
Listing all CUDA devices in system:
CUDA device 0: PCIe address is 07:00
CUDA device 1: PCIe address is 0F:00
CUDA device 2: PCIe address is 47:00
CUDA device 3: PCIe address is 4E:00
CUDA device 4: PCIe address is 87:00
CUDA device 5: PCIe address is 90:00
CUDA device 6: PCIe address is B7:00
CUDA device 7: PCIe address is BD:00

Picking device No. 1
[pid = 7216, dev = 1] device name = [NVIDIA A100-SXM4-40GB]
creating CUDA Ctx
Listing all CUDA devices in system:
CUDA device 0: PCIe address is 07:00
CUDA device 1: PCIe address is 0F:00
CUDA device 2: PCIe address is 47:00
CUDA device 3: PCIe address is 4E:00
CUDA device 4: PCIe address is 87:00
CUDA device 5: PCIe address is 90:00
CUDA device 6: PCIe address is B7:00
CUDA device 7: PCIe address is BD:00

Picking device No. 0
[pid = 7211, dev = 0] device name = [NVIDIA A100-SXM4-40GB]
creating CUDA Ctx
making it the current CUDA Ctx
cuMemAlloc() of a 16777216 bytes GPU buffer
allocated GPU buffer address at 00007f0eba000000 pointer=0x7f0eba000000
---------------------------------------------------------------------------------------
                    RDMA_Write BW Test
 Dual-port       : OFF          Device         : mlx5_6
 Number of qps   : 1            Transport type : IB
 Connection type : RC           Using SRQ      : OFF
 PCIe relax order: ON
 ibv_wr* API     : ON
 TX depth        : 128
 CQ Moderation   : 100
 Mtu             : 4096[B]
 Link type       : IB
 Max inline data : 0[B]
 rdma_cm QPs     : OFF
 Data ex. method : Ethernet
---------------------------------------------------------------------------------------
making it the current CUDA Ctx
cuMemAlloc() of a 16777216 bytes GPU buffer
allocated GPU buffer address at 00007f682e000000 pointer=0x7f682e000000
---------------------------------------------------------------------------------------
                    RDMA_Write BW Test
 Dual-port       : OFF          Device         : mlx5_2
 Number of qps   : 1            Transport type : IB
 Connection type : RC           Using SRQ      : OFF
 PCIe relax order: ON
 ibv_wr* API     : ON
 CQ Moderation   : 100
 Mtu             : 4096[B]
 Link type       : IB
 Max inline data : 0[B]
 rdma_cm QPs     : OFF
 Data ex. method : Ethernet
---------------------------------------------------------------------------------------
 local address: LID 0x01 QPN 0x0107 PSN 0x90c1f2 RKey 0x17ecdc VAddr 
0x007f682e800000
 local address: LID 0x02 QPN 0x1883 PSN 0xa82bae RKey 0x17ece2 VAddr 
0x007f0eba800000
 remote address: LID 0x02 QPN 0x1883 PSN 0xa82bae RKey 0x17ece2 VAddr 
0x007f0eba800000
 remote address: LID 0x01 QPN 0x0107 PSN 0x90c1f2 RKey 0x17ecdc VAddr 
0x007f682e800000
---------------------------------------------------------------------------------------
 #bytes     #iterations    BW peak[MB/sec]    BW average[MB/sec]   MsgRate[Mpps]
---------------------------------------------------------------------------------------
 #bytes     #iterations    BW peak[MB/sec]    BW average[MB/sec]   MsgRate[Mpps]
Conflicting CPU frequency values detected: 1500.000000 != 3391.375000. CPU 
Frequency is not max.
 2          5000             4.11               4.10               2.151153
Conflicting CPU frequency values detected: 1500.000000 != 3345.763000. CPU 
Frequency is not max.
 4          5000             8.07               8.04               2.108648
Conflicting CPU frequency values detected: 1500.000000 != 3362.509000. CPU 
Frequency is not max.
 8          5000             16.13              16.13              2.113996
Conflicting CPU frequency values detected: 1500.000000 != 3335.048000. CPU 
Frequency is not max.
 16         5000             32.30              32.19              2.109436
Conflicting CPU frequency values detected: 1500.000000 != 3339.906000. CPU 
Frequency is not max.
 32         5000             64.41              64.38              2.109663
Conflicting CPU frequency values detected: 1500.000000 != 3333.100000. CPU 
Frequency is not max.
 64         5000             129.43             129.12             2.115557
Conflicting CPU frequency values detected: 1500.000000 != 3349.864000. CPU 
Frequency is not max.
 128        5000             257.89             257.16             2.106668
Conflicting CPU frequency values detected: 1500.000000 != 3350.294000. CPU 
Frequency is not max.
 256        5000             516.27             515.84             2.112864
Conflicting CPU frequency values detected: 1500.000000 != 3340.996000. CPU 
Frequency is not max.
 512        5000             1024.81            1024.72            2.098633
Conflicting CPU frequency values detected: 1500.000000 != 3356.251000. CPU 
Frequency is not max.
 1024       5000             2053.47            2053.08            2.102352
Conflicting CPU frequency values detected: 1500.000000 != 3339.107000. CPU 
Frequency is not max.
 2048       5000             3864.52            3720.22            1.904755
Conflicting CPU frequency values detected: 1500.000000 != 3355.693000. CPU 
Frequency is not max.
 4096       5000             4494.10            4083.37            1.045344
Conflicting CPU frequency values detected: 1500.000000 != 3342.793000. CPU 
Frequency is not max.
 8192       5000             4590.54            4425.60            0.566476
Conflicting CPU frequency values detected: 1500.000000 != 3351.159000. CPU 
Frequency is not max.
 16384      5000             4517.28            4279.27            0.273873
Conflicting CPU frequency values detected: 1500.000000 != 3314.743000. CPU 
Frequency is not max.
 32768      5000             4460.95            4387.03            0.140385
Conflicting CPU frequency values detected: 1500.000000 != 3305.732000. CPU 
Frequency is not max.
 65536      5000             4465.92            4408.98            0.070544
Conflicting CPU frequency values detected: 1500.000000 != 3310.266000. CPU 
Frequency is not max.
 131072     5000             4449.90            4422.93            0.035383
Conflicting CPU frequency values detected: 1500.000000 != 3364.586000. CPU 
Frequency is not max.
 262144     5000             4443.64            4439.50            0.017758
Conflicting CPU frequency values detected: 1500.000000 != 3325.738000. CPU 
Frequency is not max.
 524288     5000             4444.42            4441.08            0.008882
Conflicting CPU frequency values detected: 1500.000000 != 3391.764000. CPU 
Frequency is not max.
 1048576    5000             4453.77            4452.52            0.004453
Conflicting CPU frequency values detected: 1500.000000 != 3391.441000. CPU 
Frequency is not max.
 2097152    5000             4450.29            4449.44            0.002225
Conflicting CPU frequency values detected: 1500.000000 != 1958.593000. CPU 
Frequency is not max.
 4194304    5000             4452.98            4451.38            0.001113
Conflicting CPU frequency values detected: 1500.000000 != 2246.050000. CPU 
Frequency is not max.
 8388608    5000             4453.11            4452.79            0.000557
---------------------------------------------------------------------------------------
 8388608    5000             4453.11            4452.79            0.000557
---------------------------------------------------------------------------------------
deallocating RX GPU buffer 00007f682e000000
deallocating RX GPU buffer 00007f0eba000000
destroying current CUDA Ctx
destroying current CUDA Ctx
+ cleanup
+ '[' -n 7209 ']'
+ test -d /proc/7209
+ sudo kill 7209
kill: (7209): No such process
+ /bin/true
+ '[' -z '' ']'
+ sudo ip addr del dev enp148s0 192.168.5.1/24
+ sudo ip netns exec peermemclient ip addr del dev enp18s0 192.168.5.2/24
+ sudo ip netns delete peermemclient
ubuntu@blanka:~/nvidia-dgx-2/tests$ echo $?
0


** Tags removed: verification-needed-hirsute
** Tags added: verification-done-hirsute

-- 
You received this bug notification because you are a member of Kernel
Packages, which is subscribed to linux in Ubuntu.
https://bugs.launchpad.net/bugs/1947206

Title:
  Updates to ib_peer_memory requested by Nvidia

Status in linux package in Ubuntu:
  In Progress
Status in linux source package in Focal:
  In Progress
Status in linux source package in Hirsute:
  Fix Committed
Status in linux source package in Impish:
  Fix Committed

Bug description:
  [Impact]
  Nvidia notified me via private email that they'd discovered some issues with 
the ib_peer_memory patch we are carrying in hirsute/impish and sent me a patch 
intended to resolve them. My knowledge of these changes is limited to what is 
mentioned in the commit message:

  - Allow clients to opt out of unmap during invalidation
  - Fix some bugs in the sequencing of mlx5 MRs
  - Enable ATS for peer memory

  [Test Case]
  ib_write_bw from the perftest package, rebuilt with CUDA support, can be used 
as a smoke test of this feature. I'll attach a sample test script here. I've 
verified this test passes with the kernels in the archive, and continues to 
pass with the provided patch applied.

  [Fix]
  Nvidia has emailed me fixes for both trees. They are not currently available 
in a public tree elsewhere, though I'm told at some point they should end up in 
a branch here:
    https://git.kernel.org/pub/scm/linux/kernel/git/leon/linux-rdma.git/

  [What could go wrong]
  The only known use case for ib_peer_memory are Nvidia GPU users making use of 
the GPU PeerDirect feature where GPUs can share memory with one another over an 
Infiniband network. Bugs here could cause problems (hangs, crashes, corruption) 
with such workloads.

To manage notifications about this bug go to:
https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1947206/+subscriptions


-- 
Mailing list: https://launchpad.net/~kernel-packages
Post to     : kernel-packages@lists.launchpad.net
Unsubscribe : https://launchpad.net/~kernel-packages
More help   : https://help.launchpad.net/ListHelp

Reply via email to