-----------------------------------------------------------
This is an automatically generated e-mail. To reply, visit:
https://reviews.apache.org/r/75145/#review226825
-----------------------------------------------------------


Ship it!




local edits:

```
diff --git a/src/slave/containerizer/device_manager/device_manager.cpp 
b/src/slave/containerizer/device_manager/device_manager.cpp
index 5ee1dba82..c245e4104 100644
--- a/src/slave/containerizer/device_manager/device_manager.cpp
+++ b/src/slave/containerizer/device_manager/device_manager.cpp
@@ -35,6 +35,8 @@
 #include "slave/state.hpp"
 #include "linux/cgroups2.hpp"
 
+using google::protobuf::RepeatedPtrField;
+
 using std::string;
 using std::vector;
 
@@ -238,94 +240,84 @@ public:
           cgroups_root, state.container_id(), false));
     }
 
-    const string device_manager_path = paths::getDevicesStatePath(meta_dir);
-    if (!os::exists(device_manager_path)) {
-      return Nothing();
+    const string checkpoint_path = paths::getDevicesStatePath(meta_dir);
+    if (!os::exists(checkpoint_path)) {
+      return Nothing(); // This happens on the first run.
     }
 
-    Result<CgroupDeviceAccessStates> devices_recovery_info =
-      state::read<CgroupDeviceAccessStates>(device_manager_path);
+    Result<CgroupDeviceAccessStates> device_states =
+      state::read<CgroupDeviceAccessStates>(checkpoint_path);
 
-    if (devices_recovery_info.isError()) {
-      return Failure(
-        "Failed to read device configuration info from '" +
-        device_manager_path + "': " + devices_recovery_info.error());
-    } else if (devices_recovery_info.isNone()) {
-      LOG(WARNING) << "The device info file at '" << device_manager_path
-                   << "' is empty";
+    if (device_states.isError()) {
+      return Failure("Failed to read device configuration info from"
+                     " '" + checkpoint_path + "': " + device_states.error());
+    } else if (device_states.isNone()) {
+      LOG(WARNING) << "The device info file at '" << checkpoint_path << "'"
+                   << " is empty";
       return Nothing();
     }
+    CHECK_SOME(device_states);
 
-    CHECK_SOME(devices_recovery_info);
     vector<string> recovered_cgroups = {};
-    foreach (
-        const auto& entry, devices_recovery_info->device_access_per_cgroup()) {
+    foreach (const auto& entry, device_states->device_access_per_cgroup()) {
       const string& cgroup = entry.first;
       const CgroupDeviceAccessState& state = entry.second;
 
-      if(!cgroups_to_recover.contains(cgroup)) {
-        LOG(WARNING) << "The cgroup '" << cgroup
-                     << "' was not found in the given containers to recover";
+      if (!cgroups_to_recover.contains(cgroup)) {
+        LOG(WARNING)
+          << "The cgroup '" << cgroup << "' from the device manager's"
+             " checkpointed state is not present in the expected cgroups of"
+             " the containerizer";
         continue;
       }
 
-      auto parse_and_insert =
-        [&](
-            const ::google::protobuf::RepeatedPtrField<::std::string>& list,
-            const bool allow) -> Try<Nothing>
-        {
-          foreach (const string& entry, list) {
-            Try<Entry> parsed_entry = Entry::parse(entry);
-            if (parsed_entry.isError()) {
-              return Error(
-                  "Failed to parse entry " + entry +
-                  " during recover for cgroup: " + cgroup);
-            }
-
-            auto it = device_access_per_cgroup.find(cgroup);
-            if (it != device_access_per_cgroup.end()) {
-              if (allow) {
-                it->second.allow_list.push_back(*parsed_entry);
-              } else {
-                it->second.deny_list.push_back(*parsed_entry);
-              }
-            } else {
-              auto result = device_access_per_cgroup.emplace(
-                  cgroup,
-                  CHECK_NOTERROR(DeviceManager::CgroupDeviceAccess::create(
-                      allow ? vector<Entry>{*parsed_entry} : vector<Entry>{},
-                      allow ? vector<Entry>{} : 
vector<Entry>{*parsed_entry})));
-              CHECK(result.second);
-            }
+      auto parse = [&](const RepeatedPtrField<string>& list)
+          -> Try<vector<Entry>>
+      {
+        vector<Entry> parsed_entries;
+        foreach (const string& entry, list) {
+          Try<Entry> parsed_entry = Entry::parse(entry);
+          if (parsed_entry.isError()) {
+            return Error("Failed to parse entry " + entry + " during recover"
+                         " for cgroup " + cgroup + ": " + 
parsed_entry.error());
           }
-          return Nothing();
-        };
+          parsed_entries.push_back(*parsed_entry);
+        }
+        return parsed_entries;
+      };
 
       // We return failure because we expect all data in the checkpoint file
       // to be valid.
-      Try<Nothing>
-        status = parse_and_insert(state.allow_list(), true);
-      if (status.isError()) {
-        return Failure(status.error());
+      Try<vector<Entry>> allow_entries = parse(state.allow_list());
+      if (allow_entries.isError()) {
+        return Failure(allow_entries.error());
       }
 
-      status = parse_and_insert(state.deny_list(), false);
-      if (status.isError()) {
-        return Failure(status.error());
+      Try<vector<Entry>> deny_entries = parse(state.deny_list());
+      if (deny_entries.isError()) {
+        return Failure(deny_entries.error());
       }
 
+      auto result = device_access_per_cgroup.emplace(
+          cgroup,
+          CHECK_NOTERROR(DeviceManager::CgroupDeviceAccess::create(
+              *allow_entries, *deny_entries)));
+
+      CHECK(result.second); // There should be a single insertion per cgroup.
+
       recovered_cgroups.push_back(cgroup);
     }
 
     foreach (const string& cgroup, recovered_cgroups) {
-      // Return failure as the checkpointed state should be valid, allowing us
-      // to generate and attach BPF programs. This is because the cgroup
-      // previously succeeded in doing so.
-      Try<Nothing> status = commit_device_access_changes(cgroup, false);
-      if (status.isError()) {
+      // Commit with checkpoint = false, since there's no need to 
re-checkpoint.
+      Try<Nothing> commit = commit_device_access_changes(cgroup, false);
+      if (commit.isError()) {
+        // Return failure as the checkpointed state should be valid, allowing 
us
+        // to generate and attach BPF programs. This is because the cgroup
+        // previously succeeded in doing so.
         return Failure(
-            "Failed to perform configuration of ebpf file for cgroup '" +
-            cgroup + "': " + status.error());
+            "Failed to perform configuration of ebpf file for cgroup"
+            " '" + cgroup + "': " + commit.error());
       }
     }
 
@@ -340,10 +332,13 @@ public:
 
     foreach(const string& cgroup, cgroups_to_recover) {
       if (!device_access_per_cgroup.contains(cgroup)) {
-        LOG(WARNING) << "Unable to recover state for requested cgroup: '"
-                     << cgroup + "'";
+        LOG(WARNING)
+          << "Unable to recover state for cgroup '" + cgroup + "' as requested"
+             " by the containerizer, because it was missing in the device"
+             " manager's checkpointed state";
       }
     }
+
     return Nothing();
   }
 
diff --git a/src/tests/device_manager_tests.cpp 
b/src/tests/device_manager_tests.cpp
index c791c7305..b6235e9e5 100644
--- a/src/tests/device_manager_tests.cpp
+++ b/src/tests/device_manager_tests.cpp
@@ -504,9 +504,8 @@ TEST(DeviceManagerCgroupDeviceAccessTest, 
IsAccessGrantedTest)
 }
 
 
-TEST_F(DeviceManagerTest, ROOT_DeviceManagerRecover_CanRecover)
+TEST_F(DeviceManagerTest, ROOT_Recover)
 {
-
   slave::Flags flags;
   flags.work_dir = *sandbox;
   flags.cgroups_root = TEST_CGROUPS_ROOT;
@@ -516,14 +515,12 @@ TEST_F(DeviceManagerTest, 
ROOT_DeviceManagerRecover_CanRecover)
 
   vector<devices::Entry> allow_list = {*devices::Entry::parse("c 1:3 w")};
 
-
   Future<Nothing> setup = dm->configure(
-    TEST_CGROUP_WITH_ROOT,
-    allow_list,
-    {});
+      TEST_CGROUP_WITH_ROOT,
+      allow_list,
+      {});
 
-  AWAIT_READY(setup);
-  ASSERT_FALSE(setup.isFailed());
+  AWAIT_ASSERT_READY(setup);
 
   Future<DeviceManager::CgroupDeviceAccess> dm_state =
     dm->state(TEST_CGROUP_WITH_ROOT);
@@ -542,18 +539,14 @@ TEST_F(DeviceManagerTest, 
ROOT_DeviceManagerRecover_CanRecover)
 
   Option<ContainerID> container_id =
     slave::containerizer::paths::cgroups2::containerId(
-      flags.cgroups_root, TEST_CGROUP);
+        flags.cgroups_root, TEST_CGROUP);
   ASSERT_SOME(container_id);
 
+
   Future<Nothing> recover = dm->recover({protobuf::slave::createContainerState(
-              None(),
-              None(),
-              *container_id,
-              -1,
-              *sandbox)});
-
-  AWAIT_READY(recover);
-  ASSERT_FALSE(recover.isFailed());
+      None(), None(), *container_id, -1, *sandbox)});
+
+  AWAIT_ASSERT_READY(recover);
 
   dm_state = dm->state(TEST_CGROUP_WITH_ROOT);
   AWAIT_ASSERT_READY(dm_state);

```


src/slave/containerizer/device_manager/device_manager.cpp
Lines 266-270 (patched)
<https://reviews.apache.org/r/75145/#comment315122>

    maybe another test for this case?


- Benjamin Mahler


On Aug. 8, 2024, 3:06 p.m., Jason Zhou wrote:
> 
> -----------------------------------------------------------
> This is an automatically generated e-mail. To reply, visit:
> https://reviews.apache.org/r/75145/
> -----------------------------------------------------------
> 
> (Updated Aug. 8, 2024, 3:06 p.m.)
> 
> 
> Review request for mesos and Benjamin Mahler.
> 
> 
> Repository: mesos
> 
> 
> Description
> -------
> 
> We currently do not have any method of recovering the device access
> states when the cgroups 2 isolator is atempting to recover containers.
> 
> We add a recovery state here that makes use of the protobuf checkpoint
> files to ensure that the previous device accesses of cgroups can be
> restored. It will be used by the cgroups 2 isolator.
> 
> 
> Diffs
> -----
> 
>   src/slave/containerizer/device_manager/device_manager.hpp 
> 853350f70e12b992ef311a35c509a5dce8f2301a 
>   src/slave/containerizer/device_manager/device_manager.cpp 
> a2b6b9cd68436b507d0de71d1e971a1faf178c49 
>   src/tests/device_manager_tests.cpp c4e9b8c58282b8d57b5ce88fefddd34c8ea30c77 
> 
> 
> Diff: https://reviews.apache.org/r/75145/diff/4/
> 
> 
> Testing
> -------
> 
> Test added to check the functionality of recover() with container state. 
> Tests pass, and the state was recovered.
> 
> 
> Thanks,
> 
> Jason Zhou
> 
>

Reply via email to