-----------------------------------------------------------
This is an automatically generated e-mail. To reply, visit:
https://reviews.apache.org/r/75145/#review226825
-----------------------------------------------------------
Ship it!
local edits:
```
diff --git a/src/slave/containerizer/device_manager/device_manager.cpp
b/src/slave/containerizer/device_manager/device_manager.cpp
index 5ee1dba82..c245e4104 100644
--- a/src/slave/containerizer/device_manager/device_manager.cpp
+++ b/src/slave/containerizer/device_manager/device_manager.cpp
@@ -35,6 +35,8 @@
#include "slave/state.hpp"
#include "linux/cgroups2.hpp"
+using google::protobuf::RepeatedPtrField;
+
using std::string;
using std::vector;
@@ -238,94 +240,84 @@ public:
cgroups_root, state.container_id(), false));
}
- const string device_manager_path = paths::getDevicesStatePath(meta_dir);
- if (!os::exists(device_manager_path)) {
- return Nothing();
+ const string checkpoint_path = paths::getDevicesStatePath(meta_dir);
+ if (!os::exists(checkpoint_path)) {
+ return Nothing(); // This happens on the first run.
}
- Result<CgroupDeviceAccessStates> devices_recovery_info =
- state::read<CgroupDeviceAccessStates>(device_manager_path);
+ Result<CgroupDeviceAccessStates> device_states =
+ state::read<CgroupDeviceAccessStates>(checkpoint_path);
- if (devices_recovery_info.isError()) {
- return Failure(
- "Failed to read device configuration info from '" +
- device_manager_path + "': " + devices_recovery_info.error());
- } else if (devices_recovery_info.isNone()) {
- LOG(WARNING) << "The device info file at '" << device_manager_path
- << "' is empty";
+ if (device_states.isError()) {
+ return Failure("Failed to read device configuration info from"
+ " '" + checkpoint_path + "': " + device_states.error());
+ } else if (device_states.isNone()) {
+ LOG(WARNING) << "The device info file at '" << checkpoint_path << "'"
+ << " is empty";
return Nothing();
}
+ CHECK_SOME(device_states);
- CHECK_SOME(devices_recovery_info);
vector<string> recovered_cgroups = {};
- foreach (
- const auto& entry, devices_recovery_info->device_access_per_cgroup()) {
+ foreach (const auto& entry, device_states->device_access_per_cgroup()) {
const string& cgroup = entry.first;
const CgroupDeviceAccessState& state = entry.second;
- if(!cgroups_to_recover.contains(cgroup)) {
- LOG(WARNING) << "The cgroup '" << cgroup
- << "' was not found in the given containers to recover";
+ if (!cgroups_to_recover.contains(cgroup)) {
+ LOG(WARNING)
+ << "The cgroup '" << cgroup << "' from the device manager's"
+ " checkpointed state is not present in the expected cgroups of"
+ " the containerizer";
continue;
}
- auto parse_and_insert =
- [&](
- const ::google::protobuf::RepeatedPtrField<::std::string>& list,
- const bool allow) -> Try<Nothing>
- {
- foreach (const string& entry, list) {
- Try<Entry> parsed_entry = Entry::parse(entry);
- if (parsed_entry.isError()) {
- return Error(
- "Failed to parse entry " + entry +
- " during recover for cgroup: " + cgroup);
- }
-
- auto it = device_access_per_cgroup.find(cgroup);
- if (it != device_access_per_cgroup.end()) {
- if (allow) {
- it->second.allow_list.push_back(*parsed_entry);
- } else {
- it->second.deny_list.push_back(*parsed_entry);
- }
- } else {
- auto result = device_access_per_cgroup.emplace(
- cgroup,
- CHECK_NOTERROR(DeviceManager::CgroupDeviceAccess::create(
- allow ? vector<Entry>{*parsed_entry} : vector<Entry>{},
- allow ? vector<Entry>{} :
vector<Entry>{*parsed_entry})));
- CHECK(result.second);
- }
+ auto parse = [&](const RepeatedPtrField<string>& list)
+ -> Try<vector<Entry>>
+ {
+ vector<Entry> parsed_entries;
+ foreach (const string& entry, list) {
+ Try<Entry> parsed_entry = Entry::parse(entry);
+ if (parsed_entry.isError()) {
+ return Error("Failed to parse entry " + entry + " during recover"
+ " for cgroup " + cgroup + ": " +
parsed_entry.error());
}
- return Nothing();
- };
+ parsed_entries.push_back(*parsed_entry);
+ }
+ return parsed_entries;
+ };
// We return failure because we expect all data in the checkpoint file
// to be valid.
- Try<Nothing>
- status = parse_and_insert(state.allow_list(), true);
- if (status.isError()) {
- return Failure(status.error());
+ Try<vector<Entry>> allow_entries = parse(state.allow_list());
+ if (allow_entries.isError()) {
+ return Failure(allow_entries.error());
}
- status = parse_and_insert(state.deny_list(), false);
- if (status.isError()) {
- return Failure(status.error());
+ Try<vector<Entry>> deny_entries = parse(state.deny_list());
+ if (deny_entries.isError()) {
+ return Failure(deny_entries.error());
}
+ auto result = device_access_per_cgroup.emplace(
+ cgroup,
+ CHECK_NOTERROR(DeviceManager::CgroupDeviceAccess::create(
+ *allow_entries, *deny_entries)));
+
+ CHECK(result.second); // There should be a single insertion per cgroup.
+
recovered_cgroups.push_back(cgroup);
}
foreach (const string& cgroup, recovered_cgroups) {
- // Return failure as the checkpointed state should be valid, allowing us
- // to generate and attach BPF programs. This is because the cgroup
- // previously succeeded in doing so.
- Try<Nothing> status = commit_device_access_changes(cgroup, false);
- if (status.isError()) {
+ // Commit with checkpoint = false, since there's no need to
re-checkpoint.
+ Try<Nothing> commit = commit_device_access_changes(cgroup, false);
+ if (commit.isError()) {
+ // Return failure as the checkpointed state should be valid, allowing
us
+ // to generate and attach BPF programs. This is because the cgroup
+ // previously succeeded in doing so.
return Failure(
- "Failed to perform configuration of ebpf file for cgroup '" +
- cgroup + "': " + status.error());
+ "Failed to perform configuration of ebpf file for cgroup"
+ " '" + cgroup + "': " + commit.error());
}
}
@@ -340,10 +332,13 @@ public:
foreach(const string& cgroup, cgroups_to_recover) {
if (!device_access_per_cgroup.contains(cgroup)) {
- LOG(WARNING) << "Unable to recover state for requested cgroup: '"
- << cgroup + "'";
+ LOG(WARNING)
+ << "Unable to recover state for cgroup '" + cgroup + "' as requested"
+ " by the containerizer, because it was missing in the device"
+ " manager's checkpointed state";
}
}
+
return Nothing();
}
diff --git a/src/tests/device_manager_tests.cpp
b/src/tests/device_manager_tests.cpp
index c791c7305..b6235e9e5 100644
--- a/src/tests/device_manager_tests.cpp
+++ b/src/tests/device_manager_tests.cpp
@@ -504,9 +504,8 @@ TEST(DeviceManagerCgroupDeviceAccessTest,
IsAccessGrantedTest)
}
-TEST_F(DeviceManagerTest, ROOT_DeviceManagerRecover_CanRecover)
+TEST_F(DeviceManagerTest, ROOT_Recover)
{
-
slave::Flags flags;
flags.work_dir = *sandbox;
flags.cgroups_root = TEST_CGROUPS_ROOT;
@@ -516,14 +515,12 @@ TEST_F(DeviceManagerTest,
ROOT_DeviceManagerRecover_CanRecover)
vector<devices::Entry> allow_list = {*devices::Entry::parse("c 1:3 w")};
-
Future<Nothing> setup = dm->configure(
- TEST_CGROUP_WITH_ROOT,
- allow_list,
- {});
+ TEST_CGROUP_WITH_ROOT,
+ allow_list,
+ {});
- AWAIT_READY(setup);
- ASSERT_FALSE(setup.isFailed());
+ AWAIT_ASSERT_READY(setup);
Future<DeviceManager::CgroupDeviceAccess> dm_state =
dm->state(TEST_CGROUP_WITH_ROOT);
@@ -542,18 +539,14 @@ TEST_F(DeviceManagerTest,
ROOT_DeviceManagerRecover_CanRecover)
Option<ContainerID> container_id =
slave::containerizer::paths::cgroups2::containerId(
- flags.cgroups_root, TEST_CGROUP);
+ flags.cgroups_root, TEST_CGROUP);
ASSERT_SOME(container_id);
+
Future<Nothing> recover = dm->recover({protobuf::slave::createContainerState(
- None(),
- None(),
- *container_id,
- -1,
- *sandbox)});
-
- AWAIT_READY(recover);
- ASSERT_FALSE(recover.isFailed());
+ None(), None(), *container_id, -1, *sandbox)});
+
+ AWAIT_ASSERT_READY(recover);
dm_state = dm->state(TEST_CGROUP_WITH_ROOT);
AWAIT_ASSERT_READY(dm_state);
```
src/slave/containerizer/device_manager/device_manager.cpp
Lines 266-270 (patched)
<https://reviews.apache.org/r/75145/#comment315122>
maybe another test for this case?
- Benjamin Mahler
On Aug. 8, 2024, 3:06 p.m., Jason Zhou wrote:
>
> -----------------------------------------------------------
> This is an automatically generated e-mail. To reply, visit:
> https://reviews.apache.org/r/75145/
> -----------------------------------------------------------
>
> (Updated Aug. 8, 2024, 3:06 p.m.)
>
>
> Review request for mesos and Benjamin Mahler.
>
>
> Repository: mesos
>
>
> Description
> -------
>
> We currently do not have any method of recovering the device access
> states when the cgroups 2 isolator is atempting to recover containers.
>
> We add a recovery state here that makes use of the protobuf checkpoint
> files to ensure that the previous device accesses of cgroups can be
> restored. It will be used by the cgroups 2 isolator.
>
>
> Diffs
> -----
>
> src/slave/containerizer/device_manager/device_manager.hpp
> 853350f70e12b992ef311a35c509a5dce8f2301a
> src/slave/containerizer/device_manager/device_manager.cpp
> a2b6b9cd68436b507d0de71d1e971a1faf178c49
> src/tests/device_manager_tests.cpp c4e9b8c58282b8d57b5ce88fefddd34c8ea30c77
>
>
> Diff: https://reviews.apache.org/r/75145/diff/4/
>
>
> Testing
> -------
>
> Test added to check the functionality of recover() with container state.
> Tests pass, and the state was recovered.
>
>
> Thanks,
>
> Jason Zhou
>
>