I believe this is not covering all cases, for instance it would break
adding rules to second level nested cgroup, if second level nested
cgroup is in "default deny" and it's parent is in "default deny" and
none of them have CGRP_VE_ROOT set. In parent there is allowing wildcard
rule and adding same allowing wildcard rule to child would fail as
verify_new_ex -> match_exception would return false.
If you have a node with kernel with this patch installed I would like to
play with it, so that it would be easier to review.
On 19.01.2023 15:52, Nikolay Borisov wrote:
In containerized environments there arise cases where we might want to
allow wildcard exceptions when the parent cg doesn't have such. This for
example arises when systemd services are being setup in containers. In
order to allow systemd to function we must allow it to write wildcard
(i.e b *:* rwm) rules in the child group. At the same time in order not
to break the fundamental invariant of the device cgroup hierarchy that
children cannot be more permissive than their parents instead of blindly
trusting those rules, simply skip them in the child cgroup and defer to
the parent's exceptions.
For example assume we have A/B, where A has default behavior 'deny' and
B was created afterwards and subsequently also has 'deny' default
behavior. With this patch it's possible to write "b *:* rwm" in B which
would also result in EPERM when trying to access any device that doesn't
contain an exception in A:
mkdir A
echo "a" > A/devices.deny
mkdir A/B
echo "c *:*" > A/B/devices.allow <-- allows to create the exception
but it's essentially a noop
echo "c 1:3 rw" > A < -- now trying to open /dev/nul (matching 1:3)
by a process in B would succeed.
Implementing this doesn't really break the main invariant that children
shouldn't have more access than their ancestors.
Signed-off-by: Nikolay Borisov <nikolay.bori...@virtuozzo.com>
---
Changes in v2:
* Patch is now self-contained.
security/device_cgroup.c | 55 +++++++++++++++++++++++++++++++---------
1 file changed, 43 insertions(+), 12 deletions(-)
diff --git a/security/device_cgroup.c b/security/device_cgroup.c
index 2f6d5e0ffd00..0e5fdcc0cbff 100644
--- a/security/device_cgroup.c
+++ b/security/device_cgroup.c
@@ -61,6 +61,11 @@ struct dev_cgroup {
struct list_head propagate_pending;
};
+static bool is_wildcard_exception(struct dev_exception_item *ex)
+{
+ return ex->minor == ~0 || ex->major == ~0;
+}
+
static inline struct dev_cgroup *css_to_devcgroup(struct cgroup_subsys_state
*s)
{
return container_of(s, struct dev_cgroup, css);
@@ -434,10 +439,10 @@ static bool match_exception_partial(struct list_head
*exceptions, short type,
/**
* match_exception - iterates the exception list trying to match a rule
- * based on type, major, minor and access type. It is
- * considered a match if an exception is found that
- * will contain the entire range of provided parameters.
- * @exceptions: list of exceptions
+ * based on type, major, minor and access type. It is
+ * considered a match if an exception is found that
+ * will contain the entire range of provided parameters.
+ * @dev_cgroup: cgroup whose exceptions we are checking
* @type: device type (DEV_BLOCK or DEV_CHAR)
* @major: device file major number, ~0 to match all
* @minor: device file minor number, ~0 to match all
@@ -445,10 +450,13 @@ static bool match_exception_partial(struct list_head
*exceptions, short type,
*
* returns: true in case it matches an exception completely
*/
-static bool match_exception(struct list_head *exceptions, short type,
- u32 major, u32 minor, short access)
+static bool match_exception(struct dev_cgroup *dev_cgroup, short type,
+ u32 major, u32 minor, short access, bool
check_parent)
{
struct dev_exception_item *ex;
+ struct cgroup *cgrp = dev_cgroup->css.cgroup;
+ struct list_head *exceptions = &dev_cgroup->exceptions;
+ bool wildcard_skipped = false;
list_for_each_entry_rcu(ex, exceptions, list) {
short mismatched_bits;
@@ -462,6 +470,11 @@ static bool match_exception(struct list_head *exceptions,
short type,
continue;
if (ex->minor != ~0 && ex->minor != minor)
continue;
+ /* skip wildcard rule if we are child cg */
+ if (is_wildcard_exception(ex) && !test_bit(CGRP_VE_ROOT,
&cgrp->flags)) {
Comment does not correspond to what you do, so either comment is wrong
or what you do is wrong. Note: CGRP_VE_ROOT bit is only set on root
container cgroups when container is running. O maybe I'm not getting
"child cg" thing.
+ wildcard_skipped = true;
+ continue;
+ }
/* provided access cannot have more than the exception rule */
mismatched_bits = access & (~ex->access) & ~ACC_MOUNT;
@@ -475,9 +488,26 @@ static bool match_exception(struct list_head *exceptions,
short type,
return true;
}
+ /* we matched a wildcard rule, so let's check for a
+ * more specific one in the parent
+ */
Nit. Wrong comment style. Please read
https://www.kernel.org/doc/html/v4.10/process/coding-style.html#commenting
+ if (wildcard_skipped && check_parent) {
+ struct cgroup *parent = cgrp->parent;
+ struct dev_cgroup *devcg_parent = cgroup_to_devcgroup(parent);
+ if (devcg_parent->behavior == DEVCG_DEFAULT_ALLOW)
+ /* Can't match any of the exceptions, even partially */
+ return
!match_exception_partial(&devcg_parent->exceptions,
+ type, major, minor,
access);
+ else
+ /* Need to match completely one exception to be allowed
*/
+ return match_exception(devcg_parent, type, major,
+ minor, access, false);
+ }
+
return false;
}
+
/**
* verify_new_ex - verifies if a new exception is part of what is allowed
* by a dev cgroup based on the default policy +
@@ -527,9 +557,8 @@ static bool verify_new_ex(struct dev_cgroup *dev_cgroup,
* be contained completely in an parent's exception to be
* allowed
*/
- match = match_exception(&dev_cgroup->exceptions, refex->type,
- refex->major, refex->minor,
- refex->access);
+ match = match_exception(dev_cgroup, refex->type, refex->major,
+ refex->minor, refex->access, false);
if (match)
/* parent has an exception that matches the proposed */
@@ -703,6 +732,7 @@ static inline bool has_children(struct dev_cgroup
*devcgroup)
return !list_empty(&cgrp->children);
}
+
/*
* Modify the exception list using allow/deny rules.
* CAP_SYS_ADMIN is needed for this. It's at least separate from CAP_MKNOD
@@ -862,8 +892,10 @@ static int devcgroup_update_access(struct dev_cgroup
*devcgroup,
break;
}
- if (!parent_has_perm(devcgroup, &ex))
+ if (!is_wildcard_exception(&ex) &&
+ !parent_has_perm(devcgroup, &ex))
return -EPERM;
+
rc = dev_exception_add(devcgroup, &ex);
break;
case DEVCG_DENY:
@@ -954,8 +986,7 @@ static int __devcgroup_check_permission(short type, u32
major, u32 minor,
type, major, minor, access);
else
/* Need to match completely one exception to be allowed */
- rc = match_exception(&dev_cgroup->exceptions, type, major,
- minor, access);
+ rc = match_exception(dev_cgroup, type, major, minor, access,
true);
rcu_read_unlock();
#ifdef CONFIG_VE
--
2.34.1
_______________________________________________
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel
--
Best regards, Tikhomirov Pavel
Software Developer, Virtuozzo.
_______________________________________________
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel