Hi, ok, seems we are getting near submission for kernel inclusion. If no new comments arise, the only thing missing from my side is documentation. VLAN is again included in this patch to show one use case.
Changes since last version: -remove NETIF_F_STACKED, use dev->iflink instead. Change VLAN to set dev->iflink properly -rename IFF_CARRIER to IFF_LOWER_UP -reject userspace controlled DORMANT->UP transition if netif_dormant() is set -add all operstate attributes to sysfs -vlan_transfer_operstate() sets dormant first, then carrier -call rfc2863_policy() also when interface is admin down to keep operstate recent. This allows to move vlan_transfer_operstate() to vlan setup function. Couldn't get rid of unconditional call to linkwatch_fire_event() though -small fixes I've successfully tested stacking and userspace interaction, also with drivers that do not call netif_carrier_*() Should we reset link_mode on dev_open() to allow the user to reset the state to a usable state with a down/up transition if supplicant crashes for some reason or should we depend on maintainers/distributors to update userspace for this? Stefan
diff -X dontdiff -ur linux-2.6.14/include/linux/if.h linux-2.6.14-rfc2863/include/linux/if.h --- linux-2.6.14/include/linux/if.h 2005-11-02 11:07:32.000000000 +0100 +++ linux-2.6.14-rfc2863/include/linux/if.h 2005-11-30 21:15:24.000000000 +0100 @@ -33,7 +33,7 @@ #define IFF_LOOPBACK 0x8 /* is a loopback net */ #define IFF_POINTOPOINT 0x10 /* interface is has p-p link */ #define IFF_NOTRAILERS 0x20 /* avoid use of trailers */ -#define IFF_RUNNING 0x40 /* interface running and carrier ok */ +#define IFF_RUNNING 0x40 /* interface RFC2863 OPER_UP */ #define IFF_NOARP 0x80 /* no ARP protocol */ #define IFF_PROMISC 0x100 /* receive all packets */ #define IFF_ALLMULTI 0x200 /* receive all multicast packets*/ @@ -43,12 +43,16 @@ #define IFF_MULTICAST 0x1000 /* Supports multicast */ -#define IFF_VOLATILE (IFF_LOOPBACK|IFF_POINTOPOINT|IFF_BROADCAST|IFF_MASTER|IFF_SLAVE|IFF_RUNNING) - #define IFF_PORTSEL 0x2000 /* can set media type */ #define IFF_AUTOMEDIA 0x4000 /* auto media select active */ #define IFF_DYNAMIC 0x8000 /* dialup device with changing addresses*/ +#define IFF_LOWER_UP 0x10000 /* driver signals L1 up */ +#define IFF_DORMANT 0x20000 /* driver signals dormant */ + +#define IFF_VOLATILE (IFF_LOOPBACK|IFF_POINTOPOINT|IFF_BROADCAST|\ + IFF_MASTER|IFF_SLAVE|IFF_RUNNING|IFF_LOWER_UP|IFF_DORMANT) + /* Private (from user) interface flags (netdevice->priv_flags). */ #define IFF_802_1Q_VLAN 0x1 /* 802.1Q VLAN device. */ #define IFF_EBRIDGE 0x2 /* Ethernet bridging device. */ @@ -80,6 +84,22 @@ #define IF_PROTO_FR_ETH_PVC 0x200B #define IF_PROTO_RAW 0x200C /* RAW Socket */ +/* RFC 2863 operational status */ +enum { + IF_OPER_UNKNOWN, + IF_OPER_NOTPRESENT, + IF_OPER_DOWN, + IF_OPER_LOWERLAYERDOWN, + IF_OPER_TESTING, + IF_OPER_DORMANT, + IF_OPER_UP, +}; + +/* link modes */ +enum { + IF_LINK_MODE_DEFAULT, + IF_LINK_MODE_DORMANT, /* limit upward transition to dormant */ +}; /* * Device mapping structure. I'd just gone off and designed a diff -X dontdiff -ur linux-2.6.14/include/linux/netdevice.h linux-2.6.14-rfc2863/include/linux/netdevice.h --- linux-2.6.14/include/linux/netdevice.h 2005-11-02 11:08:10.000000000 +0100 +++ linux-2.6.14-rfc2863/include/linux/netdevice.h 2005-11-30 21:01:43.000000000 +0100 @@ -230,7 +230,8 @@ __LINK_STATE_SCHED, __LINK_STATE_NOCARRIER, __LINK_STATE_RX_SCHED, - __LINK_STATE_LINKWATCH_PENDING + __LINK_STATE_LINKWATCH_PENDING, + __LINK_STATE_DORMANT, }; @@ -334,11 +335,14 @@ */ - unsigned short flags; /* interface flags (a la BSD) */ + unsigned int flags; /* interface flags (a la BSD) */ unsigned short gflags; unsigned short priv_flags; /* Like 'flags' but invisible to userspace. */ unsigned short padded; /* How much padding added by alloc_netdev() */ + unsigned char operstate; /* RFC2863 operstate */ + unsigned char link_mode; /* mapping policy to operstate */ + unsigned mtu; /* interface MTU value */ unsigned short type; /* interface hardware type */ unsigned short hard_header_len; /* hardware hdr length */ @@ -712,6 +716,10 @@ /* Carrier loss detection, dial on demand. The functions netif_carrier_on * and _off may be called from IRQ context, but it is caller * who is responsible for serialization of these calls. + * + * The name carrier is inappropriate, these functions should really be + * called netif_lowerlayer_*() because they represent the state of any + * kind of lower layer not just hardware media. */ extern void linkwatch_fire_event(struct net_device *dev); @@ -727,6 +735,29 @@ extern void netif_carrier_off(struct net_device *dev); +static inline void netif_dormant_on(struct net_device *dev) +{ + if (!test_and_set_bit(__LINK_STATE_DORMANT, &dev->state)) + linkwatch_fire_event(dev); +} + +static inline void netif_dormant_off(struct net_device *dev) +{ + if (test_and_clear_bit(__LINK_STATE_DORMANT, &dev->state)) + linkwatch_fire_event(dev); +} + +static inline int netif_dormant(const struct net_device *dev) +{ + return test_bit(__LINK_STATE_DORMANT, &dev->state); +} + + +static inline int netif_oper_up(const struct net_device *dev) { + return (dev->operstate == IF_OPER_UP || + dev->operstate == IF_OPER_UNKNOWN /* backward compat */); +} + /* Hot-plugging. */ static inline int netif_device_present(struct net_device *dev) { diff -X dontdiff -ur linux-2.6.14/include/linux/rtnetlink.h linux-2.6.14-rfc2863/include/linux/rtnetlink.h --- linux-2.6.14/include/linux/rtnetlink.h 2005-11-02 11:08:11.000000000 +0100 +++ linux-2.6.14-rfc2863/include/linux/rtnetlink.h 2005-11-18 20:14:05.000000000 +0100 @@ -733,6 +733,8 @@ #define IFLA_MAP IFLA_MAP IFLA_WEIGHT, #define IFLA_WEIGHT IFLA_WEIGHT + IFLA_OPERSTATE, + IFLA_LINKMODE, __IFLA_MAX }; diff -X dontdiff -ur linux-2.6.14/net/8021q/vlan.c linux-2.6.14-rfc2863/net/8021q/vlan.c --- linux-2.6.14/net/8021q/vlan.c 2005-11-02 11:07:35.000000000 +0100 +++ linux-2.6.14-rfc2863/net/8021q/vlan.c 2005-11-30 22:48:49.000000000 +0100 @@ -68,7 +68,7 @@ /* Bits of netdev state that are propagated from real device to virtual */ #define VLAN_LINK_STATE_MASK \ - ((1<<__LINK_STATE_PRESENT)|(1<<__LINK_STATE_NOCARRIER)) + ((1<<__LINK_STATE_PRESENT)|(1<<__LINK_STATE_NOCARRIER)|(1<<__LINK_STATE_DORMANT)) /* End of global variables definitions. */ @@ -343,6 +343,26 @@ new_dev->do_ioctl = vlan_dev_ioctl; } +static void vlan_transfer_operstate(const struct net_device *dev, struct net_device *vlandev) +{ + /* Have to respect userspace enforced dormant state + * of real device, also must allow supplicant running + * on VLAN device + */ + if (dev->operstate == IF_OPER_DORMANT) + netif_dormant_on(vlandev); + else + netif_dormant_off(vlandev); + + if (netif_carrier_ok(dev)) { + if (!netif_carrier_ok(vlandev)) + netif_carrier_on(vlandev); + } else { + if (netif_carrier_ok(vlandev)) + netif_carrier_off(vlandev); + } +} + /* Attach a VLAN device to a mac address (ie Ethernet Card). * Returns the device that was created, or NULL if there was * an error of some kind. @@ -449,7 +469,7 @@ new_dev->flags = real_dev->flags; new_dev->flags &= ~IFF_UP; - new_dev->state = real_dev->state & VLAN_LINK_STATE_MASK; + new_dev->state = real_dev->state & ~(1<<__LINK_STATE_START); /* need 4 bytes for extra VLAN header info, * hope the underlying device can handle it. @@ -497,6 +517,10 @@ if (register_netdevice(new_dev)) goto out_free_newdev; + new_dev->iflink = real_dev->ifindex; + vlan_transfer_operstate(real_dev, new_dev); + linkwatch_fire_event(new_dev); /* _MUST_ call rfc2863_policy() */ + /* So, got the sucker initialized, now lets place * it into our local structure. */ @@ -572,25 +596,12 @@ switch (event) { case NETDEV_CHANGE: /* Propagate real device state to vlan devices */ - flgs = dev->state & VLAN_LINK_STATE_MASK; for (i = 0; i < VLAN_GROUP_ARRAY_LEN; i++) { vlandev = grp->vlan_devices[i]; if (!vlandev) continue; - if (netif_carrier_ok(dev)) { - if (!netif_carrier_ok(vlandev)) - netif_carrier_on(vlandev); - } else { - if (netif_carrier_ok(vlandev)) - netif_carrier_off(vlandev); - } - - if ((vlandev->state & VLAN_LINK_STATE_MASK) != flgs) { - vlandev->state = (vlandev->state &~ VLAN_LINK_STATE_MASK) - | flgs; - netdev_state_change(vlandev); - } + vlan_transfer_operstate(dev, vlandev); } break; diff -X dontdiff -ur linux-2.6.14/net/core/dev.c linux-2.6.14-rfc2863/net/core/dev.c --- linux-2.6.14/net/core/dev.c 2005-11-06 17:35:22.000000000 +0100 +++ linux-2.6.14-rfc2863/net/core/dev.c 2005-11-30 21:15:50.000000000 +0100 @@ -2141,12 +2141,20 @@ flags = (dev->flags & ~(IFF_PROMISC | IFF_ALLMULTI | - IFF_RUNNING)) | + IFF_RUNNING | + IFF_LOWER_UP | + IFF_DORMANT)) | (dev->gflags & (IFF_PROMISC | IFF_ALLMULTI)); - if (netif_running(dev) && netif_carrier_ok(dev)) - flags |= IFF_RUNNING; + if (netif_running(dev)) { + if (netif_oper_up(dev)) + flags |= IFF_RUNNING; + if (netif_carrier_ok(dev)) + flags |= IFF_LOWER_UP; + if (netif_dormant(dev)) + flags |= IFF_DORMANT; + } return flags; } diff -X dontdiff -ur linux-2.6.14/net/core/link_watch.c linux-2.6.14-rfc2863/net/core/link_watch.c --- linux-2.6.14/net/core/link_watch.c 2005-06-17 21:48:29.000000000 +0200 +++ linux-2.6.14-rfc2863/net/core/link_watch.c 2005-11-30 21:13:53.000000000 +0100 @@ -49,6 +49,34 @@ /* Avoid kmalloc() for most systems */ static struct lw_event singleevent; +static inline unsigned char default_operstate(const struct net_device *dev) { + if (!netif_carrier_ok(dev)) + return dev->ifindex!=dev->iflink?IF_OPER_LOWERLAYERDOWN:IF_OPER_DOWN; + if (netif_dormant(dev)) return IF_OPER_DORMANT; + return IF_OPER_UP; +} + + +static void rfc2863_policy(struct net_device *dev) { + unsigned char operstate = default_operstate(dev); + + if (operstate == dev->operstate) return; + + switch(dev->link_mode) { + case IF_LINK_MODE_DORMANT: + if (operstate == IF_OPER_UP) operstate = IF_OPER_DORMANT; + break; + case IF_LINK_MODE_DEFAULT: + default: + break; + } + + write_lock_bh(&dev_base_lock); + dev->operstate = operstate; + write_unlock_bh(&dev_base_lock); +} + + /* Must be called with the rtnl semaphore held */ void linkwatch_run_queue(void) { @@ -74,6 +102,7 @@ */ clear_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state); + rfc2863_policy(dev); if (dev->flags & IFF_UP) { if (netif_carrier_ok(dev)) { WARN_ON(dev->qdisc_sleeping == &noop_qdisc); diff -X dontdiff -ur linux-2.6.14/net/core/net-sysfs.c linux-2.6.14-rfc2863/net/core/net-sysfs.c --- linux-2.6.14/net/core/net-sysfs.c 2005-06-17 21:48:29.000000000 +0200 +++ linux-2.6.14-rfc2863/net/core/net-sysfs.c 2005-11-28 21:33:44.000000000 +0100 @@ -94,6 +94,7 @@ NETDEVICE_ATTR(ifindex, fmt_dec); NETDEVICE_ATTR(features, fmt_long_hex); NETDEVICE_ATTR(type, fmt_dec); +NETDEVICE_ATTR(link_mode, fmt_dec); /* use same locking rules as GIFHWADDR ioctl's */ static ssize_t format_addr(char *buf, const unsigned char *addr, int len) @@ -136,9 +137,44 @@ return -EINVAL; } +static ssize_t show_dormant(struct class_device *dev, char *buf) +{ + struct net_device *netdev = to_net_dev(dev); + if (netif_running(netdev)) { + return sprintf(buf, fmt_dec, !!netif_dormant(netdev)); + } + return -EINVAL; +} + +static const char *operstates[] = { + "unknown", + NULL, /* notpresent, currently unused */ + "down", + "lowerlayerdown", + NULL, /* testing, currently unused */ + "dormant", + "up" +}; + +static ssize_t show_operstate(struct class_device *dev, char *buf) +{ + const struct net_device *netdev = to_net_dev(dev); + unsigned char operstate; + + read_lock(&dev_base_lock); + operstate = netdev->operstate; + if (!netif_running(netdev)) operstate = IF_OPER_DOWN; + read_unlock(&dev_base_lock); + + if (operstate >= sizeof(operstates)) return -EINVAL; /* should not happen */ + return sprintf(buf, "%s\n", operstates[operstate]); +} + static CLASS_DEVICE_ATTR(address, S_IRUGO, show_address, NULL); static CLASS_DEVICE_ATTR(broadcast, S_IRUGO, show_broadcast, NULL); static CLASS_DEVICE_ATTR(carrier, S_IRUGO, show_carrier, NULL); +static CLASS_DEVICE_ATTR(dormant, S_IRUGO, show_dormant, NULL); +static CLASS_DEVICE_ATTR(operstate, S_IRUGO, show_operstate, NULL); /* read-write attributes */ NETDEVICE_SHOW(mtu, fmt_dec); @@ -212,9 +248,12 @@ &class_device_attr_flags, &class_device_attr_weight, &class_device_attr_type, + &class_device_attr_link_mode, &class_device_attr_address, &class_device_attr_broadcast, &class_device_attr_carrier, + &class_device_attr_dormant, + &class_device_attr_operstate, NULL }; diff -X dontdiff -ur linux-2.6.14/net/core/rtnetlink.c linux-2.6.14-rfc2863/net/core/rtnetlink.c --- linux-2.6.14/net/core/rtnetlink.c 2005-11-02 11:08:12.000000000 +0100 +++ linux-2.6.14-rfc2863/net/core/rtnetlink.c 2005-11-30 22:13:02.000000000 +0100 @@ -178,6 +178,32 @@ } +static void set_operstate(struct net_device *dev, unsigned char transition) { + unsigned char operstate = dev->operstate; + ASSERT_RTNL(); + + switch(transition) { + case IF_OPER_UP: + if ((operstate == IF_OPER_DORMANT || + operstate == IF_OPER_UNKNOWN) && + !netif_dormant(dev)) + operstate = IF_OPER_UP; + break; + case IF_OPER_DORMANT: + if (operstate == IF_OPER_UP || + operstate == IF_OPER_UNKNOWN) + operstate = IF_OPER_DORMANT; + break; + } + + if (dev->operstate != operstate) { + write_lock_bh(&dev_base_lock); + dev->operstate = operstate; + write_unlock_bh(&dev_base_lock); + netdev_state_change(dev); + } +} + static int rtnetlink_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, int type, u32 pid, u32 seq, u32 change, unsigned int flags) @@ -208,6 +234,13 @@ } if (1) { + u8 operstate = dev->operstate; + u8 link_mode = dev->link_mode; + RTA_PUT(skb, IFLA_OPERSTATE, sizeof(operstate), &operstate); + RTA_PUT(skb, IFLA_LINKMODE, sizeof(link_mode), &link_mode); + } + + if (1) { struct rtnl_link_ifmap map = { .mem_start = dev->mem_start, .mem_end = dev->mem_end, @@ -398,6 +431,22 @@ dev->weight = *((u32 *) RTA_DATA(ida[IFLA_WEIGHT - 1])); } + if (ida[IFLA_OPERSTATE - 1]) { + if (ida[IFLA_OPERSTATE - 1]->rta_len != RTA_LENGTH(sizeof(u8))) + goto out; + + set_operstate(dev, *((u8 *) RTA_DATA(ida[IFLA_OPERSTATE - 1]))); + } + + if (ida[IFLA_LINKMODE - 1]) { + if (ida[IFLA_LINKMODE - 1]->rta_len != RTA_LENGTH(sizeof(u8))) + goto out; + + write_lock_bh(&dev_base_lock); + dev->link_mode = *((u8 *) RTA_DATA(ida[IFLA_LINKMODE - 1])); + write_unlock_bh(&dev_base_lock); + } + if (ifm->ifi_index >= 0 && ida[IFLA_IFNAME - 1]) { char ifname[IFNAMSIZ];