Make sure the sw_flow_key structure and valid mask boundaries are always
machine word aligned. Optimize the flow compare and mask operations
using machine word size operations.

This patch is inspired by ideas and code from a patch submitted by Peter
Klausler titled "replace memcmp() with specialized comparator".
The original patch mentioned 7X speed up with this optimization.
However, The original patch only optimizes for architectures
support unaligned machine word access. This patch optimizes for
all architectures.

Signed-off-by: Andy Zhou <az...@nicira.com>
---
 datapath/flow.c |   50 +++++++++++++++++++++++++++++++++-----------------
 datapath/flow.h |   16 ++--------------
 2 files changed, 35 insertions(+), 31 deletions(-)

diff --git a/datapath/flow.c b/datapath/flow.c
index ab803ef..25eb8c1 100644
--- a/datapath/flow.c
+++ b/datapath/flow.c
@@ -54,8 +54,8 @@ static void update_range__(struct sw_flow_match *match,
                          size_t offset, size_t size, bool is_mask)
 {
        struct sw_flow_key_range *range = NULL;
-       size_t start = offset;
-       size_t end = offset + size;
+       size_t start = rounddown(offset, sizeof(long));
+       size_t end = roundup(offset + size, sizeof(long));
 
        if (!is_mask)
                range = &match->range;
@@ -351,16 +351,19 @@ static bool icmp6hdr_ok(struct sk_buff *skb)
 void ovs_flow_key_mask(struct sw_flow_key *dst, const struct sw_flow_key *src,
                       const struct sw_flow_mask *mask)
 {
-       u8 *m = (u8 *)&mask->key + mask->range.start;
-       u8 *s = (u8 *)src + mask->range.start;
-       u8 *d = (u8 *)dst + mask->range.start;
+       const long *m = (long *)((u8 *)&mask->key + mask->range.start);
+       const long *s = (long *)((u8 *)src + mask->range.start);
+       long *d = (long *)dst;
        int i;
 
-       memset(dst, 0, sizeof(*dst));
-       for (i = 0; i < ovs_sw_flow_mask_size_roundup(mask); i++) {
-               *d = *s & *m;
-               d++, s++, m++;
-       }
+       for (i = 0; i < mask->range.start;  i += sizeof(long))
+               *d++ = 0;
+
+       for (i = 0; i < range_n_bytes(&mask->range); i += sizeof(long))
+               *d++ = *s++ & *m++;
+
+       for (i = mask->range.end; i < sizeof(*dst); i += sizeof(long))
+               *d++ = 0;
 }
 
 #define TCP_FLAGS_OFFSET 13
@@ -977,8 +980,10 @@ int ovs_flow_extract(struct sk_buff *skb, u16 in_port, 
struct sw_flow_key *key)
 static u32 ovs_flow_hash(const struct sw_flow_key *key, int key_start,
                         int key_end)
 {
-       return jhash2((u32 *)((u8 *)key + key_start),
-                     DIV_ROUND_UP(key_end - key_start, sizeof(u32)), 0);
+       u8 *hash_bytes = (u8 *)key + key_start;
+       int hash_len = key_end - key_start;
+
+       return jhash2((u32 *)hash_bytes, hash_len, 0);
 }
 
 static int flow_key_start(const struct sw_flow_key *key)
@@ -986,14 +991,25 @@ static int flow_key_start(const struct sw_flow_key *key)
        if (key->tun_key.ipv4_dst)
                return 0;
        else
-               return offsetof(struct sw_flow_key, phy);
+               return rounddown(offsetof(struct sw_flow_key, phy),
+                                         sizeof(long));
 }
 
 static bool __cmp_key(const struct sw_flow_key *key1,
                const struct sw_flow_key *key2,  int key_start, int key_end)
 {
-       return !memcmp((u8 *)key1 + key_start,
-                       (u8 *)key2 + key_start, (key_end - key_start));
+       const long *cp1 = (long *)(u8 *)key1 + key_start;
+       const long *cp2 = (long *)(u8 *)key2 + key_start;
+       long diffs = 0;
+       int i;
+
+       BUG_ON(key_start != rounddown(key_start, sizeof(long)));
+       BUG_ON(key_end != roundup(key_end, sizeof(long)));
+
+       for (i = key_start; i < key_end;  i += sizeof(long))
+               diffs |= *cp1++ ^ *cp2++;
+
+       return diffs == 0;
 }
 
 static bool __flow_cmp_key(const struct sw_flow *flow,
@@ -1952,7 +1968,7 @@ static bool ovs_sw_flow_mask_equal(const struct 
sw_flow_mask *a,
 
        return  (a->range.end == b->range.end)
                && (a->range.start == b->range.start)
-               && (memcmp(a_, b_, ovs_sw_flow_mask_actual_size(a)) == 0);
+               && (memcmp(a_, b_, range_n_bytes(&a->range)) == 0);
 }
 
 struct sw_flow_mask *ovs_sw_flow_mask_find(const struct flow_table *tbl,
@@ -1989,5 +2005,5 @@ static void ovs_sw_flow_mask_set(struct sw_flow_mask 
*mask,
        u8 *m = (u8 *)&mask->key + range->start;
 
        mask->range = *range;
-       memset(m, val, ovs_sw_flow_mask_size_roundup(mask));
+       memset(m, val, range_n_bytes(range));
 }
diff --git a/datapath/flow.h b/datapath/flow.h
index c177f55..22bd630 100644
--- a/datapath/flow.h
+++ b/datapath/flow.h
@@ -127,7 +127,7 @@ struct sw_flow_key {
                        } nd;
                } ipv6;
        };
-};
+} __aligned(sizeof(long));
 
 struct sw_flow {
        struct rcu_head rcu;
@@ -151,7 +151,7 @@ struct sw_flow_key_range {
        size_t end;
 };
 
-static inline u16 ovs_sw_flow_key_range_actual_size(const struct 
sw_flow_key_range *range)
+static inline u16 range_n_bytes(const struct sw_flow_key_range *range)
 {
        return range->end - range->start;
 }
@@ -255,18 +255,6 @@ struct sw_flow_mask {
        struct sw_flow_key key;
 };
 
-static inline u16
-ovs_sw_flow_mask_actual_size(const struct sw_flow_mask *mask)
-{
-       return ovs_sw_flow_key_range_actual_size(&mask->range);
-}
-
-static inline u16
-ovs_sw_flow_mask_size_roundup(const struct sw_flow_mask *mask)
-{
-       return roundup(ovs_sw_flow_mask_actual_size(mask), sizeof(u32));
-}
-
 struct sw_flow_mask *ovs_sw_flow_mask_alloc(void);
 void ovs_sw_flow_mask_add_ref(struct sw_flow_mask *);
 void ovs_sw_flow_mask_del_ref(struct sw_flow_mask *, bool deferred);
-- 
1.7.9.5

_______________________________________________
dev mailing list
dev@openvswitch.org
http://openvswitch.org/mailman/listinfo/dev

Reply via email to