On Thu, Apr 16, 2015 at 02:26:59PM +0100, Pablo de Lara wrote:
> Jenkins hash function was developed originally in 1996,
> and was integrated in first versions of DPDK.
> The function has been improved in 2006,
> achieving up to 60% better performance, compared to the original one.
> 
> Check out: http://burtleburtle.net/bob/c/lookup3.c
> 
> This patch integrates that code in the rte_jhash library,
> adding also a new function rte_jhash_word2,
> that returns two different hash values, for a single key.
> 

Should the addition of the new functionality not be a separate patch from the
update to the existing code?
Also, do the new functions return the exact same values as the previous 
versions,
just faster?

> Signed-off-by: Pablo de Lara <pablo.de.lara.guarch at intel.com>
> ---
>  lib/librte_hash/rte_jhash.h |  407 
> ++++++++++++++++++++++++++++++++++++-------
>  1 files changed, 347 insertions(+), 60 deletions(-)
> 
> diff --git a/lib/librte_hash/rte_jhash.h b/lib/librte_hash/rte_jhash.h
> index a4bf5a1..3de006d 100644
> --- a/lib/librte_hash/rte_jhash.h
> +++ b/lib/librte_hash/rte_jhash.h
> @@ -1,7 +1,7 @@
>  /*-
>   *   BSD LICENSE
>   *
> - *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
> + *   Copyright(c) 2010-2015 Intel Corporation. All rights reserved.
>   *   All rights reserved.
>   *
>   *   Redistribution and use in source and binary forms, with or without
> @@ -45,38 +45,51 @@ extern "C" {
>  #endif
>  
>  #include <stdint.h>
> +#include <rte_byteorder.h>
>  
>  /* jhash.h: Jenkins hash support.
>   *
> - * Copyright (C) 1996 Bob Jenkins (bob_jenkins at burtleburtle.net)
> + * Copyright (C) 2006 Bob Jenkins (bob_jenkins at burtleburtle.net)
>   *
>   * http://burtleburtle.net/bob/hash/
>   *
>   * These are the credits from Bob's sources:
>   *
> - * lookup2.c, by Bob Jenkins, December 1996, Public Domain.
> - * hash(), hash2(), hash3, and mix() are externally useful functions.
> - * Routines to test the hash are included if SELF_TEST is defined.
> - * You can use this free for any purpose.  It has no warranty.
> + * lookup3.c, by Bob Jenkins, May 2006, Public Domain.
> + *
> + * These are functions for producing 32-bit hashes for hash table lookup.
> + * hashword(), hashlittle(), hashlittle2(), hashbig(), mix(), and final()
> + * are externally useful functions.  Routines to test the hash are included
> + * if SELF_TEST is defined.  You can use this free for any purpose.  It's in
> + * the public domain.  It has no warranty.
>   *
>   * $FreeBSD$
>   */
>  
> +#define rot(x, k) (((x)<<(k)) | ((x)>>(32-(k))))
> +
>  /** @internal Internal function. NOTE: Arguments are modified. */
>  #define __rte_jhash_mix(a, b, c) do { \
> -     a -= b; a -= c; a ^= (c>>13); \
> -     b -= c; b -= a; b ^= (a<<8); \
> -     c -= a; c -= b; c ^= (b>>13); \
> -     a -= b; a -= c; a ^= (c>>12); \
> -     b -= c; b -= a; b ^= (a<<16); \
> -     c -= a; c -= b; c ^= (b>>5); \
> -     a -= b; a -= c; a ^= (c>>3); \
> -     b -= c; b -= a; b ^= (a<<10); \
> -     c -= a; c -= b; c ^= (b>>15); \
> +     a -= c; a ^= rot(c, 4); c += b; \
> +     b -= a; b ^= rot(a, 6); a += c; \
> +     c -= b; c ^= rot(b, 8); b += a; \
> +     a -= c; a ^= rot(c, 16); c += b; \
> +     b -= a; b ^= rot(a, 19); a += c; \
> +     c -= b; c ^= rot(b, 4); b += a; \
> +} while (0)
> +
> +#define __rte_jhash_final(a, b, c) do { \
> +     c ^= b; c -= rot(b, 14); \
> +     a ^= c; a -= rot(c, 11); \
> +     b ^= a; b -= rot(a, 25); \
> +     c ^= b; c -= rot(b, 16); \
> +     a ^= c; a -= rot(c, 4);  \
> +     b ^= a; b -= rot(a, 14); \
> +     c ^= b; c -= rot(b, 24); \
>  } while (0)
>  
>  /** The golden ratio: an arbitrary value. */
> -#define RTE_JHASH_GOLDEN_RATIO      0x9e3779b9
> +#define RTE_JHASH_GOLDEN_RATIO      0xdeadbeef
>  
>  /**
>   * The most generic version, hashes an arbitrary sequence
> @@ -95,42 +108,256 @@ extern "C" {
>  static inline uint32_t
>  rte_jhash(const void *key, uint32_t length, uint32_t initval)
>  {
> -     uint32_t a, b, c, len;
> -     const uint8_t *k = (const uint8_t *)key;
> -     const uint32_t *k32 = (const uint32_t *)key;
> +     uint32_t a, b, c;
> +     union {
> +             const void *ptr;
> +             size_t i;
> +     } u;
>  
> -     len = length;
> -     a = b = RTE_JHASH_GOLDEN_RATIO;
> -     c = initval;
> +     /* Set up the internal state */
> +     a = b = c = RTE_JHASH_GOLDEN_RATIO + ((uint32_t)length) + initval;
>  
> -     while (len >= 12) {
> -             a += k32[0];
> -             b += k32[1];
> -             c += k32[2];
> +     u.ptr = key;
>  
> -             __rte_jhash_mix(a,b,c);
> +     if ((u.i & 0x3) == 0) {
> +             const uint32_t *k = (const uint32_t *)key;
>  
> -             k += (3 * sizeof(uint32_t)), k32 += 3;
> -             len -= (3 * sizeof(uint32_t));
> -     }
> +             while (length > 12) {
> +                     a += k[0];
> +                     b += k[1];
> +                     c += k[2];
>  
> -     c += length;
> -     switch (len) {
> -             case 11: c += ((uint32_t)k[10] << 24);
> -             case 10: c += ((uint32_t)k[9] << 16);
> -             case 9 : c += ((uint32_t)k[8] << 8);
> -             case 8 : b += ((uint32_t)k[7] << 24);
> -             case 7 : b += ((uint32_t)k[6] << 16);
> -             case 6 : b += ((uint32_t)k[5] << 8);
> -             case 5 : b += k[4];
> -             case 4 : a += ((uint32_t)k[3] << 24);
> -             case 3 : a += ((uint32_t)k[2] << 16);
> -             case 2 : a += ((uint32_t)k[1] << 8);
> -             case 1 : a += k[0];
> -             default: break;
> -     };
> +                     __rte_jhash_mix(a, b, c);
> +
> +                     k += 3;
> +                     length -= 12;
> +             }
> +
> +             switch (length) {
> +#if RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN
> +             case 12:
> +                     c += k[2]; b += k[1]; a += k[0]; break;
> +             case 11:
> +                     c += k[2]&0xffffff; b += k[1]; a += k[0]; break;
> +             case 10:
> +                     c += k[2]&0xffff; b += k[1]; a += k[0]; break;
> +             case 9:
> +                     c += k[2]&0xff; b += k[1]; a += k[0]; break;
> +             case 8:
> +                     b += k[1]; a += k[0]; break;
> +             case 7:
> +                     b += k[1]&0xffffff; a += k[0]; break;
> +             case 6:
> +                     b += k[1]&0xffff; a += k[0]; break;
> +             case 5:
> +                     b += k[1]&0xff; a += k[0]; break;
> +             case 4:
> +                     a += k[0]; break;
> +             case 3:
> +                     a += k[0]&0xffffff; break;
> +             case 2:
> +                     a += k[0]&0xffff; break;
> +             case 1:
> +                     a += k[0]&0xff; break;
> +#else
> +             case 12:
> +                     c += k[2]; b += k[1]; a += k[0]; break;
> +             case 11:
> +                     c += k[2]&0xffffff00; b += k[1]; a += k[0]; break;
> +             case 10:
> +                     c += k[2]&0xffff0000; b += k[1]; a += k[0]; break;
> +             case 9:
> +                     c += k[2]&0xff000000; b += k[1]; a += k[0]; break;
> +             case 8:
> +                     b += k[1]; a += k[0]; break;
> +             case 7:
> +                     b += k[1]&0xffffff00; a += k[0]; break;
> +             case 6:
> +                     b += k[1]&0xffff0000; a += k[0]; break;
> +             case 5:
> +                     b += k[1]&0xff000000; a += k[0]; break;
> +             case 4:
> +                     a += k[0]; break;
> +             case 3:
> +                     a += k[0]&0xffffff00; break;
> +             case 2:
> +                     a += k[0]&0xffff0000; break;
> +             case 1:
> +                     a += k[0]&0xff000000; break;
> +#endif

Only the constants seem different in this block. Can we get rid of the
#ifdefs using rte_XX_to_cpu() calls instead?

> +             /* zero length strings require no mixing */
> +             case 0:
> +                     return c;
> +             };
> +#if RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN
> +     } else if ((u.i & 0x1) == 0) {
> +             /* read 16-bit chunks */
> +             const uint16_t *k = (const uint16_t *)key;
> +             const uint8_t  *k8;
> +
> +             /* all but last block: aligned reads and different mixing */
> +             while (length > 12) {
> +                     a += k[0] + (((uint32_t)k[1])<<16);
> +                     b += k[2] + (((uint32_t)k[3])<<16);
> +                     c += k[4] + (((uint32_t)k[5])<<16);
> +
> +                     __rte_jhash_mix(a, b, c);
> +
> +                     k += 6;
> +                     length -= 12;
> +             }
> +
> +             /* handle the last (probably partial) block */
> +             k8 = (const uint8_t *)k;
> +             switch (length) {
> +             case 12:
> +                     c += k[4]+(((uint32_t)k[5])<<16);
> +                     b += k[2]+(((uint32_t)k[3])<<16);
> +                     a += k[0]+(((uint32_t)k[1])<<16);
> +                     break;
> +             case 11:
> +                     /* fall through */
> +                     c += ((uint32_t)k8[10])<<16;
> +             case 10:
> +                     c += k[4];
> +                     b += k[2]+(((uint32_t)k[3])<<16);
> +                     a += k[0]+(((uint32_t)k[1])<<16);
> +                     break;
> +             case 9:
> +                     /* fall through */
> +                     c += k8[8];
> +             case 8:
> +                     b += k[2]+(((uint32_t)k[3])<<16);
> +                     a += k[0]+(((uint32_t)k[1])<<16);
> +                     break;
> +             case 7:
> +                     /* fall through */
> +                     b += ((uint32_t)k8[6])<<16;
> +             case 6:
> +                     b += k[2];
> +                     a += k[0]+(((uint32_t)k[1])<<16);
> +                     break;
> +             case 5:
> +                     /* fall through */
> +                     b += k8[4];
> +             case 4:
> +                     a += k[0]+(((uint32_t)k[1])<<16);
> +                     break;
> +             case 3:
> +                     /* fall through */
> +                     a += ((uint32_t)k8[2])<<16;
> +             case 2:
> +                     a += k[0];
> +                     break;
> +             case 1:
> +                     a += k8[0];
> +                     break;
> +             case 0:
> +                     /* zero length requires no mixing */
> +                     return c;
> +             }
> +#endif

No else block for this ifdef?

> +     } else {
> +             const uint8_t *k = (const uint8_t *)key;
> +
> +             /* all but the last block: affect some 32 bits of (a, b, c) */
> +             while (length > 12) {
> +#if RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN
> +                     a += k[0];
> +                     a += ((uint32_t)k[1])<<8;
> +                     a += ((uint32_t)k[2])<<16;
> +                     a += ((uint32_t)k[3])<<24;
> +                     b += k[4];
> +                     b += ((uint32_t)k[5])<<8;
> +                     b += ((uint32_t)k[6])<<16;
> +                     b += ((uint32_t)k[7])<<24;
> +                     c += k[8];
> +                     c += ((uint32_t)k[9])<<8;
> +                     c += ((uint32_t)k[10])<<16;
> +                     c += ((uint32_t)k[11])<<24;
> +#else
> +                     a += ((uint32_t)k[0])<<24;
> +                     a += ((uint32_t)k[1])<<16;
> +                     a += ((uint32_t)k[2])<<8;
> +                     a += ((uint32_t)k[3]);
> +                     b += ((uint32_t)k[4])<<24;
> +                     b += ((uint32_t)k[5])<<16;
> +                     b += ((uint32_t)k[6])<<8;
> +                     b += ((uint32_t)k[7]);
> +                     c += ((uint32_t)k[8])<<32;
> +                     c += ((uint32_t)k[9])<<16;
> +                     c += ((uint32_t)k[10])<<8;
> +                     c += ((uint32_t)k[11]);
> +#endif

Maybe find a better way to shorten/remove this #ifdef also. E.g. shorter
ifdef defining macros for the different shift amounts, 0, 8, 16, 24.

> +
> +                     __rte_jhash_mix(a, b, c);
>  
> -     __rte_jhash_mix(a,b,c);
> +                     k += 12;
> +                     length -= 12;
> +             }
> +
> +             /* last block: affect all 32 bits of (c) */
> +             /* all the case statements fall through */
> +             switch (length) {
> +#if RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN
> +             case 12:
> +                     c += ((uint32_t)k[11])<<24;
> +             case 11:
> +                     c += ((uint32_t)k[10])<<16;
> +             case 10:
> +                     c += ((uint32_t)k[9])<<8;
> +             case 9:
> +                     c += k[8];
> +             case 8:
> +                     b += ((uint32_t)k[7])<<24;
> +             case 7:
> +                     b += ((uint32_t)k[6])<<16;
> +             case 6:
> +                     b += ((uint32_t)k[5])<<8;
> +             case 5:
> +                     b += k[4];
> +             case 4:
> +                     a += ((uint32_t)k[3])<<24;
> +             case 3:
> +                     a += ((uint32_t)k[2])<<16;
> +             case 2:
> +                     a += ((uint32_t)k[1])<<8;
> +             case 1:
> +                     a += k[0];
> +             break;
> +#else
> +             case 12:
> +                     c += k[11];
> +             case 11:
> +                     c += ((uint32_t)k[10])<<8;
> +             case 10:
> +                     c += ((uint32_t)k[9])<<16;
> +             case 9:
> +                     c += ((uint32_t)k[8])<<24;
> +             case 8:
> +                     b += k[7];
> +             case 7:
> +                     b += ((uint32_t)k[6])<<8;
> +             case 6:
> +                     b += ((uint32_t)k[5])<<16;
> +             case 5:
> +                     b += ((uint32_t)k[4])<<24;
> +             case 4:
> +                     a += k[3];
> +             case 3:
> +                     a += ((uint32_t)k[2])<<8;
> +             case 2:
> +                     a += ((uint32_t)k[1])<<16;
> +             case 1:
> +                     a += ((uint32_t)k[0])<<24;
> +             break;
> +#endif
> +             case 0:
> +                     return c;
> +             }
> +     }
> +
> +     __rte_jhash_final(a, b, c);
>  
>       return c;
>  }
> @@ -151,33 +378,93 @@ rte_jhash(const void *key, uint32_t length, uint32_t 
> initval)
>  static inline uint32_t
>  rte_jhash2(const uint32_t *k, uint32_t length, uint32_t initval)
>  {
> -     uint32_t a, b, c, len;
> +     uint32_t a, b, c;
>  
> -     a = b = RTE_JHASH_GOLDEN_RATIO;
> -     c = initval;
> -     len = length;
> +     /* Set up the internal state */
> +     a = b = c = RTE_JHASH_GOLDEN_RATIO + (((uint32_t)length)<<2) + initval;
>  
> -     while (len >= 3) {
> +     /* Handle most of the key */
> +     while (length > 3) {
>               a += k[0];
>               b += k[1];
>               c += k[2];
> +
>               __rte_jhash_mix(a, b, c);
> -             k += 3; len -= 3;
> -     }
>  
> -     c += length * 4;
> +             k += 3;
> +             length -= 3;
> +     }
>  
> -     switch (len) {
> -             case 2 : b += k[1];
> -             case 1 : a += k[0];
> -             default: break;
> +     /* Handle the last 3 uint32_t's */
> +     switch (length) {
> +     case 3:
> +             c += k[2];
> +     case 2:
> +             b += k[1];
> +     case 1:
> +             a += k[0];
> +             __rte_jhash_final(a, b, c);
> +     /* case 0: nothing left to add */
> +     case 0:
> +             break;
>       };
>  
> -     __rte_jhash_mix(a,b,c);
> -
>       return c;
>  }
>  
> +/**
> + * Same as rte_jhash2, but take two seeds and return two uint32_ts.
> + * pc and pb must be non-null, and *pc and *pb must both be initialized
> + * with seeds. If you pass in (*pb)=0, the output (*pc) will be
> + * the same as the return value from rte_jhash.
> + *
> + * @param k
> + *   Key to calculate hash of.
> + * @param length
> + *   Length of key in units of 4 bytes.
> + * @param pc
> + *   IN: seed OUT: primary hash value.
> + * @param pc
> + *   IN: second seed OUT: secondary hash value.
> + */
> +static inline void
> +rte_jhash_word2(const uint32_t *k, uint32_t length, uint32_t *pc, uint32_t 
> *pb)
> +{
> +     uint32_t a, b, c;
> +
> +     /* Set up the internal state */
> +     a = b = c = RTE_JHASH_GOLDEN_RATIO + (((uint32_t)length)<<2) + *pc;
> +     c += *pb;
> +
> +     /* Handle most of the key */
> +     while (length > 3) {
> +             a += k[0];
> +             b += k[1];
> +             c += k[2];
> +
> +             __rte_jhash_mix(a, b, c);
> +
> +             k += 3;
> +             length -= 3;
> +     }
> +
> +     /* Handle the last 3 uint32_t's */
> +     switch (length) {
> +     case 3:
> +             c += k[2];
> +     case 2:
> +             b += k[1];
> +     case 1:
> +             a += k[0];
> +             __rte_jhash_final(a, b, c);
> +     /* case 0: nothing left to add */
> +     case 0:
> +             break;
> +     };
> +
> +     *pc = c;
> +     *pb = b;
> +}
>  
>  /**
>   * A special ultra-optimized versions that knows it is hashing exactly
> -- 
> 1.7.4.1
> 

Reply via email to