ruby-changes:4253

matz	2008-03-11 10:20:25 +0900 (Tue, 11 Mar 2008)

  New Revision: 15743

  Modified files:
    trunk/ChangeLog
    trunk/string.c

  Log:
    * string.c (hash): replaced by MurmurHash described in
      <http://murmurhash.googlepages.com/>.

  http://svn.ruby-lang.org/cgi-bin/viewvc.cgi/trunk/string.c?r1=15743&r2=15742&diff_format=u
  http://svn.ruby-lang.org/cgi-bin/viewvc.cgi/trunk/ChangeLog?r1=15743&r2=15742&diff_format=u

Index: ChangeLog
===================================================================
--- ChangeLog	(revision 15742)
+++ ChangeLog	(revision 15743)
@@ -1,3 +1,8 @@
+Tue Mar 11 10:19:10 2008  Yukihiro Matsumoto  <matz@r...>
+
+	* string.c (hash): replaced by MurmurHash described in
+	  <http://murmurhash.googlepages.com/>.
+
 Tue Mar 11 09:52:49 2008  Yukihiro Matsumoto  <matz@r...>
 
 	* string.c (rb_str_comparable): empty strings in any encoding are
Index: string.c
===================================================================
--- string.c	(revision 15742)
+++ string.c	(revision 15743)
@@ -1685,122 +1685,41 @@
     return rb_str_append(str1, str2);
 }
 
-typedef  unsigned int  ub4;   /* unsigned 4-byte quantities */
-typedef  unsigned char ub1;   /* unsigned 1-byte quantities */
+/* MurmurHash described in http://murmurhash.googlepages.com/ */
+unsigned int
+hash(const unsigned char * data, int len, unsigned int h)
+{
+    const unsigned int m = 0x7fd652ad;
+    const int r = 16;
 
-#define hashsize(n) ((ub4)1<<(n))
-#define hashmask(n) (hashsize(n)-1)
+    h += 0xdeadbeef;
 
-/*
---------------------------------------------------------------------
-mix -- mix 3 32-bit values reversibly.
-For every delta with one or two bits set, and the deltas of all three
-  high bits or all three low bits, whether the original value of a,b,c
-  is almost all zero or is uniformly distributed,
-* If mix() is run forward or backward, at least 32 bits in a,b,c
-  have at least 1/4 probability of changing.
-* If mix() is run forward, every bit of c will change between 1/3 and
-  2/3 of the time.  (Well, 22/100 and 78/100 for some 2-bit deltas.)
-mix() was built out of 36 single-cycle latency instructions in a 
-  structure that could supported 2x parallelism, like so:
-      a -= b; 
-      a -= c; x = (c>>13);
-      b -= c; a ^= x;
-      b -= a; x = (a<<8);
-      c -= a; b ^= x;
-      c -= b; x = (b>>13);
-      ...
-  Unfortunately, superscalar Pentiums and Sparcs can't take advantage 
-  of that parallelism.  They've also turned some of those single-cycle
-  latency instructions into multi-cycle latency instructions.  Still,
-  this is the fastest good hash I could find.  There were about 2^^68
-  to choose from.  I only looked at a billion or so.
---------------------------------------------------------------------
-*/
-#define mix(a,b,c) \
-{ \
-  a -= b; a -= c; a ^= (c>>13); \
-  b -= c; b -= a; b ^= (a<<8); \
-  c -= a; c -= b; c ^= (b>>13); \
-  a -= b; a -= c; a ^= (c>>12);  \
-  b -= c; b -= a; b ^= (a<<16); \
-  c -= a; c -= b; c ^= (b>>5); \
-  a -= b; a -= c; a ^= (c>>3);  \
-  b -= c; b -= a; b ^= (a<<10); \
-  c -= a; c -= b; c ^= (b>>15); \
-}
+    while(len >= 4) {
+	h += *(unsigned int *)data;
+	h *= m;
+	h ^= h >> r;
 
-/*
---------------------------------------------------------------------
-hash() -- hash a variable-length key into a 32-bit value
-  k       : the key (the unaligned variable-length array of bytes)
-  len     : the length of the key, counting by bytes
-  initval : can be any 4-byte value
-Returns a 32-bit value.  Every bit of the key affects every bit of
-the return value.  Every 1-bit and 2-bit delta achieves avalanche.
-About 6*len+35 instructions.
+	data += 4;
+	len -= 4;
+    }
 
-The best hash table sizes are powers of 2.  There is no need to do
-mod a prime (mod is sooo slow!).  If you need less than 32 bits,
-use a bitmask.  For example, if you need only 10 bits, do
-  h = (h & hashmask(10));
-In which case, the hash table should have hashsize(10) elements.
+    switch(len) {
+      case 3:
+	h += data[2] << 16;
+      case 2:
+	h += data[1] << 8;
+      case 1:
+	h += data[0];
+	h *= m;
+	h ^= h >> r;
+    };
 
-If you are hashing n strings (ub1 **)k, do it like this:
-  for (i=0, h=0; i<n; ++i) h = hash( k[i], len[i], h);
+    h *= m;
+    h ^= h >> 10;
+    h *= m;
+    h ^= h >> 17;
 
-By Bob Jenkins, 1996.  bob_jenkins@b...  You may use this
-code any way you wish, private, educational, or commercial.  It's free.
-
-See http://burtleburtle.net/bob/hash/evahash.html
-Use for hash table lookup, or anything where one collision in 2^^32 is
-acceptable.  Do NOT use for cryptographic purposes.
---------------------------------------------------------------------
-*/
-
-static ub4
-hash(const ub1 *k, ub4 length, ub4 initval)
-    /* k: the key */
-    /* length: the length of the key */
-    /* initval: the previous hash, or an arbitrary value */
-{
-    register ub4 a,b,c,len;
-
-    /* Set up the internal state */
-    len = length;
-    a = b = 0x9e3779b9;  /* the golden ratio; an arbitrary value */
-    c = initval;         /* the previous hash value */
-
-   /*---------------------------------------- handle most of the key */
-    while (len >= 12) {
-	a += (k[0] +((ub4)k[1]<<8) +((ub4)k[2]<<16) +((ub4)k[3]<<24));
-	b += (k[4] +((ub4)k[5]<<8) +((ub4)k[6]<<16) +((ub4)k[7]<<24));
-	c += (k[8] +((ub4)k[9]<<8) +((ub4)k[10]<<16)+((ub4)k[11]<<24));
-	mix(a,b,c);
-	k += 12; len -= 12;
-    }
-
-    /*------------------------------------- handle the last 11 bytes */
-    c += length;
-    switch(len)              /* all the case statements fall through */
-    {
-      case 11: c+=((ub4)k[10]<<24);
-      case 10: c+=((ub4)k[9]<<16);
-      case 9 : c+=((ub4)k[8]<<8);
-	/* the first byte of c is reserved for the length */
-      case 8 : b+=((ub4)k[7]<<24);
-      case 7 : b+=((ub4)k[6]<<16);
-      case 6 : b+=((ub4)k[5]<<8);
-      case 5 : b+=k[4];
-      case 4 : a+=((ub4)k[3]<<24);
-      case 3 : a+=((ub4)k[2]<<16);
-      case 2 : a+=((ub4)k[1]<<8);
-      case 1 : a+=k[0];
-	/* case 0: nothing left to add */
-    }
-    mix(a,b,c);
-    /*-------------------------------------------- report the result */
-    return c;
+    return h;
 }
 
 int

--
ML: ruby-changes@q...
Info: http://www.atdot.net/~ko1/quickml/