ruby-changes:4253
From: ko1@a...
Date: Tue, 11 Mar 2008 11:00:40 +0900 (JST)
Subject: [ruby-changes:4253] matz - Ruby:r15743 (trunk): * string.c (hash): replaced by MurmurHash described in
matz 2008-03-11 10:20:25 +0900 (Tue, 11 Mar 2008) New Revision: 15743 Modified files: trunk/ChangeLog trunk/string.c Log: * string.c (hash): replaced by MurmurHash described in <http://murmurhash.googlepages.com/>. http://svn.ruby-lang.org/cgi-bin/viewvc.cgi/trunk/string.c?r1=15743&r2=15742&diff_format=u http://svn.ruby-lang.org/cgi-bin/viewvc.cgi/trunk/ChangeLog?r1=15743&r2=15742&diff_format=u Index: ChangeLog =================================================================== --- ChangeLog (revision 15742) +++ ChangeLog (revision 15743) @@ -1,3 +1,8 @@ +Tue Mar 11 10:19:10 2008 Yukihiro Matsumoto <matz@r...> + + * string.c (hash): replaced by MurmurHash described in + <http://murmurhash.googlepages.com/>. + Tue Mar 11 09:52:49 2008 Yukihiro Matsumoto <matz@r...> * string.c (rb_str_comparable): empty strings in any encoding are Index: string.c =================================================================== --- string.c (revision 15742) +++ string.c (revision 15743) @@ -1685,122 +1685,41 @@ return rb_str_append(str1, str2); } -typedef unsigned int ub4; /* unsigned 4-byte quantities */ -typedef unsigned char ub1; /* unsigned 1-byte quantities */ +/* MurmurHash described in http://murmurhash.googlepages.com/ */ +unsigned int +hash(const unsigned char * data, int len, unsigned int h) +{ + const unsigned int m = 0x7fd652ad; + const int r = 16; -#define hashsize(n) ((ub4)1<<(n)) -#define hashmask(n) (hashsize(n)-1) + h += 0xdeadbeef; -/* --------------------------------------------------------------------- -mix -- mix 3 32-bit values reversibly. -For every delta with one or two bits set, and the deltas of all three - high bits or all three low bits, whether the original value of a,b,c - is almost all zero or is uniformly distributed, -* If mix() is run forward or backward, at least 32 bits in a,b,c - have at least 1/4 probability of changing. -* If mix() is run forward, every bit of c will change between 1/3 and - 2/3 of the time. (Well, 22/100 and 78/100 for some 2-bit deltas.) -mix() was built out of 36 single-cycle latency instructions in a - structure that could supported 2x parallelism, like so: - a -= b; - a -= c; x = (c>>13); - b -= c; a ^= x; - b -= a; x = (a<<8); - c -= a; b ^= x; - c -= b; x = (b>>13); - ... - Unfortunately, superscalar Pentiums and Sparcs can't take advantage - of that parallelism. They've also turned some of those single-cycle - latency instructions into multi-cycle latency instructions. Still, - this is the fastest good hash I could find. There were about 2^^68 - to choose from. I only looked at a billion or so. --------------------------------------------------------------------- -*/ -#define mix(a,b,c) \ -{ \ - a -= b; a -= c; a ^= (c>>13); \ - b -= c; b -= a; b ^= (a<<8); \ - c -= a; c -= b; c ^= (b>>13); \ - a -= b; a -= c; a ^= (c>>12); \ - b -= c; b -= a; b ^= (a<<16); \ - c -= a; c -= b; c ^= (b>>5); \ - a -= b; a -= c; a ^= (c>>3); \ - b -= c; b -= a; b ^= (a<<10); \ - c -= a; c -= b; c ^= (b>>15); \ -} + while(len >= 4) { + h += *(unsigned int *)data; + h *= m; + h ^= h >> r; -/* --------------------------------------------------------------------- -hash() -- hash a variable-length key into a 32-bit value - k : the key (the unaligned variable-length array of bytes) - len : the length of the key, counting by bytes - initval : can be any 4-byte value -Returns a 32-bit value. Every bit of the key affects every bit of -the return value. Every 1-bit and 2-bit delta achieves avalanche. -About 6*len+35 instructions. + data += 4; + len -= 4; + } -The best hash table sizes are powers of 2. There is no need to do -mod a prime (mod is sooo slow!). If you need less than 32 bits, -use a bitmask. For example, if you need only 10 bits, do - h = (h & hashmask(10)); -In which case, the hash table should have hashsize(10) elements. + switch(len) { + case 3: + h += data[2] << 16; + case 2: + h += data[1] << 8; + case 1: + h += data[0]; + h *= m; + h ^= h >> r; + }; -If you are hashing n strings (ub1 **)k, do it like this: - for (i=0, h=0; i<n; ++i) h = hash( k[i], len[i], h); + h *= m; + h ^= h >> 10; + h *= m; + h ^= h >> 17; -By Bob Jenkins, 1996. bob_jenkins@b... You may use this -code any way you wish, private, educational, or commercial. It's free. - -See http://burtleburtle.net/bob/hash/evahash.html -Use for hash table lookup, or anything where one collision in 2^^32 is -acceptable. Do NOT use for cryptographic purposes. --------------------------------------------------------------------- -*/ - -static ub4 -hash(const ub1 *k, ub4 length, ub4 initval) - /* k: the key */ - /* length: the length of the key */ - /* initval: the previous hash, or an arbitrary value */ -{ - register ub4 a,b,c,len; - - /* Set up the internal state */ - len = length; - a = b = 0x9e3779b9; /* the golden ratio; an arbitrary value */ - c = initval; /* the previous hash value */ - - /*---------------------------------------- handle most of the key */ - while (len >= 12) { - a += (k[0] +((ub4)k[1]<<8) +((ub4)k[2]<<16) +((ub4)k[3]<<24)); - b += (k[4] +((ub4)k[5]<<8) +((ub4)k[6]<<16) +((ub4)k[7]<<24)); - c += (k[8] +((ub4)k[9]<<8) +((ub4)k[10]<<16)+((ub4)k[11]<<24)); - mix(a,b,c); - k += 12; len -= 12; - } - - /*------------------------------------- handle the last 11 bytes */ - c += length; - switch(len) /* all the case statements fall through */ - { - case 11: c+=((ub4)k[10]<<24); - case 10: c+=((ub4)k[9]<<16); - case 9 : c+=((ub4)k[8]<<8); - /* the first byte of c is reserved for the length */ - case 8 : b+=((ub4)k[7]<<24); - case 7 : b+=((ub4)k[6]<<16); - case 6 : b+=((ub4)k[5]<<8); - case 5 : b+=k[4]; - case 4 : a+=((ub4)k[3]<<24); - case 3 : a+=((ub4)k[2]<<16); - case 2 : a+=((ub4)k[1]<<8); - case 1 : a+=k[0]; - /* case 0: nothing left to add */ - } - mix(a,b,c); - /*-------------------------------------------- report the result */ - return c; + return h; } int -- ML: ruby-changes@q... Info: http://www.atdot.net/~ko1/quickml/