Path: utzoo!utgpu!watserv1!watmath!att!rutgers!aramis.rutgers.edu!paul.rutgers.edu!gaynor From: gaynor@paul.rutgers.edu (Silver) Newsgroups: comp.lang.c Subject: Re: hash function for text in C Message-ID: Date: 18 Oct 90 05:13:09 GMT References: <1990Oct16.184951.513@watdragon.waterloo.edu> Organization: Rutgers Univ., New Brunswick, N.J. Lines: 124 You must supply definitions of uint8 (an 8-bit unsigned int), uint16 (similarly), and uint32 (likewise). ----------------------------------- hash.h ------------------------------------ /* This is an implementation of Peter K. Pearson's hash algorithm as */ /* presented in "Fast Hashing of Variable-Length Text Strings", ACM 6/90. */ /* The conclusion of his article is cited below. */ /* */ /* The main advantages of the hashing function presented here are: */ /* 1. No restriction is placed on the length of the text string. */ /* 2. The length of the text string does not need to be known beforehand. */ /* 3. Very little arithmetic is performed on each character being hashed. */ /* 4. Similar strings are not likely to collide. */ /* 5. Minimal, perfect hashing functions can be built in this form. */ /* [6. It's average distribution is quite random.] */ /* */ /* Its principal disadvantages are: */ /* 1. Output value ranges that are not powers of 2 are somwehat more */ /* complicated to provide. */ /* 2. More auxiliary memory (the 256-byte table T) is required by this */ /* hashing function than by many traditional functions. */ /* [3. I blew 50 cents copying the article and then had to type it in.] */ /* */ /* Elaborating on advantage 5 above, by fatootsing with the permutation */ /* table T, one can make hash8 perform perfect minimal hashes. The author */ /* provides the following guidelines for construction of such a table: */ /* */ /* [C[N] are the characters of the string being hashed and h[N] is the Nth */ /* iteration of the loop.] */ /* */ /* 1. A table T was consructed by pseudorandom permutation of the integers */ /* (0 ... 255). */ /* 2. One by one, the desired values were assigned to the words in the */ /* list. Each assignment was effected by exchanging two elemnts in the */ /* table. */ /* 3. For each word, the first candidate considered for exchange was */ /* T[h[n-1] ^ C[n]], the last table element referenced in the */ /* computation of the hash function for that word. */ /* 4. A table element could not be exchanged if it was referenced during */ /* the hashing of a previously assgined word or if it was referenced */ /* earlier in the hashing of the same word. */ /* 5. If the necessary exchange was forbidden by Rule 4, attention was */ /* shifted to the previously referenced table element, */ /* T[[h[n-2]] ^ C[n-1]]. */ /* */ /* The procedure is not always successful. For example, using the ascii */ /* character codes, if the word "a" hashes to 0 and the word "i" hashes to */ /* 15, it turns out that the word "in" must hash to 0. Initial attempts */ /* to map Knuth's 31 words onto the integers (0 ... 30) failed for exactly */ /* this reason. The shift to the range (1 ... 31) was an ad hoc tactic to */ /* circumvent this problem. */ /* Hash s into an 8, 16, or 32 bit unsigned integer. The larger-sized */ /* values are generated by applying hash8 in parallel. */ extern uint8 hash8(uint8 * s); extern uint16 hash16(uint8 * s); extern uint32 hash32(uint8 * s); /* The hash permutation table. */ extern uint32 * hashpermutation; ----------------------------------- hash.c ------------------------------------ #include "hash.h" static uint8 permutation[256] = { 51, 140, 233, 27, 118, 125, 170, 138, 119, 132, 174, 97, 25, 110, 1, 14, 65, 36, 40, 188, 73, 173, 7, 30, 68, 56, 169, 234, 107, 177, 197, 87, 28, 210, 186, 67, 2, 15, 115, 48, 223, 148, 211, 57, 190, 104, 213, 49, 144, 172, 147, 124, 157, 238, 167, 183, 78, 75, 58, 22, 70, 103, 181, 12, 254, 41, 198, 168, 46, 79, 241, 156, 83, 128, 66, 60, 86, 141, 161, 176, 221, 54, 192, 252, 116, 95, 206, 35, 88, 133, 154, 250, 237, 253, 85, 178, 93, 159, 155, 42, 9, 89, 3, 61, 201, 158, 106, 82, 240, 255, 218, 102, 189, 8, 33, 4, 145, 16, 150, 26, 99, 100, 195, 175, 34, 50, 80, 166, 194, 195, 164, 29, 134, 105, 55, 143, 122, 130, 245, 208, 72, 77, 64, 121, 139, 232, 191, 108, 228, 137, 59, 74, 11, 126, 171, 5, 242, 101, 239, 193, 112, 113, 98, 21, 207, 225, 151, 251, 92, 91, 17, 127, 20, 81, 24, 6, 43, 196, 204, 247, 212, 224, 220, 94, 32, 13, 187, 199, 214, 18, 226, 84, 71, 231, 165, 19, 202, 217, 90, 129, 136, 153, 182, 111, 244, 45, 236, 249, 109, 47, 180, 205, 215, 160, 53, 162, 114, 246, 179, 62, 227, 96, 142, 230, 184, 146, 117, 39, 69, 37, 23, 63, 52, 216, 0, 135, 149, 31, 38, 44, 209, 120, 76, 203, 229, 123, 131, 152, 10, 219, 243, 248, 235, 222, 200, 163 }; uint8 hash8(register uint8 * s) {register static uint8 h; if (*s) {h = permutation[*s]; while (*++s) h ^= permutation[h ^ *s]; return(h);} else {return((uint8)0);} uint16 hash16(register uint8 * s); {register uint8 h1; register uint8 h2; if (*s) {h1 = permutation[*s]; h2 = permutation[*s + 1]; while (*++s) {h1 ^= permutation[h1 ^ *s]; h2 ^= permutation[h2 ^ *s];} return((uint16)((h1<<8) & h2));} else {return((uint16)0);}} uint32 hash32(register uint8 * s); {register uint8 h1; register uint8 h2; register uint8 h3; register uint8 h4; if (*s) {h1 = permutation[*s] h2 = permutation[*s + 1]; h2 = permutation[*s + 2]; h2 = permutation[*s + 3]; while (*++s) {h1 ^= permutation[h1 ^ *s]; h2 ^= permutation[h2 ^ *s]; h3 ^= permutation[h3 ^ *s]; h4 ^= permutation[h4 ^ *s];} return((uint32)((h1<<24) & (h2<<16) & (h3<<8) & h4));} else {return((uint32)0);}}