Path: utzoo!attcan!uunet!munnari.oz.au!csc.anu.oz.au!csc3.anu.oz.au!anucsd!neccan!peter From: peter@neccan.oz (Peter Miller) Newsgroups: comp.lang.c Subject: Re: count of bits in a long Keywords: C Message-ID: <826@neccan.oz> Date: 5 Oct 90 02:17:33 GMT Organization: NEC Information Systems Australia, Canberra Lines: 336 /* * A while back I posted a solution to the "count of bits set in a * long" question, suggesting that HACKMEM 169 was a very fast way to do it. * This program demonstrates the technique. * * The HACKMEM 169 method uses a modulo to sum the digits. * A number of people have asked me "how can a division be faster than a loop?" * This program demonstrates how. The modulo is a special case, * and can be unwound into a tight loop if your machine does not * have a fast divide opcode (although many modern machines have a 1 cycle * divide opcode, except for we-really-believe-our-press-hype risc * manufacturers). * * This program was run on a 386/20 under unix with the following results: * Func sec/iter scaled * nbits 0.00018310547 1.000 hackmem 169 * nbits1 0.0006980896 3.812 hackmem 169 no % operator * nbits2 0.0016822815 9.188 count lsb's loop * nbits3 0.0037384033 20.417 * nbits4 0.0034370422 18.771 * nbits5 0.0037765503 20.625 * nbits6 0.0035247803 19.250 */ #include #include #include #define SIZEOF(a) (sizeof(a)/sizeof(a[0])) /* * This function is used to calibrate the timing mechanism. * This way we can subtract the loop and call overheads. */ int calibrate(n) register unsigned long n; { return 0; } /* * This function counts the number of bits in a long. * It is limited to 63 bit longs, but a minor mod can cope with 511 bits. * * It is so magic, an explanation is required: * Consider a 3 bit number as being * 4a+2b+c * if we shift it right 1 bit, we have * 2a+b * subtracting this from the original gives * 2a+b+c * if we shift the original 2 bits right we get * a * and so with another subtraction we have * a+b+c * which is the number of bits in the original number. * Suitable masking allows the sums of the octal digits in a * 32 bit bumber to appear in each octal digit. This isn't much help * unless we can get all of them summed together. * This can be done by modulo arithmetic (sum the digits in a number by molulo * the base of the number minus one) the old "cating out nines" trick they * taught in school before calculators were invented. * Now, using mod 7 wont help us, because our number will very likely have * more than 7 bits set. So add the octal digits together to get base64 * digits, and use modulo 63. * (Those of you with 64 bit machines need to add 3 octal digits together to * get base512 digits, and use mod 511.) * * This is HACKMEM 169, as used in X11 sources. */ int nbits(n) register unsigned long n; { register unsigned long tmp; tmp = n - ((n >> 1) & 033333333333) - ((n >> 2) & 011111111111); return ((tmp + (tmp >> 3)) & 030707070707) % 63; } /* * This is the same as the above, but does not use the % operator. * Most modern machines have clockless division, and so the modulo is as * expensive as, say, an addition. */ int nbits1(n) register unsigned long n; { register unsigned long tmp; tmp = n - ((n >> 1) & 033333333333) - ((n >> 2) & 011111111111); tmp = (tmp + (tmp >> 3)) & 030707070707; while (tmp > 63) tmp = (tmp & 63) + (tmp >> 6); return tmp; } /* * This function counts the bits in a long. * * It removes the lsb and counting the number of times round the loop. * The expression (n & -n) yields the lsb of a number, * but it only works on 2's compliment machines. */ int nbits2(n) register unsigned long n; { register int count; for (count = 0; n; n -= (n & -n)) count++; return count; } /* * This function counts the bits in a long. * * It works by shifting the number left and testing the top bit. * On many machines shift is expensive, so it uses a cheap addition instead. */ int nbits3(n) register unsigned long n; { register int count; for (count = 0; n; n += n) if (n & ~(~(unsigned long)0 >> 1)) count++; return count; } /* * This function counts the bits in a long. * * It works by shifting the number down and testing the bottom bit. */ int nbits4(n) register unsigned long n; { register int count; for (count = 0; n; n >>= 1) if (n & 1) count++; return count; } /* * This function counts the bits in a long. * * It works by masking each bit. * This is the second most intuitively obvious method, * and is independent of the number of bits in the long. */ int nbits5(n) register unsigned long n; { register int count; register unsigned long mask; count = 0; for (mask = 1; mask; mask += mask) if (n & mask) count++; return count; } /* * This function counts the bits in a long. * * It works by masking each bit. * This is the most intuitively obvious method, * but how do you a priori know how many bits in the long? * (except for ''sizeof(long) * CHAR_BITS'' expression) */ int nbits6(n) register unsigned long n; { register int count; register int bit; count = 0; for (bit = 0; bit < 32; ++bit) if (n & ((unsigned long)1 << bit)) count++; return count; } /* * build a long random number. * The standard rand() returns a 15bit number. */ unsigned long bigrand() { return ( ((unsigned long)(rand() & 0xFFF) << 24) | ((unsigned long)(rand() & 0xFFF) << 12) | (unsigned long)(rand() & 0xFFF) ); } /* * Run the test many times. * You will need a running time in the 10s of seconds * for an accurate result. * * To give a fair comparison, the random number generator * is seeded the same for each measurement. * * Return value is seconds per iteration. */ #define REPEAT (1L<<18) double measure(func) int (*func)(); { long j; struct tms start; struct tms finish; srand(1); times(&start); for (j = 0; j < REPEAT; ++j) func(bigrand()); times(&finish); return ((finish.tms_utime - start.tms_utime) / (double)REPEAT); } struct table { char *name; int (*func)(); double measurement; int trial; }; struct table table[] = { { "nbits", nbits }, { "nbits1", nbits1 }, { "nbits2", nbits2 }, { "nbits3", nbits3 }, { "nbits4", nbits4 }, { "nbits5", nbits3 }, { "nbits6", nbits4 }, }; main() { double calibration; double best; int j, k, m; /* * run a few tests to make sure they all agree */ srand(getpid()); for (j = 0; j < 10; ++j) { unsigned long n; int bad; n = bigrand(); for (k = 0; k < SIZEOF(table); ++k) table[k].trial = table[k].func(n); bad = 0; for (k = 0; k < SIZEOF(table); ++k) for (m = 0; m < SIZEOF(table); ++m) if (table[k].trial != table[m].trial) bad = 1; if (bad) { printf("wrong:\t%08lX", n); for (k = 0; k < SIZEOF(table); ++k) printf("\t%d", table[k].trial); printf("\n"); } } /* * calibrate the timing mechanism */ calibration = measure(calibrate); /* * time them all, keeping track of the best (smallest) */ for (j = 0; j < SIZEOF(table); ++j) { table[j].measurement = measure(table[j].func) - calibration; if (!j || table[j].measurement < best) best = table[j].measurement; } for (j = 0; j < SIZEOF(table); ++j) { printf ( "%s\t%10.8g\t%6.3f\n", table[j].name, table[j].measurement, table[j].measurement / best ); } exit(0); } Regards Peter Miller UUCP uunet!munnari!pdact.pd.necisa.oz!pmiller ARPA pmiller%pdact.pd.necisa.oz@australia /\/\* CSNET pmiller%pdact.pd.necisa.oz@au ACSnet pmiller@pdact.pd.necisa.oz Disclaimer: The views expressed here are personal and do not necessarily reflect the view of my employer or the views or my colleagues. D