Path: utzoo!utgpu!news-server.csri.toronto.edu!cs.utexas.edu!uunet!timbuk!cs.umn.edu!quest!zeno!gene
From: gene@zeno.mn.org (Gene H. Olson)
Newsgroups: comp.lang.c
Subject: Re: count of bits in a long
Keywords: C
Message-ID: <1990Oct14.183932.14482@zeno.mn.org>
Date: 14 Oct 90 18:39:32 GMT
References: <826@neccan.oz> <26881@mimsy.umd.edu>
Organization: Smartware Consulting
Lines: 527

chris@mimsy.umd.edu (Chris Torek) writes:
>In article <826@neccan.oz> peter@neccan.oz (Peter Miller) sends out a
>program to time various bit-counting functions.  I decided to `improve'
>the program a bit :-) and add a number of additional functions.  I have
>run the resulting program (which I will append below) on a number of
>architectures.  The `mod' version is never the fastest of the hackmem
>variants.  The fastest function is always table lookup, except on the
>Sun-4.

I also posted an "improved" version of this program that got
horribly erratic and odd results.

Looking at all this closely, it seems the erratic results on this
"test" are due two 3 problems:

1)	The original program had a bug in the procedure table
	which tested the wrong procedures.  Both Chris and I
	fixed this.

2)	The original program neglected to return a value in
	procedure bigrand().  On the Sun4, this means all the
	"random" data was zero.  This kinda skewed the results.

	I just fixed this by putting in the return().

3)	The bigrand() procedure was called every invokation of
	the test procedure.  Since for the fast routines it takes
	*MUCH* longer to generate these numbers than to count the
	bits, the noise level is extreme.

	I just fixed this by creating an array once, and then
	referencing it.  The test runs about 6 times faster now,
	and it is much more consistent.

So the bottom line is that most of the results posted so far are
largely nonsense.  So, in the spirit of throwing good net bandwidth
after bad,  I am posting the blasted thing again.   I hope this
discussion is useful to *someone*.

Earlier I had added a test now called "suminplace" to the list
which I believed was faster than any other way of doing it.
(Except maybe a 16-bit lookup table).  It turns out, that
an 8-bit lookup table beats it.

Blast the torpedos, I still like my method best.  Maybe
it will run faster on another machine or another compiler.
It certainly is smaller.

>>>>> SparcStation results:

function                   time          ratio
hackmemmod               0.00093555450  16.082
hackmemloop              0.00013732910   2.361
hackmemunrolled          9.0599060e-05   1.557
rmlsbsub                 0.00032043457   5.508
rmlsbmask                0.00026893616   4.623
testlsb                  0.00064945221  11.164
testmsb                  0.00065517426  11.262
testsignandshift         0.00065326691  11.230
testeachbit              0.00067424774  11.590
testeachbit1shl          0.00086975098  14.951
tableshift               5.8174133e-05   1.000
tableuchar               8.2015991e-05   1.410
suminplace               6.6757202e-05   1.148

>>>>> 386 results:

function                   time          ratio
hackmemmod               0.00022506714   1.553
hackmemloop              0.00039672852   2.737
hackmemunrolled          0.00025939941   1.789
rmlsbsub                 0.00074005127   5.105
rmlsbmask                0.00067138672   4.632
testlsb                  0.0016098022   11.105
testmsb                  0.0015449524   10.658
testsignandshift         0.0015525818   10.711
testeachbit              0.0015563965   10.737
testeachbit1shl          0.0020790100   14.342
tableshift               0.00014877319   1.026
tableuchar               0.00014495850   1.000
suminplace               0.00014877319   1.026


Perhaps Chris will find it worth his time to run the thing
on his list of machines.

-------------------- Yet another posting ----------------------

#include <stdio.h>
#include <sys/types.h>

#ifdef FD_SETSIZE
# define USE_GETRUSAGE
# include <sys/time.h>
# include <sys/resource.h>
#else
# include <sys/times.h>
#endif

#define SIZEOF(a) (sizeof(a)/sizeof(a[0]))

#define NUMRAND	16384
unsigned long rr[NUMRAND] ;


/*
 * This function is used to calibrate the timing mechanism.
 * This way we can subtract the loop and call overheads.
 */
int
calibrate(n)
	register unsigned long n;
{
	return 0;
}


/*
 * This function counts the number of bits in a long.
 * It is limited to 63 bit longs, but a minor mod can cope with 511 bits.
 *
 * It is so magic, an explanation is required:
 * Consider a 3 bit number as being
 *	4a+2b+c
 * if we shift it right 1 bit, we have
 *	2a+b
 * subtracting this from the original gives
 *	2a+b+c
 * if we shift the original 2 bits right we get
 *	a
 * and so with another subtraction we have
 *	a+b+c
 * which is the number of bits in the original number.
 * Suitable masking allows the sums of the octal digits in a
 * 32 bit bumber to appear in each octal digit.  This isn't much help
 * unless we can get all of them summed together.
 * This can be done by modulo arithmetic (sum the digits in a number by molulo
 * the base of the number minus one) the old "cating out nines" trick they
 * taught in school before calculators were invented.
 * Now, using mod 7 wont help us, because our number will very likely have
 * more than 7 bits set.  So add the octal digits together to get base64
 * digits, and use modulo 63.
 * (Those of you with 64 bit machines need to add 3 octal digits together to
 * get base512 digits, and use mod 511.)
 *
 * This is HACKMEM 169, as used in X11 sources.
 */
int
t0_hackmemmod(n)
	register unsigned long n;
{
	register unsigned long tmp;

	tmp = n - ((n >> 1) & 033333333333) - ((n >> 2) & 011111111111);
	return ((tmp + (tmp >> 3)) & 030707070707) % 63;
}


/*
 * This is the same as the above, but does not use the % operator.
 * Most modern machines have clockless division, and so the modulo is as
 * expensive as, say, an addition.
 */
int
t1_hackmemloop(n)
	register unsigned long n;
{
	register unsigned long tmp;

	tmp = n - ((n >> 1) & 033333333333) - ((n >> 2) & 011111111111);
	tmp = (tmp + (tmp >> 3)) & 030707070707;
	while (tmp > 63)
		tmp = (tmp & 63) + (tmp >> 6);
	return tmp;
}

/*
 * Above, without using while loop.  It takes at most 5 iterations of the
 * loop, so we just do all 5 in-line.  The final result is never 63
 * (this is assumed above as well).
 */
int
t2_hackmemunrolled(n)
	register unsigned long n;
{
	register unsigned long tmp;

	tmp = n - ((n >> 1) & 033333333333) - ((n >> 2) & 011111111111);
	tmp = (tmp + (tmp >> 3)) & 030707070707;
	tmp = (tmp & 63) + (tmp >> 6);
	tmp = (tmp & 63) + (tmp >> 6);
	tmp = (tmp & 63) + (tmp >> 6);
	tmp = (tmp & 63) + (tmp >> 6);
	tmp = (tmp & 63) + (tmp >> 6);
	return (tmp);
}


/*
 * This function counts the bits in a long.
 *
 * It removes the lsb and counting the number of times round the loop.
 * The expression (n & -n) yields the lsb of a number,
 * but it only works on 2's compliment machines.
 */
int
t3_rmlsbsub(n)
	register unsigned long n;
{
	register int count;

	for (count = 0; n; n -= (n & -n))
		count++;
	return count;
}

int
t4_rmlsbmask(n)
	register unsigned long n;
{
	register int count;

	for (count = 0; n; count++)
		n &= n - 1;	/* take away lsb */
	return (count);
}

/*
 * This function counts the bits in a long.
 *
 * It works by shifting the number down and testing the bottom bit.
 */
int
t5_testlsb(n)
	register unsigned long n;
{
	register int count;

	for (count = 0; n; n >>= 1)
		if (n & 1)
			count++;
	return count;
}

/*
 * This function counts the bits in a long.
 *
 * It works by shifting the number left and testing the top bit.
 * On many machines shift is expensive, so it uses a cheap addition instead.
 */
int
t6_testmsb(n)
	register unsigned long n;
{
	register int count;

	for (count = 0; n; n += n)
		if (n & ~(~(unsigned long)0 >> 1))
			count++;
	return count;
}

int
t7_testsignandshift(n)
	register unsigned long n;
{
	register int count;

	for (count = 0; n; n <<= 1)
		if ((long)n < 0)
			count++;
	return (count);
}


/*
 * This function counts the bits in a long.
 *
 * It works by masking each bit.
 * This is the second most intuitively obvious method,
 * and is independent of the number of bits in the long.
 */
int
t8_testeachbit(n)
	register unsigned long n;
{
	register int count;
	register unsigned long mask;

	count = 0;
	for (mask = 1; mask; mask += mask)
		if (n & mask)
			count++;
	return count;
}


/*
 * This function counts the bits in a long.
 *
 * It works by masking each bit.
 * This is the most intuitively obvious method,
 * but how do you a priori know how many bits in the long?
 * (except for ''sizeof(long) * CHAR_BITS'' expression)
 */
int
t9_testeachbit1shl(n)
	register unsigned long n;
{
	register int count;
	register int bit;

	count = 0;
	for (bit = 0; bit < 32; ++bit)
		if (n & ((unsigned long)1 << bit))
			count++;
	return count;
}

static char nbits[256] = {
	0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
	1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
	1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
	1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
	3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
	1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
	3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
	3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
	3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
	4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8,
};

int
tA_tableshift(n)
	register unsigned long n;
{

	return (nbits[n & 0xff] + nbits[(n >> 8) & 0xff] +
		nbits[(n >> 16) & 0xff] + nbits[n >> 24]);
}

int
tB_tableuchar(n)
	unsigned long n;
{
	register unsigned char *p = (unsigned char *)&n;

	return (nbits[p[0]] + nbits[p[1]] + nbits[p[2]] + nbits[p[3]]);
}


int
tC_suminplace(n)
	register unsigned long n ;
{
	n = ((n >> 1) & 0x55555555) + (n & 0x55555555) ;
	n = ((n >> 2) & 0x33333333) + (n & 0x33333333) ;
	n = ((n >> 4) + n) & 0x0F0F0F0F ;
	n = ((n >> 8) + n) ;
	return ((n >> 16) + n) & 0xFF ;
}


/*
 * build a long random number.
 * The standard rand() returns at least a 15 bit number.
 * We use the top 9 of 15 (since the lower N bits of the usual rand()
 * repeat with a period of 2^N).
 */
unsigned long
bigrand()
{
#define randbits() ((unsigned long)((rand() >> 6) & 0777))
	register int r;

	r = randbits() << 27;
	r |= randbits() << 18;
	r |= randbits() << 9;
	r |= randbits();

	return(r) ;
}

/*
 * Run the test many times.
 * You will need a running time in the 10s of seconds
 * for an accurate result.
 *
 * To give a fair comparison, the random number generator
 * is seeded the same for each measurement.
 *
 * Return value is seconds per iteration.
 */
#if defined(mips) || defined(sparc)
#define REPEAT (1L<<20)
#else
#define REPEAT (1L<<18)
#endif

double
measure(func)
	register int	(*func)();
{
#ifdef USE_GETRUSAGE
	struct rusage ru0, ru1;
	register long j;

	(void) getrusage(RUSAGE_SELF, &ru0);
	for (j = 0; j < REPEAT; ++j)
		func(rr[j & (NUMRAND-1)]);
	(void) getrusage(RUSAGE_SELF, &ru1);
	ru1.ru_utime.tv_sec -= ru0.ru_utime.tv_sec;
	if ((ru1.ru_utime.tv_usec -= ru0.ru_utime.tv_usec) < 0) {
		ru1.ru_utime.tv_usec += 1000000;
		ru1.ru_utime.tv_sec--;
	}
	return ((ru1.ru_utime.tv_sec + (ru1.ru_utime.tv_usec / 1000000.0)) /
		(double)REPEAT);
#else
	register long	j;
	struct tms	start;
	struct tms	finish;

	times(&start);
	for (j = 0; j < REPEAT; ++j)
		func(rr[j & (NUMRAND-1)]);
	times(&finish);
	return ((finish.tms_utime - start.tms_utime) / (double)REPEAT);
#endif
}

struct table {
	char	*name;
	int	(*func)();
	double	measurement;
	int	trial;
};

struct table	table[] =
{
	{ "hackmemmod",		t0_hackmemmod },
	{ "hackmemloop", 	t1_hackmemloop },
	{ "hackmemunrolled",	t2_hackmemunrolled },
	{ "rmlsbsub", 		t3_rmlsbsub },
	{ "rmlsbmask",	 	t4_rmlsbmask },
	{ "testlsb",		t5_testlsb },
	{ "testmsb", 		t6_testmsb },
	{ "testsignandshift", 	t7_testsignandshift },
	{ "testeachbit", 	t8_testeachbit },
	{ "testeachbit1shl", 	t9_testeachbit1shl },
	{ "tableshift", 	tA_tableshift },
	{ "tableuchar", 	tB_tableuchar },
	{ "suminplace", 	tC_suminplace },
};

main(argc, argv)
	int argc;
	char **argv;
{
	double calibration, v, best;
	int j, k, m, verbose;

	verbose = argc > 1;
	/*
	 * run a few tests to make sure they all agree
	 */
	srand(getpid());
	for (j = 0; j < 10; ++j) {
		unsigned long n;
		int bad;

		n = bigrand();
		for (k = 0; k < SIZEOF(table); ++k)
			table[k].trial = table[k].func(n);
		bad = 0;
		for (k = 0; k < SIZEOF(table); ++k)
			for (m = 0; m < SIZEOF(table); ++m)
				if (table[k].trial != table[m].trial)
					bad = 1;
		if (bad) {
			printf("wrong: %08lX", n);
			for (k = 0; k < SIZEOF(table); ++k)
				printf(" %3d", table[k].trial);
			printf("\n");
		}
	}

	for (k = 0 ; k < NUMRAND ; k++) rr[k] = bigrand() ;

	/*
	 * calibrate the timing mechanism
	 */
	calibration = measure(calibrate);
	if (verbose)
		printf("calibration: %g\n", calibration);

	/*
	 * time them all, keeping track of the best (smallest)
	 */
	for (j = 0; j < SIZEOF(table); ++j) {
		v = measure(table[j].func) - calibration;
		if (verbose)
			printf("%s: %g\n", table[j].name, v);
		table[j].measurement = v;
		if (!j || v < best)
			best = v;
	}
	(void) printf("%-24s   %-14sratio\n", "function", "time");
	for (j = 0; j < SIZEOF(table); ++j) {
		(void) printf("%-24s %#10.8g  %6.3f\n",
		    table[j].name,
		    table[j].measurement,
		    table[j].measurement / best);
	}
	exit(0);
}
_________________________________________________________________________
   __ 
  /  )                 Gene H. Olson             uunet!digibd!zeno!gene
 / __  _ __  _         Smartware Consulting
(__/ _(/_//_(/_        Minneapolis, MN           (612) 824-9108