Path: utzoo!attcan!uunet!lll-winken!ames!xanth!mcnc!ecsvax!dukeac!bet From: bet@dukeac.UUCP (Bennett Todd) Newsgroups: comp.lang.c Subject: Re: Bit counting Message-ID: <1191@dukeac.UUCP> Date: 18 Jan 89 07:59:51 GMT References: <1208@ncar.ucar.edu> Reply-To: bet@dukeac.UUCP (Bennett Todd) Organization: Radiology, Duke Med. Center, Durham, NC Lines: 236 : This is a sharchive -- extract the files by running through sh. echo x - README sed 's/^X//' <<\Shar_Eof >README XI wanted to see for myself whether the byte-at-a-time table could perform Xcomparably to the "parallel addition" algorithm presented; so I wrote a Xlittle benchmark code to compare. I feel (and folks are welcome to disagree Xwith me here, obviously) that taking advantage of pointer type punning to Xconvert from shift-and-mask extractions to bump-a-pointer-and-dereference is Xlegitimate and (as far as I know!) portable, and not unreasonably ugly when Xyou are already wading down in the bit-bashing. X XI did this using GCC 1.31 on a Sun 3/50 with 4M under SunOS 3.5 (no suntools Xrunning). The machine wasn't quite idle; I had an rlogin to another machine Xwhere I was reading netnews. However, it was close enough to idle that I Xthink the user times are reasonable for comparison. I ran it several times Xand the runs didn't vary by as much as a second in the user time. 1000000 Xiterations didn't have quite enough separation to convince me for sure this Xwas a solid result, so I bumped it to 5000000. X XThe timing results came out: X XByte at a time table lookup: 75.0 user X Parallel addition: 82.7 user X XSo, given this coding style, and this compiler technology, on this sort of Xarchitecture, it appears that the byte-wise table lookup isn't particularly Xslower. It also isn't importantly faster. I personally find it more Xunderstandable. X XI compared the output for the first 1000 results (on the output-generating Xversion) and they were identical, so I hope this code is correctly computing Xthe desired function. It compiled with the options given in the Makefile X(optization cranked way up, fairly strict diagnostics) without error or Xwarning messages. X XNote that the generator program in the comment is really a one-shot; XI built it with a library I run against that performs system and library Xcall error checking for me, because I am extremely lazy. It should be clear Xhow to extract that into a stand-alone. X XFinally, this is my first try ever at coding a timing benchmark, so it is Xquite possible I have committed some classic benchmark blunder. I hope not. XThe separately-compiled "consume" procedure might not have been absolutely Xnecessary, but it kept me from having to examine assembler or figure out Xwhat a heavy-duty optimizing compiler might be capable of omitting. Shar_Eof echo x - Makefile sed 's/^X//' <<\Shar_Eof >Makefile X# X# Choose one of the following four OPTIONS definitions, which X# cover generating 1000-point validation versions on each algorithm, X# and generating 5000000-point silent versions for timing. X# X X# OPTIONS=-DLIM=5000000 -DBENCHMARK -DBYTE_ORIENTED_TABLE XOPTIONS=-DLIM=5000000 -DBENCHMARK -DPARALLEL_ADDITION X# OPTIONS=-DLIM=1000 -DVERIFY -DBYTE_ORIENTED_TABLE X# OPTIONS=-DLIM=1000 -DVERIFY -DPARALLEL_ADDITION X XOBJ=bench.o consume.o X XCC=gcc XCFLAGS=-O -g -Wall -Wwrite-strings -msoft-float -fstrength-reduce \ X -finline-functions -I/usr/local/include $(OPTIONS) X Xbench : $(OBJ) X $(CC) $(CFLAGS) -o $@ $(OBJ) X X$(OBJ) : Makefile Shar_Eof echo x - bench.c sed 's/^X//' <<\Shar_Eof >bench.c X#include X X#ifndef LIM X#define LIM 1000000 X#endif X Xvoid consume(int); Xint bitcount(int); X Xint main() { X int i; X X for (i=0; i X * X * char syntax_args[] = ""; X * X * int main(int argc, char **argv) { X * register unsigned i; X * X * progname = argv[0]; X * if (argc != 1) X * syntax(); X * X * for (i = 0; i<256; i++) { X * register unsigned tmp = i; X * register int count = 0; X * while (tmp != 0) { X * tmp ^= (tmp & -tmp); X * count++; X * } X * eprintf("%d%s", count, (i%32 == 31) ? ",\n" : ","); X * } X * return(0); X * } X */ X static char bits_per_byte[] = { X 0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5, X 1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6, X 1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6, X 2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7, X 1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6, X 2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7, X 2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7, X 3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,4,5,5,6,5,6,6,7,5,6,6,7,6,7,7,8, X }; X X /* X * The following lines are the loop: X * X * for (counter = 0; counter < sizeof(int); counter++) X * count += bits_per_byte[*ptr++]; X * X * unrolled. X */ X count += bits_per_byte[*ptr++]; X count += bits_per_byte[*ptr++]; X count += bits_per_byte[*ptr++]; X count += bits_per_byte[*ptr++]; X return(count); X} X#endif /* BYTE_ORIENTED_TABLE */ X X#ifdef PARALLEL_ADDITION X/* X Module: l_bitcnt.c X Author: Richard E. K. Neitzel X Xrevision history X---------------- X1.0,9jan89,rekn written X X X Explanation: X This routine takes an integer and returns the number of bits it contains X which are set. It does this via 'parallel' addition, turning the integer X into sets of bit pairs of increasing size. WARNING! This assumes that X integers are 32 bits! X*/ X X/* X * Excised from netnews posting, and entry point renamed "bitcount"; X * changes made for ANSI C. X * X * In other words, this is Richard E. K. Neitzel's work, not mine, but X * I have monkeyed with it, so don't blame him. X * Bennett Todd X */ X Xint bitcount(int word) { X register int j, k; /* Our work area */ X X if (!word) /* Why bother? */ X return(0); X X j = word; X X k = j & 0x55555555; /* Clear every other bit */ X j >>= 1; /* Do again for cleared bits */ X j &= 0x55555555; X j += k; /* 1st pairs done */ X X k = j & 0x33333333; /* Clear every two bits */ X j >>= 2; X j &= 0x33333333; X j += k; /* 2nd pairs done */ X X k = j & 0xffff; /* Only need last 4 sets */ X j &= 0xffff0000; /* and top 4 here */ X j = j >> 16; /* ready - */ X j += k; /* add 3rd pairs */ X X k = j & 0xf0f; /* Clear bits */ X j >>= 4; /* Repeat */ X j &= 0xf0f; X j += k; /* add 4th pairs */ X X k = j & 0xff; /* Now add the 8 bit pairs */ X j >>= 8; X j += k; X X return(j); /* bye */ X} X#endif /* PARALLEL_ADDITION */ Shar_Eof echo x - consume.c sed 's/^X//' <<\Shar_Eof >consume.c X#include X Xint printf(const char *, ...); X Xvoid consume(int val) { X X#ifdef VERIFY X (void) printf("%d\n", val); X#endif /* VERIFY */ X X#ifdef BENCHMARK X /* Then we simply return.... */ X#endif /* BENCHMARK */ X X} Shar_Eof exit 0 -Bennett bet@orion.mc.duke.edu