Path: utzoo!utgpu!news-server.csri.toronto.edu!cs.utexas.edu!uunet!mcsun!ukc!axion!tharr!gtoal From: gtoal@tharr.UUCP (Graham Toal) Newsgroups: comp.lang.c Subject: Re: Sets in C (like Pascal) **source included** Keywords: sets, flexible, C Message-ID: <813@tharr.UUCP> Date: 29 Jun 90 05:55:16 GMT Reply-To: gtoal@tharr.UUCP (Graham Toal) Organization: Public access to Usenet in the UK Lines: 229 Archive-name: setdemo.c /* File: setdemo.c Author: Graham Toal Created: 27/06/90 Bryon Lape recently asked for C code to implement pascal-like sets. By coincidence I had written this just a few days earlier, as a proof-of-concept test program. Slight lack of comments but should be fathomable. The code in fact implements something stronger than pascal sets; it allows arbitrarily sized sets, and it lets you construct expressions out of those sets *without* using any scratch workspace to evaluate intermediates. (The real use of the code is in free text indexing, where the sets are lists of addresses of words in a large file, and we are performing queries on the file) (c) Graham Toal 1990. I retain copyright on this only to stop anyone else copyrighting it and restricting my use of my own code! Otherwise it is entirely in the public domain. Use it as you wish, commercially or otherwise. Note: the sets are stored as lists of elements; if the input data is always unique and sorted, the output data will be too; a simple proof by induction can show this. So when constructing sets as input to this code, remember to validate them. For example, list1 = [ 1 2 38 3456 100006 ] is ok, but list2 = [ 1 38 2 3456 100006 ] is bad (order), and list3 = [ 1 1 1 2 38 3456 100006 ] is also bad (not unique). */ /*#define DEBUG*/ #include #define TRUE (0==0) #define FALSE (0!=0) #define DATA int typedef struct PIPEstruct PIPE; typedef int (*UNARYFN)(PIPE *, DATA *); typedef int (*BINFN)(PIPE *, PIPE *, DATA *); /* easily extended to ternary functions for things such as range-checking */ #define MONO 1 #define BINARY 2 #define PENDING (EOF+1) #define UNDEF (PENDING+1) struct PIPEstruct { int fntype; UNARYFN monofun; BINFN bifun; struct PIPEstruct *op1, *op2; int state; /* EOF, PENDING, UNDEF */ DATA lookahead; int *array; /* These two represent data stream for now */ int index; /* DATA offset1, offset2; */ /* These two would be used in real life to point into a file of word indexes */ }; void unread(PIPE *p, DATA d) { p->state = PENDING; p->lookahead = d; } int and(PIPE *left, PIPE *right, DATA *value) { DATA d1, d2; /* Perform an AND operation on two streams */ for (;;) { if (!eval(left, &d1) || !eval(right, &d2)) return(FALSE); while ((d1 < d2) && eval(left, &d1)); while ((d2 < d1) && eval(right, &d2)); if (d1 == d2) {*value = d1; return(TRUE);} } } int or(PIPE *left, PIPE *right, DATA *value) { DATA d1, d2; /* Perform an OR operation on two streams */ if (!eval(left, &d1)) return(eval(right, value)); *value = d1; if (eval(right, &d2)) { if (d1 < d2) unread(right, d2); else if (d2 < d1) {unread(left, d1); *value = d2;} } return(TRUE); } int eval(PIPE *expr, DATA *value) { if (expr->state == PENDING) { expr->state = UNDEF; *value = expr->lookahead; return(TRUE); } switch (expr->fntype) { case MONO: return(expr->monofun(expr->op1, value)); case BINARY: return(expr->bifun(expr->op1, expr->op2, value)); } } int data(PIPE *d, DATA *value) { if (d->state == PENDING) { d->state = UNDEF; *value = d->lookahead; return(TRUE); } if (d->array[d->index] == -1) d->state = EOF; /* Hack for demo -- real code would compare offset1 to offset2 */ if (d->state == EOF) return(FALSE); *value = d->array[d->index++]; d->state = UNDEF; return(TRUE); } PIPE *make_data_stream(int O_List[]) { PIPE *tmp; tmp = (PIPE *)malloc(sizeof(PIPE)); tmp->fntype = MONO; tmp->monofun = data; tmp->array = O_List; tmp->index = 0; tmp->op1 = tmp; /* Actually op1 should be a structure containing array and index from above; this is just laziness */ tmp->state = UNDEF; return(tmp); } PIPE *make_pipe_binop(int (*fn)(PIPE *left, PIPE *right, DATA *value), PIPE *arg1, PIPE *arg2) { PIPE *tmp; tmp = (PIPE *)malloc(sizeof(PIPE)); tmp->bifun = fn; tmp->fntype = BINARY; tmp->op1 = arg1; tmp->op2 = arg2; tmp->state = UNDEF; return(tmp); } int main(int argc, char **argv) { /* Simulated Search Engine -- effectively performing set operations on arbitrarily large sets. */ static int O_List1[16] = { 1, 3, 5, 6, 7, 8, 10, 12, 14, 16, 17, 18, 20, 21, 22, -1}; static int O_List2[7] = {1, 2, 4, 5, 6, 7, -1}; static int O_List3[9] = {14, 15, 16, 17, 18, 19, 20, 21, -1}; static int O_List4[4] = {12, 13, 14, -1}; /* The problem is a free-text database lookup. We have an index which when presented with a keyword, returns a list of identifiers of records which contain that keyword. We wish to use and and or operations on several keywords, eg 'Give me all the records which contain "programmer" and "poor", or "manager" and "rich"' Here is a worked example of the problem. There is a very easy solution possible *iff* all the sets are kept in memory; however they can grow to be excessively large, so the chosen algorithm has to work on small sections of these sets at any one time. Conceptually the simplest way to do this is to define a mechanism where the operator is akin to a unix filter or process, receiving its input data from a stream, and writing its results to a stream. Then the expression evaluation becomes a dataflow problem of connecting these streams together, and reading the results coming out of the topmost filter. One minor implementation note: accessing the elements of each of these lists off CD Rom turn-about would cause lots of disk-head movement; we can optimise this to any degree necessary by having the bottom-level functions buffer their data in blocks in Ram. List1 <- 1 3 5 6 7 8 10 12 14 16 17 18 20 21 22 / OR / \ / List2 <- 1 2 4 5 6 7 Result <- AND \ List3 <- 12 13 14 15 16 17 18 19 \ / OR \ List4 <- 12 13 14 <- 1 3 4 5 6 7 8 10 12 14 16 17 18 20 21 22 / / Result <- AND \ \ <- 12 13 14 15 16 17 18 19 20 21 Result <- 12 14 16 17 18 20 21 */ PIPE *data1, *data2, *data3, *data4, *or1, *or2, *and1; DATA value; /* The Object-lists are simulated by a C array; in real life they would be found in a large index file. (They are actually found by following a 'trie' from a master index) */ data1 = make_data_stream(O_List1); data2 = make_data_stream(O_List2); data3 = make_data_stream(O_List3); data4 = make_data_stream(O_List4); /* Compile our expression */ or1 = make_pipe_binop(or, data1, data2); or2 = make_pipe_binop(or, data3, data4); and1 = make_pipe_binop(and, or1, or2); /* Perform the evaluation of the expression; if you want, you can store the results in a new list; often it is easier to do things with them one at a time as the members of the list are presented */ fprintf(stderr, "The result of list1|list2 & list3|list4 is:\n"); while (eval(and1, &value)) { fprintf(stderr, " %d", value); } fprintf(stderr, "\n"); }