Path: utzoo!utgpu!jarvis.csri.toronto.edu!mailrus!purdue!mentor.cc.purdue.edu!pur-ee!ea.ecn.purdue.edu!housel From: housel@en.ecn.purdue.edu (Peter S. Housel) Newsgroups: comp.os.minix Subject: A full, V7-Compatible expr(1) Message-ID: <13618@ea.ecn.purdue.edu> Date: 23 Jul 89 18:59:15 GMT Sender: housel@ea.ecn.purdue.edu Reply-To: housel@en.ecn.purdue.edu (Peter S. Housel) Organization: Purdue University Engineering Computer Network Lines: 792 I've been running a lot of shell scripts lately. One construct commonly used in many scripts is the colon operator of /usr/bin/expr, which does regular expression matching. The Minix version of expr(1) doesn't support this operator at all, and had problems with its support of the rest. Here is my replacement version. It implements everything described in the V7 manual page, and behaves exactly as the 4.3BSD version does. (I don't think this program has changed any since V7 - it would have broken a lot of scripts.) It also passes lint, and contains no cholesterol or artificial coloring. The regular expression code is my own, as it was not possible to use any of the commonly available regular expression matching libraries. (expr(1) uses the ed(1) regexp syntax, while the libraries implement extensions or improvements. The extraction operator also would be difficult to get right if another library were used.) It passes a modified version of Henry Spencer's regular expression test, tailored for the BSD expr. There is one extension over the original version - multiple \(...\) pairs are allowed. Don't use this feature if you expect your scripts to work on V7/BSD/SysV. (The only reason this feature has been addded is that I had done the design and part of the coding work before I realized it was nonstandard. It seemed a shame to rip out perfectly good code that increased the generality, didn't hurt anything and didn't add significantly to the size.) Bug reports appreciated. -Peter S. Housel- housel@ecn.purdue.edu ...!pur-ee!housel echo 'x - expr.c' sed 's/^X//' <<'**-expr.c-EOF-**' >expr.c X/* file: expr.c X * author: Peter S. Housel 7/4/89,7/7/89,7/19/89,7/20/89 X */ X X#include X#include X#include X Xextern char *malloc(/* size_t nbytes */); X Xstruct value X { X long numval; /* numeric value */ X int nf_valid; /* "numeric value field valid" flag */ X char *strval; /* string value */ X }; X X#define numresult(valp,number) \ X (((valp)->nf_valid = 1), \ X ((valp)->strval = NULL), \ X ((valp)->numval = (number))) X Xvoid invalid(/* char *err */); Xchar *strvalue(/* struct value *valp */); Xint numvalue(/* struct value *valp */); Xchar *strsave(/* char *string */); Xvoid expr1(/* struct value *valp */), X expr2(/* struct value *valp */), X expr3(/* struct value *valp */), X expr4(/* struct value *valp */), X expr5(/* struct value *valp */), X expr6(/* struct value *valp */), X expr7(/* struct value *valp */); Xvoid docolon(/* value *match, *pattern */); X Xchar *progname; Xchar **argp; Xchar NUMARG[] = "numeric argument required"; X Xmain(argc, argv) Xint argc; char *argv[]; X{ X struct value val0; X X progname = argv[0]; X X argp = &argv[1]; X X expr1(&val0); X X if(*argp != NULL) X invalid("syntax error"); X X (void) puts(strvalue(&val0)); X X exit(nullz(&val0)); X} X X/* Yet Another recursive descent parser. X */ Xvoid expr1(valp) Xstruct value *valp; X{ X struct value val1; X X expr2(valp); X X while(*argp != NULL) X { X if(strcmp(*argp, "|") == 0) X { X ++argp; X expr2(&val1); X if(nullz(valp)) X *valp = val1; X else X ; /* return the first arg (already in *valp) */ X } X else X break; X } X} X Xvoid expr2(valp) Xstruct value *valp; X{ X struct value val1; X X expr3(valp); X X while(*argp != NULL) X { X if(strcmp(*argp, "&") == 0) X { X ++argp; X expr3(&val1); X if(nullz(valp) && nullz(&val1)) X numresult(valp, 0); X else X ; /* return the first arg (already in *valp) */ X } X else X break; X } X} X X/* save source code lines but not object code, unfortunately. X */ X X#define RELOP(OP) \ X ++argp; \ X expr4(&val1); \ X if(numvalue(valp) && numvalue(&val1)) \ X numresult(valp, valp->numval OP val1.numval); \ X else \ X numresult(valp, strcmp(strvalue(valp), strvalue(&val1)) OP 0); X Xvoid expr3(valp) Xstruct value *valp; X{ X struct value val1; X X expr4(valp); X X while(*argp != NULL) X { X if(strcmp(*argp, "<") == 0) X { X RELOP( < ) X } X else if(strcmp(*argp, "<=") == 0) X { X RELOP( <= ) X } X else if(strcmp(*argp, "=") == 0) X { X RELOP( == ) X } X else if(strcmp(*argp, "!=") == 0) X { X RELOP( != ) X } X else if(strcmp(*argp, ">=") == 0) X { X RELOP( >= ) X } X else if(strcmp(*argp, ">") == 0) X { X RELOP( > ) X } X else X break; X } X} X X#define BINOP(NEXT,OP) \ X ++argp; \ X NEXT(&val1); \ X if(!numvalue(valp) || !numvalue(&val1)) \ X invalid(NUMARG); \ X else \ X numresult(valp, valp->numval OP val1.numval); \ X Xvoid expr4(valp) Xstruct value *valp; X{ X struct value val1; X X expr5(valp); X X while(*argp != NULL) X { X if(strcmp(*argp, "+") == 0) X { X BINOP(expr5, + ) X } X else if(strcmp(*argp, "-") == 0) X { X BINOP(expr5, - ) X } X else X break; X } X} X Xvoid expr5(valp) Xstruct value *valp; X{ X struct value val1; X X expr6(valp); X X while(*argp != NULL) X { X if(strcmp(*argp, "*") == 0) X { X BINOP(expr6, * ) X } X else if(strcmp(*argp, "/") == 0) X { X ++argp; X expr6(&val1); X if(!numvalue(valp) || !numvalue(&val1)) X invalid(NUMARG); X else X { X if(val1.numval == 0) X invalid("division by zero"); X numresult(valp, valp->numval / val1.numval); X } X } X else if(strcmp(*argp, "%") == 0) X { X ++argp; X expr6(&val1); X if(!numvalue(valp) || !numvalue(&val1)) X invalid(NUMARG); X else X { X if(val1.numval == 0) X invalid("division by zero"); X numresult(valp, valp->numval / val1.numval); X } X } X else X break; X } X} X Xvoid expr6(valp) Xstruct value *valp; X{ X struct value val1; X X expr7(valp); X X while(*argp != NULL) X { X if(strcmp(*argp, ":") == 0) X { X ++argp; X expr7(&val1); X#ifndef NOCOLON X docolon(valp, &val1); X#else X valp->nf_valid = 0; X valp->strval = NULL; X#endif X } X else X break; X } X} X Xvoid expr7(valp) Xstruct value *valp; X{ X if(*argp == NULL) X invalid("missing argument(s)"); X else if(strcmp(*argp, "(") == 0) X { X ++argp; X expr1(valp); X if(strcmp(*argp++, ")") != 0) X invalid("unbalanced parentheses"); X } X else X { X valp->nf_valid = 0; X valp->strval = *argp++; X } X} X X/* return 1 if the argument is zero (numeric) or null (string X */ Xint nullz(valp) Xstruct value *valp; X{ X if(numvalue(valp)) X return (valp->numval == 0); X X return (strlen(strvalue(valp)) == 0); X} X X/* return 1 if the argument is a valid number, insuring that the nf_valid X * and numval fields are set properly. Does the string-to-number X * conversion if nf_valid is false. X */ Xint numvalue(valp) Xstruct value *valp; X{ X register char *p; X int sign = 0, digits = 0; X unsigned long num = 0; X X if(valp->nf_valid) X return 1; X X if((p = valp->strval) == NULL) X return 0; X X if(*p == '-') X { X ++p; X sign = 1; X } X while(isdigit(*p)) X { X num = num * 10 + (*p++ - '0'); X digits = 1; X } X X if(!digits || *p != '\0') X return 0; X X valp->numval = sign ? -num : num; X valp->nf_valid = 1; X return 1; X} X X/* return the string value of the given argument. If there is only a X * numeric value, convert it to a string X */ Xchar *strvalue(valp) Xstruct value *valp; X{ X char numbuf[30]; X register char *p; X unsigned long num; X int sign = 0; X X if(!valp->nf_valid) X return (valp->strval != NULL) ? valp->strval : ""; X X p = numbuf + sizeof numbuf; X *--p = '\0'; X X if(valp->numval < 0) X { X num = -(valp->numval); X sign = 1; X } X else X num = valp->numval; X X do X { X *--p = '0' + (num % 10); X num /= 10; X } while(num); X X if(sign) X *--p = '-'; X X return (valp->strval = strsave(p)); X} X X/* save the given string in its own allocated memory and return a pointer X * to that memory. X */ Xchar *strsave(string) Xchar *string; X{ X char *p; X X if((p = malloc(strlen(string) + 1)) == NULL) X invalid("out of memory"); X X (void) strcpy(p, string); X X return p; X} X X/* print error message and exit. X */ Xvoid invalid(err) Xchar *err; X{ X (void) fputs(progname, stderr); X (void) fputs(": ", stderr); X (void) fputs(err, stderr); X (void) putc('\n', stderr); X exit(2); X} X X#ifndef NOCOLON X X#include X X#define RMIN (UCHAR_MAX-8) /* >= reserved as opcodes */ X#define RESC (UCHAR_MAX-8) /* for escaping opcodes */ X#define RDOT (UCHAR_MAX-7) /* match any character */ X#define ROPEN (UCHAR_MAX-6) /* opening \( */ X#define RCLOSE (UCHAR_MAX-5) /* closing \) */ X#define RSTAR (UCHAR_MAX-4) /* Kleene closure */ X#define RCLASS (UCHAR_MAX-3) /* character class */ X#define RBACK (UCHAR_MAX-2) /* \digit reference */ X#define REND (UCHAR_MAX-1) /* end of program */ X X#define RABEGIN 0x01 /* match anchored at BOL (^) */ X#define RAEND 0x02 /* match anchored at EOL ($) */ X#define RSELECT 0x04 /* \(...\) selection op used */ X X#define PROGLENGTH 1024 /* bytes reserved for r-programs */ X X#define CLASS_BYTES ((CHAR_MAX - CHAR_MIN) / CHAR_BIT) X Xunsigned char rprogram[PROGLENGTH]; /* regexp program storage */ Xunsigned int rflags = RABEGIN; /* regexp program context */ X Xchar *rbegins[10]; /* pointers to \( beginnings */ Xchar *rends[10]; /* pointers to \) endings */ Xint rlevel; /* \(...\) level */ X Xvoid rcomp(/* char *regexp */); Xvoid rmatch(/* char *str */); Xchar *rtry(/* char *str, unsigned char **pcp */); Xchar *tryone(/* char *str, unsigned char **pcp */); X X/* compile the regexp, match it against the string, and return the X * proper result (a string if \(...\) used, and the match length otherwise. X */ Xvoid docolon(match, pattern) Xstruct value *match, *pattern; X{ X rcomp(strvalue(pattern)); X X rmatch(strvalue(match)); X X if(rflags & RSELECT) X { X match->nf_valid = 0; X if(rends[0] == rbegins[0] || rends[1] == NULL) X { X match->strval = NULL; X } X else X { X *(rends[1]) = '\0'; /* semi-nasty */ X match->strval = rbegins[1]; X } X } X else X { X numresult(match, rends[0] - rbegins[0]); X } X} X X/* compile an ed(1)-syntax regular-expression into the rprogram[] array. X */ Xvoid rcomp(regexp) Xregister char *regexp; X{ X char c; /* current regexp character */ X char first, last; /* limits of character class */ X unsigned char *starable; /* last "starable" variable */ X unsigned char *rpc; /* pointer to next program storage byte */ X int negate; /* character class negated */ X int i; /* loop counter and such */ X int pstack[9]; /* \(...\) nesting stack */ X int pstackp = 0; /* stack pointer for nesting stack */ X int pclosed[10]; /* flags indicating \(...\) closed */ X X rpc = &rprogram[0]; X starable = NULL; X X for(i = 1; i < 10; ++i) X pclosed[i] = 0; X X if(*regexp == '^') X { X rflags |= RABEGIN; /* not needed, as it turns out */ X ++regexp; X } X X while((c = *regexp++)) X { X if((rpc - rprogram) >= PROGLENGTH - 2 - CLASS_BYTES) X invalid("regular expression program too long"); X X switch(c) X { X case '.': X starable = rpc; X *rpc++ = RDOT; X break; X case '\\': X if(isdigit(*regexp)) X { X if(!pclosed[*regexp - '0']) X invalid("reference to unclosed/nonexistent \\(...\\) pair"); X starable = NULL; X *rpc++ = RBACK; X *rpc++ = *regexp++ - '0'; X } X else if(*regexp == '(') X { X starable = NULL; X ++regexp; X rflags |= RSELECT; X if((i = ++rlevel) > 9) X invalid("too many \\(...\\) levels"); X pstack[pstackp++] = i; X *rpc++ = ROPEN; X *rpc++ = i; X break; X } X else if(*regexp == ')') X { X starable = NULL; X ++regexp; X if(pstackp == 0) X invalid("\\(...\\) pairs don't balance"); X i = pstack[--pstackp]; X *rpc++ = RCLOSE; X *rpc++ = i; X pclosed[i] = 1; X break; X } X else if((unsigned char) *regexp >= RMIN) X { X starable = rpc; X *rpc++ = RESC; X *rpc++ = *regexp++; X break; X } X else X { X starable = rpc; X *rpc++ = *regexp++; X break; X } X case '$': X if(*regexp == '\0') X { X rflags |= RAEND; X break; X } X else X { X starable = rpc; X *rpc++ = '$'; X break; X } X case '*': X if(starable == NULL) X { X starable = rpc; X *rpc++ = '*'; X break; X } X else X { X bcopy(starable, starable + 1, rpc - starable); X *starable = RSTAR; X starable = NULL; X ++rpc; X break; X } X case '[': X negate = 0; X starable = rpc; X *rpc++ = RCLASS; X if(*regexp == '^') X { X ++regexp; X negate = 1; X } X for(i = 0; i < CLASS_BYTES; ++i) X rpc[i] = 0; X do X { X first = *regexp++; X if(*regexp == '-' && regexp[1] != ']' X && regexp[1] > first) X { X ++regexp; X last = *regexp++; X for(i = first; i < last; ++i) X { X rpc[(i - CHAR_MIN) / CHAR_BIT] |= X 1 << ((i - CHAR_MIN) % CHAR_BIT); X } X } X else X { X rpc[(first - CHAR_MIN) / CHAR_BIT] |= X 1 << ((first - CHAR_MIN) % CHAR_BIT); X } X } while (*regexp && *regexp != ']'); X if(*regexp != ']') X invalid("unterminated character class"); X ++regexp; X if(negate) X for(i = 0; i < CLASS_BYTES; ++i, ++rpc) X *rpc = ~*rpc; X else X rpc += CLASS_BYTES; X break; X default: X if((unsigned char) c >= RMIN) X { X starable = rpc; X *rpc++ = RESC; X *rpc++ = c; X break; X } X else X { X starable = rpc; X *rpc++ = c; X break; X } X } X } X if(pstackp != 0) X invalid("\\(...\\) pairs don't balance"); X *rpc = REND; X} X X/* It turns out that expr regular expressions have an implicit X * '^' prepended, and therefore RABEGIN is always on. It seemed X * a waste to delete the code after discovering this, however. X */ Xvoid rmatch(str) Xchar *str; X{ X char *end; X unsigned char *rpc; X X rends[0] = rbegins[0] = str; X X if(rflags & RABEGIN) X { X rpc = &rprogram[0]; X if((end = rtry(str, &rpc)) != NULL) X rends[0] = end; X } X else X { X while(*str) X { X rpc = &rprogram[0]; X end = rtry(str, &rpc); X if(end != NULL && (end - str) > (rends[0] - rbegins[0])) X { X rbegins[0] = str; /* longest match wins */ X rends[0] = end; X } X ++str; X } X } X X} X X/* try to match str to program from *pcp on X*/ Xchar *rtry(str, pcp) Xchar *str; Xunsigned char **pcp; X{ X char *nstr; X X while(*str && **pcp != REND) X { X if((nstr = tryone(str, pcp)) == NULL) X return NULL; X str = nstr; X } X X while(**pcp == RCLOSE) X { X rends[*(*pcp + 1)] = str; X *pcp += 2; X } X X if(**pcp != REND) X return NULL; X X if((rflags & RAEND) && *str != '\0') X return NULL; X X return str; X} X X/* try to match one regular expression operator X */ Xchar *tryone(str, pcp) Xchar *str; Xunsigned char **pcp; X{ X char *ret = NULL; X unsigned char *npc; X char *p, *q; X Xagain: X switch(**pcp) X { X case RESC: X if(*str == *(*pcp + 1)) X ret = str + 1; X *pcp += 2; X break; X default: X if(*str == **pcp) X ret = str + 1; X *pcp += 1; X break; X case RDOT: X if(*str != '\0') X ret = str + 1; X *pcp += 1; X break; X case RCLASS: X if(*str != '\0' X && ((*pcp + 1)[(*str - CHAR_MIN) / CHAR_BIT] X & (1 << ((*str - CHAR_MIN) % CHAR_BIT)))) X { X ret = str + 1; X } X *pcp += CLASS_BYTES + 1; X break; X case ROPEN: X rbegins[*(*pcp + 1)] = str; X *pcp += 2; X goto again; X case RCLOSE: X rends[*(*pcp + 1)] = str; X *pcp += 2; X goto again; X case RBACK: X p = rbegins[*(*pcp + 1)]; X q = rends[*(*pcp + 1)]; X *pcp += 2; X while(p < q) X if(*p++ != *str++) X return NULL; X ret = str; X break; X case RSTAR: X *pcp += 1; X p = str; X while(npc = *pcp, tryone(p, &npc) != NULL) X ++p; X *pcp = npc; X while(p >= str X && (npc = *pcp, (ret = rtry(p, &npc)) == NULL)) X --p; X *pcp = npc; X break; X case REND: X ret = str; X } X X return ret; X} X#endif /* !NOCOLON */ **-expr.c-EOF-** exit 0