Relay-Version: version B 2.10 5/3/83; site utzoo.UUCP Path: utzoo!mnetor!uunet!husc6!necntc!ima!johnl From: johnl@ima.UUCP Newsgroups: comp.compilers Subject: Re: Lex/Yacc inputs for C and C pre-processor Message-ID: <640@ima.ISC.COM> Date: Thu, 6-Aug-87 05:18:33 EDT Article-I.D.: ima.640 Posted: Thu Aug 6 05:18:33 1987 Date-Received: Thu, 13-Aug-87 02:34:29 EDT References: <625@ima.ISC.COM> Sender: johnl@ima.ISC.COM Reply-To: decvax!utzoo!henry Lines: 237 Approved: compilers@ima.UUCP This isn't quite what was asked for, but still might be of general interest. This is a lex program which tokenizes C source, with minor limitations as described in the leading comment. (In fact it does C++, unless you give it the -C option that restricts it to ANSI C only.) It's probably not useful as a compiler front end; in particular, it accepts *exactly* the legal C strings/numbers/etc. rather than accepting more general forms and giving error messages for violations of the detailed rules. It is, however, of some use for things like statistical analysis of C programs. Henry Spencer @ U of Toronto Zoology {allegra,ihnp4,decvax,pyramid}!utzoo!henry ---------------- %{ /* * ctokens - print tokens of a C or C++ program * * Full ANSI C (draft of 1 Oct 1986) except: no trigraphs; copes with * backslash-newline stripping only inside strings; does not understand * the context-dependent rule that makes a single token * inside a #include. * * Except for newlines, any white-space character is printed as "\t". * It would be more sensible to make the white-space expression [ \t\v\f]+ * instead of just [ \t\v\f], but our old lex has problems with that. * * Note that this program uses one (sigh) undocumented feature of Unix lex: * the ability to override the choice of input stream by assigning to yyin. * Avoiding this requires reimplementing lex's input functions, which is a * pain because getc/ungetc isn't good enough. * * $Log$ */ #include #include #include #include #define STREQ(a, b) (*(a) == *(b) && strcmp((a), (b)) == 0) #ifndef lint static char RCSid[] = "$Header$"; #endif int debug = 0; char *progname; extern void error(), exit(); #ifdef UTZOOERR extern char *mkprogname(); #else #define mkprogname(a) (a) #endif #define PRINTIT printf("%s\n", yytext) int cflag = 0; /* C only. */ %} EXP ([eE][+-]?[0-9]+) FS [flFL] IS ([uU][lL]?|[lL][uU]?) %% [_a-zA-Z][_a-zA-Z0-9]* { PRINTIT; /* identifier */ } [0-9]+"."[0-9]*{EXP}?{FS}? | "."[0-9]+{EXP}?{FS}? | [0-9]+{EXP}{FS}? | [1-9][0-9]*{IS}? | 0[0-7]*{IS}? | 0[xX][0-9a-fA-F]+{IS}? { PRINTIT; /* number */ } \'([^'\\\n]|\\(['"?\\abfnrtv]|[0-7]{1,3}|[xX][0-9a-fA-F]{1,3}))+\' { PRINTIT; /* character constant */ } \"([^"\\\n]|\\(['"?\\abfnrtv\n]|[0-7]{1,3}|[xX][0-9a-fA-F]{1,3}))*\" { /* string -- remove backslashed newlines */ register char *p; for (p = yytext; *p != '\0'; p++) if (*p == '\\' && *(p+1) == '\n') p++; else putchar(*p); putchar('\n'); } [-()&*+~!/%<>^|,.=;:{}?#] | "[" | "]" | "->" | "++" | "--" | "<<" | ">>" | "<=" | ">=" | "==" | "!=" | "&&" | "||" | "##" | "..." | [-*/%+&^|]"=" | "<<=" | ">>=" { PRINTIT; /* misc. tokens */ } "::" { if (cflag) { REJECT; } else PRINTIT; } \n printf("\\n\n"); [ \t\v\f] printf("\\t\n"); "/*" { register int ch; register int nnl = 0; printf("/* "); for (;;) { ch = input(); if (ch == '*') { ch = input(); if (ch == '/') break; else unput(ch); } else if (ch == '\n') { nnl++; if (nnl <= 10) printf("\\n"); if (nnl == 10) printf("..."); } else if (ch == '\0') { fprintf(stderr, "unterminated comment!\n"); exit(0); } } printf(" */\n"); } "//" { register int ch; if (cflag) { REJECT; } else { printf("//\n"); while ((ch = input()) != '\n') if (ch == '\0') { fprintf(stderr, "unterminated comment!\n"); exit(0); } unput(ch); } } . printf("%c ???\n", yytext[0]); %% /* - main - parse arguments and handle options */ main(argc, argv) int argc; char *argv[]; { int c; int errflg = 0; FILE *in; struct stat statbuf; extern int optind; extern char *optarg; extern FILE *efopen(); void process(); progname = mkprogname(argv[0]); while ((c = getopt(argc, argv, "dC")) != EOF) switch (c) { case 'C': /* C only, no C++. */ cflag = 1; break; case 'd': /* Debugging. */ debug++; break; case '?': default: errflg++; break; } if (errflg) { fprintf(stderr, "usage: %s [-C] [file] ...\n", progname); exit(2); } if (optind >= argc) process(stdin, "stdin"); else for (; optind < argc; optind++) if (STREQ(argv[optind], "-")) process(stdin, "-"); else { in = efopen(argv[optind], "r"); if (fstat(fileno(in), &statbuf) < 0) error("can't fstat `%s'", argv[optind]); if ((statbuf.st_mode & S_IFMT) == S_IFDIR) error("`%s' is directory!", argv[optind]); process(in, argv[optind]); (void) fclose(in); } exit(0); } /* * process - process input file */ void process(in, inname) FILE *in; char *inname; { yyin = in; (void) yylex(); } -- Send compilers articles to ima!compilers or, in a pinch, to Levine@YALE.ARPA Plausible paths are { ihnp4 | decvax | cbosgd | harvard | yale | cca}!ima Please send responses to the originator of the message -- I cannot forward mail accidentally sent back to compilers. Meta-mail to ima!compilers-request