Xref: utzoo comp.unix.questions:28957 comp.unix.programmer:1154 comp.compilers:1736 Path: utzoo!utgpu!news-server.csri.toronto.edu!bonnie.concordia.ca!nstn.ns.ca!news.cs.indiana.edu!samsung!spool.mu.edu!snorkelwacker.mit.edu!paperboy!hsdndev!spdcc!iecc!compilers-sender From: enag@ifi.uio.no (Erik Naggum) Newsgroups: comp.unix.questions,comp.unix.programmer,comp.compilers Subject: Re: You, too, can look at strings. Keywords: lex, code, C Message-ID: Date: 22 Feb 91 22:36:38 GMT References: <1991Feb12.144738.11530@lgc.com> <1991Feb20.150204.3815@lgc.com> Sender: compilers-sender@iecc.cambridge.ma.us Reply-To: Erik Naggum Organization: Naggum Software, Oslo, Norway Lines: 314 Approved: compilers@iecc.cambridge.ma.us In-Reply-To: cl@lgc.com's message of 20 Feb 91 15:02:04 GMT Hi, Cameron! In article <1991Feb20.150204.3815@lgc.com> you write: > 7. One reader wrote that he'd send a finite-state machine which > models C syntax as soon as he found his copy. I haven't heard > from him since. I'll pass it along when it arrives. Hmmm, it must've lost in the fight for survival across the net, which may explain why I never heard from you, again, either. :-) Here's the code, with added table lookup optimization and support for those who wish to scan the preprocessor directives since the first issue. This was some work to get right, so I haven't placed it in the public domain. Some readers of these newsgroups may frown upon that, but if you don't like it the way I did it, or my reasons for copyrighting my work, roll your own. I'd appreciate credit if you glean principles of operation from me. Otherwise, people are free to use this for non-commercial purposes. (If you wish to use this in production code, please drop me a line.) This is all original code. The intent is that if the C compiler finds a string, this will find it, too. ---------------------------------------------------------------------- /* * State machine to trivially parse C code and find strings. * * Copyright Erik Naggum, 1991. Contact: * * Permission to use this code for any non-commercial purpose is * hereby granted, provided that the above copyright notice and * this permission clause are included unchanged. Please inform * the author if you plan to use this code in an article or * otherwise publish it. If you change the code, indicate which * sections are yours and which are mine in an unambiguous manner. */ /* * This code is written with portability in mind, following POSIX.1 [1] * and the ANSI C standard [2]. No system dependent features are used. * * [1] IEEE 1003.1-1990, or ISO 9945-1:1990 * [2] ANSI X3.159(1989), or ISO 9989:1990 */ /* * Usage: [file ...] */ #include /* * NOTE: * If you wish to find strings also in the preprocessor lines, * define the PREPROC_STRINGS macro when compiling. The code is * intended to be used on preprocessed output, as ANSI C macros * may contain ## (pasting) and # (stringifying) operators, which * are both hard to handle without extensive support for the * preprocessor, much like the preprocessor itself... * * It's decidedly non-trivial to make this decision run-time. */ /* * Names of states. Order is not significant. */ typedef enum { Code, #if !defined (PREPROC_STRINGS) New_Line, Preprocessor, #endif String, String_Escaped, Char, Char_Escaped, Enter_Comment, Comment, Exit_Comment, Line_Input, Line_Escape, } state; struct transition; /* * Prototype declarations. */ int enter_string (char); int exit_string (char); int pass (char); int passesc (char); int rescan (char); int write_char (char); void engine (FILE *); void process(struct transition *, state *, char); /* * Magic code which matches any character when scanning transition table */ #define ANY -1 /* * Distance value for Code state (with and without PREPROC_STRINGS defined) */ #if !defined(PREPROC_STRINGS) # define CODE_DIST 5 #else # define CODE_DIST 4 #endif /* * NOTES: * (1) FSA may be optimized by sorting states by frequency of occurrence * (2) All states must have a match for all characters (use ANY to fill) * (3) Don't change the `rescan' dispositions */ struct transition { state this_state; /* state in which condition applies */ int condition; /* transition condition (match) */ state next_state; /* new state if condition true */ int (*disposition)(char); /* action before next state */ int distance; /* optimized table search support */ } code_transitions[] = { Code, '"', String, enter_string, CODE_DIST, Code, '\'', Char, NULL, 0, Code, '/', Enter_Comment, NULL, 0, #if !defined(PREPROC_STRINGS) Code, '\n', New_Line, NULL, 0, #endif Code, ANY, Code, NULL, 0, Comment, '*', Exit_Comment, NULL, 2, Comment, ANY, Comment, NULL, 0, #if !defined (PREPROC_STRINGS) New_Line, '#', Preprocessor, NULL, 2, New_Line, ANY, Code, rescan, 0, Preprocessor, '\n', Code, NULL, 2, Preprocessor, ANY, Preprocessor, NULL, 0, #endif String, '"', Code, exit_string, 4, String, '\n', Code, exit_string, 0, String, '\\', String_Escaped, write_char, 0, String, ANY, String, write_char, 0, String_Escaped, ANY, String, write_char, 1, Char, '\'', Code, NULL, 4, Char, '\\', Char_Escaped, NULL, 0, Char, '\n', Code, NULL, 0, Char, ANY, Char, NULL, 0, Char_Escaped, ANY, Code, NULL, 1, Enter_Comment, '*', Comment, NULL, 2, Enter_Comment, ANY, Code, rescan, 0, Exit_Comment, '/', Code, NULL, 2, Exit_Comment, ANY, Comment, NULL, 0, }; /* * Separate FSA for the input line handling, to avoid multiplicity of * states in the code_transitions. Also reflects ANSI C translation * phase 2. (Phase 1 is assumed to be empty.) */ struct transition line_transitions[] = { Line_Input, '\\', Line_Escape, NULL, 2, Line_Input, ANY, Line_Input, pass, 0, Line_Escape, '\n', Line_Input, NULL, 2, Line_Escape, ANY, Line_Input, passesc, 0, }; static state line_state; static state code_state; static char *input_file; /* * Process arguments (file names) and options (none at present) */ void main (int argc, char *argv[]) { int argp; /* * Not very elegant way to force standard input in the absence of * arguments. Suggestions for improvements are welcome. */ static char *no_files[] = { "", "-", NULL }; if (argc <= 1) argv = no_files; for (argp = 1; argv[argp]; argp++) { FILE *input; input_file = argv[argp]; if (strcmp (input_file, "-") == 0) { engine (stdin); continue; } input = fopen (input_file, "r"); if (input == NULL) { perror (input_file); continue; } engine (input); fclose (input); } exit (0); } /* * Initialize FSA and process input file */ void engine (FILE *input) { int c; line_state = Line_Input; #if !defined(PREPROC_STRINGS) code_state = New_Line; #else code_state = Code; #endif while ((c = fgetc(input)) != EOF) process (line_transitions, &line_state, c); } /* * Line handler - pass character to code handler */ int pass (char c) { process (code_transitions, &code_state, c); return 0; } /* * Line handler - pass escape when not followed by newline, * then rescan the input character. */ int passesc (char c) { process (code_transitions, &code_state, '\\'); return 1; } /* * Cause current character to be reused in new state */ int rescan (char c) { return 1; } /* * We have entered a string -- print file name */ int enter_string (char c) { printf ("%s:\t%c", input_file, c); return 0; } /* * The string is terminated -- finish string processing */ int exit_string (char c) { if (c == '\n') printf ("\" (unterminated)\n"); else printf ("%c\n", c); return 0; } /* * In string -- print each character out */ int write_char (char c) { putchar (c); return 0; } /* * The heart of the Finite State Automaton * * Note the rescan support. This code would have been cleaner without * it, but other code would suffer. Better to have it one place, only. * */ void process (struct transition *transitions, state *in_state, char c) { struct transition *work; do { work = transitions; while (work -> this_state != *in_state) work += work -> distance; while (work -> condition != c && work -> condition != ANY) ++ work; *in_state = work -> next_state; if (work -> disposition == NULL) break; } while (work -> disposition (c)); } ---------------------------------------------------------------------- Enjoy! Bug fixes, if any, will be appreciated. -- [Erik Naggum] Naggum Software, Oslo, Norway -- Send compilers articles to compilers@iecc.cambridge.ma.us or {ima | spdcc | world}!iecc!compilers. Meta-mail to compilers-request. Brought to you by Super Global Mega Corp .com