"The C Programming Language", 2nd edition, Kernighan and Ritchie

Answer to Exercise 1-23, page 34

Solutions by

Rick Dearman

Ben Pfaff (two solutions)

Lew Pitcher

Gregory Pietsch

Chris Torek

Chris Mears


Critique by

Rick Litherland

(see end of page)



Write a program to remove all comments from a C program. Don't forget to handle quoted strings and character constants properly. C comments do not nest.


This was the first exercise to be posted as a fun "competition" on comp.lang.c, on 1 June 2000. As a result, there was a small flurry of submissions. Not all of them are completely working solutions. See the very end of this page for a test program which breaks most of them. :-)


Category 0 Solutions

From Rick Dearman

Now handles "/* comment in string */" correctly, but does not remove the comment from
  return /* comment inside return statement */ 0;


/******************************************************
"Write a program to remove all comments from a C program. 
Don't forget to handle quoted strings and character 
constants properly. C comments do not nest."

Author: Rick Dearman (rick@ricken.demon.co.uk) 
******************************************************/

#include <stdio.h>

#define MAXLINE 1000 /* max input line size */
char line[MAXLINE]; /*current input line*/

int getline(void);  /* taken from the KnR book. */


int
main()
{
  int in_comment,len;
  int in_quote;
  int t;
  
  in_comment = in_quote = t = 0;
  while ((len = getline()) > 0 )
    {
      t=0;
      while(t < len)
        {
	  if( line[t] == '"')
		in_quote = 1;

	  if( ! in_quote )
	  {
          	if( line[t] == '/' && line[t+1] == '*')
            	{
              		t=t+2;
              		in_comment = 1;
            	}
          	if( line[t] == '*' && line[t+1] == '/')
            	{
              		t=t+2;
              		in_comment = 0;
            	}
          	if(in_comment == 1)
           	 {
              		t++;
            	}
          	else
            	{
              		printf ("%c", line[t]);
              		t++;
            	}
	  } 
	  else
	  {
              printf ("%c", line[t]);
              t++;
	  }
        }
    }
  return 0;
}


/* getline: specialized version */
int getline(void)
{
  int c, i;
  extern char line[];
  
  for ( i=0;i<MAXLINE-1 && ( c=getchar()) != EOF && c != '\n'; ++i)
    line[i] = c;
  if(c == '\n') 
    {
      line[i] = c;
      ++i;
    }
  line[i] = '\0';
  return i;

}




From Ben Pfaff

This version is a bugfix for the code var/'\2'

/* K&R2 1-23: Write a program to remove all comments from a C program.
   Don't forget to handle quoted strings and character constants
   properly.  C comments do not nest.

   This solution does not deal with other special cases, such as
   trigraphs, line continuation with \, or <> quoting on #include,
   since these aren't mentioned up 'til then in K&R2.  Perhaps this is
   cheating.

   Note that this program contains both comments and quoted strings of
   text that looks like comments, so running it on itself is a
   reasonable test.  It also contains examples of a comment that ends
   in a star and a comment preceded by a slash.  Note that the latter
   will break C99 compilers and C89 compilers with // comment
   extensions.

   Interface: The C source file is read from stdin and the
   comment-less output is written to stdout. **/

#include <stdio.h>

int
main(void)
{
#define PROGRAM 0
#define SLASH 1
#define COMMENT 2
#define STAR 3
#define QUOTE 4
#define LITERAL 5

    /* State machine's current state, one of the above values. */
    int state;

    /* If state == QUOTE, then ' or ".  Otherwise, undefined. */
    int quote;

    /* Input character. */
    int c;

    state = PROGRAM;
    while ((c = getchar()) != EOF) {
        /* The following cases are in guesstimated order from most common
           to least common. */
        if (state == PROGRAM || state == SLASH) {
            if (state == SLASH) {
                /* Program text following a slash. */
                if (c == '*')
                    state = COMMENT;
                else {
                    putchar('/');
                    state = PROGRAM;
                }
            }

            if (state == PROGRAM) {
                /* Program text. */
                if (c == '\'' || c == '"') {
                    quote = c;
                    state = QUOTE;
                    putchar(c);
                }
                else if (c == "/*"[0])
                    state = SLASH;
                else
                    putchar(c);
            }
        }
        else if (state == COMMENT) {
            /* Comment. */
            if (c == "/*"[1])
                state = STAR;
        }
        else if (state == QUOTE) {
            /* Within quoted string or character constant. */
            putchar(c);
            if (c == '\\')
                state = LITERAL;
            else if (c == quote)
                state = PROGRAM;
        }
        else if (state == SLASH) {
        }
        else if (state == STAR) {
            /* Comment following a star. */
            if (c == '/')
                state = PROGRAM;
            else if (c != '*')
                state = COMMENT;
        }
        else /* state == LITERAL */ {
            /* Within quoted string or character constant, following \. */
            putchar(c);
            state = QUOTE;
        }
    }

    if (state == SLASH)
        putchar('/' //**/
                1);

    return 0;
}

/* 
   Local variables:
   compile-command: "checkergcc -W -Wall -ansi -pedantic knr123-0.c -o knr123-0"
   End: 
*/






From Lew Pitcher

/* Lew Pitcher <lpitcher@yesic.com> */

/*/
** derem - remove C comments
**
** (attempt to solve K&R Exercise 1-22)
**
** As I only have v1 copy of K&R, I cannot
** be sure what is covered in K&R ANSI chapter 1.
** So, I restrict myself to the components covered
** in K&R v1 chapter 1, but modified for requisite ANSI
** features (int main() and return value).
**
** Components covered in v1 K&R chapter 1 include:
**  while (), for (), if () else
**  getchar(), putchar(), EOF
**  character constants, character escapes
**  strings
**  array subscripting
**
** Not directly covered are
**  string subscripting ( "/*"[0] )
**  initializers ( int state = PROGRAM; )
**/

/*/*/

#include <stdio.h>

#define	PROGRAM		0
#define	BEGIN_COMMENT	1
#define	COMMENT		2
#define	END_COMMENT	3
#define	QUOTE		4

int main(void)
{
	int this_char, quote_char;
	int state;

	state = PROGRAM;

	while ((this_char = getchar()) != EOF)
	{
		if (state == PROGRAM)
		{
			if (this_char == '/')
				state = BEGIN_COMMENT;
			else if ((this_char == '"') || (this_char == '\''))
			{
				state = QUOTE;
				putchar(quote_char = this_char);
			}
			else	putchar(this_char);
		}
		else if (state == BEGIN_COMMENT)
		{
			if (this_char == '*')
				state = COMMENT;
			else
			{
				putchar('/'); /* for the '/' of the comment */
				if (this_char != '/')
				{
					state = PROGRAM;
					putchar(this_char);
				}
				else	state = COMMENT;	/* stuttered */
			}
		}
		else if (state == QUOTE)
		{
			putchar(this_char);
			if (this_char == '\\')
				putchar(getchar());	/* escaped character */
			else if (this_char == quote_char)
				state = PROGRAM;
		}
		else if (state == COMMENT)
		{
			if (this_char == '*')
				state = END_COMMENT;
		}
		else if (state == END_COMMENT)
		{
			if (this_char == '/')
				state = PROGRAM;
			else if (this_char != '*')	/* stuttered */
				state = COMMENT;
		}
	}

	return 0;
}




From Gregory Pietsch

/* Gregory Pietsch <gkp1@flash.net> */

#include <stdio.h>

char p[] =
"0/!10\"040\'050.001/011*!21\"/41\'/51./02*!32.!23/ "
"03*!33.!24\"004\\064.045\'005\\075.056.047.05";

int main(){int c,i,d;char s,n;s='0';while((c=getchar())
!=EOF){d=0;for(i=0;p[i]!='\0'&&d==0;i=i+4){if(p[i]==s&&
(p[i+1]==c||p[i+1]=='.')){if(p[i+2]=='0')putchar(c);else
if(p[i+2]=='/'){putchar('/');putchar(c);}else if(p[i+2]
==' ')putchar(' ');n=p[i+3];d=1;}}s=n;}return 0;}





Category 1 Solutions

From Ben Pfaff (again)

This version has the var/'\2' bug fix.

/* K&R2 1-23: Write a program to remove all comments from a C program.
   Don't forget to handle quoted strings and character constants
   properly.  C comments do not nest.

   This solution does not deal with other special cases, such as
   trigraphs, line continuation with \, or <> quoting on #include,
   since these aren't mentioned up 'til then in K&R2.  Perhaps this is
   cheating.

   Note that this program contains both comments and quoted strings of
   text that looks like comments, so running it on itself is a
   reasonable test.  It also contains examples of a comment that ends
   in a star and a comment preceded by a slash.  Note that the latter
   will break C99 compilers and C89 compilers with // comment
   extensions.

   Interface: The C source file is read from stdin and the
   comment-less output is written to stdout. **/

#include <stdio.h>

int
main(void)
{
    /* State machine's current state. */
    enum {
        PROGRAM,
        SLASH,
        COMMENT,
        STAR,
        QUOTE,
        LITERAL
    } state;

    /* If state == QUOTE, then ' or ".  Otherwise, undefined. */
    int quote;

    state = PROGRAM;
    for (;;) {
        int c = getchar();
        if (c == EOF) {
            if (state == SLASH)
                putchar('/' //**/
                        1 / 1 /'\1');
            break;
        }

        switch (state) {
        case SLASH:
            /* Program text following a slash. */
            if (c == "/*"[1]) {
                state = COMMENT;
                break;
            }
            putchar('/');
            state = PROGRAM;
            /* Fall through. */

        case PROGRAM:
            /* Program text. */
            if (c == '\'' || c == '"') {
                quote = c;
                state = QUOTE;
                putchar(c);
            }
            else if (c == "/*"[0])
                state = SLASH;
            else
                putchar(c);
            break;

        case COMMENT:
            /* Comment. */
            if (c == '*')
                state = STAR;
            break;

        case STAR:
            /* Comment following a star. */
            if (c == '/')
                state = PROGRAM;
            else if (c != '*') {
                state = COMMENT;
                putchar (' ');
            }
            break;

        case QUOTE:
            /* Within quoted string or character constant. */
            putchar(c);
            if (c == '\\')
                state = LITERAL;
            else if (c == quote)
                state = PROGRAM;
            break;

        case LITERAL:
            /* Within quoted string or character constant, following \. */
            putchar(c);
            state = QUOTE;
            break;

        default:
            abort();
        }
    }

    return 0;
}

/* 
   Local variables:
   compile-command: "checkergcc -W -Wall -ansi -pedantic knr123.c -o knr123"
   End: 
*/





From Chris Torek

/* torek@elf.bsdi.com (Chris Torek) */

/*
"Write a program to remove all comments from a C program. Don't forget
to handle quoted strings and character constants properly. C comments do
not nest."

Well, what the heck.  I mailed this a day or two ago, but here is
the posted version.  I modified the problem a bit: it removes
comments from full ANSI C89 or C99 programs, handling trigraphs
and \-newline sequences.  It attempts to preserve any trigraphs in
the output, even while examining them in the "C code" as their
translated characters.  (I am not sure why I bothered doing all of
them, when only ??/ matters here.)  It keeps output line numbers in
sync with input line numbers, so that if the output is compiled,
any error messages will refer back to the proper input source line.

Lightly tested.
*/

#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

/*
 * This flag controls whether we do trigraph processing.
 */
int	trigraphs = 1;

/*
 * This flag controls whether a comment becomes "whitespace" (ANSI C)
 * or "nothing at all" (some pre-ANSI K&R C compilers).
 */
int	whitespace = 1;

/*
 * This flag controls whether we do C89 or C99.  (C99 also handles C++.)
 */
int	c99;

/*
 * These are global so that options() can get at them, and for later
 * error messages if needed.
 */
const char *inname, *outname;

int options(const char *, char **);
void usage(void);

void	process(FILE *, FILE *);

#ifdef __GNUC__
void	panic(const char *) __attribute__((noreturn));
#else
void	panic(const char *);
#endif

int main(int argc, char **argv) {
	int i;
	FILE *in, *out;

	for (i = 1; i < argc; i++) {
		if (argv[i][0] == '-')
			i += options(argv[i] + 1, argv + i + 1);
		else if (inname == NULL)
			inname = argv[i];
		else
			usage();
	}
	if (inname != NULL) {
		if ((in = fopen(inname, "r")) == NULL) {
			fprintf(stderr, "cannot open %s for reading\n", inname);
			exit(EXIT_FAILURE);
		}
	} else {
		inname = "stdin";
		in = stdin;
	}
	if (outname != NULL) {
		if ((out = fopen(outname, "w")) == NULL) {
			fprintf(stderr, "cannot open %s for writing\n",
			    outname);
			exit(EXIT_FAILURE);
		}
	} else {
		outname = "stdout";
		out = stdout;
	}
	process(in, out);
	fclose(in);
	fclose(out);
	exit(EXIT_SUCCESS);
}

/*
 * This scans for -o type options.  Options that have an argument
 * can either take it immediately or as a subsequent argument (e.g.,
 * -ofoo means the same thing as -o foo).  We return 0 for "handled
 * them normally", 1 for "handled them normally but needed more
 * arguments".
 *
 * Currently this function is more powerful than really needed, but
 * if we ever decide to have more arguments...
 */
int options(const char *afterdash, char **moreargs) {
	int nmore = 0, c;

	while ((c = *afterdash++) != '\0') {
		if (c == 'o') {
			if (*afterdash) {
				outname = afterdash;
				afterdash = "";
			} else if (moreargs[nmore] != NULL)
				outname = moreargs[nmore++];
			else
				usage();
		} else if (c == 't')
			trigraphs = 0;
		else if (c == 'w')
			whitespace = 0;
		else if (c == '9')
			c99 = 1;
		else
			usage();
	}
	return nmore;
}

void usage(void) {
	fprintf(stderr, "usage: uncomment [-9tw] [-o outfile] [infile]\n");
	exit(EXIT_FAILURE);	/* ??? */
}

/*
 * States, level 0:
 *	normal
 *	trigraph processing: Q1 Q2 (for ??x)
 *
 * States, level 1:
 *	backslash-newline processing: BACK (seen \, may consume NL)
 *
 * States, level 2:
 *	normal
 *	character constant: CC (seen '), CCBACK (seen \ inside CC)
 *	string constant: SC, SCBACK
 *	comment: SLASH, COMM, COMMSTAR (for /, in-comment, & seen-star)
 *	C99: SLASHSLASH
 */

enum l0state {
	L0_NORMAL,
	L0_Q1, L0_Q2
};
enum l1state {
	L1_NORMAL,
	L1_BACK
};
enum l2state {
	L2_NORMAL,
	L2_CC, L2_CCBACK,
	L2_SC, L2_SCBACK,
	L2_SLASH, L2_COMM, L2_COMMSTAR,
	L2_SLASHSLASH
};

struct state {
	FILE *in;
	enum l0state l0state;
	int npushback;
	char pushback[4];
	char pushorig[4];	/* nonzero => trigraph pushback */
	int lastgetc;
	int lineno;
};

/*
 * Set up "initial" state.
 */
static void state0(struct state *sp, FILE *in) {
	sp->in = in;
	sp->l0state = L0_NORMAL;
	sp->npushback = 0;
	sp->lastgetc = 0;
	sp->lineno = 1;
}

static void pushback(struct state *sp, int c, char origc) {
	assert(sp->npushback < sizeof sp->pushback);
	sp->pushback[sp->npushback] = c;
	sp->pushorig[sp->npushback++] = origc;
}

/*
 * Get a character, doing trigraph processing.  Set *origc to 0 for normal
 * characters, or the actual input character pre-trigraph-mapping
 * for trigraph input.
 *
 * As a side effect, this can wind up getting up to 3 characters, maybe
 * stuffing two of them into the pushback buffer sp->buf[].  It also bumps
 * sp->lineno when a previously-read newline has been passed over.
 */
static int getl0char(struct state *sp, char *origc) {
	int c, newc;
	enum l0state state;

	state = sp->l0state;
	*origc = 0;
	while ((c = getc(sp->in)) != EOF) {
		if (sp->lastgetc == '\n')
			sp->lineno++;
		sp->lastgetc = c;
		switch (state) {

		case L0_NORMAL:
			/* ? => get another character; otherwise we are ok */
			if (c == '?') {
				state = L0_Q1;
				continue;
			}
			assert(sp->l0state == L0_NORMAL);
			return c;

		case L0_Q1:
			/* ?? => get another character */
			if (c == '?') {
				state = L0_Q2;
				continue;
			}
			/* ?X => return ?, look at X later */
			pushback(sp, c, 0);
			sp->l0state = L0_NORMAL;
			return '?';

		case L0_Q2:
			/*
			 * ??X, where X is trigraph => map
			 * ??X, where X is non-trigraph => tricky
			 * ??? => also tricky
			 */
			switch (c) {
			case '=':
				newc = '#';
				break;
			case '(':
				newc = '[';
				break;
			case '/':
				newc = '\\';
				break;
			case ')':
				newc = ']';
				break;
			case '\'':
				newc = '^';
				break;
			case '<':
				newc = '{';
				break;
			case '!':
				newc = '|';
				break;
			case '>':
				newc = '}';
				break;
			case '?':
				/*
				 * This one is slightly tricky.  Three '?'s
				 * mean that the '?' we read two characters
				 * ago gets returned, and the two remaining
				 * '?'s leave us in Q2 state.
				 */
				sp->l0state = L0_Q2;
				return '?';
			default:
				/*
				 * This one returns the first ?, leaves
				 * the second ? to be re-examined, and
				 * leaves the last character to be re-examined.
				 * In any case we are back in "normal" state.
				 */
				pushback(sp, c, 0);
				pushback(sp, '?', 0);
				sp->l0state = L0_NORMAL;
				return '?';
			}
			/* mapped a trigraph char -- return new char */
			*origc = c;
			sp->l0state = L0_NORMAL;
			return newc;

		default:
			panic("getl0char state");
		}
	}
	sp->lastgetc = EOF;
	return EOF;
}

void warn(struct state *, const char *);

void process(FILE *in, FILE *out) {
	enum l1state l1state = L1_NORMAL;
	enum l2state l2state = L2_NORMAL;
	int c, pendnls;
	char origc, backc;
	struct state state;

	state0(&state, in);
	pendnls = 0;
	backc = 0;		/* defeat gcc warning */

	/*
	 * Slight sort-of-bug: files ending in \ cause two "final" getc()s.
	 */
	do {
		if (state.npushback) {
			c = state.pushback[--state.npushback];
			origc = state.pushorig[state.npushback];
		} else if (trigraphs) {
			c = getl0char(&state, &origc);
		} else {
			c = getc(in);
			origc = 0;
			if (state.lastgetc == '\n')
				state.lineno++;
			state.lastgetc = c;
		}

		/*
		 * Do backslash-newline processing.
		 */
		switch (l1state) {

		case L1_NORMAL:
			if (c == '\\') {
				l1state = L1_BACK;
				backc = origc;
				continue;
			}
			break;

		case L1_BACK:
			/*
			 * If backc is nonzero here, the backslash that
			 * got us into this state was spelled ??/ --
			 * if we eat a newline (and hence the backslash),
			 * we forget that the eaten newline was spelled
			 * this way.  This is sort of a bug, but so it goes.
			 */
			l1state = L1_NORMAL;
			if (c == '\n') {
				pendnls++;
				continue;
			}
			if (c != EOF)
				pushback(&state, c, origc);
			c = '\\';
			origc = backc;
			break;

		default:
			panic("bad l1state");
		}

		/*
		 * Now ready to do "C proper" processing.
		 */
#define	SYNCLINES()	while (pendnls) putc('\n', out), pendnls--
#define	OUTPUT(ch, tri) ((tri) ? fprintf(out, "??%c", tri) : putc(ch, out))
#define	COPY()		OUTPUT(c, origc)

		switch (l2state) {
		case L2_NORMAL:
			switch (c) {
			case '\'':
				l2state = L2_CC;
				break;
			case '"':
				l2state = L2_SC;
				break;
			case '/':
				l2state = L2_SLASH;
				continue;
			default:
				break;
			}
			SYNCLINES();
			if (c != EOF)
				COPY();
			break;

		case L2_CC:
			switch (c) {
			case EOF:
				warn(&state, "EOF in character constant");
				break;
			case '\n':
				warn(&state, "newline in character constant");
				break;
			case '\\':
				l2state = L2_CCBACK;
				break;
			case '\'':
				l2state = L2_NORMAL;
				break;
			default:
				break;
			}
			if (c != EOF)
				COPY();
			break;

		case L2_CCBACK:
			switch (c) {
			case EOF:
				warn(&state, "EOF in character constant");
				break;
			case '\n':
				warn(&state, "newline in character constant");
				break;
			default:
				break;
			}
			l2state = L2_CC;
			if (c != EOF)
				COPY();
			break;

		case L2_SC:	/* much like CC */
			switch (c) {
			case EOF:
				warn(&state, "EOF in string constant");
				break;
			case '\n':
				warn(&state, "newline in string constant");
				break;
			case '\\':
				l2state = L2_SCBACK;
				break;
			case '"':
				l2state = L2_NORMAL;
				break;
			default:
				break;
			}
			if (c != EOF)
				COPY();
			break;

		case L2_SCBACK:
			switch (c) {
			case EOF:
				warn(&state, "EOF in string constant");
				break;
			case '\n':
				warn(&state, "newline in string constant");
				break;
			default:
				break;
			}
			l2state = L2_SC;
			if (c != EOF)
				COPY();
			break;

		case L2_SLASH:
			if (c == '*')
				l2state = L2_COMM;
			else if (c99 && c == '/')
				l2state = L2_SLASHSLASH;
			else {
				SYNCLINES();
				OUTPUT('/', 0);
				if (c != '/') {
					if (c != EOF)
						COPY();
					l2state = L2_NORMAL;
				}
			}
			break;

		case L2_COMM:
			switch (c) {
			case '*':
				l2state = L2_COMMSTAR;
				break;
			case '\n':
				pendnls++;
				break;
			case EOF:
				warn(&state, "EOF inside comment");
				break;
			}
			break;

		case L2_COMMSTAR:
			switch (c) {
			case '/':
				l2state = L2_NORMAL;
				/*
				 * If comments become whitespace,
				 * and we have no pending newlines,
				 * must emit a blank here.
				 *
				 * The comment text is now all eaten.
				 */
				if (whitespace && pendnls == 0)
					putc(' ', out);
				SYNCLINES();
				break;
			case '*':
				/* stay in L2_COMMSTAR state */
				break;
			case EOF:
				warn(&state, "EOF inside comment");
				break;
			case '\n':
				pendnls++;
				/* FALLTHROUGH */
			default:
				l2state = L2_COMM;
			}
			break;

		case L2_SLASHSLASH:
			switch (c) {
			case EOF:
				/* ??? do we really care? */
				warn(&state, "EOF inside //-comment");
				break;
			case '\n':
				l2state = L2_NORMAL;
				pendnls++;	/* cheesy, but... */
				SYNCLINES();
			default:
				break;
			}
			break;

		default:
			panic("bad l2state");
		}
	} while (c != EOF);
	SYNCLINES();
}

void warn(struct state *sp, const char *msg) {
	fprintf(stderr, "uncomment: %s(%d): %s\n", inname, sp->lineno, msg);
}

void panic(const char *msg) {
	fprintf(stderr, "panic: %s\n", msg);
	abort();
	exit(EXIT_FAILURE);
}



From Chris Mears

Here's Chris's updated version, without the bugs (says he). :-)

/*
 * C comment stripper.
 *
 * Strips comments from C or C++ code.
 */

#include <stdio.h>

enum state_t { normal, string, character, block_comment, line_comment};

enum token_t { none, backslash, slash, star, tri1, tri2, tri_backslash};

static int print_mode(enum state_t s)
{
        return (s == normal || s == string || s == character);
}

void cstrip(FILE *infile, FILE *outfile)
{
        int ch;
        int comment_newline = 0;
        enum state_t state = normal;
        enum token_t token = none;
        enum token_t last_token = none;
        
        if (!infile || !outfile || (infile == outfile)) {
                return;
        }


        while ((ch = fgetc(infile)) != EOF) {
                switch (ch) {
                case '/':
                        if (token == tri2) {
                                token = tri_backslash;
                                if (print_mode(state))
                                        fputc(ch, outfile);
                        } else if (state == string || state == character) {
                                fputc(ch, outfile);
                                token = slash;
                        } else if (state == block_comment && token == star) {
                                state = normal;
                                token = none;

                                /* Replace block comments with whitespace. */
                                if (comment_newline) {
                                        fputc('\n', outfile);
                                } else {
                                        fputc(' ', outfile);
                                }
                        } else if (state == normal && token == slash) {
                                state = line_comment;
                                token = slash;
                        } else {
                                token = slash;
                        }
                        
                        break;

                case '\\':
                        if (state == normal && token == slash)
                                fputc('/', outfile);
                        if (print_mode(state))
                                fputc(ch, outfile);

                        if (token == backslash || token == tri_backslash) {
                                token = none;
                        } else {
                                last_token = token;
                                token = backslash;
                        }
                                
                        break;

                case '"':
                        if (state == normal && token == slash)
                                fputc('/', outfile);
                        if (state == string && token != backslash)
                                state = normal;
                        else if (state == normal && token != backslash)
                                state = string;

                        if (print_mode(state))
                                fputc(ch, outfile);

                        token = none;
                        
                        break;

                case '\'':
                        if (state == normal && token == slash)
                                fputc('/', outfile);
                        if (state == character && token != backslash)
                                state = normal;
                        else if (state == normal && token != backslash)
                                state = character;

                        if (print_mode(state))
                                fputc(ch, outfile);

                        token = none;
                        
                        break;

                case '\n':
                        /* This test is independent of the others. */
                        if (state == block_comment)
                                comment_newline = 1;
                        
                        if (state == normal && token == slash)
                                fputc('/', outfile);
                        
                        if (token == backslash || token == tri_backslash)
                                token = last_token;
                        else if (state == line_comment &&
                                        token != backslash) {
                                state = normal;
                                token = none;
                        } else {
                                token = none;
                        }

                        if (print_mode(state))
                                fputc(ch, outfile);

                        break;

                case '*':
                        if (state == normal && token == slash) {
                                state = block_comment;
                                token = none;
                                comment_newline = 0;
                        } else {
                                token = star;
                        }
                        
                        if (print_mode(state))
                                fputc(ch, outfile);

                        break;

                case '?':
                        if (state == normal && token == slash)
                                fputc('/', outfile);
                        
                        if (token == tri1) {
                                token = tri2;
                        } else if (token == tri2) {
                                token = tri2;   /* retain state */
                        } else {
                                /* We might need the last token if this
                                 * trigraph turns out to be a backslash.
                                 */
                                last_token = token;
                                token = tri1;
                        }

                        if (print_mode(state))
                                fputc(ch, outfile);

                        break;

                default:
                        if (state == normal && token == slash)
                                fputc('/', outfile);

                        if (print_mode(state))
                                fputc(ch, outfile);

                        token = none;

                        break;
                } /* switch */

        } /* while */

        return;
}


/* Small driver program. */

int main(void)
{
        cstrip(stdin, stdout);

        return 0;
}





Here's a critique of the above, sent in by Rick Litherland. (Please note: when Rick posted this, I hadn't yet posted Chris Mears's updated version of the code.)

(Since I find it hard to pick the solution number out of KRX12300.C at a glance, I'll refer to the solutions as uncomment00, uncomment01, and so on.)

[Rick - KR means K&R. X means eXercise. 1 means Chapter 1. 23 means exercise 23. The next digit is the category number - 0 == Cat 0 (ANSI C89, with code restricted to what K&R have discussed at this point in the book). The final digit is the solution number. 0 is the first I received in that category, 1 is the second, and so on. (RJH)]

uncomment03 (Gregory Pietsch)

===========

I can find only one possible flaw in this, namely that it does not allow for a slash in program text being immediately followed by a quotation mark. One could reasonably argue that this is not a flaw at all, because that would never happen in sensible code. On the other hand, it can happen in legal code, as demonstrated by the following complete (if useless) program.

#include <stdio.h>
int main(void)
{
    /* print the number three */
    printf("%d\n", 6/'\2');
    /* remember to return a value from main */
    return 0;
}

When this is fed to uncomment03, the output is

#include <stdio.h>
int main(void)
{
     
    printf("%d\n", 6/'\2');
    /* remember to return a value from main */
    return 0;
}

Clearly, uncomment03 realises that the second comment is too important to remove. Um, sorry, that was a feeble excuse for a joke. What's happening is that uncomment03 doesn't recognise the beginning of the character constant '\2', so it takes the closing quote as the start of a "character constant" that is never terminated. The peculiar idiom 6/'\2' for 3 can be replaced by the even more brain-damaged 6/"\2"[0] with the same effect. Since uncomment03 is table-driven, it's easy to make it recognise these situations by adding two new rules to the table.

/* modified krx12303.c */
#include <stdio.h>

char p[] =
"0/!10\"@40\'@50.@01/@11*!2"
"1\"/41\'/5"            /* added by RAL */
"1./02*!32.!23/ 03*!33.!24\"@04\\@64.@45\'@05\\@75.@56.@47.@5";

int main(){int c,i,d;char s,n;s='0';while((c=getchar())
!=EOF){d=0;for(i=0;p[i]!='\0'&&d==0;i=i+4){if(p[i]==s&&
(p[i+1]==c||p[i+1]=='.')){if(p[i+2]=='@')putchar(c);else
if(p[i+2]=='/'){putchar('/');putchar(c);}else if(p[i+2]
==' ')putchar(' ');n=p[i+3];d=1;}}s=n;}return 0;}
/* end of modified krx12303.c */

uncomment02 (Lew Pitcher)

===========

uncomment11 (Chris Torek)

===========

These have the same problem (or non-problem, according to your point of view) as uncomment03. If it were regarded as a problem, it could probably be fixed quite easily, though not (I think) as neatly as with uncomment03; I haven't looked at these carefully enough to be sure.

uncomment01, uncomment10 (Ben Pfaff)

=========== ===========

An oversight has the effect that if a slash in program text is followed by anything other than a star or another slash, the following character is dropped. For example, with input

int a = 4/2;
the output is
int a = 4/;

The correction is the same in both cases; replace

    /* Program text following a slash. */
    if (c == '*')
        state = COMMENT;
    else {
        putchar('/');
        if (c != '/')
            state = PROGRAM;
    }

by

    /* Program text following a slash. */
    if (c == '*')
        state = COMMENT;
    else {
        putchar('/');
        if (c != '/') {
            putchar(c);
            state = PROGRAM;
        }
    }

After this, these programs will have the same problem (or not) as the previous three.

uncomment12 (Chris Mears)

===========

This is a completely different kettle of fish. If you run this with Ben Pfaff's solution as input, the output is quite bizarre; some comments have just their initial and final slashes removed, for instance. I've managed to find two things contributing to this. The first is illustrated by the input

int c = '/';

with output

int c = '';

This can be fixed by changing the lines

    case '/':
        if (state == string) {

to

    case '/':
        if (state == string || state == character) {

However, with or without this change, the input

char *p = "\\"; /* This is not a comment. */

is left unchanged. What happens is that the closing quote of the string literal isn't recognised as such because of the preceding backlash, despite the backslash before that. The handling of backslashes is split between three cases (at least), and is complicated enough that I don't feel competent to propose a remedy.






This program breaks most of the above submissions:

/* krx123tp.c - a test program to serve as input to krx123*.c
 *
 * This is a shameless copy of Ben Pfaff's solution, to which I have
 * added a few extra statements to further test the candidate programs
 * for this exercise. As Ben says, this program already contains lots
 * of examples of comments and not-quite-comments. I've just made it
 * a little tougher.
 *
 */

/* K&R2 1-23: Write a program to remove all comments from a C program.
   Don't forget to handle quoted strings and character constants
   properly.  C comments do not nest.

   This solution does not deal with other special cases, such as
   trigraphs, line continuation with \, or <> quoting on #include,
   since these aren't mentioned up 'til then in K&R2.  Perhaps this is
   cheating.

   Note that this program contains both comments and quoted strings of
   text that looks like comments, so running it on itself is a
   reasonable test.  It also contains examples of a comment that ends
   in a star and a comment preceded by a slash.  Note that the latter
   will break C99 compilers and C89 compilers with // comment
   extensions.

   Interface: The C source file is read from stdin and the
   comment-less output is written to stdout. **/

#include <stdio.h>

int
main(void)
{
    /* State machine's current state. */
    enum {
        PROGRAM,
        SLASH,
        COMMENT,
        STAR,
        QUOTE,
        LITERAL
    } state;

    /* If state == QUOTE, then ' or ".  Otherwise, undefined. */
    int quote;

    state = PROGRAM;
    for (;;) {
        int c = getchar();
        if (c == EOF) {
            if (state == SLASH)
                putchar('/' //**/
                        1 / 1 /'\1');
            break;
        }

        if(0)
          printf("%d\n", 6/'\2'); 
        /* line of code, and comment, added by RJH 10 July 2000 */

        switch (state) {
        case SLASH:
            /* Program text following a slash. */
            if (c == "/*"[1]) {
                state = COMMENT;
                break;
            }
            putchar('/');
            state = PROGRAM;
            /* Fall through. */

        case PROGRAM:
            /* Program text. */
            if (c == '\'' || c == '"') {
                quote = c;
                state = QUOTE;
                putchar(c);
            }
            else if (c == "/*"[0])
                state = SLASH;
            else
                putchar(c);
            break;

        case COMMENT:
            /* Comment. */
            if (c == '*')
                state = STAR;
            break;

        case STAR:
            /* Comment following a star. */
            if (c == '/')
                state = PROGRAM;
            else if (c != '*') {
                state = COMMENT;
                putchar (' ');
            }
            break;

        case QUOTE:
            /* Within quoted string or character constant. */
            putchar(c);
            if (c == '\\')
                state = LITERAL;
            else if (c == quote)
                state = PROGRAM;
            break;

        case LITERAL:
            /* Within quoted string or character constant, following \. */
            putchar(c);
            state = QUOTE;
            break;

        default:
            abort();
        }
    }

    return /* this comment added by RJH 10 July 2000 */ 0;
}

/* 
   Local variables:
   compile-command: "checkergcc -W -Wall -ansi -pedantic knr123.c -o knr123"
   End: 
*/





Back to index





You are visitor number - call again soon!