"The C Programming Language", 2nd edition, Kernighan and Ritchie

Answer to Exercise 6-3, page 143

Solution by Richard Heathfield



Bug (noticed by John W Krahn) fixed 11 June 2002. The noise word list was broken because it contained out-of-order data. I fixed this, and made the program more generally useful, by performing all string comparisons without regard to case.

Write a cross-referencer that prints a list of all words in a document, and, for each word, a list of the line numbers on which it occurs. Remove noise words like "the", "and," and so on.

/* Write a cross-referencer program that prints a list of all words in a
 * document, and, for each word, a list of the line numbers on which it
 * occurs. Remove noise words like "the", "and," and so on.
 */


#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>

/* no such thing as strdup, so let's write one
 *
 * supplementary question: why did I call this function dupstr,
 * rather than strdup?
 *
 */

char *dupstr(char *s)
{
  char *p = NULL;

  if(s != NULL)
  {
    p = malloc(strlen(s) + 1);
    if(p)
    {
      strcpy(p, s);
    }
  }

  return p;
}

/* case-insensitive string comparison */
int i_strcmp(const char *s, const char *t)
{
  int diff = 0;
  char cs = 0;
  char ct = 0;

  while(diff == 0 && *s != '\0' && *t != '\0')
  {
    cs = tolower((unsigned char)*s);
    ct = tolower((unsigned char)*t);
    if(cs < ct)
    {
      diff = -1;
    }
    else if(cs > ct)
    {
      diff = 1;
    }
    ++s;
    ++t;
  }

  if(diff == 0 && *s != *t)
  {
    /* the shorter string comes lexicographically sooner */
    if(*s == '\0')
    {
      diff = -1;
    }
    else
    {
      diff = 1;
    }
  }

  return diff;
}


struct linelist
{
  struct linelist *next;
  int line;
};

struct wordtree
{
  char *word;
  struct linelist *firstline;
  struct wordtree *left;
  struct wordtree *right;
};

void printlist(struct linelist *list)
{
  if(list != NULL)
  {
    printlist(list->next);
    printf("%6d ", list->line);
  }
}

void printtree(struct wordtree *node)
{
  if(node != NULL)
  {
    printtree(node->left);
    printf("%18s  ", node->word);
    printlist(node->firstline);
    printf("\n");
    printtree(node->right);
  }
}

struct linelist *addlink(int line)
{
  struct linelist *new = malloc(sizeof *new);
  if(new != NULL)
  {
    new->line = line;
    new->next = NULL;
  }

  return new;
}

void deletelist(struct linelist *listnode)
{
  if(listnode != NULL)
  {
    deletelist(listnode->next);
    free(listnode);
  }
}

void deleteword(struct wordtree **node)
{
  struct wordtree *temp = NULL;
  if(node != NULL)
  {
    if(*node != '\0')
    {
      if((*node)->right != NULL)
      {
        temp = *node;
        deleteword(&temp->right);
      }
      if((*node)->left != NULL)
      {
        temp = *node;
        deleteword(&temp->left);
      }
      if((*node)->word != NULL)
      {
        free((*node)->word);
      }
      if((*node)->firstline != NULL)
      {
        deletelist((*node)->firstline);
      }
      free(*node);
      *node = NULL;
    }
  }
}

struct wordtree *addword(struct wordtree **node, char *word, int line)
{
  struct wordtree *wordloc = NULL;
  struct linelist *newline = NULL;
  struct wordtree *temp = NULL;
  int diff = 0;

  if(node != NULL && word != NULL)
  {
    if(NULL == *node)
    {
      *node = malloc(sizeof **node);
      if(NULL != *node)
      {
        (*node)->left = NULL;
        (*node)->right = NULL;
        (*node)->word = dupstr(word);
        if((*node)->word != NULL)
        {
          (*node)->firstline = addlink(line);
          if((*node)->firstline != NULL)
          {
            wordloc = *node;
          }
        }
      }
    }
    else
    {
      diff = i_strcmp((*node)->word, word);
      if(0 == diff)
      {
        /* we have seen this word before! add this line number to
         * the front of the line number list. Adding to the end
         * would keep them in the right order, but would take
         * longer. By continually adding them to the front, we
         * take less time, but we pay for it at the end by having
         * to go to the end of the list and working backwards.
         * Recursion makes this less painful than it might have been.
         */
        newline = addlink(line);
        if(newline != NULL)
        {
          wordloc = *node;
          newline->next = (*node)->firstline;
          (*node)->firstline = newline;
        }
      }
      else if(0 < diff)
      {
        temp = *node;
        wordloc = addword(&temp->left, word, line);
      }
      else
      {
        temp = *node;
        wordloc = addword(&temp->right, word, line);
      }
    }
  }

  if(wordloc == NULL)
  {
    deleteword(node);
  }

  return wordloc;
}

/* We can't use strchr because it's not yet been discussed, so we'll
 * write our own instead.
 */
char *char_in_string(char *s, int c)
{
  char *p = NULL;

  /* if there's no data, we'll stop */
  if(s != NULL)
  {
    if(c != '\0')
    {
      while(*s != '\0' && *s != c)
      {
        ++s;
      }
      if(*s == c)
      {
        p = s;
      }
    }
  }

  return p;
}

/* We can't use strtok because it hasn't been discussed in the text
 * yet, so we'll write our own.
 * To minimise hassle at the user end, let's modify the user's pointer
 * to s, so that we can just call this thing in a simple loop.
 */
char *tokenise(char **s, char *delims)
{
  char *p = NULL;
  char *q = NULL;

  if(s != NULL && *s != '\0' && delims != NULL)
  {
    /* pass over leading delimiters */
    while(NULL != char_in_string(delims, **s))
    {
      ++*s;
    }
    if(**s != '\0')
    {
      q = *s + 1;
      p = *s;
      while(*q != '\0' && NULL == char_in_string(delims, *q))
      {
        ++q;
      }

      *s = q + (*q != '\0');
      *q = '\0';
    }
  }

  return p;
}


/* return zero if this word is not a noise word,
 * or non-zero if it is a noise word
 */

int NoiseWord(char *s)
{
  int found = 0;
  int giveup = 0;

  char *list[] =
  {
    "a",
    "an",
    "and",
    "be",
    "but",
    "by",
    "he",
    "I",
    "is",
    "it",
    "off",
    "on",
    "she",
    "so",
    "the",
    "they",
    "you"
  };
  int top = sizeof list / sizeof list[0] - 1;

  int bottom = 0;

  int guess = top / 2;

  int diff = 0;

  if(s != NULL)
  {
    while(!found && !giveup)
    {
      diff = i_strcmp(list[guess], s);
      if(0 == diff)
      {
        found = 1;
      }
      else if(0 < diff)
      {
        top = guess - 1;
      }
      else
      {
        bottom = guess + 1;
      }
      if(top < bottom)
      {
        giveup = 1;
      }
      else
      {
        guess = (top + bottom) / 2;
      }
    }
  }

  return found;
}

/*
 * Argh! We can't use fgets()! It's not discussed until page 164.
 * Oh well... time to roll our own again...
 */

char *GetLine(char *s, int n, FILE *fp)
{
  int c = 0;
  int done = 0;
  char *p = s;

  while(!done && --n > 0 && (c = getc(fp)) != EOF)
  {
    if((*p++ = c) == '\n')
    {
      done = 1;
    }
  }

  *p = '\0';

  if(EOF == c && p == s)
  {
    p = NULL;
  }
  else
  {
    p = s;
  }

  return p;
}

/*
 * Ideally, we'd use a clever GetLine function which expanded its
 * buffer dynamically to cope with large lines. Since we can't use
 * realloc, and because other solutions would require quite hefty
 * engineering, we'll adopt a simple solution - a big buffer.
 *
 * Note: making the buffer static will help matters on some
 * primitive systems which don't reserve much storage for
 * automatic variables, and shouldn't break anything anywhere.
 *
 */

#define MAXLINE 8192

int main(void)
{
  static char buffer[MAXLINE] = {0};
  char *s = NULL;
  char *word = NULL;
  int line = 0;
  int giveup = 0;
  struct wordtree *tree = NULL;

  char *delims = " \t\n\r\a\f\v!\"%^&*()_=+{}[]\\|/,.<>:;#~?";

  while(!giveup && GetLine(buffer, sizeof buffer, stdin) != NULL)
  {
    ++line;
    s = buffer;
    while(!giveup && (word = tokenise(&s, delims)) != NULL)
    {
      if(!NoiseWord(word))
      {
        if(NULL == addword(&tree, word, line))
        {
          printf("Error adding data into memory. Giving up.\n");
          giveup = 1;
        }
      }
    } 
  }

  if(!giveup)
  {
    printf("%18s  Line Numbers\n", "Word");
    printtree(tree);
  }

  deleteword(&tree);

  return 0;
}





Back to index





You are visitor number - call again soon!