Processo de varredura de um arquivo de C++ onde ele separa por palavras reservadas, identificadores, números, operadores e comentários.

#include 
#include 
#include 
#include 

#define SOURCE_FILENAME src.txt
#define OUTPUT_FILENAME out.txt

#define LINE_MAX_LEN 500
#define TOKEN_MAX_LEN 50

enum toktype {
  UNKNOW,
  OPERATOR,
  NUMBER,
  IDENTIFIER,
  RESWORD,
  COMMOPEN,
  COMMCLOSE
};

struct token {
  char lex[TOKEN_MAX_LEN + 1];
  enum toktype type;
};

#define STR(s) _STR(s)
#define _STR(s) #s

struct token *get_tok(const char *cs, const char **lastc);

int main(int argc, char *argv[])
{
  FILE *src_fd;
  FILE *outp_fd;

  if ((src_fd = fopen(STR(SOURCE_FILENAME), "r")) == NULL ||
      (outp_fd = fopen(STR(OUTPUT_FILENAME), "w")) == NULL) {
    perror(argv[0]);
    return -1;
  }

  int line_num;
  char line[LINE_MAX_LEN];

  line_num = 0;
  while (fgets(line, sizeof line, src_fd) != NULL) {
    fprintf(outp_fd, "%dn", ++line_num);

    struct token *tok;
    const char *tok_firstc_ptr, *tok_lastc_ptr;

    for (tok_firstc_ptr = line;
	 (tok = get_tok(tok_firstc_ptr, &tok_lastc_ptr)) != NULL;
	 tok_firstc_ptr = tok_lastc_ptr + 1) {
      if (tok->type == COMMOPEN) {
	const char *comm_open = tok_lastc_ptr;

	for (tok_firstc_ptr = tok_lastc_ptr + 1;
	     (tok = get_tok(tok_firstc_ptr, &tok_lastc_ptr)) != NULL &&
	       tok->type != COMMCLOSE;
	     tok_firstc_ptr = tok_lastc_ptr + 1)
	  ;

	const char *comm_close = tok_lastc_ptr;

	fprintf(outp_fd, "%17s: ", "comentario");

	while (comm_open <= comm_close)
	  fputc(*comm_open++, outp_fd);

	fputc('n', outp_fd);
	continue;
      }

      const char *type_name;

      switch (tok->type) {
      case NUMBER:
	type_name = "numeral";
	break;
      case OPERATOR:
	type_name = "operador";
	break;
      case IDENTIFIER:
	type_name = "identificador";
	break;
      case RESWORD:
	type_name = "palavra reservada";
	break;
      case UNKNOW:
      default:
	type_name = "desconhecido";
	break;
      }

      fprintf(outp_fd, "%17s: %sn", type_name, tok->lex);
    }
  }

  exit(EXIT_SUCCESS);
}

int get_lex(char *lex, size_t lex_maxlen, const char *cs);
enum toktype get_toktype(const char *lex);

struct token *get_tok(const char *cs, const char **lastc)
{
  if (cs == NULL || lastc == NULL)
    return NULL;
  
  char tmp_lex[TOKEN_MAX_LEN + 1];
  int ccnt; /* Contador de caracteres lidos pelo scanner. */
  if ((ccnt = get_lex(tmp_lex, sizeof tmp_lex, cs)) == 0) {
    *lastc = cs;

    return NULL;
  } else if (tmp_lex[0] == '') {
    *lastc = &cs[ccnt - 1];

    return NULL;
  }

  *lastc = &cs[ccnt - 1];

  struct token *tok;
  if ((tok = malloc(sizeof(struct token))) == NULL) {
    perror("gettok()");
    exit(EXIT_FAILURE);
  }

  strncpy(tok->lex, tmp_lex, sizeof tmp_lex);
  tok->type = get_toktype(tok->lex);
  return tok;
}  

int get_lex(char *lex, size_t lex_maxlen, const char *cs)
{
  if (lex == NULL || lex_maxlen < 1 || cs == NULL || cs[0] == '')
    return 0;

  const char *strp = cs; /* Aponta para o proximo caracter da string fonte. */

  while (isspace(lex[0] = *strp++)) /* Espaco em branco nao eh relevante. */
    ;

  /*
    Se a string nao possui lexemas, devemos retornar uma string nula, pois o
    contador de caracteres nao pode ser usado para determinar tal informacao.
  */
  if (lex[0] == '')
    return strp - cs;

  /*
    A partir desse ponto a string possui um lexema. Iniciarei a busca do
    lexema supondo que ele eh um numeral.
  */
  char *lexp = &lex[1]; /* aponta para o proximo caracter do lexema. */

  if (isdigit(lex[0]) || lex[0] == '.' && isdigit(*strp))
    while (lexp - lex < lex_maxlen && (isdigit(*strp) || *strp == '.'))
      *lexp++ = *strp++;
  /* 
     Ate aqui nenhum caracter serviu na busca por um numeral entao procuraremos
     por caracteres compativeis com palavras reservadas ou identificadores.
  */
  else if (isalpha(lex[0]) || lex[0] == '_')
    while (lexp - lex < lex_maxlen && (isalnum(*strp) || *strp == '_'))
      *lexp++ = *strp++;
  /*
    Ainda nao sendo adequado so me resta supor que seja um operador. ispunct()
    procura por caracteres imprimiveis que nao sao nem espaco nem alfanumericos,
    justamente os adequados para tal.
  */
  else if (ispunct(lex[0]))
    while (lexp - lex < lex_maxlen && ispunct(*strp))
      *lexp++ = *strp++;

  *lexp = '';
  return strp - cs;
}  

int is_numeral(const char *lex);
int is_operator(const char *lex);
int is_resword(const char *lex);
int is_identifier(const char *lex);
int is_commet_open(const char *lex);
int is_commet_close(const char *lex);

enum toktype get_toktype(const char *lex)
{
  if (is_numeral(lex))
    return NUMBER;
  if (is_operator(lex))
    return OPERATOR;
  if (is_resword(lex))
    return RESWORD;
  if (is_identifier(lex))
    return IDENTIFIER;
  if (is_commopen(lex))
    return COMMOPEN;
  if (is_commclose(lex))
    return COMMCLOSE;

  return UNKNOW;
}

int is_commopen(const char *lex)
{
  if (lex == NULL)
    return 0;

  if (strcmp(lex, "{") == 0)
    return 1;

  return 0;
}

int is_commclose(const char *lex)
{
  if (lex == NULL)
    return 0;

  if (strcmp(lex, "}") == 0)
    return 1;

  return 0;
}

int is_numeral(const char *lex)
{
  if (lex == NULL)
    return 0;

  if (isdigit(*lex))
    while (isdigit(*++lex))
      ;
  
  if (*lex == '.')
    while (isdigit(*++lex))
      ;
  
  if (*lex == '')
    return 1;
  
  return 0;
}

int is_operator(const char *lex)
{
  const char *l_opers[] = {
    "+", "-",
    "*", "/",  
    "<>", "=", ">", "<", ">=", "<=",
    ":=",
    ";"
  };

  if (lex != NULL) {
    int i;

    for (i = 0; i < sizeof l_opers / sizeof(const char *); i++)
      if (strcmp(lex, l_opers[i]) == 0)
	return 1;
  }

  return 0;
}

int is_resword(const char *lex)
{
  const char *l_reswords[] = {
    "read", "write",
    "begin", "end",
    "if", "else", "then",
    "for", "while", "do", "until", "repeat"
  };

  if (lex != NULL && isalpha(*lex)) {
    int i;

    for (i = 0; i < sizeof l_reswords / sizeof(const char *); i++)
      if (strcmp(lex, l_reswords[i]) == 0)
	return 1;
  }

  return 0;
}

int is_identifier(const char *lex)
{
  if (lex != NULL && (isalpha(lex[0]) || lex[0] == '_' && isalpha(lex[1])))
    if (!is_resword(lex)) {
      while (isalnum(*++lex))
	;

      if (*lex == '')
	return 1;
    }

  return 0;
}