Notes about Bison





Introduction

Bison is the GNU version of Yacc (from Berkeley). Yacc means "Yet Another C Compiler". It was designed to help the implementation of compilers. Bison takes a grammar as input and generates a C code that represents the syntax parser for the given grammar.

Please note that Bison is not a lexical parser. Bison uses Flex (the GNU version of Lex from Berkeley) in order to perform the lexical parsing.

The following example shows how to use Bison in order to write a parser for boolean expression. This analyzer of boolean expression supports:

  • Operators OR, AND, XOR, NOT.
  • Priorities between operators.
  • Parenthesis (any level of nested parenthesis).
  • This parser detects all syntaxic error.

The output of the analyzer is the RPN (Revers Polish Notation, used by HP calculators) representation of the given boolean expression.

For example:

INPUT BOOLEAN EXPRESSION: "b1 OR b2 AND b3 OR (b4 AND b5) XOR NOT b6"

OUTPUT (RPN representation):
b1
b2
b3
and()
or()
b4
b5
and()
b6
not()
xor()
or()
  

The Bison grammar

This file defines the syntax of a boolean expression.

bool.y



%{
   #include <stdio.h>
   #include <string.h>


   #define BOOLEAN_ANALIZER

   #include "boolean_analyzer.h"

   /* ----------------------------------------------------- */
   /* Error handling */
   /* ----------------------------------------------------- */

   #define MAX_ERROR 2048
   char boolean_parsing_error[MAX_ERROR];

   void boolerror (char *m)
   {
     strncpy (
               boolean_parsing_error,
               m,
               strlen(m)+1 <= MAX_ERROR ? strlen(m)+1 : MAX_ERROR-1
             );
     boolean_parsing_error[MAX_ERROR-1]=0;
   }

   /* ----------------------------------------------------- */
   /* The following code is used to interface with the in- */
   /* -memory lexical parser. */
   /* ----------------------------------------------------- */

   extern int boollex();
   int boolparse(void);
   extern void init_lexical_parser(char *src);
   extern void close_lexical_parser();

   static List *list;

   /* ----------------------------------------------------- */
   /* The boolean expression analyser itsef */
   /* ----------------------------------------------------- */

   int boolean_spliter(char *src, List *l)
   {
     int res;

     /* --------------------------------------------------- */
     /* Initialize the linked list */
     /* --------------------------------------------------- */

     list = l;

     /* --------------------------------------------------- */
     /* Parse in memory buffer pointed by 'src' */
     /* --------------------------------------------------- */

     init_lexical_parser(src);
     res = boolparse();
     close_lexical_parser();

     return res;
   }


%}

                
                Defines the type of the token returned by the lexical parser (that is Flex). This is a C union.
                


%union {
         char* mot;
         enum operator_name operatorval;
         enum delimitor_name delimitorval;
       }

                
                Defines the type of each token.
                




%token <mot> MOT
%token <operatorval> AND
%token <operatorval> OR
%token <operatorval> XOR
%token <operatorval> NOT
%token <delimitorval> OPEN_BRAKET
%token <delimitorval> CLOSE_BRAKET

%%


expression:
    expression OR terme {
                          if (add_element (list, operator, or, NULL))
                          { return 1; }
                        }
  | terme
  ;

terme:
    terme AND facteur {
                        if (add_element (list, operator, and, NULL))
                        { return 1; }
                      }
  | terme XOR facteur {
                        if (add_element (list, operator, xor, NULL))
                        { return 1; }
                      }
  | facteur
  ;

facteur:
    OPEN_BRAKET expression CLOSE_BRAKET {}
  | NOT facteur {
                  if (add_element (list, operator, not, NULL))
                  { return 1; }
                }
  | MOT {
           if (add_element (list, word, undef, $1)) { return 1; }
         }
  ;


%%

The lexical parser

Bison calls the lexical parser to cut the input stream into tockens. The file "bool.bison.h" is generated by Bison.

bool.lex

%{
   #include <stdlib.h>
   #include "grammar_types.h"
   #include "bool.bison.h"

   int boolwrap() { return 1; }

   /* ----------------------------------------------------- */
   /* The following code is used to parse in-memory data */
   /* -> init_lexical_parser: must be called before the */
   /* grammatical analyzer. */
   /* -> close_lexical_parser: must be called after the */
   /* end of the grammatical analyzer. */
   /* ----------------------------------------------------- */

   static YY_BUFFER_STATE buf_state;
   void init_lexical_parser(char *src) { buf_state = yy_scan_string(src); }
   void close_lexical_parser() { yy_delete_buffer(buf_state); }

   /* ----------------------------------------------------- */
   /* Allocatd memory for words */
   /* ----------------------------------------------------- */

   char *found_mot;
%}

SPACES [\t ]+
LEX_MOT [^\t\n\(\) ]+
LEX_AND "AND"
LEX_OR "OR"
LEX_XOR "XOR"
LEX_NOT "NOT"
LEX_OPEN_BRAKET \(
LEX_CLOSE_BRAKET \)

%%

{LEX_AND} {
            boollval.operatorval = and;
            return AND;
          }

{LEX_OR} {
            boollval.operatorval = or;
            return OR;
          }

{LEX_XOR} {
            boollval.operatorval = xor;
            return XOR;
          }

{LEX_NOT} {
            boollval.operatorval = not;
            return NOT;
          }

{LEX_OPEN_BRAKET} {
                    boollval.delimitorval = open_braket;
                    return OPEN_BRAKET;
                  }

{LEX_CLOSE_BRAKET} {
                     boollval.delimitorval = close_braket;
                     return CLOSE_BRAKET;
                   }

{LEX_MOT} {
            found_mot = (char*)malloc(sizeof(char) * (strlen(yytext)+1));
            if (found_mot == NULL) { return 1; }
            strcpy (found_mot, yytext);
            boollval.mot = found_mot;
            return MOT;
          }

{SPACES} { }

\n { return 0; }

%%

Other C codes

The file "linked_list.c" contains code that interfaces the Bison parser with a linked list (used to save the RPN representation).

Linked list.c

/*! \file linked_list.c
    Linked list management for the boolean interpretor.
 */

#include <stdlib.h>
#include "linked_list.h"

/*! \brief User provided function used to free one element of the linked list.
    \param d Pointer to the element to free.
 */

static void free_data (void *d)
{
  struct list_data *data;

  data = (struct list_data*)d;
  if (data->type == word) { free (data->word); }

  free (data);
}

/*! \brief User provided function used to compare two elements of the linked
           list.
    \param d1 Pointer to the first element to compare.
    \warning This function is required by the linked list API. But it is not
             used by the interpretor.
    \param d2 Pointer to the second element to compare.
    \return The function returns 1 if the elements pointed by d1 and d2 are
            identical. It returns 0 otherwise.
 */

static int compare_element (const void* d1, const void* d2) { return 0; }

/*! \brief Initialize a given linked list so it can be used to save the result of the boolean analysis.
    \param l Pointer to the linked list to initialize.
 */

void init_lined_list (List *l)
{ list_init (l, free_data, compare_element); }

/*! \example test_bool.c */

/*! \brief Free all memory allocated for a linked list.
    \param l Pointer to the linked list to destroy.
 */

void destroy_linked_list (List *l)
{ list_destroy (l); }

/*! \example test_bool.c */

/*! \brief Add an element to the linked list.
    \param l Pointer to the linked list that will receive the new element.
    \param type Type of the new element. It could be:
           <UL>
              <li>word: for a word.
              <li>operator: for an operator.
           </UL>
    \param op If the value of <i>type</i> is <i>operator</i>, then the value
              of <i>op</i> could be:
              <UL>
                <li>and: for the operator <i>AND</i>.
                <li>or: for the operator <i>OR</i>.
              </UL>
              If the value of <i>type</i> is <i>word</i>, then the value
              of <i>op</i> should be set to <i>undef</i>.
    \param w If the value of <i>type</i> is <i>word</i>, then the pointer <i>
             w</i> should point to a zero terminated string of characters that
             represents a word. If the value of <i>type</i> is <i>operator</i>
             then the pointer <i>w</i> should be set to NULL.
    \return Upon sucessful completion the function returns 0, otherwise it
            returns 1 (this means that the program is running out of memory).
 */

int add_element (
                   List *l,
                   enum expr_elem type,
                   enum operator_name op,
                   char *w
                )
{
  struct list_data *d;

  /* --------------------------------------------------- */
  /* Allocate memory */
  /* --------------------------------------------------- */

  d = (struct list_data*)malloc(sizeof(struct list_data));
  if (d == NULL) { return 1; }

  /* --------------------------------------------------- */
  /* Initialize data */
  /* --------------------------------------------------- */

  d->type = type;

  switch (type)
  {
    case word: {
                 d->word = w;
                 d->operator = undef;
               }; break;

    case operator: {
                     d->word = NULL;
                     d->operator = op;
                   }; break;
  }

  /* --------------------------------------------------- */
  /* Insert data to the end of the linked list */
  /* --------------------------------------------------- */

  if (list_ins_next (l, list_tail(l), d) == -1)
  {
    free (d);
    return 1;
  }

  return 0;
}

/*! \example test_bool.c */

This simple program allows you to test the boolean analyzer.

test_bool.c

/* ----------------------------------------------------- */
/* This program illustrates the use of the boolean ex- */
/* -pression analyzer. */
/* ----------------------------------------------------- */

#include <stdio.h>
#include <stdlib.h>
#include "boolean_analyzer.h"

/* ----------------------------------------------------- */
/* This is the in-memory grammatical parser */
/* ----------------------------------------------------- */

#define NUMBER_OF_TEST 5

char *expr[] = {
                 "b1 OR b2 AND b3 OR (b4 AND b5) XOR NOT b6",
                 "b1 OR b2 AND b3 OR (b4 AND b5)",
                 "b1 OR b2 AND",
                 "b1 OR b2 AND b3 OR ((b4 AND b5) OR b1)",
"(b1 OR b2 AND b3 OR ((b4 AND b5) OR b1) AND b1 OR b2 AND b3 OR ((b4 AND b5) OR b1) AND b1 OR b2 AND b3 OR ((b4 AND b5) OR b1) AND b1 OR b2 AND b3 OR ((b4 AND b5) OR b1) AND b1 OR b2 AND b3 OR ((b4 AND b5) OR b1) AND b1 OR b2 AND b3 OR ((b4 AND b5) OR b1) AND b1 OR b2 AND b3 OR ((b4 AND b5) OR b1) AND b1 OR b2 AND b3 OR ((b4 AND b5) OR b1) AND b1 OR b2 AND b3 OR ((b4 AND b5) OR b1) AND b1 OR b2 AND b3 OR ((b4 AND b5) OR b1) AND b1 OR b2 AND b3 OR ((b4 AND b5) OR b1) AND b1 OR b2 AND b3 OR ((b4 AND b5) OR b1)) AND b1 OR b2 AND b3 OR ((b4 AND b5) OR b1) XOR NOT test AND b1 OR b2 AND b3 OR ((b4 AND b5) OR b1) OR NOT b1 OR b2 AND b3 OR ((b4 AND b5) OR b1) OR NOT b1 OR b2 AND b3 OR ((b4 AND b5) OR b1) OR NOT b1 OR b2 AND b3 OR ((b4 AND b5) OR b1) OR NOT b1 OR b2 AND b3 OR ((b4 AND b5) OR b1) OR NOT b1 OR b2 AND b3 OR ((b4 AND b5) OR b1) OR NOT b1 OR b2 AND b3 OR ((b4 AND b5) OR b1) OR NOT b1 OR b2 AND b3 OR ((b4 AND b5) OR b1) OR NOT b1 OR b2 AND b3 OR ((b4 AND b5) OR b1) OR NOT b1 OR b2 AND b3 OR ((b4 AND b5) OR b1) OR NOT b1 OR b2 AND b3 OR ((b4 AND b5) OR b1) OR NOT b1 OR b2 AND b3 OR ((b4 AND b5) OR b1) XOR ((b4 AND b5) OR b1) XOR ((b4 AND b5) OR b1) XOR ((b4 AND b5) OR b1) XOR ((b4 AND b5) OR b1) XOR ((b4 AND b5) OR b1) XOR ((b4 AND b5) OR b1) XOR ((b4 AND b5) OR b1) XOR ((b4 AND b5) OR b1) XOR ((b4 AND b5) OR b1) XOR ((b4 AND b5) OR b1) XOR ((b4 AND b5) OR b1) XOR ((b4 AND b5) OR b1) XOR ((b4 AND b5) OR b1) XOR ((b4 AND b5) OR b1) XOR ((b4 AND b5) OR b1) XOR ((b4 AND b5) OR b1) AND NOT toto OR gros_test_de_la_mort XOR ((b4 AND b5) OR b1) AND b1 OR b2 AND b3 XOR ((b4 AND b5) OR b1) AND b1 OR b2 AND b3 XOR ((b4 AND b5) OR b1) AND b1 OR b2 AND b3 AND ((b4 AND b5) OR b1) AND b1 OR b2 AND b3" };

int main (int argc, char *argv[])
{
  int res, i;
  List list;
  ListElmt *element;

  for (i=0; i<NUMBER_OF_TEST; i++)
  {

    /* --------------------------------------------------- */
    /* Initialize the linked list */
    /* --------------------------------------------------- */

    init_lined_list (&list);

    /* --------------------------------------------------- */
    /* Call the boolean expression analyzer */
    /* --------------------------------------------------- */

    fprintf (stdout, "\n--> %s:\n", expr[i]);
    fflush (stdout);

    res = boolean_spliter(expr[i], &list);

    if (res == 1)
    {
      fprintf (stdout, "\nParsing error: %s\n", boolean_parsing_error);
      fflush (stdout);
      destroy_linked_list (&list);
      continue;
    }

    /* --------------------------------------------------- */
    /* Now print the RPN conversion */
    /* --------------------------------------------------- */

    element = list_head (&list);

    while (element != NULL)
    {
      switch (GET_TYPE(element))
      {
        case word: { fprintf (stdout, "\n%s", GET_WORD(element)); }; break;
        case operator: {
                         switch (GET_OPERATOR(element))
                         {
                           case or: { fprintf (stdout, "\nor()"); }; break;
                           case and: { fprintf (stdout, "\nand()"); }; break;
                           case xor: { fprintf (stdout, "\nxor()"); }; break;
                           case not: { fprintf (stdout, "\nnot()"); }; break;
                           case undef: { fprintf (stdout, "\nundef"); }; break;
                         }
                       }; break;
      }

      element = list_next(element);
    }

    fprintf (stdout, "\n");

    /* --------------------------------------------------- */
    /* Destroy the linked list */
    /* --------------------------------------------------- */

    destroy_linked_list (&list);
    fflush (stdout);
  }

  return 0;
}

Include files

boolean_analyzer.h

/*! \file boolean_analyzer.h
    This file defines the interface of the boolean expression analyzer.
 */

#ifndef BOOLEAN_ANALYSER_HEADER

#include "grammar_types.h"
#include "linked_list.h"

#define MAX_ERROR 2048

#ifndef BOOLEAN_ANALIZER

/*! \brief If the parser finds an sintaxic error, then the array boolean_parsing_error contains a zero
           terminated sequence of characters that describes the error.
 */

extern char boolean_parsing_error[MAX_ERROR];

/*! \brief The boolean analyser itself.
    \param src Pointer to a zero terminated string of characters that represnts the boolean expression.
    \param l Pointer to a linked list that will be used to store the RPN representation of the boolean
           expression.
    \return Upon successfull completion, the function returns the value 0. Otherwise, the function returns
            the value 1. If the returned value is 1, then the array boolean_parsing_error contains a description
            of the error.
 */

int boolean_spliter(char *src, List *l);

/*! \example test_bool.c
    This simple test program shows how to use the analyser of boolean expressions.
 */

#endif

#define BOOLEAN_ANALYSER_HEADER
#endif

grammar_types.h

/*! \file grammar_types.h
    This file defines all data types used for the BISON grammar parser.
 */

#ifndef OPRATOR_HEADER_FILE

  /*! \brief Defines the type of data used by the BISON parser to assign a
             type to all opreator tokens.
   */

  enum operator_name {
                       /*! \brief Operator <i>AND</i>. */
                       and,
                       /*! \brief Operator <i>OR</i>. */
                       or,
                       /*! \brief Operator <i>XOR</i>. */
                       xor,
                       /*! \brief Operator <i>NOT</i>. */
                       not,
                       /*! \brief This value is used to qualify a token which
                                  does not represent an operator (that is, a
                                  word). */
                       undef
                     };

  /*! \brief Defines the type of data used by the BISON parser to assign a
             type to the tockens associated with the characters <i>(</i> and
             <i>)</i>.
   */

  enum delimitor_name {
                        /*! \brief Character <i>(</i>. */
                        open_braket,
                        /*! \brief Character <i>)</i>. */
                        close_braket
                      };

#define OPRATOR_HEADER_FILE
#endif

linked_list.h

/*! \file linked_list.h
    This file defines all data types used to store elements into the
    linked list.
 */

#ifndef LINKED_LIST_HEADER

  #include "grammar_types.h"
  #include "list.h"

  /*! \brief Defines the content of an element. An element may contain:
             <UL>
                <li>A word.
                <li>An operator.
             </UL>
   */

  enum expr_elem {
                   /*! \brief This type defines an element that contains a
                              word. */
                   word,

                   /*! \brief This type defines an element that contains an
                              operator. */
                   operator
                 };

  /*! \brief Defines the structure of an element of the linked list. */

  struct list_data {
                     /*! \brief If the element is a word, then this value points
                                to a zero terminated string of characters that
                                represents the word. Otherwise, it should be set
                                to the value NULL. */
                     char *word;

                     /*! \brief If the element is an operator, then this value
                                could be:
                                <UL>
                                  <li>or
                                  <li>and
                                  <li>xor
                                  <li>not
                                </UL>
                                Otherwise, it should be set to <i>undef</i>. */
                     enum operator_name operator;

                     /*! \brief Define the type of the element. If could be a
                                word or an opreator. */
                     enum expr_elem type;
                   };

  int add_element (
                    List *l,
                    enum expr_elem type,
                    enum operator_name op,
                    char *w
                 );
  void init_lined_list (List *l);
  void destroy_linked_list (List *l);

   /*! \brief This macro returns the type of data stored into an element.
       \param e Pointer to the element of the linked list.
       \return The macro returns one of the following values:
               <UL>
                 <li>word (see <i>enum expr_elem</i>)
                 <li>operator (see <i>enum expr_elem</i>)
               </UL>
     */

   #define GET_TYPE(e) (((struct list_data*)list_data(e))->type)

   /*! \brief This macro returns the word stored into an element.
       \param e Pointer to the element of the linked list.
       \return The macro returns a pointer to a zero terminated string of
               characters that represents the word.
       \warning You should call GET_TYPE(e) first.
   */

   #define GET_WORD(e) (((struct list_data*)list_data(e))->word)

   /*! \brief This macro returns the operator stored into an element.
       \param e Pointer to the element of the linked list.
       \return The macro returns one of the following values:
               <UL>
                 <li>or (see <i>enum operator_name</i>)
                 <li>and (see <i>enum operator_name</i>)
                 <li>xor (see <i>enum operator_name</i>)
                 <li>not (see <i>enum operator_name</i>)
                 <li>undef (see <i>enum operator_name</i>)
               </UL>
       \warning You should call GET_TYPE(e) first.
   */
   #define GET_OPERATOR(e) (((struct list_data*)list_data(e))->operator)

#define LINKED_LIST_HEADER
#endif

Makefile

This Makfile is used to build the boolean analyser.

Makefile

# -----------------------------------------------
# Makefile for the boolean interpretor
# -----------------------------------------------



include ../Makefile.conf


PARSER_PREFIX = bool

# -----------------------------------------------
# Tree organization
# -----------------------------------------------

SRC_DIR = ./src
INCLUDE_DIR = ./includes
FLEX_DIR = ./flex
BISON_DIR = ./bison
TMP_DIR = ./tmp
TEST_DIR = ./tests
BIN_DIR = ./bin
LIB_DIR = ./lib
DOC_DIR = ./doc

# -----------------------------------------------
# Local C compiler options
# -----------------------------------------------

LOCAL_CCFLAGS = -L${LIB_DIR} -I${INCLUDE_DIR} -I${TMP_DIR} ${CCFLAGS}

# -----------------------------------------------
# Building the grammar analizer
# -----------------------------------------------

${TMP_DIR}/bool.bison.h ${TMP_DIR}/bool.bison.c: ${BISON_DIR}/bool.y
        ${YACC} -p ${PARSER_PREFIX} -o ${TMP_DIR}/bool.bison.c -d ${BISON_DIR}/bool.y

${TMP_DIR}/bool.bison.o: ${TMP_DIR}/bool.bison.c \
                         ${TMP_DIR}/bool.bison.h \
                         ${INCLUDE_DIR}/grammar_types.h
        ${CC} ${LOCAL_CCFLAGS} -c ${TMP_DIR}/bool.bison.c

# -----------------------------------------------
# Building the lexical parser
# -----------------------------------------------

${TMP_DIR}/bool.lex.c: ${FLEX_DIR}/bool.lex
        ${LEX} ${LEX_FLAGS} -P${PARSER_PREFIX} -o${@} ${FLEX_DIR}/bool.lex

${TMP_DIR}/bool.lex.o: ${TMP_DIR}/bool.lex.c \
                       ${TMP_DIR}/bool.bison.h
        ${CC} ${LOCAL_CCFLAGS} -c ${TMP_DIR}/bool.lex.c

# -----------------------------------------------
# Linked list management
# -----------------------------------------------

${TMP_DIR}/linked_list.o: ${SRC_DIR}/linked_list.c \
                          ${INCLUDE_DIR}/linked_list.h
        ${CC} ${LOCAL_CCFLAGS} -c ${SRC_DIR}/linked_list.c

# -----------------------------------------------
# Build the library
# -----------------------------------------------

${LIB_DIR}/libbool.a: ${TMP_DIR}/bool.lex.o \
                      ${TMP_DIR}/bool.bison.o \
                      ${TMP_DIR}/linked_list.o
        ${AR} ${ARFLAGS} ${LIB_DIR}/libbool.a ${TMP_DIR}/bool.lex.o ${TMP_DIR}/bool.bison.o ${TMP_DIR}/linked_list.o

# -----------------------------------------------
# Build the top level application
# -----------------------------------------------

${BIN_DIR}/bool.test: ${TEST_DIR}/test_bool.c \
                      ${LIB_DIR}/libbool.a
        ${CC} ${LOCAL_CCFLAGS} ${TEST_DIR}/test_bool.c -lbool -lfl -lmy_list

# -----------------------------------------------
# Build rules
# -----------------------------------------------

test: ${BIN_DIR}/bool.test

all: test

html:
        ${DOXYGEN} doxygen.conf

clean:
        rm -f ${TMP_DIR}/*
        rm -f doxygen.war