Common Text Transformation Library: grammar tutorial

Theoretical computer science provides an interesting look at solving general computation problems: if input to the problem is described in one formal language, and a problem solution is described in another, then the problem can by solved by an algorithm which implements translation between the two languages. Unfortunately, there are problems that computers cannot easily solve, for example, see traveling salesman problem. At the same time, many problems can be solved by formal language syntax recognition and subsequent translation to a different language. For example, the following program, arithmetics.cpp, translates an arithmetic expression into its pseudo-assembly language equivalent.

sample program

// sample code: arithmetics.cpp
// demonstrates stateful cttl parser

//#define NDEBUG    // must appear before assert.h is included to stop assertions from being compiled 
//#define CTTL_TRACE_EVERYTHING

#include <iostream>
#include "cttl/cttl.h"

using namespace cttl;

struct parser {

    /*  Arithmetic expression grammar production rules in EBNF form:
     *
     * <expr> --> <term> ( '+' <term> | '-' <term> )*
     * <term> --> <factor> ( '*'  <factor> | '/'  <factor> )*
     * <factor> --> <prime> | '(' <expr> ')' | '-' <factor> | '+' <factor>
     * <prime> --> ( '0' | '1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9' )+
     * 
     */

    int ip_cnt;

    parser( int ip_cnt_ = 0 )
        :   ip_cnt( ip_cnt_ )
    {
    }

    size_t rule_factor( const_edge< policy_space<> >& universe_ )
    {
        return (

            (
                isdigit & rule( *this, &parser::numeric_literal )
            )
            |
            (
                '('
                +
                rule( *this, &parser::rule_expr )
                +
                ')'
            )
            |
            (
                '-' + ( rule( *this, &parser::rule_factor ) & rule( *this, &parser::unary_minus ) )
            )
            |
            (
                '+' + rule( *this, &parser::rule_factor )   // unary plus is simply ignored
            )
            |
            rule( *this, &parser::parse_error )

        ).match( universe_ )
        ;
    }

    size_t rule_term( const_edge< policy_space<> >& universe_ )
    {
        return (

            rule( *this, &parser::rule_factor )
            +
            *
            (
                (
                    ( '*' + rule( *this, &parser::rule_factor ) )
                    &
                    rule( *this, &parser::multiply )
                )
                |
                (
                    ( '/' + rule( *this, &parser::rule_factor ) )
                    &
                    rule( *this, &parser::divide )
                )
            )

        ).match( universe_ )
        ;
    }


    size_t rule_expr( const_edge< policy_space<> >& universe_ )
    {
        return (

            rule( *this, &parser::rule_term )
            +
            *(
                (
                    ( '+' + rule( *this, &parser::rule_term ) )
                    &
                    rule( *this, &parser::add )
                )
                |
                (
                    ( '-' + rule( *this, &parser::rule_term ) )
                    &
                    rule( *this, &parser::subtract )
                )
            )

        ).match( universe_ )
        ;
    }


    size_t numeric_literal( const_edge<>& edge_ )
    {
        std::cout << "\t" << ++ip_cnt << "\t ;" << std::endl;
        std::cout << "\t" << ++ip_cnt << "\t PUSH \t " << edge_.text() << std::endl;
        return edge_.second.offset();
    }

    size_t divide( const_edge<>& edge_ )
    {
        std::cout << "\t" << ++ip_cnt << "\t ;" << std::endl;
        std::cout << "\t" << ++ip_cnt << "\t POP \t R1" << std::endl;
        std::cout << "\t" << ++ip_cnt << "\t POP \t R2" << std::endl;
        std::cout << "\t" << ++ip_cnt << "\t DIV \t R2, R1 \t; R2 = R2 " << edge_.text() << std::endl;
        std::cout << "\t" << ++ip_cnt << "\t PUSH \t R2" << std::endl;
        return edge_.second.offset();
    }

    size_t multiply( const_edge<>& edge_ )
    {
        std::cout << "\t" << ++ip_cnt << "\t ;" << std::endl;
        std::cout << "\t" << ++ip_cnt << "\t POP \t R1" << std::endl;
        std::cout << "\t" << ++ip_cnt << "\t POP \t R2" << std::endl;
        std::cout << "\t" << ++ip_cnt << "\t MUL \t R2, R1 \t; R2 = R2 " << edge_.text() << std::endl;
        std::cout << "\t" << ++ip_cnt << "\t PUSH \t R2" << std::endl;
        return edge_.second.offset();
    }

    size_t add( const_edge<>& edge_ )
    {
        std::cout << "\t" << ++ip_cnt << "\t ;" << std::endl;
        std::cout << "\t" << ++ip_cnt << "\t POP \t R1" << std::endl;
        std::cout << "\t" << ++ip_cnt << "\t POP \t R2" << std::endl;
        std::cout << "\t" << ++ip_cnt << "\t ADD \t R2, R1 \t; R2 = R2 " << edge_.text() << std::endl;
        std::cout << "\t" << ++ip_cnt << "\t PUSH \t R2" << std::endl;
        return edge_.second.offset();
    }

    size_t subtract( const_edge<>& edge_ )
    {
        std::cout << "\t" << ++ip_cnt << "\t ;" << std::endl;
        std::cout << "\t" << ++ip_cnt << "\t POP \t R1" << std::endl;
        std::cout << "\t" << ++ip_cnt << "\t POP \t R2" << std::endl;
        std::cout << "\t" << ++ip_cnt << "\t SUB \t R2, R1 \t; R2 = R2 " << edge_.text() << std::endl;
        std::cout << "\t" << ++ip_cnt << "\t PUSH \t R2" << std::endl;
        return edge_.second.offset();
    }

    size_t unary_minus( const_edge<>& edge_ )
    {
        std::cout << "\t" << ++ip_cnt << "\t ;" << std::endl;
        std::cout << "\t" << ++ip_cnt << "\t POP \t R1" << std::endl;
        std::cout << "\t" << ++ip_cnt << "\t NEG \t R1 \t\t; R1 = -(" << edge_.text() << ")" << std::endl;
        std::cout << "\t" << ++ip_cnt << "\t PUSH \t R1" << std::endl;
        return edge_.second.offset();
    }

    size_t parse_error( const_edge< policy_space<> >& edge_ ) const
    {
        std::cout
            << "*** syntax error ***"
            << std::endl
            << edge_.parent().text()
            << std::endl
            ;

        for ( size_t pos = 0; pos < edge_.first.offset(); ++pos )
            std::cout << '\x20';

        std::cout << "^-- at position " << edge_.first.offset() << std::endl;
        return std::string::npos;
    }

};


int main(int argc, char* argv[])
{
    if ( argc == 1 ) {
        std::cout << "\t usage: enter arithmetic expression to parse" << std::endl;
        return 1;
    }

    // construct input string from arguments on the command line:
    input<> inp( &argv[ 1 ], ' ' );

    // construct universe to be parsed
    const_edge< policy_space<> > universe( new_edge( inp ) );

    // construct the parser:
    parser arithmetic_parser;

    // evaluate arithmetic expression:
    if ( arithmetic_parser.rule_expr( universe ) != std::string::npos ) {
        if( universe.length() ) {
            std::cout
                << std::endl
                << "*** error: parser terminated: ***"
                << std::endl
                << inp.text()
                << std::endl
                ;
            for ( size_t pos = 0; pos < universe.first.offset(); ++pos )
                std::cout << '\x20';

            std::cout << "^-- at position " << universe.first.offset() << std::endl;
        }

    } else
        std::cout << "*** parser terminated" << std::endl;

    return 0;
}

program output

>cttl 2*(3+-4)
        1        ;
        2        PUSH    2
        3        ;
        4        PUSH    3
        5        ;
        6        PUSH    4
        7        ;
        8        POP     R1
        9        NEG     R1             ; R1 = -(4)
        10       PUSH    R1
        11       ;
        12       POP     R1
        13       POP     R2
        14       ADD     R2, R1         ; R2 = R2 +-4
        15       PUSH    R2
        16       ;
        17       POP     R1
        18       POP     R2
        19       MUL     R2, R1         ; R2 = R2 *(3+-4)
        20       PUSH    R2

>

program organization

Program arguments (except argv[0]) are passed to the constructor of the input class. To assemble user input, the constructor concatenates individual arguments into a string of text using space delimiter.

To parse the input, main() function invokes the rule_expr() function, designated to be the starting rule of the grammar.

The parser has three functions representing the grammar: rule_factor(), rule_term(), and rule_expr(). Other parser functions encapsulate various semantic actions invoked while the grammar is parsed. For example, numeric_literal() is invoked when parser encounters a number in the input. Divide(), multiply(), add(), and subtract() implement semantic actions to handle binary operators. Unary_minus() handles the unary minus operator (note that unary plus is simply ignored). Finally, if evaluation of the arithmetic expression fails, semantic action parse_error() is invoked. The error handler prints error message on the screen and returns std::string::npos to the lexer to indicate an error condition.

Note that all parser member functions have the same signature (parse_error() is also declared const):

Each semantic action updates parser state by incrementing instruction counter variable ip_cnt. The error handler function, parse_error(), does not modify parser state, so it is declared const.

All functions receive reference to the cttl::const_edge object, which represents the universe to be consumed. The "const_edge" name suggests that parser should not attempt to modify the universe.

The topic of writing unambiguous grammars is beyond the scope of our tutorial. Often, there is a good chance that EBNF (or BNF) grammar for the language in question already exists (see, for example, basic english grammar.) In such case it might be a good idea to reuse existing grammar instead of writing your own.

It is difficult to underestimate the role of the grammar in our sample program. The structure of rules provides correct arithmetic operator precedence and associativity, as well as parenthesis handling, allowing users to group sub-expressions and modify default precedence of arithmetic operators. The sample program implements these features exclusively by means of a formally described EBNF grammar. The grammar declaratively adds desired functionality to our program, with no need to write any additional code.

input grammar

Sometimes part of the program that parses the input becomes one of the most difficult components of the application. The code can be tough to understand and maintain. Shifting implementation to practical use of formal grammars could simplify lexical analysis of the input to a great extent.

Step 1: identify character entities and replace them with CTTL lexeme functions.

Individual terminal symbols, such as '+', '-', '*', '/', '(', and ')', could be replaced with lexemes symbol('+'), symbol('-'), etc. The library provides overloaded operators for built-in types on the left and right hand side of the CTTL expression. Therefore, our tutorial program can use shorthand notation, where terminal symbols '+', '-', '*', '/', '(', ')' represent themselves.

Step 2: create parser object and add signatures of production rule functions.

Because arithmetic expression parser maintains its state during grammar evaluation, the easiest way to organize production rules is to write them as member functions of the parser structure:

Step 3: write production rules corresponding to EBNF rules.

When writing grammar rules in CTTL, we begin with the original EBNF production notation. For example, right hand side of the production rule <expr>,

To support recursion required by EBNF grammars, CTTL library defines a family of cttl::rule() function adaptors. The adaptors provide mechanism to make function calls from grammar expressions at run time. At compile time, cttl::rule() functions capture references to the production rule functions, represented by global, static, or class member functions invoked as ordinary functions.

To accommodate grammar expression inside parser::rule_expr(), we invoke one of the grammar evaluation methods (in our case, match() method), and return the evaluation result back to the caller. Function rule_expr() takes single argument, parseable universe, passed as a reference to the cttl::const_edge object:

At this point member function rule_expr() can recognize grammar corresponding to the encapsulated production rule expression.

sample semantic actions

When evaluation of rule_expr() succeeds, we would like to call parser::add() and parser::subtract(). To do so, we modify rule_expr() as follows:

struct parser {

    size_t rule_expr( const_edge<>& universe_ )
    {
        return (

            rule( *this, &parser::rule_term )
            +
            *(
                (
                    ( '+' + rule( *this, &parser::rule_term ) )
                    &
                    rule( *this, &parser::add )
                )
                |
                (
                    ( '-' + rule( *this, &parser::rule_term ) )
                    &
                    rule( *this, &parser::subtract )
                )
            )

        ).match( universe_ )
        ;
    }

    size_t add( const_edge<>& edge_ ) { /*...*/ }
    size_t subtract( const_edge<>& edge_ ) { /*...*/ }

    //...
};

Remaining grammar rules, rule_factor() and rule_term(), are written in a similar way. The structure of rule_term() is similar to rule_expr(). Relationship between rule_expr() and rule_term() adds support for arithmetic operator precedence to our program.

Rule rule_factor implements a sequence of choices: it matches an integer number, a sub-expression enclosed in parenthesis, or an unary plus or minus followed by the rule_factor itself. If neither of these choices resolves successfully, parse_error() function is invoked. The error handler prints informative error message and returns std::string::npos to cancel further grammar evaluation.

Permission to copy, use, modify, sell and distribute this document is granted provided this copyright notice appears in all copies. This document is provided "as is" without express or implied warranty, and with no claim as to its suitability for any purpose.

grammar tutorial

abstract