CTTL on
SourceForge |
Download
Latest |
Documentation
Index |
Library
News |
CVS
Repository |
Other
Links |
Theoretical computer science
provides an interesting look at solving
general computation
problems: if input to the problem is described in one
formal language,
and a problem solution is described in another,
then the problem can by solved by an algorithm which implements translation
between the two languages.
Unfortunately, there are problems that computers cannot easily solve, for example,
see traveling salesman problem.
At the same time, many problems can be solved by formal language syntax recognition
and subsequent translation to a different language. For example, the following program,
arithmetics.cpp
,
translates an arithmetic expression into its pseudo-assembly language equivalent.
// sample code: arithmetics.cpp // demonstrates stateful cttl parser //#define NDEBUG // must appear before assert.h is included to stop assertions from being compiled //#define CTTL_TRACE_EVERYTHING #include <iostream> #include "cttl/cttl.h" using namespace cttl; struct parser { /* Arithmetic expression grammar production rules in EBNF form: * * <expr> --> <term> ( '+' <term> | '-' <term> )* * <term> --> <factor> ( '*' <factor> | '/' <factor> )* * <factor> --> <prime> | '(' <expr> ')' | '-' <factor> | '+' <factor> * <prime> --> ( '0' | '1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9' )+ * */ int ip_cnt; parser( int ip_cnt_ = 0 ) : ip_cnt( ip_cnt_ ) { } size_t rule_factor( const_edge< policy_space<> >& universe_ ) { return ( ( isdigit & rule( *this, &parser::numeric_literal ) ) | ( '(' + rule( *this, &parser::rule_expr ) + ')' ) | ( '-' + ( rule( *this, &parser::rule_factor ) & rule( *this, &parser::unary_minus ) ) ) | ( '+' + rule( *this, &parser::rule_factor ) // unary plus is simply ignored ) | rule( *this, &parser::parse_error ) ).match( universe_ ) ; } size_t rule_term( const_edge< policy_space<> >& universe_ ) { return ( rule( *this, &parser::rule_factor ) + * ( ( ( '*' + rule( *this, &parser::rule_factor ) ) & rule( *this, &parser::multiply ) ) | ( ( '/' + rule( *this, &parser::rule_factor ) ) & rule( *this, &parser::divide ) ) ) ).match( universe_ ) ; } size_t rule_expr( const_edge< policy_space<> >& universe_ ) { return ( rule( *this, &parser::rule_term ) + *( ( ( '+' + rule( *this, &parser::rule_term ) ) & rule( *this, &parser::add ) ) | ( ( '-' + rule( *this, &parser::rule_term ) ) & rule( *this, &parser::subtract ) ) ) ).match( universe_ ) ; } size_t numeric_literal( const_edge<>& edge_ ) { std::cout << "\t" << ++ip_cnt << "\t ;" << std::endl; std::cout << "\t" << ++ip_cnt << "\t PUSH \t " << edge_.text() << std::endl; return edge_.second.offset(); } size_t divide( const_edge<>& edge_ ) { std::cout << "\t" << ++ip_cnt << "\t ;" << std::endl; std::cout << "\t" << ++ip_cnt << "\t POP \t R1" << std::endl; std::cout << "\t" << ++ip_cnt << "\t POP \t R2" << std::endl; std::cout << "\t" << ++ip_cnt << "\t DIV \t R2, R1 \t; R2 = R2 " << edge_.text() << std::endl; std::cout << "\t" << ++ip_cnt << "\t PUSH \t R2" << std::endl; return edge_.second.offset(); } size_t multiply( const_edge<>& edge_ ) { std::cout << "\t" << ++ip_cnt << "\t ;" << std::endl; std::cout << "\t" << ++ip_cnt << "\t POP \t R1" << std::endl; std::cout << "\t" << ++ip_cnt << "\t POP \t R2" << std::endl; std::cout << "\t" << ++ip_cnt << "\t MUL \t R2, R1 \t; R2 = R2 " << edge_.text() << std::endl; std::cout << "\t" << ++ip_cnt << "\t PUSH \t R2" << std::endl; return edge_.second.offset(); } size_t add( const_edge<>& edge_ ) { std::cout << "\t" << ++ip_cnt << "\t ;" << std::endl; std::cout << "\t" << ++ip_cnt << "\t POP \t R1" << std::endl; std::cout << "\t" << ++ip_cnt << "\t POP \t R2" << std::endl; std::cout << "\t" << ++ip_cnt << "\t ADD \t R2, R1 \t; R2 = R2 " << edge_.text() << std::endl; std::cout << "\t" << ++ip_cnt << "\t PUSH \t R2" << std::endl; return edge_.second.offset(); } size_t subtract( const_edge<>& edge_ ) { std::cout << "\t" << ++ip_cnt << "\t ;" << std::endl; std::cout << "\t" << ++ip_cnt << "\t POP \t R1" << std::endl; std::cout << "\t" << ++ip_cnt << "\t POP \t R2" << std::endl; std::cout << "\t" << ++ip_cnt << "\t SUB \t R2, R1 \t; R2 = R2 " << edge_.text() << std::endl; std::cout << "\t" << ++ip_cnt << "\t PUSH \t R2" << std::endl; return edge_.second.offset(); } size_t unary_minus( const_edge<>& edge_ ) { std::cout << "\t" << ++ip_cnt << "\t ;" << std::endl; std::cout << "\t" << ++ip_cnt << "\t POP \t R1" << std::endl; std::cout << "\t" << ++ip_cnt << "\t NEG \t R1 \t\t; R1 = -(" << edge_.text() << ")" << std::endl; std::cout << "\t" << ++ip_cnt << "\t PUSH \t R1" << std::endl; return edge_.second.offset(); } size_t parse_error( const_edge< policy_space<> >& edge_ ) const { std::cout << "*** syntax error ***" << std::endl << edge_.parent().text() << std::endl ; for ( size_t pos = 0; pos < edge_.first.offset(); ++pos ) std::cout << '\x20'; std::cout << "^-- at position " << edge_.first.offset() << std::endl; return std::string::npos; } }; int main(int argc, char* argv[]) { if ( argc == 1 ) { std::cout << "\t usage: enter arithmetic expression to parse" << std::endl; return 1; } // construct input string from arguments on the command line: input<> inp( &argv[ 1 ], ' ' ); // construct universe to be parsed const_edge< policy_space<> > universe( new_edge( inp ) ); // construct the parser: parser arithmetic_parser; // evaluate arithmetic expression: if ( arithmetic_parser.rule_expr( universe ) != std::string::npos ) { if( universe.length() ) { std::cout << std::endl << "*** error: parser terminated: ***" << std::endl << inp.text() << std::endl ; for ( size_t pos = 0; pos < universe.first.offset(); ++pos ) std::cout << '\x20'; std::cout << "^-- at position " << universe.first.offset() << std::endl; } } else std::cout << "*** parser terminated" << std::endl; return 0; }
>cttl 2*(3+-4) 1 ; 2 PUSH 2 3 ; 4 PUSH 3 5 ; 6 PUSH 4 7 ; 8 POP R1 9 NEG R1 ; R1 = -(4) 10 PUSH R1 11 ; 12 POP R1 13 POP R2 14 ADD R2, R1 ; R2 = R2 +-4 15 PUSH R2 16 ; 17 POP R1 18 POP R2 19 MUL R2, R1 ; R2 = R2 *(3+-4) 20 PUSH R2 >
The main() function instantiates cttl::input, parseable universe, and the parser:
// construct input string from arguments on the command line: input<> inp( &argv[ 1 ], ' ' ); // construct universe to be parsed const_edge<> universe( new_edge( inp ) ); // construct the parser: parser arithmetic_parser;
Program arguments (except argv[0]) are passed to the constructor of the input class. To assemble user input, the constructor concatenates individual arguments into a string of text using space delimiter.
To parse the input, main() function invokes the rule_expr() function, designated to be the starting rule of the grammar.
The parser has three functions representing the grammar: rule_factor(), rule_term(), and rule_expr(). Other parser functions encapsulate various semantic actions invoked while the grammar is parsed. For example, numeric_literal() is invoked when parser encounters a number in the input. Divide(), multiply(), add(), and subtract() implement semantic actions to handle binary operators. Unary_minus() handles the unary minus operator (note that unary plus is simply ignored). Finally, if evaluation of the arithmetic expression fails, semantic action parse_error() is invoked. The error handler prints error message on the screen and returns std::string::npos to the lexer to indicate an error condition.
Note that all parser member functions have the same signature (parse_error() is also declared const):
size_t r( const_edge<>& );
Each semantic action updates parser state by incrementing instruction counter variable ip_cnt. The error handler function, parse_error(), does not modify parser state, so it is declared const.
All functions receive reference to the cttl::const_edge object, which represents the universe to be consumed. The "const_edge" name suggests that parser should not attempt to modify the universe.
The topic of writing unambiguous grammars is beyond the scope of our tutorial. Often, there is a good chance that EBNF (or BNF) grammar for the language in question already exists (see, for example, basic english grammar.) In such case it might be a good idea to reuse existing grammar instead of writing your own.
It is difficult to underestimate the role of the grammar in our sample program. The structure of rules provides correct arithmetic operator precedence and associativity, as well as parenthesis handling, allowing users to group sub-expressions and modify default precedence of arithmetic operators. The sample program implements these features exclusively by means of a formally described EBNF grammar. The grammar declaratively adds desired functionality to our program, with no need to write any additional code.
Sometimes part of the program that parses the input becomes one of the most difficult components of the application. The code can be tough to understand and maintain. Shifting implementation to practical use of formal grammars could simplify lexical analysis of the input to a great extent.
EBNF production rules for the parser of arithmetic expressions are:
<expr> --> <term> ( '+' <term> | '-' <term> )* <term> --> <factor> ( '*' <factor> | '/' <factor> )* <factor> --> <prime> | '(' <expr> ')' | '-' <factor> | '+' <factor> <prime> --> ( '0' | '1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9' )+The library offers facilities to rewrite these grammar rules as C++ expressions. The goal of our tutorial is to demonstrate how this can be done in a progression of small steps.
Individual terminal symbols, such as '+', '-', '*', '/', '(', and ')', could be replaced with lexemes symbol('+'), symbol('-'), etc. The library provides overloaded operators for built-in types on the left and right hand side of the CTTL expression. Therefore, our tutorial program can use shorthand notation, where terminal symbols '+', '-', '*', '/', '(', ')' represent themselves.
Regular expression on the right side of the <prime> production rule
( '0' | '1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9' )+can be replaced by CTTL lexeme entity(isdigit), which matches entities containing one or more digit characters. This replacement eliminates the need for a separately defined production rule named <prime>.
Because arithmetic expression parser maintains its state during grammar evaluation, the easiest way to organize production rules is to write them as member functions of the parser structure:
struct parser { size_t rule_factor( const_edge<>& universe_ ) { /*...*/ } size_t rule_term( const_edge<>& universe_ ) { /*...*/ } size_t rule_expr( const_edge<>& universe_ ) { /*...*/ } };
When writing grammar
rules
in CTTL, we begin with the original
EBNF production notation. For example, right hand side of the production rule
<expr>
,
<term> ( '+' <term> | '-' <term> )*can be rewritten as C++ expression
rule( *this, &parser::rule_term ) + *( ( '+' + rule( *this, &parser::rule_term ) ) | ( '-' + rule( *this, &parser::rule_term ) ) )In C++ we added binary sequence operators
'+'
to connect symbols. Unary '*'
operator,
Kleene star,
has moved to the front of the expression to which it applies.
Few extra parenthesis were added to the expression to supersede default
C++ operator precedence.
To support recursion required by EBNF grammars, CTTL library defines a family of cttl::rule()
function adaptors. The adaptors provide mechanism to make function calls from grammar expressions at run time. At compile time, cttl::rule()
functions capture references to the production rule functions, represented by global, static, or class member functions invoked as ordinary functions.
To accommodate grammar expression inside parser::rule_expr()
, we invoke one of the grammar evaluation methods (in our case, match()
method), and return the evaluation result back to the caller. Function rule_expr()
takes single argument, parseable universe, passed as a reference to the cttl::const_edge
object:
size_t rule_expr( const_edge<>& universe_ ) { return ( rule( *this, &parser::rule_term ) + *( ( '+' + rule( *this, &parser::rule_term ) ) | ( '-' + rule( *this, &parser::rule_term ) ) ) ).match( universe_ ) ; }
At this point member function rule_expr()
can recognize grammar corresponding to the encapsulated production rule expression.
When evaluation of rule_expr()
succeeds, we would like to call parser::add()
and parser::subtract()
. To do so, we modify rule_expr()
as follows:
struct parser { size_t rule_expr( const_edge<>& universe_ ) { return ( rule( *this, &parser::rule_term ) + *( ( ( '+' + rule( *this, &parser::rule_term ) ) & rule( *this, &parser::add ) ) | ( ( '-' + rule( *this, &parser::rule_term ) ) & rule( *this, &parser::subtract ) ) ) ).match( universe_ ) ; } size_t add( const_edge<>& edge_ ) { /*...*/ } size_t subtract( const_edge<>& edge_ ) { /*...*/ } //... };
Remaining grammar rules, rule_factor()
and rule_term()
, are written in a similar way. The structure of rule_term()
is similar to rule_expr()
. Relationship between rule_expr()
and rule_term()
adds support for arithmetic operator precedence to our program.
Rule rule_factor
implements a sequence of choices: it matches an integer number, a sub-expression enclosed in parenthesis, or an unary plus or minus followed by the rule_factor
itself. If neither of these choices resolves successfully, parse_error()
function is invoked. The error handler prints informative error message and returns std::string::npos
to cancel further grammar evaluation.
Permission to copy, use, modify, sell and distribute this document is granted provided this copyright notice appears in all copies. This document is provided "as is" without express or implied warranty, and with no claim as to its suitability for any purpose.