Recursive Descent Parser

simparse.cc

/* * Expression grammar: * Asst --> Id := Expr ; * Expr --> Term | { [ + | - ] Term }* * Term --> Fact | { [ * | / ] Fact }* * Fact --> Id | Lit | ( Expr ) * * The program reads standard input and attempts to parse it as an Asst. It * then prints a representation of the input structure. If you type the input, * make sure to issue an EOF (^D in unix, ^Z in windows shell) at the end. */ #include <ctype.h> #include <string> #include <iostream> using namespace std; /****************************************************************************** ********* General Utilities ******* ******************************************************************************/ /* Whine and expire. */ void die(string msg) { cerr << msg << endl; exit(10); } /* Print some spaces to a stream. This class and the operator function allows stuff like: s << spaces(5) << "more stuff"; */ class spaces { private: int _nsp; friend ostream &operator <<(ostream &strm, const spaces &); public: spaces(int nsp): _nsp(nsp) { } }; ostream &operator <<(ostream &st, const spaces &sp) { for(int n = sp._nsp; n--; ) st << " "; return st; } /****************************************************************************** ********* Base Class Of (Almost) Everything ******** ******************************************************************************/ /* ParserObject is a the base class for most objects created by the system. It includes framework for printing the objects with indenting. Each class derived from ParserObject must implement pr() to print its content. If pr() should print sp() at the front of each line, except when it prints internal components. This maintains the indenting. */ /* Abstract base class for stuff used by the parser. */ class ParserObject { private: static int indent; protected: virtual void pr(ostream &s) const = 0; spaces sp() const { return spaces(indent); } public: // Print, indented from the left as indicated. void print(ostream &s) const { indent += 2; pr(s); indent -= 2; } }; int ParserObject::indent = -2; ostream &operator <<(ostream &st, const ParserObject &po) { po.print(st); return st; } /****************************************************************************** ********* Scanner ******** ******************************************************************************/ /* A scanner (called a tokenizer at fancier places than this one) breaks the input stream up into significant pieces. Each token is reprsented by a Token object. The possible tokens are: */ /* These are code numbers for each token type. */ enum tok_code { tok_id, // An identifier, L(L|D)* tok_lit, // A literal, meaning an unsigned integer number. tok_left, // A left paren. tok_right, // A right paren tok_splat, // An asterisk tok_slash, // A slash tok_plus, // A plus sign tok_minus, // A minus sign or dash tok_asst, // An assignment symbol, := tok_semi, // A semicolon. tok_EOF // END-of-file. }; /* String names to correspond to the token codes */ char *codenames[] = { "tok_id", "tok_lit", "tok_left", "tok_right", "tok_splat", "tok_slash", "tok_plus", "tok_minus", "tok_asst", "tok_semi", "tok_EOF" }; /* A token has a code number and the string it actually represents. */ class Token: public ParserObject { public: tok_code code; string text; Token(tok_code c, string t) { code = c; text = t; } void pr(ostream &s) const { s << codenames[code] << "(" << text << ")"; } }; /* * Ad hoc scanner. Returns the next token. */ Token next_tok(istream &in) { // The text of the token. string retext; // The >> will skip leading blanks. char inch; if(!(in >> inch)) return Token(tok_EOF, "$EOF"); // Let's see what we got. switch(inch) { // A _real_ language would have ranges for this. case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': /* A constant. */ do { retext += inch; in.get(inch); } while(isdigit(inch)); in.unget(); return Token(tok_lit, retext); // Gak! Identifier. case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g': case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n': case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u': case 'v': case 'w': case 'x': case 'y': case 'z': case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G': case 'H': case 'I': case 'J': case 'K': case 'L': case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U': case 'V': case 'W': case 'X': case 'Y': case 'Z': /* A constant. */ do { retext += inch; in.get(inch); } while(isalpha(inch) || isdigit(inch)); in.unget(); return Token(tok_id, retext); case '(': return Token(tok_left, "("); case ')': return Token(tok_right, ")"); case '*': return Token(tok_splat, "*"); case '/': return Token(tok_slash, "/"); case '+': return Token(tok_plus, "+"); case '-': return Token(tok_minus, "-"); case ';': return Token(tok_semi, "-"); case ':': in.get(inch); if(inch == '=') return Token(tok_asst, ":="); else die(string("Bad input character ") + inch); default: die(string("Bad input character ") + inch); } } /****************************************************************************** ********* Parser ******** ******************************************************************************/ /* These are the objects returned by the parsing functions. */ class Asst; // Holds the parts of an assignment statement. class Expr; // Base class for any type of expression. class Binary; // Expression which is the result of a binary op. class Variable; // Expression which is a variable. class Value; // Expression which is a constant. // An assignment statement is a target and an expression. class Asst: public ParserObject { public: Variable *target; Expr *source; Asst(Variable *t, Expr *e) { target = t; source = e; } void pr(ostream &s) const; }; // An expression is an abstract class that doesn't really do much. class Expr: public ParserObject { }; // Binary holds two operands and an operator. class Binary: public Expr { public: Token *oper; Expr *term1, *term2; Binary(Token *o, Expr *t1, Expr *t2) { oper = o; term1 = t1; term2 = t2; } void pr(ostream &s) const; }; // Variable just wraps the token and inherits from Expr class Variable: public Expr { public: Token *var; Variable(Token *v) { var = v; } void pr(ostream &s) const; }; // Value is also such a wrapper. class Value: public Expr { public: Token *var; Value(Token *v) { var = v; } void pr(ostream &s) const; }; // Print functions for the parser return types. void Asst::pr(ostream &s) const { s << sp() << "[ Asst: " << endl; s << *target << endl; s << *source << endl; s << sp() << "]"; } void Binary::pr(ostream &s) const { s << sp() << "[ Binary, op = " << *oper << ":" << endl; s << *term1 << endl; s << *term2 << endl; s << sp() << "]"; } void Variable::pr(ostream &s) const { s << sp() << "[ Variable: " << *var << "]"; } void Value::pr(ostream &s) const { s << sp() << "[ Value: " << *var << "]"; } // Here is the parser class. It takes an input stream in the constructor, // which it turns into a tree. The print method will output the parse // tree. class Parser { private: Token curr_tok; // Current token value. Asst *root; // Root of parse tree. istream &in; // Input stream. // Functions to find each non-terminal in the input. Note: Each // finding function expects curr_tok to contain the first token // of whatever they are to find, and must make sure it contains // the next token after when they return. Asst *asst(); Expr *expr(); Expr *term(); Expr *fact(); // These find a variable or value in the input. Their main purpose // is just to wrap the token object from the scanner in a Variable // or Value object which is derived from Expr. They have the same // behavior regarding next_tok. Variable *var(); Value *val(); public: Parser(istream &); // Parse the stream. void print(ostream &); // Print the tree. }; // Start it up. Parser::Parser(istream &strm): in(strm), curr_tok(next_tok(strm)) { // Notice that we have read curr_tok before calling asst(), just // that we had to do it with C++'s brain-damaged initializer list // syntax. root = asst(); } //******************************** //***** The Parsing Funtions ***** //******************************** // Look for the variable assigned, the :=, the expression, and the ;. // Wrap 'em up in a nice little Asst-shaped box and return them. Asst *Parser::asst() { // First thing we should find is a variable, else bail. Variable *v = var(); if(curr_tok.code != tok_asst) die("Parse error expecting :=, got " + curr_tok.text); curr_tok = next_tok(in); // Now recur to find the expression. Expr *e = expr(); // And make sure the expression is followed by a semicolon. If so, // we've found all the stuff and we return the containing object. if(curr_tok.code != tok_semi) die("Parse error expecting ;, got " + curr_tok.text); curr_tok = next_tok(in); return new Asst(v, e); } // Look for the Term. If it's followed by + or -, consume it and look // for another term. Repeat. Expr *Parser::expr() { // The value of t will be our return value, but the loop will // change it if we enter. Expr *t = term(); // Repeat while we see an appropriate op, which must be // followed by another term. while(curr_tok.code == tok_plus || curr_tok.code == tok_minus) { // Find the rest of it. Token *op = new Token(curr_tok); curr_tok = next_tok(in); Expr *t2 = term(); // Bolt 'em together Binary *b = new Binary(op, t, t2); // Now, get ready for the next iteration. t = b; } return t; } // This is almost exactly like Parser::expr(), and they could be combined // (though the parameterization would be a killer). Complicated enough // as it stands. Expr *Parser::term() { // The value of t will be our return value, but the loop will // change it if we enter. Expr *t = fact(); // Get 'em all. while(curr_tok.code == tok_splat || curr_tok.code == tok_slash) { // Find the rest of it. Token *op = new Token(curr_tok); curr_tok = next_tok(in); Expr *t2 = fact(); // Bolt 'em together Binary *b = new Binary(op, t, t2); // Now, get ready for the next iteration. t = b; } return t; } // A factor is one of three things. Look at the terminals to see what. Expr *Parser::fact() { // Must be an identifier, a literal, or start with (. if(curr_tok.code == tok_id) // The variable is it. return var(); if(curr_tok.code == tok_lit) // The constant value is it. return val(); if(curr_tok.code != tok_left) // It doesn't start with (, so it's wrong. die("Expecting (, found " + curr_tok.text); // Consume the (, match the contained expression, then match and // consume the closing ). If all that succeeds, return the // expression. curr_tok = next_tok(in); Expr *e = expr(); if(curr_tok.code != tok_right) die("Expecting ), found " + curr_tok.text); curr_tok = next_tok(in); return e; } // Just see if we have the appropriate token. If so, consume it and // place it in a wrapper object to return. Variable *Parser::var() { if(curr_tok.code != tok_id) die("Expecting id, got " + curr_tok.text); Variable *v = new Variable(new Token(curr_tok)); curr_tok = next_tok(in); return v; } Value *Parser::val() { if(curr_tok.code != tok_lit) die("Expecting id, got " + curr_tok.text); Value *v = new Value(new Token(curr_tok)); curr_tok = next_tok(in); return v; } // Print from the parser prints the tree. void Parser::print(ostream &s) { s << *root << endl; if(curr_tok.code != tok_EOF) cout << "Notice: Extra input after asst." << endl; } // Read and print. main() { Parser p(cin); p.print(cout); }