diff --git a/docs/tutorial/LangImpl1.html b/docs/tutorial/LangImpl1.html index 49ad1818d8b..a699ed6f5e3 100644 --- a/docs/tutorial/LangImpl1.html +++ b/docs/tutorial/LangImpl1.html @@ -56,7 +56,7 @@ which looks like this:
# Compute the x'th fibonacci number. def fib(x) - if x < 3 then + if x < 3 then 1 else fib(x-1)+fib(x-2) @@ -241,8 +241,8 @@ this code:With this, we have the complete lexer for the basic Kaleidoscope language. Next we'll build a simple parser that uses this to -build an Abstract Syntax Tree. If you prefer, you can jump to the main tutorial index page. +build an Abstract Syntax Tree. When we have that, we'll include a driver +so that you can use the lexer and parser together.
diff --git a/docs/tutorial/LangImpl2.html b/docs/tutorial/LangImpl2.html new file mode 100644 index 00000000000..c153aa2c973 --- /dev/null +++ b/docs/tutorial/LangImpl2.html @@ -0,0 +1,1177 @@ + + + + +Kaleidoscope: Implementing a Parser and AST + + + + + + + +Kaleidoscope: Implementing a Parser and AST+ + + + + + + ++ ++ + + + + +Welcome to part 2 of the "Implementing a language with +LLVM" tutorial. This chapter shows you how to use the Lexer built in Chapter 1 to build a full parser for +our Kaleidoscope language and build an Abstract Syntax +Tree (AST).
+ +The parser we will build uses a combination of Recursive Descent +Parsing and Operator-Precedence +Parsing to parse the Kaleidoscope language (the later for binary expression +and the former for everything else). Before we get to parsing though, lets talk +about the output of the parser: the Abstract Syntax Tree.
+ ++ ++ + + + + +The AST for a program captures its behavior in a way that it is easy for +later stages of the compiler (e.g. code generation) to interpret. We basically +want one object for each construct in the language, and the AST should closely +model the language. In Kaleidoscope, we have expressions, a prototype, and a +function object. We'll start with expressions first:
+ +++ ++/// ExprAST - Base class for all expression nodes. +class ExprAST { +public: + virtual ~ExprAST() {} +}; + +/// NumberExprAST - Expression class for numeric literals like "1.0". +class NumberExprAST : public ExprAST { + double Val; +public: + NumberExprAST(double val) : Val(val) {} +}; ++The code above shows the definition of the base ExprAST class and one +subclass which we use for numeric literals. The important thing about this is +that the NumberExprAST class captures the numeric value of the literal in the +class, so that later phases of the compiler can know what it is.
+ +Right now we only create the AST, so there are no useful accessor methods on +them. It would be very easy to add a virtual method to pretty print the code, +for example. Here are the other expression AST node definitions that we'll use +in the basic form of the Kaleidoscope language. +
+ +++ ++/// VariableExprAST - Expression class for referencing a variable, like "a". +class VariableExprAST : public ExprAST { + std::string Name; +public: + VariableExprAST(const std::string &name) : Name(name) {} +}; + +/// BinaryExprAST - Expression class for a binary operator. +class BinaryExprAST : public ExprAST { + char Op; + ExprAST *LHS, *RHS; +public: + BinaryExprAST(char op, ExprAST *lhs, ExprAST *rhs) + : Op(op), LHS(lhs), RHS(rhs) {} +}; + +/// CallExprAST - Expression class for function calls. +class CallExprAST : public ExprAST { + std::string Callee; + std::vector<ExprAST*> Args; +public: + CallExprAST(const std::string &callee, std::vector<ExprAST*> &args) + : Callee(callee), Args(args) {} +}; ++This is all (intentially) rather straight-forward: variables capture the +variable name, binary operators capture their opcode (e.g. '+'), and calls +capture a function name and list of argument expressions. One thing that is +nice about our AST is that it captures the language features without talking +about the syntax of the language. Note that there is no discussion about +precedence of binary operators, lexical structure etc.
+ +For our basic language, these are all of the expression nodes we'll define. +because it doesn't have conditional control flow, it isn't turing complete: +we'll fix that in a later installment. The two things we need next are a way +to talk about the interface to a function, and a way to talk about functions +themselves:
+ +++ ++/// PrototypeAST - This class represents the "prototype" for a function, +/// which captures its argument names as well as if it is an operator. +class PrototypeAST { + std::string Name; + std::vector<std::string> Args; +public: + PrototypeAST(const std::string &name, const std::vector<std::string> &args) + : Name(name), Args(args) {} +}; + +/// FunctionAST - This class represents a function definition itself. +class FunctionAST { + PrototypeAST *Proto; + ExprAST *Body; +public: + FunctionAST(PrototypeAST *proto, ExprAST *body) + : Proto(proto), Body(body) {} +}; ++In Kaleidoscope, functions are typed with just a count of their arguments. +Since all values are double precision floating point, this fact doesn't need to +be captured anywhere. In a more aggressive and realistic language, the +"ExprAST" class would probably have a type field.
+ +With this scaffolding, we can now talk about parsing expressions and function +bodies in Kaleidoscope.
+ ++ ++ + + + + +Now that we have an AST to build, we need to define the parser code to build +it. The idea here is that we want to parse something like "x+y" (which is +returned as three tokens by the lexer) into an AST that could be generated with +calls like this:
+ +++ ++ ExprAST *X = new VariableExprAST("x"); + ExprAST *Y = new VariableExprAST("y"); + ExprAST *Result = new BinaryExprAST('+', X, Y); ++In order to do this, we'll start by defining some basic helper routines:
+ +++ ++/// CurTok/getNextToken - Provide a simple token buffer. CurTok is the current +/// token the parser it looking at. getNextToken reads another token from the +/// lexer and updates CurTok with its results. +static int CurTok; +static int getNextToken() { + return CurTok = gettok(); +} +++This implements a simple token buffer around the lexer. This allows +us to look one token ahead at what the lexer is returning. Every function in +our lexer will assume that CurTok is the current token that needs to be +parsed.
+ +Again, we define +these with global variables: it would be better design to wrap the entire parser +in a class and use instance variables for these. +
+ +++ ++ +/// Error* - These are little helper functions for error handling. +ExprAST *Error(const char *Str) { fprintf(stderr, "Error: %s\n", Str);return 0;} +PrototypeAST *ErrorP(const char *Str) { Error(Str); return 0; } +FunctionAST *ErrorF(const char *Str) { Error(Str); return 0; } +++The Error routines are simple helper routines that our parser will use +to handle errors. The error recovery in our parser will not be the best and +are not particular user-friendly, but it will be enough for our tutorial. These +routines make it easier to handle errors in routines that have various return +types: they always return null.
+ +With these basic helper functions implemented, we can implement the first +piece of our grammar: we'll start with numeric literals.
+ ++ ++ + + + + +We start with numeric literals, because they are the simplest to process. +For each production in our grammar, we'll define a function which parses that +production. For numeric literals, we have: +
+ +++ ++/// numberexpr ::= number +static ExprAST *ParseNumberExpr() { + ExprAST *Result = new NumberExprAST(NumVal); + getNextToken(); // consume the number + return Result; +} ++This routine is very simple: it expects to be called when the current token +is a tok_number token. It takes the current number value, creates +a NumberExprAST node, advances the lexer to the next token, then +returns.
+ +There are some interesting aspects of this. The most important one is that +this routine eats all of the tokens that correspond to the production, and +returns the lexer buffer with the next token (which is not part of the grammar +production) ready to go. This is a fairly standard way to go for recursive +descent parsers. For a better example, the parenthesis operator is defined like +this:
+ +++ ++/// parenexpr ::= '(' expression ')' +static ExprAST *ParseParenExpr() { + getNextToken(); // eat (. + ExprAST *V = ParseExpression(); + if (!V) return 0; + + if (CurTok != ')') + return Error("expected ')'"); + getNextToken(); // eat ). + return V; +} ++This function illustrates a number of interesting things about the parser: +1) it shows how we use the Error routines. When called, this function expects +that the current token is a '(' token, but after parsing the subexpression, it +is possible that there is not a ')' waiting. For example, if the user types in +"(4 x" instead of "(4)", the parser should emit an error. Because errors can +occur, the parser needs a way to indicate that they happened: in our parser, we +return null on an error.
+ +Another interesting aspect of this function is that it uses recursion by +calling ParseExpression (we will soon see that ParseExpression can call +ParseParenExpr). This is powerful because it allows us to handle +recursive grammars, and keeps each production very simple. Note that +parenthesis do not cause construction of AST nodes themselves. While we could +do this, the most important role of parens are to guide the parser and provide +grouping. Once the parser constructs the AST, parens are not needed.
+ +The next simple production is for handling variable references and function +calls:
+ +++ ++/// identifierexpr +/// ::= identifer +/// ::= identifer '(' expression* ')' +static ExprAST *ParseIdentifierExpr() { + std::string IdName = IdentifierStr; + + getNextToken(); // eat identifer. + + if (CurTok != '(') // Simple variable ref. + return new VariableExprAST(IdName); + + // Call. + getNextToken(); // eat ( + std::vector<ExprAST*> Args; + while (1) { + ExprAST *Arg = ParseExpression(); + if (!Arg) return 0; + Args.push_back(Arg); + + if (CurTok == ')') break; + + if (CurTok != ',') + return Error("Expected ')'"); + getNextToken(); + } + + // Eat the ')'. + getNextToken(); + + return new CallExprAST(IdName, Args); +} ++This routine follows the same style as the other routines (it expects to be +called if the current token is a tok_identifier token). It also has +recursion and error handling. One interesting aspect of this is that it uses +look-ahead to determine if the current identifier is a stand alone +variable reference or if it is a function call expression. It handles this by +checking to see if the token after the identifier is a '(' token, and constructs +either a VariableExprAST or CallExprAST node as appropriate. +
+ +Now that we have all of our simple expression parsing logic in place, we can +define a helper function to wrap them up in a class. We call this class of +expressions "primary" expressions, for reasons that will become more clear +later. In order to parse a primary expression, we need to determine what sort +of expression it is:
+ +++ ++/// primary +/// ::= identifierexpr +/// ::= numberexpr +/// ::= parenexpr +static ExprAST *ParsePrimary() { + switch (CurTok) { + default: return Error("unknown token when expecting an expression"); + case tok_identifier: return ParseIdentifierExpr(); + case tok_number: return ParseNumberExpr(); + case '(': return ParseParenExpr(); + } +} ++Now that you see the definition of this function, it makes it more obvious +why we can assume the state of CurTok in the various functions. This uses +look-ahead to determine which sort of expression is being inspected, and parses +it with a function call.
+ +Now that basic expressions are handled, we need to handle binary expressions, +which are a bit more complex.
+ ++ ++ + + + + +Binary expressions are significantly harder to parse because they are often +ambiguous. For example, when given the string "x+y*z", the parser can choose +to parse it as either "(x+y)*z" or "x+(y*z)". With common definitions from +mathematics, we expect the later parse, because "*" (multiplication) has +higher precedence than "+" (addition).
+ +There are many ways to handle this, but an elegant and efficient way is to +use Operator-Precedence +Parsing. This parsing technique uses the precedence of binary operators to +guide recursion. To start with, we need a table of precedences:
+ +++ ++/// BinopPrecedence - This holds the precedence for each binary operator that is +/// defined. +static std::map<char, int> BinopPrecedence; + +/// GetTokPrecedence - Get the precedence of the pending binary operator token. +static int GetTokPrecedence() { + if (!isascii(CurTok)) + return -1; + + // Make sure it's a declared binop. + int TokPrec = BinopPrecedence[CurTok]; + if (TokPrec <= 0) return -1; + return TokPrec; +} + +int main() { + // Install standard binary operators. + // 1 is lowest precedence. + BinopPrecedence['<'] = 10; + BinopPrecedence['+'] = 20; + BinopPrecedence['-'] = 20; + BinopPrecedence['*'] = 40; // highest. + ... +} ++For the basic form of Kaleidoscope, we will only support 4 binary operators +(this can obviously be extended by you, the reader). The +GetTokPrecedence function returns the precedence for the current token, +or -1 if the token is not a binary operator. Having a map makes it easy to add +new operators and makes it clear that the algorithm doesn't depend on the +specific operators involved, but it would be easy enough to eliminate the map +and do the comparisons in the GetTokPrecedence function.
+ +With the helper above defined, we can now start parsing binary expressions. +The basic idea of operator precedence parsing is to break down an expression +with potentially ambiguous binary operators into pieces. Consider for example +the expression "a+b+(c+d)*e*f+g". Operator precedence parsing considers this +as a stream of primary expressions separated by binary operators. As such, +it will first parse the leading primary expression "a", then it will see the +pairs [+, b] [+, (c+d)] [*, e] [*, f] and [+, g]. Note that because parentheses +are primary expressions that the binary expression parser doesn't need to worry +about nested subexpressions like (c+d) at all. +
+ ++To start, an expression is a primary expression potentially followed by a +sequence of [binop,primaryexpr] pairs:
+ +++ ++/// expression +/// ::= primary binoprhs +/// +static ExprAST *ParseExpression() { + ExprAST *LHS = ParsePrimary(); + if (!LHS) return 0; + + return ParseBinOpRHS(0, LHS); +} ++ParseBinOpRHS is the function that parses the sequence of pairs for +us. It takes a precedence and a pointer to an expression for the part parsed +so far. Note that "x" is a perfectly valid expression: As such, "binoprhs" is +allowed to be empty, in which case it returns the expression that is passed into +it. In our example above, the code passes the expression for "a" into +ParseBinOpRHS and the current token is "+".
+ +The precedence value passed into ParseBinOpRHS indicates the +minimal operator precedence that the function is allowed to eat. For +example, if the current pair stream is [+, x] and ParseBinOpRHS is +passed in a precedence of 40, it will not consume any tokens (because the +precedence of '+' is only 20). With this in mind, ParseBinOpRHS starts +with:
+ +++ ++/// binoprhs +/// ::= ('+' primary)* +static ExprAST *ParseBinOpRHS(int ExprPrec, ExprAST *LHS) { + // If this is a binop, find its precedence. + while (1) { + int TokPrec = GetTokPrecedence(); + + // If this is a binop that binds at least as tightly as the current binop, + // consume it, otherwise we are done. + if (TokPrec < ExprPrec) + return LHS; ++This code gets the precedence of the current token and checks to see if if is +too low. Because we defined invalid tokens to have a precedence of -1, this +check implicitly knows that the pair-stream ends when the token stream runs out +of binary operators. If this check succeeds, we know that the token is a binary +operator and that it will be included in this expression:
+ +++ ++ // Okay, we know this is a binop. + int BinOp = CurTok; + getNextToken(); // eat binop + + // Parse the primary expression after the binary operator. + ExprAST *RHS = ParsePrimary(); + if (!RHS) return 0; ++As such, this code eats (and remembers) the binary operator and then parses +the following primary expression. This builds up the whole pair, the first of +which is [+, b] for the running example.
+ +Now that we parsed the left-hand side of an expression and one pair of the +RHS sequence, we have to decide which way the expression associates. In +particular, we could have "(a+b) binop unparsed" or "a + (b binop unparsed)". +To determine this, we look ahead at "binop" to determine its precedence and +compare it to BinOp's precedence (which is '+' in this case):
+ +++ ++ // If BinOp binds less tightly with RHS than the operator after RHS, let + // the pending operator take RHS as its LHS. + int NextPrec = GetTokPrecedence(); + if (TokPrec < NextPrec) { ++If the precedence of the binop to the right of "RHS" is lower or equal to the +precedence of our current operator, then we know that the parentheses associate +as "(a+b) binop ...". In our example, since the next operator is "+" and so is +our current one, we know that they have the same precedence. In this case we'll +create the AST node for "a+b", and then continue parsing:
+ +++ ++ ... if body omitted ... + } + + // Merge LHS/RHS. + LHS = new BinaryExprAST(BinOp, LHS, RHS); + } // loop around to the top of the while loop. +} ++In our example above, this will turn "a+b+" into "(a+b)" and execute the next +iteration of the loop, with "+" as the current token. The code above will eat +and remember it and parse "(c+d)" as the primary expression, which makes the +current pair be [+, (c+d)]. It will then enter the 'if' above with "*" as the +binop to the right of the primary. In this case, the precedence of "*" is +higher than the precedence of "+" so the if condition will be entered.
+ +The critical question left here is "how can the if condition parse the right +hand side in full"? In particular, to build the AST correctly for our example, +it needs to get all of "(c+d)*e*f" as the RHS expression variable. The code to +do this is surprisingly simple (code from the above two blocks duplicated for +context):
+ +++ ++ // If BinOp binds less tightly with RHS than the operator after RHS, let + // the pending operator take RHS as its LHS. + int NextPrec = GetTokPrecedence(); + if (TokPrec < NextPrec) { + RHS = ParseBinOpRHS(TokPrec+1, RHS); + if (RHS == 0) return 0; + } + // Merge LHS/RHS. + LHS = new BinaryExprAST(BinOp, LHS, RHS); + } // loop around to the top of the while loop. +} ++At this point, we know that the binary operator to the RHS of our primary +has higher precedence than the binop we are currently parsing. As such, we know +that any sequence of pairs whose operators are all higher precedence than "+" +should be parsed together and returned as "RHS". To do this, we recursively +invoke the ParseBinOpRHS function specifying "TokPrec+1" as the minimum +precedence required for it to continue. In our example above, this will cause +it to return the AST node for "(c+d)*e*f" as RHS, which is then set as the RHS +of the '+' expression.
+ +Finally, on the next iteration of the while loop, the "+g" piece is parsed. +and added to the AST. With this little bit of code (14 non-trivial lines), we +correctly handle fully general binary expression parsing in a very elegant way. +
+ +This wraps up handling of expressions. At this point, we can point the +parser at an arbitrary token stream and build an expression from them, stopping +at the first token that is not part of the expression. Next up we need to +handle function definitions etc.
+ ++ ++ + + + + ++The first basic thing missing is that of function prototypes. In Kaleidoscope, +these are used both for 'extern' function declarations as well as function body +definitions. The code to do this is straight-forward and not very interesting +(once you've survived expressions): +
+ +++ ++/// prototype +/// ::= id '(' id* ')' +static PrototypeAST *ParsePrototype() { + if (CurTok != tok_identifier) + return ErrorP("Expected function name in prototype"); + + std::string FnName = IdentifierStr; + getNextToken(); + + if (CurTok != '(') + return ErrorP("Expected '(' in prototype"); + + std::vector<std::string> ArgNames; + while (getNextToken() == tok_identifier) + ArgNames.push_back(IdentifierStr); + if (CurTok != ')') + return ErrorP("Expected ')' in prototype"); + + // success. + getNextToken(); // eat ')'. + + return new PrototypeAST(FnName, ArgNames); +} ++Given this, a function definition is very simple, just a prototype plus +and expression to implement the body:
+ +++ ++/// definition ::= 'def' prototype expression +static FunctionAST *ParseDefinition() { + getNextToken(); // eat def. + PrototypeAST *Proto = ParsePrototype(); + if (Proto == 0) return 0; + + if (ExprAST *E = ParseExpression()) + return new FunctionAST(Proto, E); + return 0; +} ++In addition, we support 'extern' to declare functions like 'sin' and 'cos' as +well as to support forward declaration of user functions. 'externs' are just +prototypes with no body:
+ +++ ++/// external ::= 'extern' prototype +static PrototypeAST *ParseExtern() { + getNextToken(); // eat extern. + return ParsePrototype(); +} ++Finally, we'll also let the user type in arbitrary top-level expressions and +evaluate them on the fly. We will handle this by defining anonymous nullary +(zero argument) functions for them:
+ +++ ++/// toplevelexpr ::= expression +static FunctionAST *ParseTopLevelExpr() { + if (ExprAST *E = ParseExpression()) { + // Make an anonymous proto. + PrototypeAST *Proto = new PrototypeAST("", std::vector<std::string>()); + return new FunctionAST(Proto, E); + } + return 0; +} ++Now that we have all the pieces, lets build a little driver that will let us +actually execute this code we've built!
+ ++ ++ + + + + +The driver for this simply invokes all of the parsing pieces with a top-level +dispatch loop. There isn't much interesting here, so I'll just include the +top-level loop. See below for full code in the "Top-Level +Parsing" section.
+ +++ ++/// top ::= definition | external | expression | ';' +static void MainLoop() { + while (1) { + fprintf(stderr, "ready> "); + switch (CurTok) { + case tok_eof: return; + case ';': getNextToken(); break; // ignore top level semicolons. + case tok_def: HandleDefinition(); break; + case tok_extern: HandleExtern(); break; + default: HandleTopLevelExpression(); break; + } + } +} ++The most interesting part of this is that we ignore top-level semi colons. +Why is this do you ask? The basic reason is that if you type "4 + 5" at the +command line, the parser doesn't know that that is the end of what you will +type. For example, on the next line you could type "def foo..." in which case +4+5 is the end of a top-level expression. Alternatively you could type "* 6", +which would continue the expression. Having top-level semicolons allows you to +type "4+5;" and the parser will know you are done.
+ ++ ++ + +With just under 400 lines of commented code, we fully defined our minimal +language, including a lexer, parser and AST builder. With this done, the +executable will validate code and tell us if it is gramatically invalid. For +example, here is a sample interaction:
+ +++ ++$ ./a.out +ready> def foo(x y) x+foo(y, 4.0); +ready> Parsed an function definition. +ready> def foo(x y) x+y y; +ready> Parsed an function definition. +ready> Parsed a top-level expr +ready> def foo(x y) x+y ); +ready> Parsed an function definition. +ready> Error: unknown token when expecting an expression +ready> extern sin(a); +ready> Parsed an extern +ready> ^D +$ +++Here is the full code. Note that it is fully self-contained: you don't even +need LLVM for this. In the next installment, we +will describe how to generate LLVM IR from the AST.
+ ++++// To build this: +// g++ -g toy.cpp +// ./a.out + +#include <cstdio> +#include <string> +#include < +#include <vector> + +//===----------------------------------------------------------------------===// +// Lexer +//===----------------------------------------------------------------------===// + +// The lexer returns tokens [0-255] if it is an unknown character, otherwise one +// of these for known things. +enum Token { + tok_eof = -1, + + // commands + tok_def = -2, tok_extern = -3, + + // primary + tok_identifier = -4, tok_number = -5, +}; + +static std::string IdentifierStr; // Filled in if tok_identifier +static double NumVal; // Filled in if tok_number + +/// gettok - Return the next token from standard input. +static int gettok() { + static int LastChar = ' '; + + // Skip any whitespace. + while (isspace(LastChar)) + LastChar = getchar(); + + if (isalpha(LastChar)) { // identifier: [a-zA-Z][a-zA-Z0-9]* + IdentifierStr = LastChar; + while (isalnum((LastChar = getchar()))) + IdentifierStr += LastChar; + + if (IdentifierStr == "def") return tok_def; + if (IdentifierStr == "extern") return tok_extern; + return tok_identifier; + } + + if (isdigit(LastChar) || LastChar == '.') { // Number: [0-9.]+ + std::string NumStr; + do { + NumStr += LastChar; + LastChar = getchar(); + } while (isdigit(LastChar) || LastChar == '.'); + + NumVal = strtod(NumStr.c_str(), 0); + return tok_number; + } + + if (LastChar == '#') { + // Comment until end of line. + do LastChar = getchar(); + while (LastChar != EOF && LastChar != '\n' & LastChar != '\r'); + + if (LastChar != EOF) + return gettok(); + } + + // Check for end of file. Don't eat the EOF. + if (LastChar == EOF) + return tok_eof; + + // Otherwise, just return the character as its ascii value. + int ThisChar = LastChar; + LastChar = getchar(); + return ThisChar; +} + +//===----------------------------------------------------------------------===// +// Abstract Syntax Tree (aka Parse Tree) +//===----------------------------------------------------------------------===// + +/// ExprAST - Base class for all expression nodes. +class ExprAST { +public: + virtual ~ExprAST() {} +}; + +/// NumberExprAST - Expression class for numeric literals like "1.0". +class NumberExprAST : public ExprAST { + double Val; +public: + NumberExprAST(double val) : Val(val) {} +}; + +/// VariableExprAST - Expression class for referencing a variable, like "a". +class VariableExprAST : public ExprAST { + std::string Name; +public: + VariableExprAST(const std::string &name) : Name(name) {} +}; + +/// BinaryExprAST - Expression class for a binary operator. +class BinaryExprAST : public ExprAST { + char Op; + ExprAST *LHS, *RHS; +public: + BinaryExprAST(char op, ExprAST *lhs, ExprAST *rhs) + : Op(op), LHS(lhs), RHS(rhs) {} +}; + +/// CallExprAST - Expression class for function calls. +class CallExprAST : public ExprAST { + std::string Callee; + std::vector<ExprAST*> Args; +public: + CallExprAST(const std::string &callee, std::vector<ExprAST*> &args) + : Callee(callee), Args(args) {} +}; + +/// PrototypeAST - This class represents the "prototype" for a function, +/// which captures its argument names as well as if it is an operator. +class PrototypeAST { + std::string Name; + std::vector< Args; +public: + PrototypeAST(const std::string &name, const std::vector<std::string> &args) + : Name(name), Args(args) {} + +}; + +/// FunctionAST - This class represents a function definition itself. +class FunctionAST { + PrototypeAST *Proto; + ExprAST *Body; +public: + FunctionAST(PrototypeAST *proto, ExprAST *body) + : Proto(proto), Body(body) {} + +}; + +//===----------------------------------------------------------------------===// +// Parser +//===----------------------------------------------------------------------===// + +/// CurTok/getNextToken - Provide a simple token buffer. CurTok is the current +/// token the parser it looking at. getNextToken reads another token from the +/// lexer and updates CurTok with its results. +static int CurTok; +static int getNextToken() { + return CurTok = gettok(); +} + +/// BinopPrecedence - This holds the precedence for each binary operator that is +/// defined. +static std::map<char, int> BinopPrecedence; + +/// GetTokPrecedence - Get the precedence of the pending binary operator token. +static int GetTokPrecedence() { + if (!isascii(CurTok)) + return -1; + + // Make sure it's a declared binop. + int TokPrec = BinopPrecedence[CurTok]; + if (TokPrec <= 0) return -1; + return TokPrec; +} + +/// Error* - These are little helper functions for error handling. +ExprAST *Error(const char *Str) { fprintf(stderr, "Error: %s\n", Str);return 0;} +PrototypeAST *ErrorP(const char *Str) { Error(Str); return 0; } +FunctionAST *ErrorF(const char *Str) { Error(Str); return 0; } + +static ExprAST *ParseExpression(); + +/// identifierexpr +/// ::= identifer +/// ::= identifer '(' expression* ')' +static ExprAST *ParseIdentifierExpr() { + std::string IdName = IdentifierStr; + + getNextToken(); // eat identifer. + + if (CurTok != '(') // Simple variable ref. + return new VariableExprAST(IdName); + + // Call. + getNextToken(); // eat ( + std::vector<ExprAST*> Args; + while (1) { + ExprAST *Arg = ParseExpression(); + if (!Arg) return 0; + Args.push_back(Arg); + + if (CurTok == ')') break; + + if (CurTok != ',') + return Error("Expected ')'"); + getNextToken(); + } + + // Eat the ')'. + getNextToken(); + + return new CallExprAST(IdName, Args); +} + +/// numberexpr ::= number +static ExprAST *ParseNumberExpr() { + ExprAST *Result = new NumberExprAST(NumVal); + getNextToken(); // consume the number + return Result; +} + +/// parenexpr ::= '(' expression ')' +static ExprAST *ParseParenExpr() { + getNextToken(); // eat (. + ExprAST *V = ParseExpression(); + if (!V) return 0; + + if (CurTok != ')') + return Error("expected ')'"); + getNextToken(); // eat ). + return V; +} + +/// primary +/// ::= identifierexpr +/// ::= numberexpr +/// ::= parenexpr +static ExprAST *ParsePrimary() { + switch (CurTok) { + default: return Error("unknown token when expecting an expression"); + case tok_identifier: return ParseIdentifierExpr(); + case tok_number: return ParseNumberExpr(); + case '(': return ParseParenExpr(); + } +} + +/// binoprhs +/// ::= ('+' primary)* +static ExprAST *ParseBinOpRHS(int ExprPrec, ExprAST *LHS) { + // If this is a binop, find its precedence. + while (1) { + int TokPrec = GetTokPrecedence(); + + // If this is a binop that binds at least as tightly as the current binop, + // consume it, otherwise we are done. + if (TokPrec < ExprPrec) + return LHS; + + // Okay, we know this is a binop. + int BinOp = CurTok; + getNextToken(); // eat binop + + // Parse the primary expression after the binary operator. + ExprAST *RHS = ParsePrimary(); + if (!RHS) return 0; + + // If BinOp binds less tightly with RHS than the operator after RHS, let + // the pending operator take RHS as its LHS. + int NextPrec = GetTokPrecedence(); + if (TokPrec < NextPrec) { + RHS = ParseBinOpRHS(TokPrec+1, RHS); + if (RHS == 0) return 0; + } + + // Merge LHS/RHS. + LHS = new BinaryExprAST(BinOp, LHS, RHS); + } +} + +/// expression +/// ::= primary binoprhs +/// +static ExprAST *ParseExpression() { + ExprAST *LHS = ParsePrimary(); + if (!LHS) return 0; + + return ParseBinOpRHS(0, LHS); +} + +/// prototype +/// ::= id '(' id* ')' +static PrototypeAST *ParsePrototype() { + if (CurTok != tok_identifier) + return ErrorP("Expected function name in prototype"); + + std::string FnName = IdentifierStr; + getNextToken(); + + if (CurTok != '(') + return ErrorP("Expected '(' in prototype"); + + std::vector<std::string> ArgNames; + while (getNextToken() == tok_identifier) + ArgNames.push_back(IdentifierStr); + if (CurTok != ')') + return ErrorP("Expected ')' in prototype"); + + // success. + getNextToken(); // eat ')'. + + return new PrototypeAST(FnName, ArgNames); +} + +/// definition ::= 'def' prototype expression +static FunctionAST *ParseDefinition() { + getNextToken(); // eat def. + PrototypeAST *Proto = ParsePrototype(); + if (Proto == 0) return 0; + + if (ExprAST *E = ParseExpression()) + return new FunctionAST(Proto, E); + return 0; +} + +/// toplevelexpr ::= expression +static FunctionAST *ParseTopLevelExpr() { + if (ExprAST *E = ParseExpression()) { + // Make an anonymous proto. + PrototypeAST *Proto = new PrototypeAST("", std::vector<()); + return new FunctionAST(Proto, E); + } + return 0; +} + +/// external ::= 'extern' prototype +static PrototypeAST *ParseExtern() { + getNextToken(); // eat extern. + return ParsePrototype(); +} + +//===----------------------------------------------------------------------===// +// Top-Level parsing +//===----------------------------------------------------------------------===// + +static void HandleDefinition() { + if (FunctionAST *F = ParseDefinition()) { + fprintf(stderr, "Parsed a function definition.\n"); + } else { + // Skip token for error recovery. + getNextToken(); + } +} + +static void HandleExtern() { + if (PrototypeAST *P = ParseExtern()) { + fprintf(stderr, "Parsed an extern\n"); + } else { + // Skip token for error recovery. + getNextToken(); + } +} + +static void HandleTopLevelExpression() { + // Evaluate a top level expression into an anonymous function. + if (FunctionAST *F = ParseTopLevelExpr()) { + fprintf(stderr, "Parsed a top-level expr\n"); + } else { + // Skip token for error recovery. + getNextToken(); + } +} + +/// top ::= definition | external | expression | ';' +static void MainLoop() { + while (1) { + fprintf(stderr, "ready> "); + switch (CurTok) { + case tok_eof: return; + case ';': getNextToken(); break; // ignore top level semicolons. + case tok_def: HandleDefinition(); break; + case tok_extern: HandleExtern(); break; + default: HandleTopLevelExpression(); break; + } + } +} + +//===----------------------------------------------------------------------===// +// Main driver code. +//===----------------------------------------------------------------------===// + +int main() { + // Install standard binary operators. + // 1 is lowest precedence. + BinopPrecedence['<'] = 10; + BinopPrecedence['+'] = 20; + BinopPrecedence['-'] = 20; + BinopPrecedence['*'] = 40; // highest. + + // Prime the first token. + fprintf(stderr, "ready> "); + getNextToken(); + + MainLoop(); + return 0; +} ++
+ + + + + Chris Lattner
+ The LLVM Compiler Infrastructure
+ Last modified: $Date: 2007-10-17 11:05:13 -0700 (Wed, 17 Oct 2007) $ + + + diff --git a/docs/tutorial/index.html b/docs/tutorial/index.html index acaee03367b..6991adfe6f3 100644 --- a/docs/tutorial/index.html +++ b/docs/tutorial/index.html @@ -28,7 +28,7 @@