Vesta: /vesta/vestasys.org/vesta/eval/80/src/Lex.C Source File

00001 // Copyright (C) 2001, Compaq Computer Corporation
00002 // 
00003 // This file is part of Vesta.
00004 // 
00005 // Vesta is free software; you can redistribute it and/or
00006 // modify it under the terms of the GNU Lesser General Public
00007 // License as published by the Free Software Foundation; either
00008 // version 2.1 of the License, or (at your option) any later version.
00009 // 
00010 // Vesta is distributed in the hope that it will be useful,
00011 // but WITHOUT ANY WARRANTY; without even the implied warranty of
00012 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00013 // Lesser General Public License for more details.
00014 // 
00015 // You should have received a copy of the GNU Lesser General Public
00016 // License along with Vesta; if not, write to the Free Software
00017 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
00018 
00019 /* File: Lex.C                                                 */
00020 /* Last modified on Mon May 16 13:57:01 EDT 2005 by ken@xorian.net         */
00021 /*      modified on Wed Apr 27 10:01:05 EDT 2005 by irina.furman@intel.com */
00022 /*      modified on Mon Aug 12 10:18:53 EDT 2002 by kcschalk@shr.intel.com */
00023 /*      modified on Wed Apr 12 18:42:44 PDT 2000 by yuanyu     */
00024 /*      modified on Sat Feb 12 17:53:22 PST 2000 by mann       */
00025 /*      modified on Fri Oct 31 10:11:35 PST 1997 by heydon     */
00026 /*      modified on Fri Jan 26 12:43:17 PST 1996 by horning    */
00027 /*      modified on Wed Apr 27 13:53:14 PDT 1994 by hanna      */
00028 
00029 #include "ModelState.H"
00030 #include "Lex.H"
00031 #include "Location.H"
00032 #include "Expr.H"
00033 #include "Val.H"
00034 #include "Err.H"
00035 #include <Table.H>
00036 #include <iostream>
00037 
00038 using std::istream;
00039 
00040 // Extern global variables:
00041 SrcLoc *noLoc = NEW(SrcLoc);
00042 
00043 // Lex global variables:
00044 istream *lexIn;
00045 char lookAhead1, lookAhead2;
00046 int lookaheads;
00047 int lineNumber, charNumber;
00048 
00049 static TokenClass CharMap[300];
00050 static Table<Text,TokenClass>::Default ReservedWords(64);
00051 static SrcLoc *currentLoc = NEW(SrcLoc);
00052 
00053 Token::Token()
00054 : bytesLength(128), length(0), loc(noLoc) {
00055   bytes = NEW_PTRFREE_ARRAY(char, 128);
00056   bytes[length] = 0;
00057 }
00058 
00059 void Token::AppendChar(char c) {
00060   bytes[length++] = c;
00061   if (length == bytesLength) {
00062     bytesLength = bytesLength * 2;
00063     char *newBytes = NEW_PTRFREE_ARRAY(char, bytesLength);
00064     memcpy(newBytes, bytes, length);
00065     delete[] bytes;
00066     bytes = newBytes;
00067   }
00068 }
00069 
00070 void Token::TokenAssign(Token& tk) {
00071   this->tclass = tk.tclass;
00072   this->expr = tk.expr;
00073   this->loc = tk.loc;
00074   char* tempBytes = this->bytes;
00075   int tempLength = this->length, tempBytesLength = this->bytesLength;
00076   this->bytes = tk.bytes;
00077   this->length = tk.length;
00078   this->bytesLength = tk.bytesLength;
00079   tk.bytes = tempBytes;
00080   tk.length = this->length;
00081   tk.bytesLength = tempBytesLength;
00082 }
00083 
00084 char GetChar() {
00085   char res;
00086   switch (lookaheads) {
00087   case 0:
00088     res = (char)lexIn->get(); 
00089     if (res == '\n') {
00090       lineNumber++;
00091       charNumber = 0;
00092     }
00093     else
00094       charNumber++;
00095     break;
00096   case 1:
00097     res = lookAhead1; 
00098     lookaheads--;
00099     break;
00100   case 2:
00101     res = lookAhead1; 
00102     lookAhead1 = lookAhead2; 
00103     lookaheads--;
00104     break;
00105   default:
00106     outputMu.lock();    
00107     InternalError("GetChar");
00108     outputMu.unlock();    
00109   }
00110   return res;
00111 }
00112     
00113 inline static void UngetChar(char c) {
00114   switch (lookaheads) {
00115   case 0:
00116     lookAhead1 = c; 
00117     lookaheads++;
00118     break;
00119   case 1:
00120     lookAhead2 = lookAhead1; 
00121     lookAhead1 = c; 
00122     lookaheads++;
00123     break;
00124   default:
00125     outputMu.lock();
00126     InternalError("UngetChar");
00127     outputMu.unlock();    
00128   }
00129 }
00130 
00131 static char SkipWhitespace() {
00132   while (true) {
00133     char c = GetChar();
00134     switch (c) {
00135     case ' ':  case '\t': case '\f': case '\n': case '\r':
00136       break;
00137     default:
00138       return c;
00139     }
00140   }
00141 }
00142 
00143 // Nested Slash-Star comments are not permitted.
00144 void Token::ScanComment(char c) {
00145   bool done = false;
00146 
00147   // assert((c == '*') || (c == '/'))
00148   if (c == '*') {
00149     while (!done) {
00150       c = GetChar();
00151       switch (c) {
00152         case '/':
00153           c = GetChar();
00154           if (c == '*') {
00155             SrcLoc loc(lineNumber, charNumber, currentLoc->file, currentLoc->shortId);
00156             outputMu.lock();
00157             Error("Nested /* comment.\n", &loc);
00158             outputMu.unlock();      
00159             throw "\nParsing terminated.\n";
00160           }
00161           else
00162             UngetChar(c);
00163           break;
00164         case '*':
00165           c = GetChar();
00166           if (c == '/')
00167             done = true;
00168           else
00169             UngetChar(c);
00170           break;
00171         case ((char) EOF):
00172           {
00173             SrcLoc loc(lineNumber, charNumber, currentLoc->file, currentLoc->shortId);
00174             outputMu.lock();        
00175             Error("Unterminated comment. EOF in ScanComment.\n", &loc);
00176             outputMu.unlock();      
00177             throw "\nParsing terminated.\n";
00178           }
00179         default:
00180           break;
00181         }
00182     }
00183   }
00184   else {
00185     while (c != '\n' && c != ((char) EOF))
00186       c = GetChar();
00187   }
00188 }
00189 
00190 // Nested pragmas are not permitted.
00191 bool Token::ScanPragma() {
00192   bool done = false;
00193   char c = SkipWhitespace();
00194 
00195   while (!done) {
00196     switch (c) {
00197     case '*':
00198       {
00199         char c1 = GetChar();
00200         char c2 = GetChar();
00201         // Actual pragma
00202         if ((c1 == '*') && (c2 == '/'))
00203           done = true;
00204         // Comment that starts with "/**"
00205         else if(c1 == '/')
00206           {
00207             // Discard any colleced bytes
00208             StartBytes();
00209             // Put the character following the comment back.
00210             UngetChar(c2);
00211             // Indicate that this wasn't a pragma.
00212             return false;
00213           }
00214         else {
00215           AppendChar(c);
00216           c = c1;
00217           UngetChar(c2);
00218         }
00219         break;
00220       }
00221     case ((char) EOF):
00222       outputMu.lock();      
00223       Error("Unterminated pragma.\n", currentLoc);
00224       outputMu.unlock();      
00225       throw "\nParsing terminated.\n";
00226     default:
00227       AppendChar(c);
00228       c = GetChar();
00229       break;
00230     }
00231   }
00232   // Remove white space at the tail.
00233   int index = length - 1;
00234   while (index >= 0) {
00235     switch (bytes[index]) {
00236     case ' ':  case '\t': case '\f': case '\n':
00237       index--;
00238       break;
00239     default:
00240       length = index + 1;
00241       index = -1;
00242       break;
00243     }
00244   }
00245   EndBytes();
00246   this->tclass = TkPragma;
00247   this->loc = currentLoc->Copy();
00248 
00249   // Indicate that this was a pragma.
00250   return true;
00251 }
00252 
00253 void Token::ScanIdNumber(char c) {
00254   AppendChar(c);
00255   while (true) {
00256     c = GetChar();
00257     if ((c >= 'a') && (c <= 'z') ||
00258         (c >= 'A') && (c <= 'Z') ||
00259         (c >= '0') && (c <= '9') ||
00260         (c == '_') ||
00261         (c == '.'))
00262       AppendChar(c);
00263     else {
00264       UngetChar(c);
00265       break;
00266     }
00267   }
00268   EndBytes();
00269   this->loc = currentLoc->Copy();
00270   Text id(Bytes());
00271   if (!ReservedWords.Get(id, tclass)) {
00272     // if it is not reserved, it is either an identifier or a number.
00273     Basics::int32 n = 0;
00274     unsigned int idx = 0, base = 10;
00275     if (id[0] == '0') {
00276       if (id[1] == 'x' || id[1] == 'X') {
00277         idx = 2;
00278         base = 16;
00279       } else {
00280         idx = 1;
00281         base = 8;
00282       }
00283     }
00284     bool isNumber = true;
00285     for (int i = idx; i < id.Length(); i++) {
00286       char ch = id[i];
00287       switch (ch) {
00288       case '0': case '1': case '2': case '3': case '4': case '5':
00289       case '6': case '7':
00290         n = base * n + (ch - '0');
00291         break;
00292       case '8': case '9':
00293         if (base == 8)
00294           isNumber = false;
00295         else
00296           n = base * n + (ch - '0');
00297         break;
00298       case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
00299         if (base != 16)
00300           isNumber = false;
00301         else 
00302           n = base * n + (10 + ch - 'a');
00303         break;
00304       case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
00305         if (base != 16)
00306           isNumber = false;
00307         else
00308           n = base * n + (10 + ch - 'A');
00309         break;
00310       default:
00311         isNumber = false;
00312         break;
00313       }
00314       if (n < 0) {
00315         outputMu.lock();        
00316         Error("Integer too large.\n", currentLoc);
00317         outputMu.unlock();      
00318         throw "\nParsing terminated.\n";
00319       }
00320       if (!isNumber) break;
00321     }
00322     // if it is not a number, it is treated as an identifier.
00323     if (isNumber) {
00324       tclass = TkNumber;
00325       expr = NEW_CONSTR(ConstantEC, (NEW_CONSTR(IntegerVC, (n)), this->loc));
00326     }
00327     else {
00328       tclass = TkId;
00329       expr = NEW_CONSTR(NameEC, (id, this->loc));
00330     }
00331   }
00332 }
00333 
00334 void Token::ScanText() {
00335   char c;
00336   int val, j;
00337   bool done = false;
00338 
00339   while (!done) {
00340     c = GetChar();
00341     switch (c) {
00342     case '"':
00343       done = true;
00344       break;
00345     case '\\':
00346       c = GetChar();
00347       switch (c) {
00348       case 'n':  AppendChar('\n'); break;
00349       case 't':  AppendChar('\t'); break;
00350       case 'v':  AppendChar('\v'); break;
00351       case 'b':  AppendChar('\b'); break;
00352       case 'r':  AppendChar('\r'); break;
00353       case 'f':  AppendChar('\f'); break;
00354       case 'a':  AppendChar('\a'); break;
00355       case '\\': AppendChar('\\'); break;
00356       case '"':  AppendChar('\"'); break;
00357       case 'x': case 'X':
00358         val = 0;
00359         c = GetChar();
00360         for (j = 0; j < 2; j++) {
00361           switch (c) {
00362           case '0': case '1': case '2': case '3': case '4': case '5':
00363           case '6': case '7': case '8': case '9':
00364             val = 16 * val + c - '0';                
00365             c = GetChar();
00366             break;
00367           case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
00368             val = 16 * val + 10 + c - 'A';
00369             c = GetChar();
00370             break;
00371           case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
00372             val = 16 * val + 10 + c - 'a';
00373             c = GetChar();
00374             break;
00375           default:
00376             if (j == 0) {
00377               UngetChar(c);
00378               outputMu.lock();        
00379               Error("Illegal escape character in text.\n", currentLoc);
00380               outputMu.unlock();              
00381               throw "\nParsing terminated.\n";
00382             }
00383             goto xend;
00384           }
00385         }
00386   xend: UngetChar(c);
00387         AppendChar((unsigned char)val);
00388         break;
00389       default:
00390         val = 0;
00391         for (j = 0; j < 3; j++) {
00392           switch (c) {
00393           case '0': case '1': case '2': case '3': case '4': case '5':
00394           case '6': case '7':
00395             val = 8 * val + c - '0';                
00396             c = GetChar();
00397             break;
00398           default:
00399             if (j == 0) {
00400               UngetChar(c);
00401               outputMu.lock();        
00402               Error("Illegal escape character in text.\n", currentLoc);
00403               outputMu.unlock();
00404               throw "\nParsing terminated.\n";
00405             }
00406             goto oend;
00407           }
00408         }
00409   oend: UngetChar(c);
00410         AppendChar((unsigned char)val);
00411         break;
00412       }
00413       break;
00414     case ((char) EOF): case '\n':  // error trap
00415       outputMu.lock();      
00416       Error("Text not terminated at end of line or file.\n", currentLoc);
00417       outputMu.unlock();      
00418       throw("\nParsing terminated.\n");
00419     default:
00420       AppendChar(c);
00421       break;
00422     }
00423   }
00424   EndBytes();
00425   this->tclass = TkString;
00426   this->loc = currentLoc->Copy();
00427   expr = NEW_CONSTR(ConstantEC, 
00428                     (NEW_CONSTR(TextVC, (Text(Bytes()))), this->loc));
00429 }
00430 
00431 void Token::Next() {
00432   while (true) {
00433     char c = SkipWhitespace();
00434     currentLoc->line = lineNumber;
00435     currentLoc->character = charNumber;
00436     StartBytes();
00437     switch (c) {
00438     case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
00439     case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
00440     case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
00441     case 's': case 't': case 'u': case 'v': case 'w': case 'x':
00442     case 'y': case 'z':
00443     case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
00444     case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
00445     case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
00446     case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
00447     case 'Y': case 'Z':
00448     case '.': case '_':
00449     case '0': case '1': case '2': case '3': case '4': case '5':
00450     case '6': case '7': case '8': case '9':
00451       this->ScanIdNumber(c);
00452       return;
00453     case '"':
00454       this->ScanText();
00455       return;
00456     case '/':
00457       {
00458         char c1 = GetChar();
00459         if (c1 == '/') {
00460           this->ScanComment(c);
00461           break;
00462         }
00463         else if (c1 == '*') {
00464           char c2 = GetChar();
00465           if (c2 == '*') {
00466             if(this->ScanPragma())
00467               {
00468                 return;
00469               }
00470             else
00471               {
00472                 break;
00473               }
00474           }
00475           else {
00476             UngetChar(c2);
00477             this->ScanComment(c1);
00478             break;
00479           }
00480         }
00481         tclass = TkSlash;
00482         this->AppendChar(c);
00483         UngetChar(c1);
00484         this->EndBytes();
00485         this->loc = currentLoc->Copy();
00486         return;
00487       }
00488     case '<':
00489       {
00490         this->AppendChar(c);
00491         c = GetChar();
00492         if (c == '=') {
00493           this->AppendChar(c);
00494           tclass = TkLessEq;
00495         }
00496         else {
00497           UngetChar(c);
00498           tclass = TkLess;
00499         }
00500         this->EndBytes();
00501         this->loc = currentLoc->Copy();
00502         return;
00503       }
00504     case '&': case '|': case '=': case '!': case '>': case '+':
00505       this->AppendChar(c);
00506       tclass = CharMap[c];
00507       c = GetChar();
00508       switch(c) {
00509       case '&': case '|': case '=': case '>': case '+':
00510         this->AppendChar(c);
00511         this->EndBytes();
00512         if (ReservedWords.Get(Text(bytes, (void*)1), tclass)) {
00513           this->loc = currentLoc->Copy();
00514           return;
00515         }
00516         this->UnAppendChar();
00517         // NB: No break here:
00518       default:
00519         UngetChar(c);
00520         this->EndBytes();
00521         this->loc = currentLoc->Copy();
00522         return;
00523       }
00524     case '\\': case ':': case ',': case '$': case '-': case '%': case '?':
00525     case ';': case '*':
00526     case '[': case ']': case '{': case '}': case '(': case ')':
00527       tclass = CharMap[c];
00528       this->AppendChar(c);
00529       this->EndBytes();
00530       this->loc = currentLoc->Copy();
00531       return;
00532     case ((char) EOF):
00533       tclass = TkEOF;
00534       return;
00535     default:
00536       outputMu.lock();      
00537       Error(Text("Bad character `") + c + "'.\n", currentLoc);
00538       outputMu.unlock();      
00539       throw("\nParsing terminated.\n");
00540     }
00541   }
00542 }
00543 
00544 void Token::LexFlush() {
00545   while (true) {
00546     int c = lexIn->get();
00547     if (c == EOF) break;
00548   }
00549 }
00550 
00551 void Token::Init(const Text& model, ShortId sid) {
00552   tclass = TkErr;
00553   expr   = NULL;
00554   currentLoc->Init(model, sid);
00555   this->loc = currentLoc;
00556 }
00557 
00558 /* These names should agree with the enumerated values of the "TokenClass"
00559    enumeration type. */
00560 Text TokenNames[TkIllegal+1] =
00561   { "binding", "bool", "do", "else", "ERR", "FALSE", "files",
00562     "foreach", "from", "function", "if", "in", "import", "int",
00563     "list", "return", "text", "then", "TRUE", "type", "value",
00564 
00565     "Id", "Number", "String",
00566 
00567     "And", "EqEq", "NotEq", "GreaterEq", "Implies", "LessEq", "Or",
00568     "PlusPlus",
00569 
00570     "BackSlash", "Bang", "Colon", "Comma", "Dollar", "Equal", "Greater",
00571     "Less", "Minus", "Percent", "Plus", "Query", "Semicolon", "Slash",
00572     "Star", "Underscore",
00573 
00574     "LBrace", "RBrace", "LBracket", "RBracket", "LParen", "RParen",
00575     
00576     "Pragma",
00577 
00578     "End of File", "Illegal Token"
00579  };
00580 
00581 void LexInit() {
00582   /* define the single character tokens: */
00583   CharMap['\\']= TkBackSlash;
00584   CharMap['!'] = TkBang;
00585   CharMap[':'] = TkColon;
00586   CharMap[','] = TkComma;
00587   CharMap['$'] = TkDollar;
00588   CharMap['='] = TkEqual;
00589   CharMap['>'] = TkGreater;
00590   CharMap['<'] = TkLess;
00591   CharMap['-'] = TkMinus;
00592   CharMap['%'] = TkPercent;
00593   CharMap['+'] = TkPlus;
00594   CharMap['?'] = TkQuery;
00595   CharMap[';'] = TkSemicolon;
00596   CharMap['/'] = TkSlash;
00597   CharMap['*'] = TkStar;
00598   CharMap['_'] = TkUnderscore;
00599   CharMap['{'] = TkLBrace;
00600   CharMap['}'] = TkRBrace;
00601   CharMap['['] = TkLBracket;
00602   CharMap[']'] = TkRBracket;
00603   CharMap['('] = TkLParen;
00604   CharMap[')'] = TkRParen;
00605 
00606   ReservedWords.Put("binding",  TkBinding);
00607   ReservedWords.Put("bool",     TkBool);
00608   ReservedWords.Put("do",       TkDo);
00609   ReservedWords.Put("else",     TkElse);
00610   ReservedWords.Put("ERR",      TkErr);
00611   ReservedWords.Put("FALSE",    TkFalse);
00612   ReservedWords.Put("files",    TkFiles);
00613   ReservedWords.Put("foreach",  TkForeach);
00614   ReservedWords.Put("from",     TkFrom);
00615   ReservedWords.Put("function", TkFunction);
00616   ReservedWords.Put("if",       TkIf);
00617   ReservedWords.Put("in",       TkIn);
00618   ReservedWords.Put("import",   TkImport);
00619   ReservedWords.Put("int",      TkInt);
00620   ReservedWords.Put("list",     TkList);
00621   ReservedWords.Put("return",   TkReturn);
00622   ReservedWords.Put("text",     TkText);
00623   ReservedWords.Put("then",     TkThen);
00624   ReservedWords.Put("TRUE",     TkTrue);
00625   ReservedWords.Put("type",     TkType);
00626   ReservedWords.Put("value",    TkValue);
00627   ReservedWords.Put("&&",       TkAnd);
00628   ReservedWords.Put("==",       TkEqEq);
00629   ReservedWords.Put("!=",       TkNotEq);
00630   ReservedWords.Put(">=",       TkGreaterEq);
00631   ReservedWords.Put("=>",       TkImplies);
00632   ReservedWords.Put("<=",       TkLessEq);
00633   ReservedWords.Put("||",       TkOr);
00634   ReservedWords.Put("++",       TkPlusPlus);
00635   ReservedWords.Put("