001 /* StreamTokenizer.java -- parses streams of characters into tokens 002 Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003 Free Software Foundation 003 004 This file is part of GNU Classpath. 005 006 GNU Classpath is free software; you can redistribute it and/or modify 007 it under the terms of the GNU General Public License as published by 008 the Free Software Foundation; either version 2, or (at your option) 009 any later version. 010 011 GNU Classpath is distributed in the hope that it will be useful, but 012 WITHOUT ANY WARRANTY; without even the implied warranty of 013 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 014 General Public License for more details. 015 016 You should have received a copy of the GNU General Public License 017 along with GNU Classpath; see the file COPYING. If not, write to the 018 Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 019 02110-1301 USA. 020 021 Linking this library statically or dynamically with other modules is 022 making a combined work based on this library. Thus, the terms and 023 conditions of the GNU General Public License cover the whole 024 combination. 025 026 As a special exception, the copyright holders of this library give you 027 permission to link this library with independent modules to produce an 028 executable, regardless of the license terms of these independent 029 modules, and to copy and distribute the resulting executable under 030 terms of your choice, provided that you also meet, for each linked 031 independent module, the terms and conditions of the license of that 032 module. An independent module is a module which is not derived from 033 or based on this library. If you modify this library, you may extend 034 this exception to your version of the library, but you are not 035 obligated to do so. If you do not wish to do so, delete this 036 exception statement from your version. */ 037 038 package java.io; 039 040 /** 041 * This class parses streams of characters into tokens. There are a 042 * million-zillion flags that can be set to control the parsing, as 043 * described under the various method headings. 044 * 045 * @author Warren Levy (warrenl@cygnus.com) 046 * @date October 25, 1998. 047 */ 048 /* Written using "Java Class Libraries", 2nd edition, ISBN 0-201-31002-3 049 * "The Java Language Specification", ISBN 0-201-63451-1 050 * plus online API docs for JDK 1.2 beta from http://www.javasoft.com. 051 * Status: Believed complete and correct. 052 */ 053 054 public class StreamTokenizer 055 { 056 /** A constant indicating that the end of the stream has been read. */ 057 public static final int TT_EOF = -1; 058 059 /** A constant indicating that the end of the line has been read. */ 060 public static final int TT_EOL = '\n'; 061 062 /** A constant indicating that a number token has been read. */ 063 public static final int TT_NUMBER = -2; 064 065 /** A constant indicating that a word token has been read. */ 066 public static final int TT_WORD = -3; 067 068 /** A constant indicating that no tokens have been read yet. */ 069 private static final int TT_NONE = -4; 070 071 /** 072 * Contains the type of the token read resulting from a call to nextToken 073 * The rules are as follows: 074 * <ul> 075 * <li>For a token consisting of a single ordinary character, this is the 076 * value of that character.</li> 077 * <li>For a quoted string, this is the value of the quote character</li> 078 * <li>For a word, this is TT_WORD</li> 079 * <li>For a number, this is TT_NUMBER</li> 080 * <li>For the end of the line, this is TT_EOL</li> 081 * <li>For the end of the stream, this is TT_EOF</li> 082 * </ul> 083 */ 084 public int ttype = TT_NONE; 085 086 /** The String associated with word and string tokens. */ 087 public String sval; 088 089 /** The numeric value associated with number tokens. */ 090 public double nval; 091 092 /* Indicates whether end-of-line is recognized as a token. */ 093 private boolean eolSignificant = false; 094 095 /* Indicates whether word tokens are automatically made lower case. */ 096 private boolean lowerCase = false; 097 098 /* Indicates whether C++ style comments are recognized and skipped. */ 099 private boolean slashSlash = false; 100 101 /* Indicates whether C style comments are recognized and skipped. */ 102 private boolean slashStar = false; 103 104 /* Attribute tables of each byte from 0x00 to 0xFF. */ 105 private boolean[] whitespace = new boolean[256]; 106 private boolean[] alphabetic = new boolean[256]; 107 private boolean[] numeric = new boolean[256]; 108 private boolean[] quote = new boolean[256]; 109 private boolean[] comment = new boolean[256]; 110 111 /* The Reader associated with this class. */ 112 private PushbackReader in; 113 114 /* Indicates if a token has been pushed back. */ 115 private boolean pushedBack = false; 116 117 /* Contains the current line number of the reader. */ 118 private int lineNumber = 1; 119 120 /** 121 * This method reads bytes from an <code>InputStream</code> and tokenizes 122 * them. For details on how this method operates by default, see 123 * <code>StreamTokenizer(Reader)</code>. 124 * 125 * @param is The <code>InputStream</code> to read from 126 * 127 * @deprecated Since JDK 1.1. 128 */ 129 public StreamTokenizer(InputStream is) 130 { 131 this(new InputStreamReader(is)); 132 } 133 134 /** 135 * This method initializes a new <code>StreamTokenizer</code> to read 136 * characters from a <code>Reader</code> and parse them. The char values 137 * have their hight bits masked so that the value is treated a character 138 * in the range of 0x0000 to 0x00FF. 139 * <p> 140 * This constructor sets up the parsing table to parse the stream in the 141 * following manner: 142 * <ul> 143 * <li>The values 'A' through 'Z', 'a' through 'z' and 0xA0 through 0xFF 144 * are initialized as alphabetic</li> 145 * <li>The values 0x00 through 0x20 are initialized as whitespace</li> 146 * <li>The values '\'' and '"' are initialized as quote characters</li> 147 * <li>'/' is a comment character</li> 148 * <li>Numbers will be parsed</li> 149 * <li>EOL is not treated as significant</li> 150 * <li>C and C++ (//) comments are not recognized</li> 151 * </ul> 152 * 153 * @param r The <code>Reader</code> to read chars from 154 */ 155 public StreamTokenizer(Reader r) 156 { 157 in = new PushbackReader(r); 158 159 whitespaceChars(0x00, 0x20); 160 wordChars('A', 'Z'); 161 wordChars('a', 'z'); 162 wordChars(0xA0, 0xFF); 163 commentChar('/'); 164 quoteChar('\''); 165 quoteChar('"'); 166 parseNumbers(); 167 } 168 169 /** 170 * This method sets the comment attribute on the specified 171 * character. Other attributes for the character are cleared. 172 * 173 * @param ch The character to set the comment attribute for, passed as an int 174 */ 175 public void commentChar(int ch) 176 { 177 if (ch >= 0 && ch <= 255) 178 { 179 comment[ch] = true; 180 whitespace[ch] = false; 181 alphabetic[ch] = false; 182 numeric[ch] = false; 183 quote[ch] = false; 184 } 185 } 186 187 /** 188 * This method sets a flag that indicates whether or not the end of line 189 * sequence terminates and is a token. The defaults to <code>false</code> 190 * 191 * @param flag <code>true</code> if EOF is significant, <code>false</code> 192 * otherwise 193 */ 194 public void eolIsSignificant(boolean flag) 195 { 196 eolSignificant = flag; 197 } 198 199 /** 200 * This method returns the current line number. Note that if the 201 * <code>pushBack()</code> method is called, it has no effect on the 202 * line number returned by this method. 203 * 204 * @return The current line number 205 */ 206 public int lineno() 207 { 208 return lineNumber; 209 } 210 211 /** 212 * This method sets a flag that indicates whether or not alphabetic 213 * tokens that are returned should be converted to lower case. 214 * 215 * @param flag <code>true</code> to convert to lower case, 216 * <code>false</code> otherwise 217 */ 218 public void lowerCaseMode(boolean flag) 219 { 220 lowerCase = flag; 221 } 222 223 private boolean isWhitespace(int ch) 224 { 225 return (ch >= 0 && ch <= 255 && whitespace[ch]); 226 } 227 228 private boolean isAlphabetic(int ch) 229 { 230 return ((ch > 255) || (ch >= 0 && alphabetic[ch])); 231 } 232 233 private boolean isNumeric(int ch) 234 { 235 return (ch >= 0 && ch <= 255 && numeric[ch]); 236 } 237 238 private boolean isQuote(int ch) 239 { 240 return (ch >= 0 && ch <= 255 && quote[ch]); 241 } 242 243 private boolean isComment(int ch) 244 { 245 return (ch >= 0 && ch <= 255 && comment[ch]); 246 } 247 248 /** 249 * This method reads the next token from the stream. It sets the 250 * <code>ttype</code> variable to the appropriate token type and 251 * returns it. It also can set <code>sval</code> or <code>nval</code> 252 * as described below. The parsing strategy is as follows: 253 * <ul> 254 * <li>Skip any whitespace characters.</li> 255 * <li>If a numeric character is encountered, attempt to parse a numeric 256 * value. Leading '-' characters indicate a numeric only if followed by 257 * another non-'-' numeric. The value of the numeric token is terminated 258 * by either the first non-numeric encountered, or the second occurrence of 259 * '-' or '.'. The token type returned is TT_NUMBER and <code>nval</code> 260 * is set to the value parsed.</li> 261 * <li>If an alphabetic character is parsed, all subsequent characters 262 * are read until the first non-alphabetic or non-numeric character is 263 * encountered. The token type returned is TT_WORD and the value parsed 264 * is stored in <code>sval</code>. If lower case mode is set, the token 265 * stored in <code>sval</code> is converted to lower case. The end of line 266 * sequence terminates a word only if EOL signficance has been turned on. 267 * The start of a comment also terminates a word. Any character with a 268 * non-alphabetic and non-numeric attribute (such as white space, a quote, 269 * or a commet) are treated as non-alphabetic and terminate the word.</li> 270 * <li>If a comment character is parsed, then all remaining characters on 271 * the current line are skipped and another token is parsed. Any EOL or 272 * EOF's encountered are not discarded, but rather terminate the comment.</li> 273 * <li>If a quote character is parsed, then all characters up to the 274 * second occurrence of the same quote character are parsed into a 275 * <code>String</code>. This <code>String</code> is stored as 276 * <code>sval</code>, but is not converted to lower case, even if lower case 277 * mode is enabled. The token type returned is the value of the quote 278 * character encountered. Any escape sequences 279 * (\b (backspace), \t (HTAB), \n (linefeed), \f (form feed), \r 280 * (carriage return), \" (double quote), \' (single quote), \\ 281 * (backslash), \XXX (octal esacpe)) are converted to the appropriate 282 * char values. Invalid esacape sequences are left in untranslated. 283 * Unicode characters like ('\ u0000') are not recognized. </li> 284 * <li>If the C++ comment sequence "//" is encountered, and the parser 285 * is configured to handle that sequence, then the remainder of the line 286 * is skipped and another token is read exactly as if a character with 287 * the comment attribute was encountered.</li> 288 * <li>If the C comment sequence "/*" is encountered, and the parser 289 * is configured to handle that sequence, then all characters up to and 290 * including the comment terminator sequence are discarded and another 291 * token is parsed.</li> 292 * <li>If all cases above are not met, then the character is an ordinary 293 * character that is parsed as a token by itself. The char encountered 294 * is returned as the token type.</li> 295 * </ul> 296 * 297 * @return The token type 298 * @exception IOException If an I/O error occurs 299 */ 300 public int nextToken() throws IOException 301 { 302 if (pushedBack) 303 { 304 pushedBack = false; 305 if (ttype != TT_NONE) 306 return ttype; 307 } 308 309 sval = null; 310 int ch; 311 312 // Skip whitespace. Deal with EOL along the way. 313 while (isWhitespace(ch = in.read())) 314 if (ch == '\n' || ch == '\r') 315 { 316 lineNumber++; 317 318 // Throw away \n if in combination with \r. 319 if (ch == '\r' && (ch = in.read()) != '\n') 320 { 321 if (ch != TT_EOF) 322 in.unread(ch); 323 } 324 if (eolSignificant) 325 return (ttype = TT_EOL); 326 } 327 328 if (ch == '/') 329 if ((ch = in.read()) == '/' && slashSlash) 330 { 331 while ((ch = in.read()) != '\n' && ch != '\r' && ch != TT_EOF) 332 ; 333 334 if (ch != TT_EOF) 335 in.unread(ch); 336 return nextToken(); // Recursive, but not too deep in normal cases 337 } 338 else if (ch == '*' && slashStar) 339 { 340 while (true) 341 { 342 ch = in.read(); 343 if (ch == '*') 344 { 345 if ((ch = in.read()) == '/') 346 break; 347 else if (ch != TT_EOF) 348 in.unread(ch); 349 } 350 else if (ch == '\n' || ch == '\r') 351 { 352 lineNumber++; 353 if (ch == '\r' && (ch = in.read()) != '\n') 354 { 355 if (ch != TT_EOF) 356 in.unread(ch); 357 } 358 } 359 else if (ch == TT_EOF) 360 { 361 break; 362 } 363 } 364 return nextToken(); // Recursive, but not too deep in normal cases 365 } 366 else 367 { 368 if (ch != TT_EOF) 369 in.unread(ch); 370 ch = '/'; 371 } 372 373 if (ch == TT_EOF) 374 ttype = TT_EOF; 375 else if (isNumeric(ch)) 376 { 377 boolean isNegative = false; 378 if (ch == '-') 379 { 380 // Read ahead to see if this is an ordinary '-' rather than numeric. 381 ch = in.read(); 382 if (isNumeric(ch) && ch != '-') 383 { 384 isNegative = true; 385 } 386 else 387 { 388 if (ch != TT_EOF) 389 in.unread(ch); 390 return (ttype = '-'); 391 } 392 } 393 394 StringBuffer tokbuf = new StringBuffer(); 395 tokbuf.append((char) ch); 396 397 int decCount = 0; 398 while (isNumeric(ch = in.read()) && ch != '-') 399 if (ch == '.' && decCount++ > 0) 400 break; 401 else 402 tokbuf.append((char) ch); 403 404 if (ch != TT_EOF) 405 in.unread(ch); 406 ttype = TT_NUMBER; 407 try 408 { 409 nval = Double.valueOf(tokbuf.toString()).doubleValue(); 410 } 411 catch (NumberFormatException _) 412 { 413 nval = 0.0; 414 } 415 if (isNegative) 416 nval = -nval; 417 } 418 else if (isAlphabetic(ch)) 419 { 420 StringBuffer tokbuf = new StringBuffer(); 421 tokbuf.append((char) ch); 422 while (isAlphabetic(ch = in.read()) || isNumeric(ch)) 423 tokbuf.append((char) ch); 424 if (ch != TT_EOF) 425 in.unread(ch); 426 ttype = TT_WORD; 427 sval = tokbuf.toString(); 428 if (lowerCase) 429 sval = sval.toLowerCase(); 430 } 431 else if (isComment(ch)) 432 { 433 while ((ch = in.read()) != '\n' && ch != '\r' && ch != TT_EOF) 434 ; 435 436 if (ch != TT_EOF) 437 in.unread(ch); 438 return nextToken(); // Recursive, but not too deep in normal cases. 439 } 440 else if (isQuote(ch)) 441 { 442 ttype = ch; 443 StringBuffer tokbuf = new StringBuffer(); 444 while ((ch = in.read()) != ttype && ch != '\n' && ch != '\r' && 445 ch != TT_EOF) 446 { 447 if (ch == '\\') 448 switch (ch = in.read()) 449 { 450 case 'a': ch = 0x7; 451 break; 452 case 'b': ch = '\b'; 453 break; 454 case 'f': ch = 0xC; 455 break; 456 case 'n': ch = '\n'; 457 break; 458 case 'r': ch = '\r'; 459 break; 460 case 't': ch = '\t'; 461 break; 462 case 'v': ch = 0xB; 463 break; 464 case '\n': ch = '\n'; 465 break; 466 case '\r': ch = '\r'; 467 break; 468 case '\"': 469 case '\'': 470 case '\\': 471 break; 472 default: 473 int ch1, nextch; 474 if ((nextch = ch1 = ch) >= '0' && ch <= '7') 475 { 476 ch -= '0'; 477 if ((nextch = in.read()) >= '0' && nextch <= '7') 478 { 479 ch = ch * 8 + nextch - '0'; 480 if ((nextch = in.read()) >= '0' && nextch <= '7' && 481 ch1 >= '0' && ch1 <= '3') 482 { 483 ch = ch * 8 + nextch - '0'; 484 nextch = in.read(); 485 } 486 } 487 } 488 489 if (nextch != TT_EOF) 490 in.unread(nextch); 491 } 492 493 tokbuf.append((char) ch); 494 } 495 496 // Throw away matching quote char. 497 if (ch != ttype && ch != TT_EOF) 498 in.unread(ch); 499 500 sval = tokbuf.toString(); 501 } 502 else 503 { 504 ttype = ch; 505 } 506 507 return ttype; 508 } 509 510 private void resetChar(int ch) 511 { 512 whitespace[ch] = alphabetic[ch] = numeric[ch] = quote[ch] = comment[ch] = 513 false; 514 } 515 516 /** 517 * This method makes the specified character an ordinary character. This 518 * means that none of the attributes (whitespace, alphabetic, numeric, 519 * quote, or comment) will be set on this character. This character will 520 * parse as its own token. 521 * 522 * @param ch The character to make ordinary, passed as an int 523 */ 524 public void ordinaryChar(int ch) 525 { 526 if (ch >= 0 && ch <= 255) 527 resetChar(ch); 528 } 529 530 /** 531 * This method makes all the characters in the specified range, range 532 * terminators included, ordinary. This means the none of the attributes 533 * (whitespace, alphabetic, numeric, quote, or comment) will be set on 534 * any of the characters in the range. This makes each character in this 535 * range parse as its own token. 536 * 537 * @param low The low end of the range of values to set the whitespace 538 * attribute for 539 * @param hi The high end of the range of values to set the whitespace 540 * attribute for 541 */ 542 public void ordinaryChars(int low, int hi) 543 { 544 if (low < 0) 545 low = 0; 546 if (hi > 255) 547 hi = 255; 548 for (int i = low; i <= hi; i++) 549 resetChar(i); 550 } 551 552 /** 553 * This method sets the numeric attribute on the characters '0' - '9' and 554 * the characters '.' and '-'. 555 * When this method is used, the result of giving other attributes 556 * (whitespace, quote, or comment) to the numeric characters may 557 * vary depending on the implementation. For example, if 558 * parseNumbers() and then whitespaceChars('1', '1') are called, 559 * this implementation reads "121" as 2, while some other implementation 560 * will read it as 21. 561 */ 562 public void parseNumbers() 563 { 564 for (int i = 0; i <= 9; i++) 565 numeric['0' + i] = true; 566 567 numeric['.'] = true; 568 numeric['-'] = true; 569 } 570 571 /** 572 * Puts the current token back into the StreamTokenizer so 573 * <code>nextToken</code> will return the same value on the next call. 574 * May cause the lineno method to return an incorrect value 575 * if lineno is called before the next call to nextToken. 576 */ 577 public void pushBack() 578 { 579 pushedBack = true; 580 } 581 582 /** 583 * This method sets the quote attribute on the specified character. 584 * Other attributes for the character are cleared. 585 * 586 * @param ch The character to set the quote attribute for, passed as an int. 587 */ 588 public void quoteChar(int ch) 589 { 590 if (ch >= 0 && ch <= 255) 591 { 592 quote[ch] = true; 593 comment[ch] = false; 594 whitespace[ch] = false; 595 alphabetic[ch] = false; 596 numeric[ch] = false; 597 } 598 } 599 600 /** 601 * This method removes all attributes (whitespace, alphabetic, numeric, 602 * quote, and comment) from all characters. It is equivalent to calling 603 * <code>ordinaryChars(0x00, 0xFF)</code>. 604 * 605 * @see #ordinaryChars(int, int) 606 */ 607 public void resetSyntax() 608 { 609 ordinaryChars(0x00, 0xFF); 610 } 611 612 /** 613 * This method sets a flag that indicates whether or not "C++" language style 614 * comments ("//" comments through EOL ) are handled by the parser. 615 * If this is <code>true</code> commented out sequences are skipped and 616 * ignored by the parser. This defaults to <code>false</code>. 617 * 618 * @param flag <code>true</code> to recognized and handle "C++" style 619 * comments, <code>false</code> otherwise 620 */ 621 public void slashSlashComments(boolean flag) 622 { 623 slashSlash = flag; 624 } 625 626 /** 627 * This method sets a flag that indicates whether or not "C" language style 628 * comments (with nesting not allowed) are handled by the parser. 629 * If this is <code>true</code> commented out sequences are skipped and 630 * ignored by the parser. This defaults to <code>false</code>. 631 * 632 * @param flag <code>true</code> to recognized and handle "C" style comments, 633 * <code>false</code> otherwise 634 */ 635 public void slashStarComments(boolean flag) 636 { 637 slashStar = flag; 638 } 639 640 /** 641 * This method returns the current token value as a <code>String</code> in 642 * the form "Token[x], line n", where 'n' is the current line numbers and 643 * 'x' is determined as follows. 644 * <p> 645 * <ul> 646 * <li>If no token has been read, then 'x' is "NOTHING" and 'n' is 0</li> 647 * <li>If <code>ttype</code> is TT_EOF, then 'x' is "EOF"</li> 648 * <li>If <code>ttype</code> is TT_EOL, then 'x' is "EOL"</li> 649 * <li>If <code>ttype</code> is TT_WORD, then 'x' is <code>sval</code></li> 650 * <li>If <code>ttype</code> is TT_NUMBER, then 'x' is "n=strnval" where 651 * 'strnval' is <code>String.valueOf(nval)</code>.</li> 652 * <li>If <code>ttype</code> is a quote character, then 'x' is 653 * <code>sval</code></li> 654 * <li>For all other cases, 'x' is <code>ttype</code></li> 655 * </ul> 656 */ 657 public String toString() 658 { 659 String tempstr; 660 if (ttype == TT_EOF) 661 tempstr = "EOF"; 662 else if (ttype == TT_EOL) 663 tempstr = "EOL"; 664 else if (ttype == TT_WORD) 665 tempstr = sval; 666 else if (ttype == TT_NUMBER) 667 tempstr = "n=" + nval; 668 else if (ttype == TT_NONE) 669 tempstr = "NOTHING"; 670 else // must be an ordinary char. 671 tempstr = "\'" + (char) ttype + "\'"; 672 673 return "Token[" + tempstr + "], line " + lineno(); 674 } 675 676 /** 677 * This method sets the whitespace attribute for all characters in the 678 * specified range, range terminators included. 679 * 680 * @param low The low end of the range of values to set the whitespace 681 * attribute for 682 * @param hi The high end of the range of values to set the whitespace 683 * attribute for 684 */ 685 public void whitespaceChars(int low, int hi) 686 { 687 if (low < 0) 688 low = 0; 689 if (hi > 255) 690 hi = 255; 691 for (int i = low; i <= hi; i++) 692 { 693 resetChar(i); 694 whitespace[i] = true; 695 } 696 } 697 698 /** 699 * This method sets the alphabetic attribute for all characters in the 700 * specified range, range terminators included. 701 * 702 * @param low The low end of the range of values to set the alphabetic 703 * attribute for 704 * @param hi The high end of the range of values to set the alphabetic 705 * attribute for 706 */ 707 public void wordChars(int low, int hi) 708 { 709 if (low < 0) 710 low = 0; 711 if (hi > 255) 712 hi = 255; 713 for (int i = low; i <= hi; i++) 714 alphabetic[i] = true; 715 } 716 }