001/* Matcher.java -- Instance of a regular expression applied to a char sequence. 002 Copyright (C) 2002, 2004, 2006 Free Software Foundation, Inc. 003 004This file is part of GNU Classpath. 005 006GNU Classpath is free software; you can redistribute it and/or modify 007it under the terms of the GNU General Public License as published by 008the Free Software Foundation; either version 2, or (at your option) 009any later version. 010 011GNU Classpath is distributed in the hope that it will be useful, but 012WITHOUT ANY WARRANTY; without even the implied warranty of 013MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 014General Public License for more details. 015 016You should have received a copy of the GNU General Public License 017along with GNU Classpath; see the file COPYING. If not, write to the 018Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 01902110-1301 USA. 020 021Linking this library statically or dynamically with other modules is 022making a combined work based on this library. Thus, the terms and 023conditions of the GNU General Public License cover the whole 024combination. 025 026As a special exception, the copyright holders of this library give you 027permission to link this library with independent modules to produce an 028executable, regardless of the license terms of these independent 029modules, and to copy and distribute the resulting executable under 030terms of your choice, provided that you also meet, for each linked 031independent module, the terms and conditions of the license of that 032module. An independent module is a module which is not derived from 033or based on this library. If you modify this library, you may extend 034this exception to your version of the library, but you are not 035obligated to do so. If you do not wish to do so, delete this 036exception statement from your version. */ 037 038 039package java.util.regex; 040 041import gnu.java.lang.CPStringBuilder; 042 043import gnu.java.util.regex.CharIndexed; 044import gnu.java.util.regex.RE; 045import gnu.java.util.regex.REMatch; 046 047/** 048 * Instance of a regular expression applied to a char sequence. 049 * 050 * @since 1.4 051 */ 052public final class Matcher implements MatchResult 053{ 054 private Pattern pattern; 055 private CharSequence input; 056 // We use CharIndexed as an input object to the getMatch method in order 057 // that /\G/ (the end of the previous match) may work. The information 058 // of the previous match is stored in the CharIndexed object. 059 private CharIndexed inputCharIndexed; 060 private int position; 061 private int appendPosition; 062 private REMatch match; 063 064 /** 065 * The start of the region of the input on which to match. 066 */ 067 private int regionStart; 068 069 /** 070 * The end of the region of the input on which to match. 071 */ 072 private int regionEnd; 073 074 /** 075 * True if the match process should look beyond the 076 * region marked by regionStart to regionEnd when 077 * performing lookAhead, lookBehind and boundary 078 * matching. 079 */ 080 private boolean transparentBounds; 081 082 /** 083 * The flags that affect the anchoring bounds. 084 * If {@link #hasAnchoringBounds()} is {@code true}, 085 * the match process will honour the 086 * anchoring bounds: ^, \A, \Z, \z and $. If 087 * {@link #hasAnchoringBounds()} is {@code false}, 088 * the anchors are ignored and appropriate flags, 089 * stored in this variable, are used to provide this 090 * behaviour. 091 */ 092 private int anchoringBounds; 093 094 Matcher(Pattern pattern, CharSequence input) 095 { 096 this.pattern = pattern; 097 this.input = input; 098 this.inputCharIndexed = RE.makeCharIndexed(input, 0); 099 regionStart = 0; 100 regionEnd = input.length(); 101 transparentBounds = false; 102 anchoringBounds = 0; 103 } 104 105 /** 106 * Changes the pattern used by the {@link Matcher} to 107 * the one specified. Existing match information is lost, 108 * but the input and the matcher's position within it is 109 * retained. 110 * 111 * @param newPattern the new pattern to use. 112 * @return this matcher. 113 * @throws IllegalArgumentException if {@code newPattern} is 114 * {@code null}. 115 * @since 1.5 116 */ 117 public Matcher usePattern(Pattern newPattern) 118 { 119 if (newPattern == null) 120 throw new IllegalArgumentException("The new pattern was null."); 121 pattern = newPattern; 122 match = null; 123 124 return this; 125 } 126 127 /** 128 * @param sb The target string buffer 129 * @param replacement The replacement string 130 * 131 * @exception IllegalStateException If no match has yet been attempted, 132 * or if the previous match operation failed 133 * @exception IndexOutOfBoundsException If the replacement string refers 134 * to a capturing group that does not exist in the pattern 135 */ 136 public Matcher appendReplacement (StringBuffer sb, String replacement) 137 throws IllegalStateException 138 { 139 assertMatchOp(); 140 sb.append(input.subSequence(appendPosition, 141 match.getStartIndex()).toString()); 142 sb.append(RE.getReplacement(replacement, match, 143 RE.REG_REPLACE_USE_BACKSLASHESCAPE)); 144 appendPosition = match.getEndIndex(); 145 return this; 146 } 147 148 /** 149 * @param sb The target string buffer 150 */ 151 public StringBuffer appendTail (StringBuffer sb) 152 { 153 sb.append(input.subSequence(appendPosition, input.length()).toString()); 154 return sb; 155 } 156 157 /** 158 * @exception IllegalStateException If no match has yet been attempted, 159 * or if the previous match operation failed 160 */ 161 public int end () 162 throws IllegalStateException 163 { 164 assertMatchOp(); 165 return match.getEndIndex(); 166 } 167 168 /** 169 * @param group The index of a capturing group in this matcher's pattern 170 * 171 * @exception IllegalStateException If no match has yet been attempted, 172 * or if the previous match operation failed 173 * @exception IndexOutOfBoundsException If the replacement string refers 174 * to a capturing group that does not exist in the pattern 175 */ 176 public int end (int group) 177 throws IllegalStateException 178 { 179 assertMatchOp(); 180 return match.getEndIndex(group); 181 } 182 183 public boolean find () 184 { 185 boolean first = (match == null); 186 if (transparentBounds || (regionStart == 0 && regionEnd == input.length())) 187 match = pattern.getRE().getMatch(inputCharIndexed, position, anchoringBounds); 188 else 189 match = pattern.getRE().getMatch(input.subSequence(regionStart, regionEnd), 190 position, anchoringBounds); 191 if (match != null) 192 { 193 int endIndex = match.getEndIndex(); 194 // Is the match within input limits? 195 if (endIndex > input.length()) 196 { 197 match = null; 198 return false; 199 } 200 // Are we stuck at the same position? 201 if (!first && endIndex == position) 202 { 203 match = null; 204 // Not at the end of the input yet? 205 if (position < input.length() - 1) 206 { 207 position++; 208 return find(position); 209 } 210 else 211 return false; 212 } 213 position = endIndex; 214 return true; 215 } 216 return false; 217 } 218 219 /** 220 * @param start The index to start the new pattern matching 221 * 222 * @exception IndexOutOfBoundsException If the replacement string refers 223 * to a capturing group that does not exist in the pattern 224 */ 225 public boolean find (int start) 226 { 227 if (transparentBounds || (regionStart == 0 && regionEnd == input.length())) 228 match = pattern.getRE().getMatch(inputCharIndexed, start, anchoringBounds); 229 else 230 match = pattern.getRE().getMatch(input.subSequence(regionStart, regionEnd), 231 start, anchoringBounds); 232 if (match != null) 233 { 234 position = match.getEndIndex(); 235 return true; 236 } 237 return false; 238 } 239 240 /** 241 * @exception IllegalStateException If no match has yet been attempted, 242 * or if the previous match operation failed 243 */ 244 public String group () 245 { 246 assertMatchOp(); 247 return match.toString(); 248 } 249 250 /** 251 * @param group The index of a capturing group in this matcher's pattern 252 * 253 * @exception IllegalStateException If no match has yet been attempted, 254 * or if the previous match operation failed 255 * @exception IndexOutOfBoundsException If the replacement string refers 256 * to a capturing group that does not exist in the pattern 257 */ 258 public String group (int group) 259 throws IllegalStateException 260 { 261 assertMatchOp(); 262 return match.toString(group); 263 } 264 265 /** 266 * @param replacement The replacement string 267 */ 268 public String replaceFirst (String replacement) 269 { 270 reset(); 271 // Semantics might not quite match 272 return pattern.getRE().substitute(input, replacement, position, 273 RE.REG_REPLACE_USE_BACKSLASHESCAPE); 274 } 275 276 /** 277 * @param replacement The replacement string 278 */ 279 public String replaceAll (String replacement) 280 { 281 reset(); 282 return pattern.getRE().substituteAll(input, replacement, position, 283 RE.REG_REPLACE_USE_BACKSLASHESCAPE); 284 } 285 286 public int groupCount () 287 { 288 return pattern.getRE().getNumSubs(); 289 } 290 291 public boolean lookingAt () 292 { 293 if (transparentBounds || (regionStart == 0 && regionEnd == input.length())) 294 match = pattern.getRE().getMatch(inputCharIndexed, regionStart, 295 anchoringBounds|RE.REG_FIX_STARTING_POSITION|RE.REG_ANCHORINDEX); 296 else 297 match = pattern.getRE().getMatch(input.subSequence(regionStart, regionEnd), 0, 298 anchoringBounds|RE.REG_FIX_STARTING_POSITION); 299 if (match != null) 300 { 301 if (match.getStartIndex() == 0) 302 { 303 position = match.getEndIndex(); 304 return true; 305 } 306 match = null; 307 } 308 return false; 309 } 310 311 /** 312 * Attempts to match the entire input sequence against the pattern. 313 * 314 * If the match succeeds then more information can be obtained via the 315 * start, end, and group methods. 316 * 317 * @see #start() 318 * @see #end() 319 * @see #group() 320 */ 321 public boolean matches () 322 { 323 if (transparentBounds || (regionStart == 0 && regionEnd == input.length())) 324 match = pattern.getRE().getMatch(inputCharIndexed, regionStart, 325 anchoringBounds|RE.REG_TRY_ENTIRE_MATCH|RE.REG_FIX_STARTING_POSITION|RE.REG_ANCHORINDEX); 326 else 327 match = pattern.getRE().getMatch(input.subSequence(regionStart, regionEnd), 0, 328 anchoringBounds|RE.REG_TRY_ENTIRE_MATCH|RE.REG_FIX_STARTING_POSITION); 329 if (match != null) 330 { 331 if (match.getStartIndex() == 0) 332 { 333 position = match.getEndIndex(); 334 if (position == input.length()) 335 return true; 336 } 337 match = null; 338 } 339 return false; 340 } 341 342 /** 343 * Returns the Pattern that is interpreted by this Matcher 344 */ 345 public Pattern pattern () 346 { 347 return pattern; 348 } 349 350 /** 351 * Resets the internal state of the matcher, including 352 * resetting the region to its default state of encompassing 353 * the whole input. The state of {@link #hasTransparentBounds()} 354 * and {@link #hasAnchoringBounds()} are unaffected. 355 * 356 * @return a reference to this matcher. 357 * @see #regionStart() 358 * @see #regionEnd() 359 * @see #hasTransparentBounds() 360 * @see #hasAnchoringBounds() 361 */ 362 public Matcher reset () 363 { 364 position = 0; 365 match = null; 366 regionStart = 0; 367 regionEnd = input.length(); 368 appendPosition = 0; 369 return this; 370 } 371 372 /** 373 * Resets the internal state of the matcher, including 374 * resetting the region to its default state of encompassing 375 * the whole input. The state of {@link #hasTransparentBounds()} 376 * and {@link #hasAnchoringBounds()} are unaffected. 377 * 378 * @param input The new input character sequence. 379 * @return a reference to this matcher. 380 * @see #regionStart() 381 * @see #regionEnd() 382 * @see #hasTransparentBounds() 383 * @see #hasAnchoringBounds() 384 */ 385 public Matcher reset (CharSequence input) 386 { 387 this.input = input; 388 this.inputCharIndexed = RE.makeCharIndexed(input, 0); 389 return reset(); 390 } 391 392 /** 393 * @return the index of a capturing group in this matcher's pattern 394 * 395 * @exception IllegalStateException If no match has yet been attempted, 396 * or if the previous match operation failed 397 */ 398 public int start () 399 throws IllegalStateException 400 { 401 assertMatchOp(); 402 return match.getStartIndex(); 403 } 404 405 /** 406 * @param group The index of a capturing group in this matcher's pattern 407 * 408 * @exception IllegalStateException If no match has yet been attempted, 409 * or if the previous match operation failed 410 * @exception IndexOutOfBoundsException If the replacement string refers 411 * to a capturing group that does not exist in the pattern 412 */ 413 public int start (int group) 414 throws IllegalStateException 415 { 416 assertMatchOp(); 417 return match.getStartIndex(group); 418 } 419 420 /** 421 * @return True if and only if the matcher hit the end of input. 422 * @since 1.5 423 */ 424 public boolean hitEnd() 425 { 426 return inputCharIndexed.hitEnd(); 427 } 428 429 /** 430 * @return A string expression of this matcher. 431 */ 432 public String toString() 433 { 434 CPStringBuilder sb = new CPStringBuilder(); 435 sb.append(this.getClass().getName()) 436 .append("[pattern=").append(pattern.pattern()) 437 .append(" region=").append(regionStart).append(",").append(regionEnd) 438 .append(" anchoringBounds=").append(anchoringBounds == 0) 439 .append(" transparentBounds=").append(transparentBounds) 440 .append(" lastmatch=").append(match == null ? "" : match.toString()) 441 .append("]"); 442 return sb.toString(); 443 } 444 445 private void assertMatchOp() 446 { 447 if (match == null) throw new IllegalStateException(); 448 } 449 450 /** 451 * <p> 452 * Defines the region of the input on which to match. 453 * By default, the {@link Matcher} attempts to match 454 * the whole string (from 0 to the length of the input), 455 * but a region between {@code start} (inclusive) and 456 * {@code end} (exclusive) on which to match may instead 457 * be defined using this method. 458 * </p> 459 * <p> 460 * The behaviour of region matching is further affected 461 * by the use of transparent or opaque bounds (see 462 * {@link #useTransparentBounds(boolean)}) and whether or not 463 * anchors ({@code ^} and {@code $}) are in use 464 * (see {@link #useAnchoringBounds(boolean)}). With transparent 465 * bounds, the matcher is aware of input outside the bounds 466 * set by this method, whereas, with opaque bounds (the default) 467 * only the input within the bounds is used. The use of 468 * anchors are affected by this setting; with transparent 469 * bounds, anchors will match the beginning of the real input, 470 * while with opaque bounds they match the beginning of the 471 * region. {@link #useAnchoringBounds(boolean)} can be used 472 * to turn on or off the matching of anchors. 473 * </p> 474 * 475 * @param start the start of the region (inclusive). 476 * @param end the end of the region (exclusive). 477 * @return a reference to this matcher. 478 * @throws IndexOutOfBoundsException if either {@code start} or 479 * {@code end} are less than zero, 480 * if either {@code start} or 481 * {@code end} are greater than the 482 * length of the input, or if 483 * {@code start} is greater than 484 * {@code end}. 485 * @see #regionStart() 486 * @see #regionEnd() 487 * @see #hasTransparentBounds() 488 * @see #useTransparentBounds(boolean) 489 * @see #hasAnchoringBounds() 490 * @see #useAnchoringBounds(boolean) 491 * @since 1.5 492 */ 493 public Matcher region(int start, int end) 494 { 495 int length = input.length(); 496 if (start < 0) 497 throw new IndexOutOfBoundsException("The start position was less than zero."); 498 if (start >= length) 499 throw new IndexOutOfBoundsException("The start position is after the end of the input."); 500 if (end < 0) 501 throw new IndexOutOfBoundsException("The end position was less than zero."); 502 if (end > length) 503 throw new IndexOutOfBoundsException("The end position is after the end of the input."); 504 if (start > end) 505 throw new IndexOutOfBoundsException("The start position is after the end position."); 506 reset(); 507 regionStart = start; 508 regionEnd = end; 509 return this; 510 } 511 512 /** 513 * The start of the region on which to perform matches (inclusive). 514 * 515 * @return the start index of the region. 516 * @see #region(int,int) 517 * #see #regionEnd() 518 * @since 1.5 519 */ 520 public int regionStart() 521 { 522 return regionStart; 523 } 524 525 /** 526 * The end of the region on which to perform matches (exclusive). 527 * 528 * @return the end index of the region. 529 * @see #region(int,int) 530 * @see #regionStart() 531 * @since 1.5 532 */ 533 public int regionEnd() 534 { 535 return regionEnd; 536 } 537 538 /** 539 * Returns true if the bounds of the region marked by 540 * {@link #regionStart()} and {@link #regionEnd()} are 541 * transparent. When these bounds are transparent, the 542 * matching process can look beyond them to perform 543 * lookahead, lookbehind and boundary matching operations. 544 * By default, the bounds are opaque. 545 * 546 * @return true if the bounds of the matching region are 547 * transparent. 548 * @see #useTransparentBounds(boolean) 549 * @see #region(int,int) 550 * @see #regionStart() 551 * @see #regionEnd() 552 * @since 1.5 553 */ 554 public boolean hasTransparentBounds() 555 { 556 return transparentBounds; 557 } 558 559 /** 560 * Sets the transparency of the bounds of the region 561 * marked by {@link #regionStart()} and {@link #regionEnd()}. 562 * A value of {@code true} makes the bounds transparent, 563 * so the matcher can see beyond them to perform lookahead, 564 * lookbehind and boundary matching operations. A value 565 * of {@code false} (the default) makes the bounds opaque, 566 * restricting the match to the input region denoted 567 * by {@link #regionStart()} and {@link #regionEnd()}. 568 * 569 * @param transparent true if the bounds should be transparent. 570 * @return a reference to this matcher. 571 * @see #hasTransparentBounds() 572 * @see #region(int,int) 573 * @see #regionStart() 574 * @see #regionEnd() 575 * @since 1.5 576 */ 577 public Matcher useTransparentBounds(boolean transparent) 578 { 579 transparentBounds = transparent; 580 return this; 581 } 582 583 /** 584 * Returns true if the matcher will honour the use of 585 * the anchoring bounds: {@code ^}, {@code \A}, {@code \Z}, 586 * {@code \z} and {@code $}. By default, the anchors 587 * are used. Note that the effect of the anchors is 588 * also affected by {@link #hasTransparentBounds()}. 589 * 590 * @return true if the matcher will attempt to match 591 * the anchoring bounds. 592 * @see #useAnchoringBounds(boolean) 593 * @see #hasTransparentBounds() 594 * @since 1.5 595 */ 596 public boolean hasAnchoringBounds() 597 { 598 return anchoringBounds == 0; 599 } 600 601 /** 602 * Enables or disables the use of the anchoring bounds: 603 * {@code ^}, {@code \A}, {@code \Z}, {@code \z} and 604 * {@code $}. By default, their use is enabled. When 605 * disabled, the matcher will not attempt to match 606 * the anchors. 607 * 608 * @param useAnchors true if anchoring bounds should be used. 609 * @return a reference to this matcher. 610 * @since 1.5 611 * @see #hasAnchoringBounds() 612 */ 613 public Matcher useAnchoringBounds(boolean useAnchors) 614 { 615 if (useAnchors) 616 anchoringBounds = 0; 617 else 618 anchoringBounds = RE.REG_NOTBOL|RE.REG_NOTEOL; 619 return this; 620 } 621 622 /** 623 * Returns a read-only snapshot of the current state of 624 * the {@link Matcher} as a {@link MatchResult}. Any 625 * subsequent changes to this instance are not reflected 626 * in the returned {@link MatchResult}. 627 * 628 * @return a {@link MatchResult} instance representing the 629 * current state of the {@link Matcher}. 630 */ 631 public MatchResult toMatchResult() 632 { 633 Matcher snapshot = new Matcher(pattern, input); 634 if (match != null) 635 snapshot.match = (REMatch) match.clone(); 636 return snapshot; 637 } 638 639 /** 640 * Returns a literalized string of s where characters {@code $} and {@code 641 * \\} are escaped. 642 * 643 * @param s the string to literalize. 644 * @return the literalized string. 645 * @since 1.5 646 */ 647 public static String quoteReplacement(String s) 648 { 649 if (s == null) 650 throw new NullPointerException(); 651 CPStringBuilder sb = new CPStringBuilder(); 652 for (int i = 0; i < s.length(); i++) 653 { 654 char ch = s.charAt(i); 655 if (ch == '$' || ch == '\\') 656 sb.append('\\'); 657 sb.append(ch); 658 } 659 return sb.toString(); 660 } 661 662}