ICU 4.4.2 4.4.2
|
00001 /* 00002 ********************************************************************** 00003 * Copyright (C) 2002-2010, International Business Machines 00004 * Corporation and others. All Rights Reserved. 00005 ********************************************************************** 00006 * file name: regex.h 00007 * encoding: US-ASCII 00008 * indentation:4 00009 * 00010 * created on: 2002oct22 00011 * created by: Andy Heninger 00012 * 00013 * ICU Regular Expressions, API for C++ 00014 */ 00015 00016 #ifndef REGEX_H 00017 #define REGEX_H 00018 00019 //#define REGEX_DEBUG 00020 00045 #include "unicode/utypes.h" 00046 00047 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 00048 00049 #include "unicode/uobject.h" 00050 #include "unicode/unistr.h" 00051 #include "unicode/utext.h" 00052 #include "unicode/parseerr.h" 00053 00054 #include "unicode/uregex.h" 00055 00056 U_NAMESPACE_BEGIN 00057 00058 00059 // Forward Declarations... 00060 00061 class RegexMatcher; 00062 class RegexPattern; 00063 class UVector; 00064 class UVector32; 00065 class UVector64; 00066 class UnicodeSet; 00067 struct REStackFrame; 00068 struct Regex8BitSet; 00069 class RuleBasedBreakIterator; 00070 class RegexCImpl; 00071 00072 00073 00074 00079 #ifdef REGEX_DEBUG 00080 U_INTERNAL void U_EXPORT2 00081 RegexPatternDump(const RegexPattern *pat); 00082 #else 00083 #undef RegexPatternDump 00084 #define RegexPatternDump(pat) 00085 #endif 00086 00087 00088 00100 class U_I18N_API RegexPattern: public UObject { 00101 public: 00102 00110 RegexPattern(); 00111 00118 RegexPattern(const RegexPattern &source); 00119 00125 virtual ~RegexPattern(); 00126 00135 UBool operator==(const RegexPattern& that) const; 00136 00145 inline UBool operator!=(const RegexPattern& that) const {return ! operator ==(that);}; 00146 00152 RegexPattern &operator =(const RegexPattern &source); 00153 00161 virtual RegexPattern *clone() const; 00162 00163 00188 static RegexPattern * U_EXPORT2 compile( const UnicodeString ®ex, 00189 UParseError &pe, 00190 UErrorCode &status); 00191 00192 00219 static RegexPattern * U_EXPORT2 compile( UText *regex, 00220 UParseError &pe, 00221 UErrorCode &status); 00222 00247 static RegexPattern * U_EXPORT2 compile( const UnicodeString ®ex, 00248 uint32_t flags, 00249 UParseError &pe, 00250 UErrorCode &status); 00251 00252 00279 static RegexPattern * U_EXPORT2 compile( UText *regex, 00280 uint32_t flags, 00281 UParseError &pe, 00282 UErrorCode &status); 00283 00284 00307 static RegexPattern * U_EXPORT2 compile( const UnicodeString ®ex, 00308 uint32_t flags, 00309 UErrorCode &status); 00310 00311 00336 static RegexPattern * U_EXPORT2 compile( UText *regex, 00337 uint32_t flags, 00338 UErrorCode &status); 00339 00340 00346 virtual uint32_t flags() const; 00347 00365 virtual RegexMatcher *matcher(const UnicodeString &input, 00366 UErrorCode &status) const; 00367 00368 00373 enum PatternIsUTextFlag { PATTERN_IS_UTEXT }; 00374 00394 virtual RegexMatcher *matcher(UText *input, 00395 PatternIsUTextFlag flag, 00396 UErrorCode &status) const; 00397 00398 private: 00412 RegexMatcher *matcher(const UChar *input, 00413 UErrorCode &status) const; 00414 public: 00415 00416 00428 virtual RegexMatcher *matcher(UErrorCode &status) const; 00429 00430 00445 static UBool U_EXPORT2 matches(const UnicodeString ®ex, 00446 const UnicodeString &input, 00447 UParseError &pe, 00448 UErrorCode &status); 00449 00450 00465 static UBool U_EXPORT2 matches(UText *regex, 00466 UText *input, 00467 UParseError &pe, 00468 UErrorCode &status); 00469 00470 00479 virtual UnicodeString pattern() const; 00480 00481 00492 virtual UText *patternText() const; 00493 00494 00520 virtual int32_t split(const UnicodeString &input, 00521 UnicodeString dest[], 00522 int32_t destCapacity, 00523 UErrorCode &status) const; 00524 00525 00551 virtual int32_t split(UText *input, 00552 UText *dest[], 00553 int32_t destCapacity, 00554 UErrorCode &status) const; 00555 00556 00562 virtual UClassID getDynamicClassID() const; 00563 00569 static UClassID U_EXPORT2 getStaticClassID(); 00570 00571 private: 00572 // 00573 // Implementation Data 00574 // 00575 UText *fPattern; // The original pattern string. 00576 UnicodeString *fPatternString; // The original pattern UncodeString if relevant 00577 uint32_t fFlags; // The flags used when compiling the pattern. 00578 // 00579 UVector64 *fCompiledPat; // The compiled pattern p-code. 00580 UnicodeString fLiteralText; // Any literal string data from the pattern, 00581 // after un-escaping, for use during the match. 00582 00583 UVector *fSets; // Any UnicodeSets referenced from the pattern. 00584 Regex8BitSet *fSets8; // (and fast sets for latin-1 range.) 00585 00586 00587 UErrorCode fDeferredStatus; // status if some prior error has left this 00588 // RegexPattern in an unusable state. 00589 00590 int32_t fMinMatchLen; // Minimum Match Length. All matches will have length 00591 // >= this value. For some patterns, this calculated 00592 // value may be less than the true shortest 00593 // possible match. 00594 00595 int32_t fFrameSize; // Size of a state stack frame in the 00596 // execution engine. 00597 00598 int32_t fDataSize; // The size of the data needed by the pattern that 00599 // does not go on the state stack, but has just 00600 // a single copy per matcher. 00601 00602 UVector32 *fGroupMap; // Map from capture group number to position of 00603 // the group's variables in the matcher stack frame. 00604 00605 int32_t fMaxCaptureDigits; 00606 00607 UnicodeSet **fStaticSets; // Ptr to static (shared) sets for predefined 00608 // regex character classes, e.g. Word. 00609 00610 Regex8BitSet *fStaticSets8; // Ptr to the static (shared) latin-1 only 00611 // sets for predefined regex classes. 00612 00613 int32_t fStartType; // Info on how a match must start. 00614 int32_t fInitialStringIdx; // 00615 int32_t fInitialStringLen; 00616 UnicodeSet *fInitialChars; 00617 UChar32 fInitialChar; 00618 Regex8BitSet *fInitialChars8; 00619 UBool fNeedsAltInput; 00620 00621 friend class RegexCompile; 00622 friend class RegexMatcher; 00623 friend class RegexCImpl; 00624 00625 // 00626 // Implementation Methods 00627 // 00628 void init(); // Common initialization, for use by constructors. 00629 void zap(); // Common cleanup 00630 #ifdef REGEX_DEBUG 00631 void dumpOp(int32_t index) const; 00632 friend void U_EXPORT2 RegexPatternDump(const RegexPattern *); 00633 #endif 00634 00635 }; 00636 00637 00638 00648 class U_I18N_API RegexMatcher: public UObject { 00649 public: 00650 00665 RegexMatcher(const UnicodeString ®exp, uint32_t flags, UErrorCode &status); 00666 00682 RegexMatcher(UText *regexp, uint32_t flags, UErrorCode &status); 00683 00705 RegexMatcher(const UnicodeString ®exp, const UnicodeString &input, 00706 uint32_t flags, UErrorCode &status); 00707 00729 RegexMatcher(UText *regexp, UText *input, 00730 uint32_t flags, UErrorCode &status); 00731 00732 private: 00746 RegexMatcher(const UnicodeString ®exp, const UChar *input, 00747 uint32_t flags, UErrorCode &status); 00748 public: 00749 00750 00756 virtual ~RegexMatcher(); 00757 00758 00765 virtual UBool matches(UErrorCode &status); 00766 00767 00778 virtual UBool matches(int32_t startIndex, UErrorCode &status); 00779 00780 00794 virtual UBool lookingAt(UErrorCode &status); 00795 00796 00810 virtual UBool lookingAt(int32_t startIndex, UErrorCode &status); 00811 00812 00825 virtual UBool find(); 00826 00827 00837 virtual UBool find(int32_t start, UErrorCode &status); 00838 00839 00849 virtual UnicodeString group(UErrorCode &status) const; 00850 00851 00856 enum MatcherDestIsUTextFlag { MATCHER_DEST_IS_UTEXT }; 00857 00873 virtual UText *group(UText *dest, MatcherDestIsUTextFlag flag, UErrorCode &status) const; 00874 00875 00888 virtual UnicodeString group(int32_t groupNum, UErrorCode &status) const; 00889 00890 00906 virtual UText *group(int32_t groupNum, UText *dest, UErrorCode &status) const; 00907 00908 00914 virtual int32_t groupCount() const; 00915 00916 00924 virtual int32_t start(UErrorCode &status) const; 00925 00926 00940 virtual int32_t start(int32_t group, UErrorCode &status) const; 00941 00942 00952 virtual int32_t end(UErrorCode &status) const; 00953 00954 00968 virtual int32_t end(int32_t group, UErrorCode &status) const; 00969 00970 00979 virtual RegexMatcher &reset(); 00980 00981 00997 virtual RegexMatcher &reset(int32_t index, UErrorCode &status); 00998 00999 01017 virtual RegexMatcher &reset(const UnicodeString &input); 01018 01019 01033 virtual RegexMatcher &reset(UText *input); 01034 01035 private: 01049 RegexMatcher &reset(const UChar *input); 01050 public: 01051 01059 virtual const UnicodeString &input() const; 01060 01069 virtual UText *inputText() const; 01070 01080 virtual UText *getInput(UText *dest) const; 01081 01082 01101 virtual RegexMatcher ®ion(int32_t start, int32_t limit, UErrorCode &status); 01102 01103 01112 virtual int32_t regionStart() const; 01113 01114 01123 virtual int32_t regionEnd() const; 01124 01133 virtual UBool hasTransparentBounds() const; 01134 01153 virtual RegexMatcher &useTransparentBounds(UBool b); 01154 01155 01163 virtual UBool hasAnchoringBounds() const; 01164 01165 01178 virtual RegexMatcher &useAnchoringBounds(UBool b); 01179 01180 01193 virtual UBool hitEnd() const; 01194 01204 virtual UBool requireEnd() const; 01205 01206 01212 virtual const RegexPattern &pattern() const; 01213 01214 01231 virtual UnicodeString replaceAll(const UnicodeString &replacement, UErrorCode &status); 01232 01233 01254 virtual UText *replaceAll(UText *replacement, UText *dest, UErrorCode &status); 01255 01256 01277 virtual UnicodeString replaceFirst(const UnicodeString &replacement, UErrorCode &status); 01278 01279 01304 virtual UText *replaceFirst(UText *replacement, UText *dest, UErrorCode &status); 01305 01306 01334 virtual RegexMatcher &appendReplacement(UnicodeString &dest, 01335 const UnicodeString &replacement, UErrorCode &status); 01336 01337 01365 virtual RegexMatcher &appendReplacement(UText *dest, 01366 UText *replacement, UErrorCode &status); 01367 01368 01379 virtual UnicodeString &appendTail(UnicodeString &dest); 01380 01381 01394 virtual UText *appendTail(UText *dest); 01395 01396 01420 virtual int32_t split(const UnicodeString &input, 01421 UnicodeString dest[], 01422 int32_t destCapacity, 01423 UErrorCode &status); 01424 01425 01449 virtual int32_t split(UText *input, 01450 UText *dest[], 01451 int32_t destCapacity, 01452 UErrorCode &status); 01453 01475 virtual void setTimeLimit(int32_t limit, UErrorCode &status); 01476 01483 virtual int32_t getTimeLimit() const; 01484 01506 virtual void setStackLimit(int32_t limit, UErrorCode &status); 01507 01515 virtual int32_t getStackLimit() const; 01516 01517 01531 virtual void setMatchCallback(URegexMatchCallback *callback, 01532 const void *context, 01533 UErrorCode &status); 01534 01535 01546 virtual void getMatchCallback(URegexMatchCallback *&callback, 01547 const void *&context, 01548 UErrorCode &status); 01549 01550 01556 void setTrace(UBool state); 01557 01558 01564 static UClassID U_EXPORT2 getStaticClassID(); 01565 01571 virtual UClassID getDynamicClassID() const; 01572 01573 private: 01574 // Constructors and other object boilerplate are private. 01575 // Instances of RegexMatcher can not be assigned, copied, cloned, etc. 01576 RegexMatcher(); // default constructor not implemented 01577 RegexMatcher(const RegexPattern *pat); 01578 RegexMatcher(const RegexMatcher &other); 01579 RegexMatcher &operator =(const RegexMatcher &rhs); 01580 void init(UErrorCode &status); // Common initialization 01581 void init2(UText *t, UErrorCode &e); // Common initialization, part 2. 01582 01583 friend class RegexPattern; 01584 friend class RegexCImpl; 01585 public: 01587 void resetPreserveRegion(); // Reset matcher state, but preserve any region. 01588 private: 01589 01590 // 01591 // MatchAt This is the internal interface to the match engine itself. 01592 // Match status comes back in matcher member variables. 01593 // 01594 void MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status); 01595 inline void backTrack(int64_t &inputIdx, int32_t &patIdx); 01596 UBool isWordBoundary(int64_t pos); // perform Perl-like \b test 01597 UBool isUWordBoundary(int64_t pos); // perform RBBI based \b test 01598 REStackFrame *resetStack(); 01599 inline REStackFrame *StateSave(REStackFrame *fp, int64_t savePatIdx, UErrorCode &status); 01600 void IncrementTime(UErrorCode &status); 01601 01602 int64_t appendGroup(int32_t groupNum, UText *dest, UErrorCode &status) const; 01603 01604 UBool findUsingChunk(); 01605 void MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &status); 01606 UBool isChunkWordBoundary(int32_t pos); 01607 01608 const RegexPattern *fPattern; 01609 RegexPattern *fPatternOwned; // Non-NULL if this matcher owns the pattern, and 01610 // should delete it when through. 01611 01612 const UnicodeString *fInput; // The string being matched. Only used for input() 01613 UText *fInputText; // The text being matched. Is never NULL. 01614 UText *fAltInputText; // A shallow copy of the text being matched. 01615 // Only created if the pattern contains backreferences. 01616 int64_t fInputLength; // Full length of the input text. 01617 int32_t fFrameSize; // The size of a frame in the backtrack stack. 01618 01619 int64_t fRegionStart; // Start of the input region, default = 0. 01620 int64_t fRegionLimit; // End of input region, default to input.length. 01621 01622 int64_t fAnchorStart; // Region bounds for anchoring operations (^ or $). 01623 int64_t fAnchorLimit; // See useAnchoringBounds 01624 01625 int64_t fLookStart; // Region bounds for look-ahead/behind and 01626 int64_t fLookLimit; // and other boundary tests. See 01627 // useTransparentBounds 01628 01629 int64_t fActiveStart; // Currently active bounds for matching. 01630 int64_t fActiveLimit; // Usually is the same as region, but 01631 // is changed to fLookStart/Limit when 01632 // entering look around regions. 01633 01634 UBool fTransparentBounds; // True if using transparent bounds. 01635 UBool fAnchoringBounds; // True if using anchoring bounds. 01636 01637 UBool fMatch; // True if the last attempted match was successful. 01638 int64_t fMatchStart; // Position of the start of the most recent match 01639 int64_t fMatchEnd; // First position after the end of the most recent match 01640 // Zero if no previous match, even when a region 01641 // is active. 01642 int64_t fLastMatchEnd; // First position after the end of the previous match, 01643 // or -1 if there was no previous match. 01644 int64_t fAppendPosition; // First position after the end of the previous 01645 // appendReplacement(). As described by the 01646 // JavaDoc for Java Matcher, where it is called 01647 // "append position" 01648 UBool fHitEnd; // True if the last match touched the end of input. 01649 UBool fRequireEnd; // True if the last match required end-of-input 01650 // (matched $ or Z) 01651 01652 UVector64 *fStack; 01653 REStackFrame *fFrame; // After finding a match, the last active stack frame, 01654 // which will contain the capture group results. 01655 // NOT valid while match engine is running. 01656 01657 int64_t *fData; // Data area for use by the compiled pattern. 01658 int64_t fSmallData[8]; // Use this for data if it's enough. 01659 01660 int32_t fTimeLimit; // Max time (in arbitrary steps) to let the 01661 // match engine run. Zero for unlimited. 01662 01663 int32_t fTime; // Match time, accumulates while matching. 01664 int32_t fTickCounter; // Low bits counter for time. Counts down StateSaves. 01665 // Kept separately from fTime to keep as much 01666 // code as possible out of the inline 01667 // StateSave function. 01668 01669 int32_t fStackLimit; // Maximum memory size to use for the backtrack 01670 // stack, in bytes. Zero for unlimited. 01671 01672 URegexMatchCallback *fCallbackFn; // Pointer to match progress callback funct. 01673 // NULL if there is no callback. 01674 const void *fCallbackContext; // User Context ptr for callback function. 01675 01676 UBool fInputUniStrMaybeMutable; // Set when fInputText wraps a UnicodeString that may be mutable - compatibility. 01677 01678 UBool fTraceDebug; // Set true for debug tracing of match engine. 01679 01680 UErrorCode fDeferredStatus; // Save error state that cannot be immediately 01681 // reported, or that permanently disables this matcher. 01682 01683 RuleBasedBreakIterator *fWordBreakItr; 01684 01685 01686 }; 01687 01688 U_NAMESPACE_END 01689 #endif // UCONFIG_NO_REGULAR_EXPRESSIONS 01690 #endif