ICU 4.2.1
 All Data Structures Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
regex.h
Go to the documentation of this file.
1 /*
2 **********************************************************************
3 * Copyright (C) 2002-2009, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 * file name: regex.h
7 * encoding: US-ASCII
8 * indentation:4
9 *
10 * created on: 2002oct22
11 * created by: Andy Heninger
12 *
13 * ICU Regular Expressions, API for C++
14 */
15 
16 #ifndef REGEX_H
17 #define REGEX_H
18 
19 //#define REGEX_DEBUG
20 
45 #include "unicode/utypes.h"
46 
47 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
48 
49 #include "unicode/uobject.h"
50 #include "unicode/unistr.h"
51 #include "unicode/parseerr.h"
52 
53 #include "unicode/uregex.h"
54 
56 
57 
58 // Forward Declarations...
59 
60 class RegexMatcher;
61 class RegexPattern;
62 class UVector;
63 class UVector32;
64 class UnicodeSet;
65 struct REStackFrame;
66 struct Regex8BitSet;
68 class RegexCImpl;
69 
70 
71 
72 
77 #ifdef REGEX_DEBUG
79  RegexPatternDump(const RegexPattern *pat);
80 #else
81  #define RegexPatternDump(pat)
82 #endif
83 
84 
85 
98 public:
99 
107  RegexPattern();
108 
115  RegexPattern(const RegexPattern &source);
116 
122  virtual ~RegexPattern();
123 
132  UBool operator==(const RegexPattern& that) const;
133 
142  inline UBool operator!=(const RegexPattern& that) const {return ! operator ==(that);};
143 
149  RegexPattern &operator =(const RegexPattern &source);
150 
158  virtual RegexPattern *clone() const;
159 
160 
185  static RegexPattern * U_EXPORT2 compile( const UnicodeString &regex,
186  UParseError &pe,
187  UErrorCode &status);
188 
213  static RegexPattern * U_EXPORT2 compile( const UnicodeString &regex,
214  uint32_t flags,
215  UParseError &pe,
216  UErrorCode &status);
217 
218 
241  static RegexPattern * U_EXPORT2 compile( const UnicodeString &regex,
242  uint32_t flags,
243  UErrorCode &status);
244 
245 
251  virtual uint32_t flags() const;
252 
270  virtual RegexMatcher *matcher(const UnicodeString &input,
271  UErrorCode &status) const;
272 
273 private:
285  RegexMatcher *matcher(const UChar *input,
286  UErrorCode &status) const;
287 public:
288 
289 
301  virtual RegexMatcher *matcher(UErrorCode &status) const;
302 
303 
318  static UBool U_EXPORT2 matches(const UnicodeString &regex,
319  const UnicodeString &input,
320  UParseError &pe,
321  UErrorCode &status);
322 
323 
328  virtual UnicodeString pattern() const;
329 
330 
356  virtual int32_t split(const UnicodeString &input,
357  UnicodeString dest[],
358  int32_t destCapacity,
359  UErrorCode &status) const;
360 
361 
367  virtual UClassID getDynamicClassID() const;
368 
374  static UClassID U_EXPORT2 getStaticClassID();
375 
376 private:
377  //
378  // Implementation Data
379  //
380  UnicodeString fPattern; // The original pattern string.
381  uint32_t fFlags; // The flags used when compiling the pattern.
382  //
383  UVector32 *fCompiledPat; // The compiled pattern p-code.
384  UnicodeString fLiteralText; // Any literal string data from the pattern,
385  // after un-escaping, for use during the match.
386 
387  UVector *fSets; // Any UnicodeSets referenced from the pattern.
388  Regex8BitSet *fSets8; // (and fast sets for latin-1 range.)
389 
390 
391  UErrorCode fDeferredStatus; // status if some prior error has left this
392  // RegexPattern in an unusable state.
393 
394  int32_t fMinMatchLen; // Minimum Match Length. All matches will have length
395  // >= this value. For some patterns, this calculated
396  // value may be less than the true shortest
397  // possible match.
398 
399  int32_t fFrameSize; // Size of a state stack frame in the
400  // execution engine.
401 
402  int32_t fDataSize; // The size of the data needed by the pattern that
403  // does not go on the state stack, but has just
404  // a single copy per matcher.
405 
406  UVector32 *fGroupMap; // Map from capture group number to position of
407  // the group's variables in the matcher stack frame.
408 
409  int32_t fMaxCaptureDigits;
410 
411  UnicodeSet **fStaticSets; // Ptr to static (shared) sets for predefined
412  // regex character classes, e.g. Word.
413 
414  Regex8BitSet *fStaticSets8; // Ptr to the static (shared) latin-1 only
415  // sets for predefined regex classes.
416 
417  int32_t fStartType; // Info on how a match must start.
418  int32_t fInitialStringIdx; //
419  int32_t fInitialStringLen;
420  UnicodeSet *fInitialChars;
421  UChar32 fInitialChar;
422  Regex8BitSet *fInitialChars8;
423 
424  friend class RegexCompile;
425  friend class RegexMatcher;
426  friend class RegexCImpl;
427 
428  //
429  // Implementation Methods
430  //
431  void init(); // Common initialization, for use by constructors.
432  void zap(); // Common cleanup
433 #ifdef REGEX_DEBUG
434  void dumpOp(int32_t index) const;
435  friend void U_EXPORT2 RegexPatternDump(const RegexPattern *);
436 #endif
437 
438 };
439 
440 
441 
452 public:
453 
468  RegexMatcher(const UnicodeString &regexp, uint32_t flags, UErrorCode &status);
469 
491  RegexMatcher(const UnicodeString &regexp, const UnicodeString &input,
492  uint32_t flags, UErrorCode &status);
493 
494 private:
506  RegexMatcher(const UnicodeString &regexp, const UChar *input,
507  uint32_t flags, UErrorCode &status);
508 public:
509 
510 
516  virtual ~RegexMatcher();
517 
518 
525  virtual UBool matches(UErrorCode &status);
526 
537  virtual UBool matches(int32_t startIndex, UErrorCode &status);
538 
539 
540 
541 
555  virtual UBool lookingAt(UErrorCode &status);
556 
557 
571  virtual UBool lookingAt(int32_t startIndex, UErrorCode &status);
572 
585  virtual UBool find();
586 
587 
597  virtual UBool find(int32_t start, UErrorCode &status);
598 
599 
609  virtual UnicodeString group(UErrorCode &status) const;
610 
611 
624  virtual UnicodeString group(int32_t groupNum, UErrorCode &status) const;
625 
626 
632  virtual int32_t groupCount() const;
633 
634 
642  virtual int32_t start(UErrorCode &status) const;
643 
644 
658  virtual int32_t start(int32_t group, UErrorCode &status) const;
659 
660 
670  virtual int32_t end(UErrorCode &status) const;
671 
672 
686  virtual int32_t end(int32_t group, UErrorCode &status) const;
687 
688 
697  virtual RegexMatcher &reset();
698 
699 
715  virtual RegexMatcher &reset(int32_t index, UErrorCode &status);
716 
717 
731  virtual RegexMatcher &reset(const UnicodeString &input);
732 
733 private:
745  RegexMatcher &reset(const UChar *input);
746 public:
747 
754  virtual const UnicodeString &input() const;
755 
756 
757 
776  virtual RegexMatcher &region(int32_t start, int32_t limit, UErrorCode &status);
777 
778 
787  virtual int32_t regionStart() const;
788 
789 
798  virtual int32_t regionEnd() const;
799 
808  virtual UBool hasTransparentBounds() const;
809 
828  virtual RegexMatcher &useTransparentBounds(UBool b);
829 
830 
838  virtual UBool hasAnchoringBounds() const;
839 
852  virtual RegexMatcher &useAnchoringBounds(UBool b);
853 
866  virtual UBool hitEnd() const;
867 
877  virtual UBool requireEnd() const;
878 
879 
880 
881 
882 
888  virtual const RegexPattern &pattern() const;
889 
890 
907  virtual UnicodeString replaceAll(const UnicodeString &replacement, UErrorCode &status);
908 
909 
930  virtual UnicodeString replaceFirst(const UnicodeString &replacement, UErrorCode &status);
931 
959  virtual RegexMatcher &appendReplacement(UnicodeString &dest,
960  const UnicodeString &replacement, UErrorCode &status);
961 
962 
973  virtual UnicodeString &appendTail(UnicodeString &dest);
974 
975 
976 
1001  virtual int32_t split(const UnicodeString &input,
1002  UnicodeString dest[],
1003  int32_t destCapacity,
1004  UErrorCode &status);
1005 
1027  virtual void setTimeLimit(int32_t limit, UErrorCode &status);
1028 
1035  virtual int32_t getTimeLimit() const;
1036 
1058  virtual void setStackLimit(int32_t limit, UErrorCode &status);
1059 
1067  virtual int32_t getStackLimit() const;
1068 
1069 
1083  virtual void setMatchCallback(URegexMatchCallback *callback,
1084  const void *context,
1085  UErrorCode &status);
1086 
1087 
1088 
1099  virtual void getMatchCallback(URegexMatchCallback *&callback,
1100  const void *&context,
1101  UErrorCode &status);
1102 
1103 
1109  void setTrace(UBool state);
1110 
1111 
1117  static UClassID U_EXPORT2 getStaticClassID();
1118 
1124  virtual UClassID getDynamicClassID() const;
1125 
1126 private:
1127  // Constructors and other object boilerplate are private.
1128  // Instances of RegexMatcher can not be assigned, copied, cloned, etc.
1129  RegexMatcher(); // default constructor not implemented
1130  RegexMatcher(const RegexPattern *pat);
1131  RegexMatcher(const RegexMatcher &other);
1132  RegexMatcher &operator =(const RegexMatcher &rhs);
1133  void init(UErrorCode &status); // Common initialization
1134  void init2(const UnicodeString &s, UErrorCode &e); // Common initialization, part 2.
1135 
1136  friend class RegexPattern;
1137  friend class RegexCImpl;
1138 public:
1140  void resetPreserveRegion(); // Reset matcher state, but preserve any region.
1141 private:
1142 
1143  //
1144  // MatchAt This is the internal interface to the match engine itself.
1145  // Match status comes back in matcher member variables.
1146  //
1147  void MatchAt(int32_t startIdx, UBool toEnd, UErrorCode &status);
1148  inline void backTrack(int32_t &inputIdx, int32_t &patIdx);
1149  UBool isWordBoundary(int32_t pos); // perform Perl-like \b test
1150  UBool isUWordBoundary(int32_t pos); // perform RBBI based \b test
1151  REStackFrame *resetStack();
1152  inline REStackFrame *StateSave(REStackFrame *fp, int32_t savePatIdx, UErrorCode &status);
1153  void IncrementTime(UErrorCode &status);
1154 
1155 
1156  const RegexPattern *fPattern;
1157  RegexPattern *fPatternOwned; // Non-NULL if this matcher owns the pattern, and
1158  // should delete it when through.
1159 
1160  const UnicodeString *fInput; // The text being matched. Is never NULL.
1161  int32_t fFrameSize; // The size of a frame in the backtrack stack.
1162 
1163  int32_t fRegionStart; // Start of the input region, default = 0.
1164  int32_t fRegionLimit; // End of input region, default to input.length.
1165 
1166  int32_t fAnchorStart; // Region bounds for anchoring operations (^ or $).
1167  int32_t fAnchorLimit; // See useAnchoringBounds
1168 
1169  int32_t fLookStart; // Region bounds for look-ahead/behind and
1170  int32_t fLookLimit; // and other boundary tests. See
1171  // useTransparentBounds
1172 
1173  int32_t fActiveStart; // Currently active bounds for matching.
1174  int32_t fActiveLimit; // Usually is the same as region, but
1175  // is changed to fLookStart/Limit when
1176  // entering look around regions.
1177 
1178  UBool fTransparentBounds; // True if using transparent bounds.
1179  UBool fAnchoringBounds; // True if using anchoring bounds.
1180 
1181  UBool fMatch; // True if the last attempted match was successful.
1182  int32_t fMatchStart; // Position of the start of the most recent match
1183  int32_t fMatchEnd; // First position after the end of the most recent match
1184  // Zero if no previous match, even when a region
1185  // is active.
1186  int32_t fLastMatchEnd; // First position after the end of the previous match,
1187  // or -1 if there was no previous match.
1188  int32_t fAppendPosition; // First position after the end of the previous
1189  // appendReplacement(). As described by the
1190  // JavaDoc for Java Matcher, where it is called
1191  // "append position"
1192  UBool fHitEnd; // True if the last match touched the end of input.
1193  UBool fRequireEnd; // True if the last match required end-of-input
1194  // (matched $ or Z)
1195 
1196  UVector32 *fStack;
1197  REStackFrame *fFrame; // After finding a match, the last active stack frame,
1198  // which will contain the capture group results.
1199  // NOT valid while match engine is running.
1200 
1201  int32_t *fData; // Data area for use by the compiled pattern.
1202  int32_t fSmallData[8]; // Use this for data if it's enough.
1203 
1204  int32_t fTimeLimit; // Max time (in arbitrary steps) to let the
1205  // match engine run. Zero for unlimited.
1206 
1207  int32_t fTime; // Match time, accumulates while matching.
1208  int32_t fTickCounter; // Low bits counter for time. Counts down StateSaves.
1209  // Kept separately from fTime to keep as much
1210  // code as possible out of the inline
1211  // StateSave function.
1212 
1213  int32_t fStackLimit; // Maximum memory size to use for the backtrack
1214  // stack, in bytes. Zero for unlimited.
1215 
1216  URegexMatchCallback *fCallbackFn; // Pointer to match progress callback funct.
1217  // NULL if there is no callback.
1218  const void *fCallbackContext; // User Context ptr for callback function.
1219 
1220  UBool fTraceDebug; // Set true for debug tracing of match engine.
1221 
1222  UErrorCode fDeferredStatus; // Save error state that cannot be immediately
1223  // reported, or that permanently disables this matcher.
1224 
1225  RuleBasedBreakIterator *fWordBreakItr;
1226 
1227 
1228 };
1229 
1231 #endif // UCONFIG_NO_REGULAR_EXPRESSIONS
1232 #endif
A subclass of BreakIterator whose behavior is specified using a list of rules.
Definition: rbbi.h:65
#define RegexPatternDump(pat)
RBBIPatternDump Debug function, displays the compiled form of a pattern.
Definition: regex.h:81
Class RegexPattern represents a compiled regular expression.
Definition: regex.h:97
A mutable set of Unicode characters and multicharacter strings.
Definition: uniset.h:272
C++ API: Unicode String.
#define U_INTERNAL
This is used to declare a function as an internal ICU C API.
Definition: umachine.h:125
unsigned int uint32_t
Define 64 bit limits.
Definition: pwin32.h:147
#define U_I18N_API
Set to export library symbols from inside the i18n library, and to import them from outside...
Definition: utypes.h:475
#define U_NAMESPACE_BEGIN
This is used to begin a declaration of a public ICU C++ API.
Definition: uversion.h:183
C API: Regular Expressions.
int32_t UChar32
Define UChar32 as a type for single Unicode code points.
Definition: umachine.h:319
UnicodeString is a string class that stores Unicode characters directly and provides similar function...
Definition: unistr.h:187
#define U_EXPORT2
Definition: platform.h:338
C++ API: Common ICU base class UObject.
uint16_t UChar
Define UChar to be wchar_t if that is 16 bits wide; always assumed to be unsigned.
Definition: umachine.h:299
#define U_NAMESPACE_END
This is used to end a declaration of a public ICU C++ API If the compiler doesn't support namespaces...
Definition: uversion.h:184
UBool URegexMatchCallback(const void *context, int32_t steps)
Function pointer for a regular expression matching callback function.
Definition: uregex.h:943
UObject is the common ICU "boilerplate" class.
Definition: uobject.h:212
C API: Parse Error Information.
void * UClassID
UClassID is used to identify classes without using RTTI, since RTTI is not yet supported by all C++ c...
Definition: utypes.h:339
UErrorCode
Error code to replace exception handling, so that the code is compatible with all C++ compilers...
Definition: utypes.h:593
A UParseError struct is used to returned detailed information about parsing errors.
Definition: parseerr.h:56
Basic definitions for ICU, for both C and C++ APIs.
class RegexMatcher bundles together a reular expression pattern and input text to which the expressio...
Definition: regex.h:451
UBool operator!=(const RegexPattern &that) const
Comparison operator.
Definition: regex.h:142
virtual UClassID getDynamicClassID() const =0
ICU4C "poor man's RTTI", returns a UClassID for the actual ICU class.
signed int int32_t
Define 64 bit limits.
Definition: pwin32.h:143
int8_t UBool
The ICU boolean type.
Definition: umachine.h:208