ICU 4.2.1
 All Data Structures Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
rbbi.h
Go to the documentation of this file.
1 /*
2 ***************************************************************************
3 * Copyright (C) 1999-2008 International Business Machines Corporation *
4 * and others. All rights reserved. *
5 ***************************************************************************
6 
7 **********************************************************************
8 * Date Name Description
9 * 10/22/99 alan Creation.
10 * 11/11/99 rgillam Complete port from Java.
11 **********************************************************************
12 */
13 
14 #ifndef RBBI_H
15 #define RBBI_H
16 
17 #include "unicode/utypes.h"
18 
24 #if !UCONFIG_NO_BREAK_ITERATION
25 
26 #include "unicode/brkiter.h"
27 #include "unicode/udata.h"
28 #include "unicode/parseerr.h"
29 #include "unicode/schriter.h"
30 #include "unicode/uchriter.h"
31 
32 
33 struct UTrie;
34 
36 
38 struct RBBIDataHeader;
39 class RuleBasedBreakIteratorTables;
40 class BreakIterator;
41 class RBBIDataWrapper;
42 class UStack;
43 class LanguageBreakEngine;
44 class UnhandledEngine;
45 struct RBBIStateTable;
46 
47 
48 
49 
66 
67 protected:
73 
80 
87 
94 
99  RBBIDataWrapper *fData;
100 
105 
113 
120 
129 
135 
142 
151 
159  UnhandledEngine *fUnhandledBreakEngine;
160 
167 
168 protected:
169  //=======================================================================
170  // constructors
171  //=======================================================================
172 
181  enum EDontAdopt {
182  kDontAdopt
183  };
184 
195  RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode &status);
196 
205  RuleBasedBreakIterator(const RBBIDataHeader* data, enum EDontAdopt dontAdopt, UErrorCode &status);
206 
207 
208  friend class RBBIRuleBuilder;
210  friend class BreakIterator;
211 
212 
213 
214 public:
215 
221 
229 
239  UParseError &parseError,
240  UErrorCode &status);
241 
242 
256 
261  virtual ~RuleBasedBreakIterator();
262 
270  RuleBasedBreakIterator& operator=(const RuleBasedBreakIterator& that);
271 
280  virtual UBool operator==(const BreakIterator& that) const;
281 
289  UBool operator!=(const BreakIterator& that) const;
290 
301  virtual BreakIterator* clone() const;
302 
308  virtual int32_t hashCode(void) const;
309 
315  virtual const UnicodeString& getRules(void) const;
316 
317  //=======================================================================
318  // BreakIterator overrides
319  //=======================================================================
320 
346  virtual CharacterIterator& getText(void) const;
347 
348 
363  virtual UText *getUText(UText *fillIn, UErrorCode &status) const;
364 
372  virtual void adoptText(CharacterIterator* newText);
373 
380  virtual void setText(const UnicodeString& newText);
381 
395  virtual void setText(UText *text, UErrorCode &status);
396 
402  virtual int32_t first(void);
403 
409  virtual int32_t last(void);
410 
421  virtual int32_t next(int32_t n);
422 
428  virtual int32_t next(void);
429 
435  virtual int32_t previous(void);
436 
444  virtual int32_t following(int32_t offset);
445 
453  virtual int32_t preceding(int32_t offset);
454 
463  virtual UBool isBoundary(int32_t offset);
464 
470  virtual int32_t current(void) const;
471 
472 
505  virtual int32_t getRuleStatus() const;
506 
530  virtual int32_t getRuleStatusVec(int32_t *fillInVec, int32_t capacity, UErrorCode &status);
531 
543  virtual UClassID getDynamicClassID(void) const;
544 
556  static UClassID U_EXPORT2 getStaticClassID(void);
557 
558  /*
559  * Create a clone (copy) of this break iterator in memory provided
560  * by the caller. The idea is to increase performance by avoiding
561  * a storage allocation. Use of this functoin is NOT RECOMMENDED.
562  * Performance gains are minimal, and correct buffer management is
563  * tricky. Use clone() instead.
564  *
565  * @param stackBuffer The pointer to the memory into which the cloned object
566  * should be placed. If NULL, allocate heap memory
567  * for the cloned object.
568  * @param BufferSize The size of the buffer. If zero, return the required
569  * buffer size, but do not clone the object. If the
570  * size was too small (but not zero), allocate heap
571  * storage for the cloned object.
572  *
573  * @param status Error status. U_SAFECLONE_ALLOCATED_WARNING will be
574  * returned if the the provided buffer was too small, and
575  * the clone was therefore put on the heap.
576  *
577  * @return Pointer to the clone object. This may differ from the stackBuffer
578  * address if the byte alignment of the stack buffer was not suitable
579  * or if the stackBuffer was too small to hold the clone.
580  * @stable ICU 2.0
581  */
582  virtual BreakIterator * createBufferClone(void *stackBuffer,
583  int32_t &BufferSize,
584  UErrorCode &status);
585 
586 
604  virtual const uint8_t *getBinaryRules(uint32_t &length);
605 
606 
607 protected:
608  //=======================================================================
609  // implementation
610  //=======================================================================
616  virtual void reset(void);
617 
618 #if 0
619 
627  virtual UBool isDictionaryChar(UChar32);
628 
633  virtual int32_t getBreakType() const;
634 #endif
635 
640  virtual void setBreakType(int32_t type);
641 
647  void init();
648 
649 private:
650 
660  int32_t handlePrevious(const RBBIStateTable *statetable);
661 
671  int32_t handleNext(const RBBIStateTable *statetable);
672 
673 protected:
674 
689  int32_t checkDictionary(int32_t startPos, int32_t endPos, UBool reverse);
690 
691 private:
692 
699  const LanguageBreakEngine *getLanguageBreakEngine(UChar32 c);
700 
704  void makeRuleStatusValid();
705 
706 };
707 
708 //------------------------------------------------------------------------------
709 //
710 // Inline Functions Definitions ...
711 //
712 //------------------------------------------------------------------------------
713 
715  return !operator==(that);
716 }
717 
719 
720 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
721 
722 #endif
C++ API: Break Iterator.
The BreakIterator class implements methods for finding the location of boundaries in text...
Definition: brkiter.h:100
A subclass of BreakIterator whose behavior is specified using a list of rules.
Definition: rbbi.h:65
UBool operator!=(const BreakIterator &that) const
Not-equal operator.
Definition: rbbi.h:714
EDontAdopt
Constant to be used in the constructor RuleBasedBreakIterator(RBBIDataHeader*, EDontAdopt, UErrorCode &); which does not adopt the memory indicated by the RBBIDataHeader* parameter.
Definition: rbbi.h:181
virtual CharacterIterator & getText(void) const =0
Return a CharacterIterator over the text being analyzed.
virtual BreakIterator * createBufferClone(void *stackBuffer, int32_t &BufferSize, UErrorCode &status)=0
Thread safe client-buffer-based cloning operation Do NOT call delete on a safeclone, since 'new' is not used to create it.
A concrete subclass of CharacterIterator that iterates over the characters (code units or code points...
Definition: uchriter.h:33
int32_t fLastRuleStatusIndex
Index of the Rule {tag} values for the most recent match.
Definition: rbbi.h:104
virtual UBool operator==(const BreakIterator &that) const
Equality operator.
unsigned char uint8_t
Define 64 bit limits.
Definition: pwin32.h:131
unsigned int uint32_t
Define 64 bit limits.
Definition: pwin32.h:147
virtual int32_t current(void) const =0
Return character index of the current interator position within the text.
virtual void setText(const UnicodeString &text)=0
Change the text over which this operates.
UBool operator!=(const BreakIterator &rhs) const
Returns the complement of the result of operator==.
Definition: brkiter.h:129
virtual int32_t next(void)=0
Return the boundary following the current boundary.
virtual void adoptText(CharacterIterator *it)=0
Change the text over which this operates.
Abstract class that defines an API for iteration on text objects.
Definition: chariter.h:356
CharacterIterator * fCharIter
A character iterator that refers to the same text as the UText, above.
Definition: rbbi.h:79
UText * fText
The UText through which this BreakIterator accesses the text.
Definition: rbbi.h:72
RBBIDataWrapper * fData
The rule data for this BreakIterator instance.
Definition: rbbi.h:99
#define U_NAMESPACE_BEGIN
This is used to begin a declaration of a public ICU C++ API.
Definition: uversion.h:183
virtual int32_t first(void)=0
Return the index of the first character in the text being scanned.
UStack * fLanguageBreakEngines
If present, UStack of LanguageBreakEngine objects that might handle dictionary characters.
Definition: rbbi.h:150
virtual int32_t last(void)=0
Return the index immediately BEYOND the last character in the text being scanned. ...
C++ API: String Character Iterator.
virtual BreakIterator * clone(void) const =0
Return a polymorphic copy of this object.
virtual int32_t following(int32_t offset)=0
Return the first boundary following the specified offset.
uint32_t fDictionaryCharCount
Counter for the number of characters encountered with the "dictionary" flag set.
Definition: rbbi.h:119
int32_t fNumCachedBreakPositions
The number of elements in fCachedBreakPositions.
Definition: rbbi.h:134
int32_t UChar32
Define UChar32 as a type for single Unicode code points.
Definition: umachine.h:319
C API: Data loading interface.
virtual int32_t preceding(int32_t offset)=0
Return the first boundary preceding the specified offset.
struct UDataMemory UDataMemory
Forward declaration of the data memory type.
Definition: udata.h:150
virtual UBool operator==(const BreakIterator &) const =0
Return true if another object is semantically equal to this one.
UnicodeString is a string class that stores Unicode characters directly and provides similar function...
Definition: unistr.h:187
#define U_EXPORT2
Definition: platform.h:338
UnhandledEngine * fUnhandledBreakEngine
If present, the special LanguageBreakEngine used for handling characters that are in the dictionary s...
Definition: rbbi.h:159
C++ API: UChar Character Iterator.
A concrete subclass of CharacterIterator that iterates over the characters (code units or code points...
Definition: schriter.h:43
virtual UBool isBoundary(int32_t offset)=0
Return true if the specfied position is a boundary position.
#define U_NAMESPACE_END
This is used to end a declaration of a public ICU C++ API If the compiler doesn't support namespaces...
Definition: uversion.h:184
UCharCharacterIterator * fDCharIter
When the input text is provided by a UText, this dummy CharacterIterator over an empty string will be...
Definition: rbbi.h:93
C API: Parse Error Information.
int32_t fPositionInCache
if fCachedBreakPositions is not null, this indicates which item in the cache the current iteration po...
Definition: rbbi.h:141
UBool fLastStatusIndexValid
Rule tag value valid flag.
Definition: rbbi.h:112
void * UClassID
UClassID is used to identify classes without using RTTI, since RTTI is not yet supported by all C++ c...
Definition: utypes.h:339
UErrorCode
Error code to replace exception handling, so that the code is compatible with all C++ compilers...
Definition: utypes.h:593
virtual int32_t previous(void)=0
Return the boundary preceding the current boundary.
int32_t * fCachedBreakPositions
When a range of characters is divided up using the dictionary, the break positions that are discovere...
Definition: rbbi.h:128
int32_t fBreakType
The type of the break iterator, or -1 if it has not been set.
Definition: rbbi.h:166
virtual UText * getUText(UText *fillIn, UErrorCode &status) const =0
Get a UText for the text being analyzed.
UText struct.
Definition: utext.h:1307
A UParseError struct is used to returned detailed information about parsing errors.
Definition: parseerr.h:56
Basic definitions for ICU, for both C and C++ APIs.
#define U_COMMON_API
Set to export library symbols from inside the common library, and to import them from outside...
Definition: utypes.h:474
StringCharacterIterator * fSCharIter
When the input text is provided by a UnicodeString, this will point to a characterIterator that wraps...
Definition: rbbi.h:86
virtual UClassID getDynamicClassID(void) const =0
Return a polymorphic class ID for this object.
signed int int32_t
Define 64 bit limits.
Definition: pwin32.h:143
int8_t UBool
The ICU boolean type.
Definition: umachine.h:208