Line data Source code
1 : /*
2 : ********************************************************************************
3 : * Copyright (C) 1997-2007, International Business Machines
4 : * Corporation and others. All Rights Reserved.
5 : ********************************************************************************
6 : *
7 : * File brkiter.h
8 : *
9 : * Modification History:
10 : *
11 : * Date Name Description
12 : * 02/18/97 aliu Added typedef for TextCount. Made DONE const.
13 : * 05/07/97 aliu Fixed DLL declaration.
14 : * 07/09/97 jfitz Renamed BreakIterator and interface synced with JDK
15 : * 08/11/98 helena Sync-up JDK1.2.
16 : * 01/13/2000 helena Added UErrorCode parameter to createXXXInstance methods.
17 : ********************************************************************************
18 : */
19 :
20 : #ifndef BRKITER_H
21 : #define BRKITER_H
22 :
23 : #include "unicode/utypes.h"
24 :
25 : /**
26 : * \file
27 : * \brief C++ API: Break Iterator.
28 : */
29 :
30 : #if UCONFIG_NO_BREAK_ITERATION
31 :
32 : U_NAMESPACE_BEGIN
33 :
34 : /*
35 : * Allow the declaration of APIs with pointers to BreakIterator
36 : * even when break iteration is removed from the build.
37 : */
38 : class BreakIterator;
39 :
40 : U_NAMESPACE_END
41 :
42 : #else
43 :
44 : #include "unicode/uobject.h"
45 : #include "unicode/unistr.h"
46 : #include "unicode/chariter.h"
47 : #include "unicode/locid.h"
48 : #include "unicode/ubrk.h"
49 : #include "unicode/strenum.h"
50 : #include "unicode/utext.h"
51 : #include "unicode/umisc.h"
52 :
53 : U_NAMESPACE_BEGIN
54 :
55 : /**
56 : * The BreakIterator class implements methods for finding the location
57 : * of boundaries in text. BreakIterator is an abstract base class.
58 : * Instances of BreakIterator maintain a current position and scan over
59 : * text returning the index of characters where boundaries occur.
60 : * <p>
61 : * Line boundary analysis determines where a text string can be broken
62 : * when line-wrapping. The mechanism correctly handles punctuation and
63 : * hyphenated words.
64 : * <p>
65 : * Sentence boundary analysis allows selection with correct
66 : * interpretation of periods within numbers and abbreviations, and
67 : * trailing punctuation marks such as quotation marks and parentheses.
68 : * <p>
69 : * Word boundary analysis is used by search and replace functions, as
70 : * well as within text editing applications that allow the user to
71 : * select words with a double click. Word selection provides correct
72 : * interpretation of punctuation marks within and following
73 : * words. Characters that are not part of a word, such as symbols or
74 : * punctuation marks, have word-breaks on both sides.
75 : * <p>
76 : * Character boundary analysis allows users to interact with
77 : * characters as they expect to, for example, when moving the cursor
78 : * through a text string. Character boundary analysis provides correct
79 : * navigation of through character strings, regardless of how the
80 : * character is stored. For example, an accented character might be
81 : * stored as a base character and a diacritical mark. What users
82 : * consider to be a character can differ between languages.
83 : * <p>
84 : * The text boundary positions are found according to the rules
85 : * described in Unicode Standard Annex #29, Text Boundaries, and
86 : * Unicode Standard Annex #14, Line Breaking Properties. These
87 : * are available at http://www.unicode.org/reports/tr14/ and
88 : * http://www.unicode.org/reports/tr29/.
89 : * <p>
90 : * In addition to the C++ API defined in this header file, a
91 : * plain C API with equivalent functionality is defined in the
92 : * file ubrk.h
93 : * <p>
94 : * Code snippits illustrating the use of the Break Iterator APIs
95 : * are available in the ICU User Guide,
96 : * http://icu-project.org/userguide/boundaryAnalysis.html
97 : * and in the sample program icu/source/samples/break/break.cpp"
98 : *
99 : */
100 : class U_COMMON_API BreakIterator : public UObject {
101 : public:
102 : /**
103 : * destructor
104 : * @stable ICU 2.0
105 : */
106 : virtual ~BreakIterator();
107 :
108 : /**
109 : * Return true if another object is semantically equal to this
110 : * one. The other object should be an instance of the same subclass of
111 : * BreakIterator. Objects of different subclasses are considered
112 : * unequal.
113 : * <P>
114 : * Return true if this BreakIterator is at the same position in the
115 : * same text, and is the same class and type (word, line, etc.) of
116 : * BreakIterator, as the argument. Text is considered the same if
117 : * it contains the same characters, it need not be the same
118 : * object, and styles are not considered.
119 : * @stable ICU 2.0
120 : */
121 : virtual UBool operator==(const BreakIterator&) const = 0;
122 :
123 : /**
124 : * Returns the complement of the result of operator==
125 : * @param rhs The BreakIterator to be compared for inequality
126 : * @return the complement of the result of operator==
127 : * @stable ICU 2.0
128 : */
129 : UBool operator!=(const BreakIterator& rhs) const { return !operator==(rhs); }
130 :
131 : /**
132 : * Return a polymorphic copy of this object. This is an abstract
133 : * method which subclasses implement.
134 : * @stable ICU 2.0
135 : */
136 : virtual BreakIterator* clone(void) const = 0;
137 :
138 : /**
139 : * Return a polymorphic class ID for this object. Different subclasses
140 : * will return distinct unequal values.
141 : * @stable ICU 2.0
142 : */
143 : virtual UClassID getDynamicClassID(void) const = 0;
144 :
145 : /**
146 : * Return a CharacterIterator over the text being analyzed.
147 : * @stable ICU 2.0
148 : */
149 : virtual CharacterIterator& getText(void) const = 0;
150 :
151 :
152 : /**
153 : * Get a UText for the text being analyzed.
154 : * The returned UText is a shallow clone of the UText used internally
155 : * by the break iterator implementation. It can safely be used to
156 : * access the text without impacting any break iterator operations,
157 : * but the underlying text itself must not be altered.
158 : *
159 : * @param fillIn A UText to be filled in. If NULL, a new UText will be
160 : * allocated to hold the result.
161 : * @param status receives any error codes.
162 : * @return The current UText for this break iterator. If an input
163 : * UText was provided, it will always be returned.
164 : * @stable ICU 3.4
165 : */
166 : virtual UText *getUText(UText *fillIn, UErrorCode &status) const = 0;
167 :
168 : /**
169 : * Change the text over which this operates. The text boundary is
170 : * reset to the start.
171 : * @param text The UnicodeString used to change the text.
172 : * @stable ICU 2.0
173 : */
174 : virtual void setText(const UnicodeString &text) = 0;
175 :
176 : /**
177 : * Reset the break iterator to operate over the text represented by
178 : * the UText. The iterator position is reset to the start.
179 : *
180 : * This function makes a shallow clone of the supplied UText. This means
181 : * that the caller is free to immediately close or otherwise reuse the
182 : * Utext that was passed as a parameter, but that the underlying text itself
183 : * must not be altered while being referenced by the break iterator.
184 : *
185 : * @param text The UText used to change the text.
186 : * @param status receives any error codes.
187 : * @stable ICU 3.4
188 : */
189 : virtual void setText(UText *text, UErrorCode &status) = 0;
190 :
191 : /**
192 : * Change the text over which this operates. The text boundary is
193 : * reset to the start.
194 : * Note that setText(UText *) provides similar functionality to this function,
195 : * and is more efficient.
196 : * @param it The CharacterIterator used to change the text.
197 : * @stable ICU 2.0
198 : */
199 : virtual void adoptText(CharacterIterator* it) = 0;
200 :
201 : enum {
202 : /**
203 : * DONE is returned by previous() and next() after all valid
204 : * boundaries have been returned.
205 : * @stable ICU 2.0
206 : */
207 : DONE = (int32_t)-1
208 : };
209 :
210 : /**
211 : * Return the index of the first character in the text being scanned.
212 : * @stable ICU 2.0
213 : */
214 : virtual int32_t first(void) = 0;
215 :
216 : /**
217 : * Return the index immediately BEYOND the last character in the text being scanned.
218 : * @stable ICU 2.0
219 : */
220 : virtual int32_t last(void) = 0;
221 :
222 : /**
223 : * Return the boundary preceding the current boundary.
224 : * @return The character index of the previous text boundary or DONE if all
225 : * boundaries have been returned.
226 : * @stable ICU 2.0
227 : */
228 : virtual int32_t previous(void) = 0;
229 :
230 : /**
231 : * Return the boundary following the current boundary.
232 : * @return The character index of the next text boundary or DONE if all
233 : * boundaries have been returned.
234 : * @stable ICU 2.0
235 : */
236 : virtual int32_t next(void) = 0;
237 :
238 : /**
239 : * Return character index of the current interator position within the text.
240 : * @return The boundary most recently returned.
241 : * @stable ICU 2.0
242 : */
243 : virtual int32_t current(void) const = 0;
244 :
245 : /**
246 : * Return the first boundary following the specified offset.
247 : * The value returned is always greater than the offset or
248 : * the value BreakIterator.DONE
249 : * @param offset the offset to begin scanning.
250 : * @return The first boundary after the specified offset.
251 : * @stable ICU 2.0
252 : */
253 : virtual int32_t following(int32_t offset) = 0;
254 :
255 : /**
256 : * Return the first boundary preceding the specified offset.
257 : * The value returned is always smaller than the offset or
258 : * the value BreakIterator.DONE
259 : * @param offset the offset to begin scanning.
260 : * @return The first boundary before the specified offset.
261 : * @stable ICU 2.0
262 : */
263 : virtual int32_t preceding(int32_t offset) = 0;
264 :
265 : /**
266 : * Return true if the specfied position is a boundary position.
267 : * As a side effect, the current position of the iterator is set
268 : * to the first boundary position at or following the specified offset.
269 : * @param offset the offset to check.
270 : * @return True if "offset" is a boundary position.
271 : * @stable ICU 2.0
272 : */
273 : virtual UBool isBoundary(int32_t offset) = 0;
274 :
275 : /**
276 : * Return the nth boundary from the current boundary
277 : * @param n which boundary to return. A value of 0
278 : * does nothing. Negative values move to previous boundaries
279 : * and positive values move to later boundaries.
280 : * @return The index of the nth boundary from the current position, or
281 : * DONE if there are fewer than |n| boundaries in the specfied direction.
282 : * @stable ICU 2.0
283 : */
284 : virtual int32_t next(int32_t n) = 0;
285 :
286 : /**
287 : * Create BreakIterator for word-breaks using the given locale.
288 : * Returns an instance of a BreakIterator implementing word breaks.
289 : * WordBreak is useful for word selection (ex. double click)
290 : * @param where the locale.
291 : * @param status the error code
292 : * @return A BreakIterator for word-breaks. The UErrorCode& status
293 : * parameter is used to return status information to the user.
294 : * To check whether the construction succeeded or not, you should check
295 : * the value of U_SUCCESS(err). If you wish more detailed information, you
296 : * can check for informational error results which still indicate success.
297 : * U_USING_FALLBACK_WARNING indicates that a fall back locale was used. For
298 : * example, 'de_CH' was requested, but nothing was found there, so 'de' was
299 : * used. U_USING_DEFAULT_WARNING indicates that the default locale data was
300 : * used; neither the requested locale nor any of its fall back locales
301 : * could be found.
302 : * The caller owns the returned object and is responsible for deleting it.
303 : * @stable ICU 2.0
304 : */
305 : static BreakIterator* U_EXPORT2
306 : createWordInstance(const Locale& where, UErrorCode& status);
307 :
308 : /**
309 : * Create BreakIterator for line-breaks using specified locale.
310 : * Returns an instance of a BreakIterator implementing line breaks. Line
311 : * breaks are logically possible line breaks, actual line breaks are
312 : * usually determined based on display width.
313 : * LineBreak is useful for word wrapping text.
314 : * @param where the locale.
315 : * @param status The error code.
316 : * @return A BreakIterator for line-breaks. The UErrorCode& status
317 : * parameter is used to return status information to the user.
318 : * To check whether the construction succeeded or not, you should check
319 : * the value of U_SUCCESS(err). If you wish more detailed information, you
320 : * can check for informational error results which still indicate success.
321 : * U_USING_FALLBACK_WARNING indicates that a fall back locale was used. For
322 : * example, 'de_CH' was requested, but nothing was found there, so 'de' was
323 : * used. U_USING_DEFAULT_WARNING indicates that the default locale data was
324 : * used; neither the requested locale nor any of its fall back locales
325 : * could be found.
326 : * The caller owns the returned object and is responsible for deleting it.
327 : * @stable ICU 2.0
328 : */
329 : static BreakIterator* U_EXPORT2
330 : createLineInstance(const Locale& where, UErrorCode& status);
331 :
332 : /**
333 : * Create BreakIterator for character-breaks using specified locale
334 : * Returns an instance of a BreakIterator implementing character breaks.
335 : * Character breaks are boundaries of combining character sequences.
336 : * @param where the locale.
337 : * @param status The error code.
338 : * @return A BreakIterator for character-breaks. The UErrorCode& status
339 : * parameter is used to return status information to the user.
340 : * To check whether the construction succeeded or not, you should check
341 : * the value of U_SUCCESS(err). If you wish more detailed information, you
342 : * can check for informational error results which still indicate success.
343 : * U_USING_FALLBACK_WARNING indicates that a fall back locale was used. For
344 : * example, 'de_CH' was requested, but nothing was found there, so 'de' was
345 : * used. U_USING_DEFAULT_WARNING indicates that the default locale data was
346 : * used; neither the requested locale nor any of its fall back locales
347 : * could be found.
348 : * The caller owns the returned object and is responsible for deleting it.
349 : * @stable ICU 2.0
350 : */
351 : static BreakIterator* U_EXPORT2
352 : createCharacterInstance(const Locale& where, UErrorCode& status);
353 :
354 : /**
355 : * Create BreakIterator for sentence-breaks using specified locale
356 : * Returns an instance of a BreakIterator implementing sentence breaks.
357 : * @param where the locale.
358 : * @param status The error code.
359 : * @return A BreakIterator for sentence-breaks. The UErrorCode& status
360 : * parameter is used to return status information to the user.
361 : * To check whether the construction succeeded or not, you should check
362 : * the value of U_SUCCESS(err). If you wish more detailed information, you
363 : * can check for informational error results which still indicate success.
364 : * U_USING_FALLBACK_WARNING indicates that a fall back locale was used. For
365 : * example, 'de_CH' was requested, but nothing was found there, so 'de' was
366 : * used. U_USING_DEFAULT_WARNING indicates that the default locale data was
367 : * used; neither the requested locale nor any of its fall back locales
368 : * could be found.
369 : * The caller owns the returned object and is responsible for deleting it.
370 : * @stable ICU 2.0
371 : */
372 : static BreakIterator* U_EXPORT2
373 : createSentenceInstance(const Locale& where, UErrorCode& status);
374 :
375 : /**
376 : * Create BreakIterator for title-casing breaks using the specified locale
377 : * Returns an instance of a BreakIterator implementing title breaks.
378 : * The iterator returned locates title boundaries as described for
379 : * Unicode 3.2 only. For Unicode 4.0 and above title boundary iteration,
380 : * please use Word Boundary iterator.{@link #createWordInstance }
381 : *
382 : * @param where the locale.
383 : * @param status The error code.
384 : * @return A BreakIterator for title-breaks. The UErrorCode& status
385 : * parameter is used to return status information to the user.
386 : * To check whether the construction succeeded or not, you should check
387 : * the value of U_SUCCESS(err). If you wish more detailed information, you
388 : * can check for informational error results which still indicate success.
389 : * U_USING_FALLBACK_WARNING indicates that a fall back locale was used. For
390 : * example, 'de_CH' was requested, but nothing was found there, so 'de' was
391 : * used. U_USING_DEFAULT_WARNING indicates that the default locale data was
392 : * used; neither the requested locale nor any of its fall back locales
393 : * could be found.
394 : * The caller owns the returned object and is responsible for deleting it.
395 : * @stable ICU 2.1
396 : */
397 : static BreakIterator* U_EXPORT2
398 : createTitleInstance(const Locale& where, UErrorCode& status);
399 :
400 : /**
401 : * Get the set of Locales for which TextBoundaries are installed.
402 : * <p><b>Note:</b> this will not return locales added through the register
403 : * call. To see the registered locales too, use the getAvailableLocales
404 : * function that returns a StringEnumeration object </p>
405 : * @param count the output parameter of number of elements in the locale list
406 : * @return available locales
407 : * @stable ICU 2.0
408 : */
409 : static const Locale* U_EXPORT2 getAvailableLocales(int32_t& count);
410 :
411 : /**
412 : * Get name of the object for the desired Locale, in the desired langauge.
413 : * @param objectLocale must be from getAvailableLocales.
414 : * @param displayLocale specifies the desired locale for output.
415 : * @param name the fill-in parameter of the return value
416 : * Uses best match.
417 : * @return user-displayable name
418 : * @stable ICU 2.0
419 : */
420 : static UnicodeString& U_EXPORT2 getDisplayName(const Locale& objectLocale,
421 : const Locale& displayLocale,
422 : UnicodeString& name);
423 :
424 : /**
425 : * Get name of the object for the desired Locale, in the langauge of the
426 : * default locale.
427 : * @param objectLocale must be from getMatchingLocales
428 : * @param name the fill-in parameter of the return value
429 : * @return user-displayable name
430 : * @stable ICU 2.0
431 : */
432 : static UnicodeString& U_EXPORT2 getDisplayName(const Locale& objectLocale,
433 : UnicodeString& name);
434 :
435 : /**
436 : * Thread safe client-buffer-based cloning operation
437 : * Do NOT call delete on a safeclone, since 'new' is not used to create it.
438 : * @param stackBuffer user allocated space for the new clone. If NULL new memory will be allocated.
439 : * If buffer is not large enough, new memory will be allocated.
440 : * @param BufferSize reference to size of allocated space.
441 : * If BufferSize == 0, a sufficient size for use in cloning will
442 : * be returned ('pre-flighting')
443 : * If BufferSize is not enough for a stack-based safe clone,
444 : * new memory will be allocated.
445 : * @param status to indicate whether the operation went on smoothly or there were errors
446 : * An informational status value, U_SAFECLONE_ALLOCATED_ERROR, is used if any allocations were
447 : * necessary.
448 : * @return pointer to the new clone
449 : *
450 : * @stable ICU 2.0
451 : */
452 : virtual BreakIterator * createBufferClone(void *stackBuffer,
453 : int32_t &BufferSize,
454 : UErrorCode &status) = 0;
455 :
456 : /**
457 : * Determine whether the BreakIterator was created in user memory by
458 : * createBufferClone(), and thus should not be deleted. Such objects
459 : * must be closed by an explicit call to the destructor (not delete).
460 : * @stable ICU 2.0
461 : */
462 : inline UBool isBufferClone(void);
463 :
464 : #if !UCONFIG_NO_SERVICE
465 : /**
466 : * Register a new break iterator of the indicated kind, to use in the given locale.
467 : * The break iterator will be adopted. Clones of the iterator will be returned
468 : * if a request for a break iterator of the given kind matches or falls back to
469 : * this locale.
470 : * @param toAdopt the BreakIterator instance to be adopted
471 : * @param locale the Locale for which this instance is to be registered
472 : * @param kind the type of iterator for which this instance is to be registered
473 : * @param status the in/out status code, no special meanings are assigned
474 : * @return a registry key that can be used to unregister this instance
475 : * @stable ICU 2.4
476 : */
477 : static URegistryKey U_EXPORT2 registerInstance(BreakIterator* toAdopt,
478 : const Locale& locale,
479 : UBreakIteratorType kind,
480 : UErrorCode& status);
481 :
482 : /**
483 : * Unregister a previously-registered BreakIterator using the key returned from the
484 : * register call. Key becomes invalid after a successful call and should not be used again.
485 : * The BreakIterator corresponding to the key will be deleted.
486 : * @param key the registry key returned by a previous call to registerInstance
487 : * @param status the in/out status code, no special meanings are assigned
488 : * @return TRUE if the iterator for the key was successfully unregistered
489 : * @stable ICU 2.4
490 : */
491 : static UBool U_EXPORT2 unregister(URegistryKey key, UErrorCode& status);
492 :
493 : /**
494 : * Return a StringEnumeration over the locales available at the time of the call,
495 : * including registered locales.
496 : * @return a StringEnumeration over the locales available at the time of the call
497 : * @stable ICU 2.4
498 : */
499 : static StringEnumeration* U_EXPORT2 getAvailableLocales(void);
500 : #endif
501 :
502 : /**
503 : * Returns the locale for this break iterator. Two flavors are available: valid and
504 : * actual locale.
505 : * @stable ICU 2.8
506 : */
507 : Locale getLocale(ULocDataLocaleType type, UErrorCode& status) const;
508 :
509 : /** Get the locale for this break iterator object. You can choose between valid and actual locale.
510 : * @param type type of the locale we're looking for (valid or actual)
511 : * @param status error code for the operation
512 : * @return the locale
513 : * @internal
514 : */
515 : const char *getLocaleID(ULocDataLocaleType type, UErrorCode& status) const;
516 :
517 : private:
518 : static BreakIterator* buildInstance(const Locale& loc, const char *type, int32_t kind, UErrorCode& status);
519 : static BreakIterator* createInstance(const Locale& loc, int32_t kind, UErrorCode& status);
520 : static BreakIterator* makeInstance(const Locale& loc, int32_t kind, UErrorCode& status);
521 :
522 : friend class ICUBreakIteratorFactory;
523 : friend class ICUBreakIteratorService;
524 :
525 : protected:
526 : /** @internal */
527 : BreakIterator();
528 : /** @internal */
529 : UBool fBufferClone;
530 : /** @internal */
531 1 : BreakIterator (const BreakIterator &other) : UObject(other), fBufferClone(FALSE) {}
532 :
533 : private:
534 :
535 : /** @internal */
536 : char actualLocale[ULOC_FULLNAME_CAPACITY];
537 : char validLocale[ULOC_FULLNAME_CAPACITY];
538 :
539 : /**
540 : * The assignment operator has no real implementation.
541 : * It's provided to make the compiler happy. Do not call.
542 : */
543 : BreakIterator& operator=(const BreakIterator&);
544 : };
545 :
546 : inline UBool BreakIterator::isBufferClone()
547 : {
548 : return fBufferClone;
549 : }
550 :
551 : U_NAMESPACE_END
552 :
553 : #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
554 :
555 : #endif // _BRKITER
556 : //eof
557 :
|