1 /* 2 * Copyright (c) 1996, 2019, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 /* 27 * (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved 28 * (C) Copyright IBM Corp. 1996 - 1998 - All Rights Reserved 29 * 30 * The original version of this source code and documentation 31 * is copyrighted and owned by Taligent, Inc., a wholly-owned 32 * subsidiary of IBM. These materials are provided under terms 33 * of a License Agreement between Taligent and Sun. This technology 34 * is protected by multiple US and International patents. 35 * 36 * This notice and attribution to Taligent may not be removed. 37 * Taligent is a registered trademark of Taligent, Inc. 38 * 39 */ 40 41 package java.text; 42 43 import java.lang.ref.SoftReference; 44 import java.text.spi.BreakIteratorProvider; 45 import java.util.Locale; 46 import sun.util.locale.provider.LocaleProviderAdapter; 47 import sun.util.locale.provider.LocaleServiceProviderPool; 48 49 50 /** 51 * The {@code BreakIterator} class implements methods for finding 52 * the location of boundaries in text. Instances of {@code BreakIterator} 53 * maintain a current position and scan over text 54 * returning the index of characters where boundaries occur. 55 * Internally, {@code BreakIterator} scans text using a 56 * {@code CharacterIterator}, and is thus able to scan text held 57 * by any object implementing that protocol. A {@code StringCharacterIterator} 58 * is used to scan {@code String} objects passed to {@code setText}. 59 * 60 * <p> 61 * You use the factory methods provided by this class to create 62 * instances of various types of break iterators. In particular, 63 * use {@code getWordInstance}, {@code getLineInstance}, 64 * {@code getSentenceInstance}, and {@code getCharacterInstance} 65 * to create {@code BreakIterator}s that perform 66 * word, line, sentence, and character boundary analysis respectively. 67 * A single {@code BreakIterator} can work only on one unit 68 * (word, line, sentence, and so on). You must use a different iterator 69 * for each unit boundary analysis you wish to perform. 70 * 71 * <p><a id="line"></a> 72 * Line boundary analysis determines where a text string can be 73 * broken when line-wrapping. The mechanism correctly handles 74 * punctuation and hyphenated words. Actual line breaking needs 75 * to also consider the available line width and is handled by 76 * higher-level software. 77 * 78 * <p><a id="sentence"></a> 79 * Sentence boundary analysis allows selection with correct interpretation 80 * of periods within numbers and abbreviations, and trailing punctuation 81 * marks such as quotation marks and parentheses. 82 * 83 * <p><a id="word"></a> 84 * Word boundary analysis is used by search and replace functions, as 85 * well as within text editing applications that allow the user to 86 * select words with a double click. Word selection provides correct 87 * interpretation of punctuation marks within and following 88 * words. Characters that are not part of a word, such as symbols 89 * or punctuation marks, have word-breaks on both sides. 90 * 91 * <p><a id="character"></a> 92 * Character boundary analysis allows users to interact with characters 93 * as they expect to, for example, when moving the cursor through a text 94 * string. Character boundary analysis provides correct navigation 95 * through character strings, regardless of how the character is stored. 96 * The boundaries returned may be those of supplementary characters, 97 * combining character sequences, or ligature clusters. 98 * For example, an accented character might be stored as a base character 99 * and a diacritical mark. What users consider to be a character can 100 * differ between languages. 101 * 102 * <p> 103 * The {@code BreakIterator} instances returned by the factory methods 104 * of this class are intended for use with natural languages only, not for 105 * programming language text. It is however possible to define subclasses 106 * that tokenize a programming language. 107 * 108 * <P> 109 * <strong>Examples</strong>:<P> 110 * Creating and using text boundaries: 111 * <blockquote> 112 * <pre> 113 * public static void main(String args[]) { 114 * if (args.length == 1) { 115 * String stringToExamine = args[0]; 116 * //print each word in order 117 * BreakIterator boundary = BreakIterator.getWordInstance(); 118 * boundary.setText(stringToExamine); 119 * printEachForward(boundary, stringToExamine); 120 * //print each sentence in reverse order 121 * boundary = BreakIterator.getSentenceInstance(Locale.US); 122 * boundary.setText(stringToExamine); 123 * printEachBackward(boundary, stringToExamine); 124 * printFirst(boundary, stringToExamine); 125 * printLast(boundary, stringToExamine); 126 * } 127 * } 128 * </pre> 129 * </blockquote> 130 * 131 * Print each element in order: 132 * <blockquote> 133 * <pre> 134 * public static void printEachForward(BreakIterator boundary, String source) { 135 * int start = boundary.first(); 136 * for (int end = boundary.next(); 137 * end != BreakIterator.DONE; 138 * start = end, end = boundary.next()) { 139 * System.out.println(source.substring(start,end)); 140 * } 141 * } 142 * </pre> 143 * </blockquote> 144 * 145 * Print each element in reverse order: 146 * <blockquote> 147 * <pre> 148 * public static void printEachBackward(BreakIterator boundary, String source) { 149 * int end = boundary.last(); 150 * for (int start = boundary.previous(); 151 * start != BreakIterator.DONE; 152 * end = start, start = boundary.previous()) { 153 * System.out.println(source.substring(start,end)); 154 * } 155 * } 156 * </pre> 157 * </blockquote> 158 * 159 * Print first element: 160 * <blockquote> 161 * <pre> 162 * public static void printFirst(BreakIterator boundary, String source) { 163 * int start = boundary.first(); 164 * int end = boundary.next(); 165 * System.out.println(source.substring(start,end)); 166 * } 167 * </pre> 168 * </blockquote> 169 * 170 * Print last element: 171 * <blockquote> 172 * <pre> 173 * public static void printLast(BreakIterator boundary, String source) { 174 * int end = boundary.last(); 175 * int start = boundary.previous(); 176 * System.out.println(source.substring(start,end)); 177 * } 178 * </pre> 179 * </blockquote> 180 * 181 * Print the element at a specified position: 182 * <blockquote> 183 * <pre> 184 * public static void printAt(BreakIterator boundary, int pos, String source) { 185 * int end = boundary.following(pos); 186 * int start = boundary.previous(); 187 * System.out.println(source.substring(start,end)); 188 * } 189 * </pre> 190 * </blockquote> 191 * 192 * Find the next word: 193 * <blockquote> 194 * <pre>{@code 195 * public static int nextWordStartAfter(int pos, String text) { 196 * BreakIterator wb = BreakIterator.getWordInstance(); 197 * wb.setText(text); 198 * int last = wb.following(pos); 199 * int current = wb.next(); 200 * while (current != BreakIterator.DONE) { 201 * for (int p = last; p < current; p++) { 202 * if (Character.isLetter(text.codePointAt(p))) 203 * return last; 204 * } 205 * last = current; 206 * current = wb.next(); 207 * } 208 * return BreakIterator.DONE; 209 * } 210 * }</pre> 211 * (The iterator returned by BreakIterator.getWordInstance() is unique in that 212 * the break positions it returns don't represent both the start and end of the 213 * thing being iterated over. That is, a sentence-break iterator returns breaks 214 * that each represent the end of one sentence and the beginning of the next. 215 * With the word-break iterator, the characters between two boundaries might be a 216 * word, or they might be the punctuation or whitespace between two words. The 217 * above code uses a simple heuristic to determine which boundary is the beginning 218 * of a word: If the characters between this boundary and the next boundary 219 * include at least one letter (this can be an alphabetical letter, a CJK ideograph, 220 * a Hangul syllable, a Kana character, etc.), then the text between this boundary 221 * and the next is a word; otherwise, it's the material between words.) 222 * </blockquote> 223 * 224 * @since 1.1 225 * @see CharacterIterator 226 * 227 */ 228 229 public abstract class BreakIterator implements Cloneable 230 { 231 /** 232 * Constructor. BreakIterator is stateless and has no default behavior. 233 */ 234 protected BreakIterator() 235 { 236 } 237 238 /** 239 * Create a copy of this iterator 240 * @return A copy of this 241 */ 242 @Override 243 public Object clone() 244 { 245 try { 246 return super.clone(); 247 } 248 catch (CloneNotSupportedException e) { 249 throw new InternalError(e); 250 } 251 } 252 253 /** 254 * DONE is returned by previous(), next(), next(int), preceding(int) 255 * and following(int) when either the first or last text boundary has been 256 * reached. 257 */ 258 public static final int DONE = -1; 259 260 /** 261 * Returns the first boundary. The iterator's current position is set 262 * to the first text boundary. 263 * @return The character index of the first text boundary. 264 */ 265 public abstract int first(); 266 267 /** 268 * Returns the last boundary. The iterator's current position is set 269 * to the last text boundary. 270 * @return The character index of the last text boundary. 271 */ 272 public abstract int last(); 273 274 /** 275 * Returns the nth boundary from the current boundary. If either 276 * the first or last text boundary has been reached, it returns 277 * {@code BreakIterator.DONE} and the current position is set to either 278 * the first or last text boundary depending on which one is reached. Otherwise, 279 * the iterator's current position is set to the new boundary. 280 * For example, if the iterator's current position is the mth text boundary 281 * and three more boundaries exist from the current boundary to the last text 282 * boundary, the next(2) call will return m + 2. The new text position is set 283 * to the (m + 2)th text boundary. A next(4) call would return 284 * {@code BreakIterator.DONE} and the last text boundary would become the 285 * new text position. 286 * @param n which boundary to return. A value of 0 287 * does nothing. Negative values move to previous boundaries 288 * and positive values move to later boundaries. 289 * @return The character index of the nth boundary from the current position 290 * or {@code BreakIterator.DONE} if either first or last text boundary 291 * has been reached. 292 */ 293 public abstract int next(int n); 294 295 /** 296 * Returns the boundary following the current boundary. If the current boundary 297 * is the last text boundary, it returns {@code BreakIterator.DONE} and 298 * the iterator's current position is unchanged. Otherwise, the iterator's 299 * current position is set to the boundary following the current boundary. 300 * @return The character index of the next text boundary or 301 * {@code BreakIterator.DONE} if the current boundary is the last text 302 * boundary. 303 * Equivalent to next(1). 304 * @see #next(int) 305 */ 306 public abstract int next(); 307 308 /** 309 * Returns the boundary preceding the current boundary. If the current boundary 310 * is the first text boundary, it returns {@code BreakIterator.DONE} and 311 * the iterator's current position is unchanged. Otherwise, the iterator's 312 * current position is set to the boundary preceding the current boundary. 313 * @return The character index of the previous text boundary or 314 * {@code BreakIterator.DONE} if the current boundary is the first text 315 * boundary. 316 */ 317 public abstract int previous(); 318 319 /** 320 * Returns the first boundary following the specified character offset. If the 321 * specified offset equals to the last text boundary, it returns 322 * {@code BreakIterator.DONE} and the iterator's current position is unchanged. 323 * Otherwise, the iterator's current position is set to the returned boundary. 324 * The value returned is always greater than the offset or the value 325 * {@code BreakIterator.DONE}. 326 * @param offset the character offset to begin scanning. 327 * @return The first boundary after the specified offset or 328 * {@code BreakIterator.DONE} if the last text boundary is passed in 329 * as the offset. 330 * @throws IllegalArgumentException if the specified offset is less than 331 * the first text boundary or greater than the last text boundary. 332 */ 333 public abstract int following(int offset); 334 335 /** 336 * Returns the last boundary preceding the specified character offset. If the 337 * specified offset equals to the first text boundary, it returns 338 * {@code BreakIterator.DONE} and the iterator's current position is unchanged. 339 * Otherwise, the iterator's current position is set to the returned boundary. 340 * The value returned is always less than the offset or the value 341 * {@code BreakIterator.DONE}. 342 * @param offset the character offset to begin scanning. 343 * @return The last boundary before the specified offset or 344 * {@code BreakIterator.DONE} if the first text boundary is passed in 345 * as the offset. 346 * @throws IllegalArgumentException if the specified offset is less than 347 * the first text boundary or greater than the last text boundary. 348 * @since 1.2 349 */ 350 public int preceding(int offset) { 351 // NOTE: This implementation is here solely because we can't add new 352 // abstract methods to an existing class. There is almost ALWAYS a 353 // better, faster way to do this. 354 int pos = following(offset); 355 while (pos >= offset && pos != DONE) { 356 pos = previous(); 357 } 358 return pos; 359 } 360 361 /** 362 * Returns true if the specified character offset is a text boundary. 363 * @param offset the character offset to check. 364 * @return {@code true} if "offset" is a boundary position, 365 * {@code false} otherwise. 366 * @throws IllegalArgumentException if the specified offset is less than 367 * the first text boundary or greater than the last text boundary. 368 * @since 1.2 369 */ 370 public boolean isBoundary(int offset) { 371 // NOTE: This implementation probably is wrong for most situations 372 // because it fails to take into account the possibility that a 373 // CharacterIterator passed to setText() may not have a begin offset 374 // of 0. But since the abstract BreakIterator doesn't have that 375 // knowledge, it assumes the begin offset is 0. If you subclass 376 // BreakIterator, copy the SimpleTextBoundary implementation of this 377 // function into your subclass. [This should have been abstract at 378 // this level, but it's too late to fix that now.] 379 if (offset == 0) { 380 return true; 381 } 382 int boundary = following(offset - 1); 383 if (boundary == DONE) { 384 throw new IllegalArgumentException(); 385 } 386 return boundary == offset; 387 } 388 389 /** 390 * Returns character index of the text boundary that was most 391 * recently returned by next(), next(int), previous(), first(), last(), 392 * following(int) or preceding(int). If any of these methods returns 393 * {@code BreakIterator.DONE} because either first or last text boundary 394 * has been reached, it returns the first or last text boundary depending on 395 * which one is reached. 396 * @return The text boundary returned from the above methods, first or last 397 * text boundary. 398 * @see #next() 399 * @see #next(int) 400 * @see #previous() 401 * @see #first() 402 * @see #last() 403 * @see #following(int) 404 * @see #preceding(int) 405 */ 406 public abstract int current(); 407 408 /** 409 * Get the text being scanned 410 * @return the text being scanned 411 */ 412 public abstract CharacterIterator getText(); 413 414 /** 415 * Set a new text string to be scanned. The current scan 416 * position is reset to first(). 417 * @param newText new text to scan. 418 */ 419 public void setText(String newText) 420 { 421 setText(new StringCharacterIterator(newText)); 422 } 423 424 /** 425 * Set a new text for scanning. The current scan 426 * position is reset to first(). 427 * @param newText new text to scan. 428 */ 429 public abstract void setText(CharacterIterator newText); 430 431 private static final int CHARACTER_INDEX = 0; 432 private static final int WORD_INDEX = 1; 433 private static final int LINE_INDEX = 2; 434 private static final int SENTENCE_INDEX = 3; 435 436 @SuppressWarnings("unchecked") 437 private static final SoftReference<BreakIteratorCache>[] iterCache = (SoftReference<BreakIteratorCache>[]) new SoftReference<?>[4]; 438 439 /** 440 * Returns a new {@code BreakIterator} instance 441 * for <a href="BreakIterator.html#word">word breaks</a> 442 * for the {@linkplain Locale#getDefault() default locale}. 443 * @return A break iterator for word breaks 444 */ 445 public static BreakIterator getWordInstance() 446 { 447 return getWordInstance(Locale.getDefault()); 448 } 449 450 /** 451 * Returns a new {@code BreakIterator} instance 452 * for <a href="BreakIterator.html#word">word breaks</a> 453 * for the given locale. 454 * @param locale the desired locale 455 * @return A break iterator for word breaks 456 * @throws NullPointerException if {@code locale} is null 457 */ 458 public static BreakIterator getWordInstance(Locale locale) 459 { 460 return getBreakInstance(locale, WORD_INDEX); 461 } 462 463 /** 464 * Returns a new {@code BreakIterator} instance 465 * for <a href="BreakIterator.html#line">line breaks</a> 466 * for the {@linkplain Locale#getDefault() default locale}. 467 * @return A break iterator for line breaks 468 */ 469 public static BreakIterator getLineInstance() 470 { 471 return getLineInstance(Locale.getDefault()); 472 } 473 474 /** 475 * Returns a new {@code BreakIterator} instance 476 * for <a href="BreakIterator.html#line">line breaks</a> 477 * for the given locale. 478 * @param locale the desired locale 479 * @return A break iterator for line breaks 480 * @throws NullPointerException if {@code locale} is null 481 */ 482 public static BreakIterator getLineInstance(Locale locale) 483 { 484 return getBreakInstance(locale, LINE_INDEX); 485 } 486 487 /** 488 * Returns a new {@code BreakIterator} instance 489 * for <a href="BreakIterator.html#character">character breaks</a> 490 * for the {@linkplain Locale#getDefault() default locale}. 491 * @return A break iterator for character breaks 492 */ 493 public static BreakIterator getCharacterInstance() 494 { 495 return getCharacterInstance(Locale.getDefault()); 496 } 497 498 /** 499 * Returns a new {@code BreakIterator} instance 500 * for <a href="BreakIterator.html#character">character breaks</a> 501 * for the given locale. 502 * @param locale the desired locale 503 * @return A break iterator for character breaks 504 * @throws NullPointerException if {@code locale} is null 505 */ 506 public static BreakIterator getCharacterInstance(Locale locale) 507 { 508 return getBreakInstance(locale, CHARACTER_INDEX); 509 } 510 511 /** 512 * Returns a new {@code BreakIterator} instance 513 * for <a href="BreakIterator.html#sentence">sentence breaks</a> 514 * for the {@linkplain Locale#getDefault() default locale}. 515 * @return A break iterator for sentence breaks 516 */ 517 public static BreakIterator getSentenceInstance() 518 { 519 return getSentenceInstance(Locale.getDefault()); 520 } 521 522 /** 523 * Returns a new {@code BreakIterator} instance 524 * for <a href="BreakIterator.html#sentence">sentence breaks</a> 525 * for the given locale. 526 * @param locale the desired locale 527 * @return A break iterator for sentence breaks 528 * @throws NullPointerException if {@code locale} is null 529 */ 530 public static BreakIterator getSentenceInstance(Locale locale) 531 { 532 return getBreakInstance(locale, SENTENCE_INDEX); 533 } 534 535 private static BreakIterator getBreakInstance(Locale locale, int type) { 536 if (iterCache[type] != null) { 537 BreakIteratorCache cache = iterCache[type].get(); 538 if (cache != null) { 539 if (cache.getLocale().equals(locale)) { 540 return cache.createBreakInstance(); 541 } 542 } 543 } 544 545 BreakIterator result = createBreakInstance(locale, type); 546 BreakIteratorCache cache = new BreakIteratorCache(locale, result); 547 iterCache[type] = new SoftReference<>(cache); 548 return result; 549 } 550 551 private static BreakIterator createBreakInstance(Locale locale, 552 int type) { 553 LocaleProviderAdapter adapter = LocaleProviderAdapter.getAdapter(BreakIteratorProvider.class, locale); 554 BreakIterator iterator = createBreakInstance(adapter, locale, type); 555 if (iterator == null) { 556 iterator = createBreakInstance(LocaleProviderAdapter.forJRE(), locale, type); 557 } 558 return iterator; 559 } 560 561 private static BreakIterator createBreakInstance(LocaleProviderAdapter adapter, Locale locale, int type) { 562 BreakIteratorProvider breakIteratorProvider = adapter.getBreakIteratorProvider(); 563 BreakIterator iterator = null; 564 switch (type) { 565 case CHARACTER_INDEX: 566 iterator = breakIteratorProvider.getCharacterInstance(locale); 567 break; 568 case WORD_INDEX: 569 iterator = breakIteratorProvider.getWordInstance(locale); 570 break; 571 case LINE_INDEX: 572 iterator = breakIteratorProvider.getLineInstance(locale); 573 break; 574 case SENTENCE_INDEX: 575 iterator = breakIteratorProvider.getSentenceInstance(locale); 576 break; 577 } 578 return iterator; 579 } 580 581 /** 582 * Returns an array of all locales for which the 583 * {@code get*Instance} methods of this class can return 584 * localized instances. 585 * The returned array represents the union of locales supported by the Java 586 * runtime and by installed 587 * {@link java.text.spi.BreakIteratorProvider BreakIteratorProvider} implementations. 588 * It must contain at least a {@code Locale} 589 * instance equal to {@link java.util.Locale#US Locale.US}. 590 * 591 * @return An array of locales for which localized 592 * {@code BreakIterator} instances are available. 593 */ 594 public static synchronized Locale[] getAvailableLocales() 595 { 596 LocaleServiceProviderPool pool = 597 LocaleServiceProviderPool.getPool(BreakIteratorProvider.class); 598 return pool.getAvailableLocales(); 599 } 600 601 private static final class BreakIteratorCache { 602 603 private BreakIterator iter; 604 private Locale locale; 605 606 BreakIteratorCache(Locale locale, BreakIterator iter) { 607 this.locale = locale; 608 this.iter = (BreakIterator) iter.clone(); 609 } 610 611 Locale getLocale() { 612 return locale; 613 } 614 615 BreakIterator createBreakInstance() { 616 return (BreakIterator) iter.clone(); 617 } 618 } 619 }