Frames | No Frames |
1: /* java.lang.Character -- Wrapper class for char, and Unicode subsets 2: Copyright (C) 1998, 1999, 2001, 2002, 2005, 2006 Free Software Foundation, Inc. 3: 4: This file is part of GNU Classpath. 5: 6: GNU Classpath is free software; you can redistribute it and/or modify 7: it under the terms of the GNU General Public License as published by 8: the Free Software Foundation; either version 2, or (at your option) 9: any later version. 10: 11: GNU Classpath is distributed in the hope that it will be useful, but 12: WITHOUT ANY WARRANTY; without even the implied warranty of 13: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14: General Public License for more details. 15: 16: You should have received a copy of the GNU General Public License 17: along with GNU Classpath; see the file COPYING. If not, write to the 18: Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 19: 02110-1301 USA. 20: 21: Linking this library statically or dynamically with other modules is 22: making a combined work based on this library. Thus, the terms and 23: conditions of the GNU General Public License cover the whole 24: combination. 25: 26: As a special exception, the copyright holders of this library give you 27: permission to link this library with independent modules to produce an 28: executable, regardless of the license terms of these independent 29: modules, and to copy and distribute the resulting executable under 30: terms of your choice, provided that you also meet, for each linked 31: independent module, the terms and conditions of the license of that 32: module. An independent module is a module which is not derived from 33: or based on this library. If you modify this library, you may extend 34: this exception to your version of the library, but you are not 35: obligated to do so. If you do not wish to do so, delete this 36: exception statement from your version. */ 37: 38: /* 39: * Note: This class must not be merged with Classpath. Gcj uses C-style 40: * arrays (see include/java-chartables.h) to store the Unicode character 41: * database, whereas Classpath uses Java objects (char[] extracted from 42: * String constants) in gnu.java.lang.CharData. Gcj's approach is more 43: * efficient, because there is no vtable or data relocation to worry about. 44: * However, despite the difference in the database interface, the two 45: * versions share identical algorithms. 46: */ 47: 48: package java.lang; 49: 50: import java.io.Serializable; 51: import java.text.Collator; 52: import java.util.Locale; 53: 54: /** 55: * Wrapper class for the primitive char data type. In addition, this class 56: * allows one to retrieve property information and perform transformations 57: * on the 57,707 defined characters in the Unicode Standard, Version 3.0.0. 58: * java.lang.Character is designed to be very dynamic, and as such, it 59: * retrieves information on the Unicode character set from a separate 60: * database, gnu.java.lang.CharData, which can be easily upgraded. 61: * 62: * <p>For predicates, boundaries are used to describe 63: * the set of characters for which the method will return true. 64: * This syntax uses fairly normal regular expression notation. 65: * See 5.13 of the Unicode Standard, Version 3.0, for the 66: * boundary specification. 67: * 68: * <p>See <a href="http://www.unicode.org">http://www.unicode.org</a> 69: * for more information on the Unicode Standard. 70: * 71: * @author Tom Tromey (tromey@cygnus.com) 72: * @author Paul N. Fisher 73: * @author Jochen Hoenicke 74: * @author Eric Blake (ebb9@email.byu.edu) 75: * @since 1.0 76: * @status updated to 1.4 77: */ 78: public final class Character implements Serializable, Comparable 79: { 80: /** 81: * A subset of Unicode blocks. 82: * 83: * @author Paul N. Fisher 84: * @author Eric Blake (ebb9@email.byu.edu) 85: * @since 1.2 86: */ 87: public static class Subset 88: { 89: /** The name of the subset. */ 90: private final String name; 91: 92: /** 93: * Construct a new subset of characters. 94: * 95: * @param name the name of the subset 96: * @throws NullPointerException if name is null 97: */ 98: protected Subset(String name) 99: { 100: // Note that name.toString() is name, unless name was null. 101: this.name = name.toString(); 102: } 103: 104: /** 105: * Compares two Subsets for equality. This is <code>final</code>, and 106: * restricts the comparison on the <code>==</code> operator, so it returns 107: * true only for the same object. 108: * 109: * @param o the object to compare 110: * @return true if o is this 111: */ 112: public final boolean equals(Object o) 113: { 114: return o == this; 115: } 116: 117: /** 118: * Makes the original hashCode of Object final, to be consistent with 119: * equals. 120: * 121: * @return the hash code for this object 122: */ 123: public final int hashCode() 124: { 125: return super.hashCode(); 126: } 127: 128: /** 129: * Returns the name of the subset. 130: * 131: * @return the name 132: */ 133: public final String toString() 134: { 135: return name; 136: } 137: } // class Subset 138: 139: /** 140: * A family of character subsets in the Unicode specification. A character 141: * is in at most one of these blocks. 142: * 143: * This inner class was generated automatically from 144: * <code>libjava/gnu/gcj/convert/Blocks-3.txt</code>, by some perl scripts. 145: * This Unicode definition file can be found on the 146: * <a href="http://www.unicode.org">http://www.unicode.org</a> website. 147: * JDK 1.4 uses Unicode version 3.0.0. 148: * 149: * @author scripts/unicode-blocks.pl (written by Eric Blake) 150: * @since 1.2 151: */ 152: public static final class UnicodeBlock extends Subset 153: { 154: /** The start of the subset. */ 155: private final int start; 156: 157: /** The end of the subset. */ 158: private final int end; 159: 160: /** The canonical name of the block according to the Unicode standard. */ 161: private final String canonicalName; 162: 163: /** Constants for the <code>forName()</code> method */ 164: private static final int CANONICAL_NAME = 0; 165: private static final int NO_SPACES_NAME = 1; 166: private static final int CONSTANT_NAME = 2; 167: 168: /** 169: * Constructor for strictly defined blocks. 170: * 171: * @param start the start character of the range 172: * @param end the end character of the range 173: * @param name the block name 174: */ 175: private UnicodeBlock(int start, int end, String name, 176: String canonicalName) 177: { 178: super(name); 179: this.start = start; 180: this.end = end; 181: this.canonicalName = canonicalName; 182: } 183: 184: /** 185: * Returns the Unicode character block which a character belongs to. 186: * <strong>Note</strong>: This method does not support the use of 187: * supplementary characters. For such support, <code>of(int)</code> 188: * should be used instead. 189: * 190: * @param ch the character to look up 191: * @return the set it belongs to, or null if it is not in one 192: */ 193: public static UnicodeBlock of(char ch) 194: { 195: return of((int) ch); 196: } 197: 198: /** 199: * Returns the Unicode character block which a code point belongs to. 200: * 201: * @param codePoint the character to look up 202: * @return the set it belongs to, or null if it is not in one. 203: * @throws IllegalArgumentException if the specified code point is 204: * invalid. 205: * @since 1.5 206: */ 207: public static UnicodeBlock of(int codePoint) 208: { 209: if (codePoint > MAX_CODE_POINT) 210: throw new IllegalArgumentException("The supplied integer value is " + 211: "too large to be a codepoint."); 212: // Simple binary search for the correct block. 213: int low = 0; 214: int hi = sets.length - 1; 215: while (low <= hi) 216: { 217: int mid = (low + hi) >> 1; 218: UnicodeBlock b = sets[mid]; 219: if (codePoint < b.start) 220: hi = mid - 1; 221: else if (codePoint > b.end) 222: low = mid + 1; 223: else 224: return b; 225: } 226: return null; 227: } 228: 229: /** 230: * <p> 231: * Returns the <code>UnicodeBlock</code> with the given name, as defined 232: * by the Unicode standard. The version of Unicode in use is defined by 233: * the <code>Character</code> class, and the names are given in the 234: * <code>Blocks-<version>.txt</code> file corresponding to that version. 235: * The name may be specified in one of three ways: 236: * </p> 237: * <ol> 238: * <li>The canonical, human-readable name used by the Unicode standard. 239: * This is the name with all spaces and hyphens retained. For example, 240: * `Basic Latin' retrieves the block, UnicodeBlock.BASIC_LATIN.</li> 241: * <li>The canonical name with all spaces removed e.g. `BasicLatin'.</li> 242: * <li>The name used for the constants specified by this class, which 243: * is the canonical name with all spaces and hyphens replaced with 244: * underscores e.g. `BASIC_LATIN'</li> 245: * </ol> 246: * <p> 247: * The names are compared case-insensitively using the case comparison 248: * associated with the U.S. English locale. The method recognises the 249: * previous names used for blocks as well as the current ones. At 250: * present, this simply means that the deprecated `SURROGATES_AREA' 251: * will be recognised by this method (the <code>of()</code> methods 252: * only return one of the three new surrogate blocks). 253: * </p> 254: * 255: * @param blockName the name of the block to look up. 256: * @return the specified block. 257: * @throws NullPointerException if the <code>blockName</code> is 258: * <code>null</code>. 259: * @throws IllegalArgumentException if the name does not match any Unicode 260: * block. 261: * @since 1.5 262: */ 263: public static final UnicodeBlock forName(String blockName) 264: { 265: int type; 266: if (blockName.indexOf(' ') != -1) 267: type = CANONICAL_NAME; 268: else if (blockName.indexOf('_') != -1) 269: type = CONSTANT_NAME; 270: else 271: type = NO_SPACES_NAME; 272: Collator usCollator = Collator.getInstance(Locale.US); 273: usCollator.setStrength(Collator.PRIMARY); 274: /* Special case for deprecated blocks not in sets */ 275: switch (type) 276: { 277: case CANONICAL_NAME: 278: if (usCollator.compare(blockName, "Surrogates Area") == 0) 279: return SURROGATES_AREA; 280: break; 281: case NO_SPACES_NAME: 282: if (usCollator.compare(blockName, "SurrogatesArea") == 0) 283: return SURROGATES_AREA; 284: break; 285: case CONSTANT_NAME: 286: if (usCollator.compare(blockName, "SURROGATES_AREA") == 0) 287: return SURROGATES_AREA; 288: break; 289: } 290: /* Other cases */ 291: int setLength = sets.length; 292: switch (type) 293: { 294: case CANONICAL_NAME: 295: for (int i = 0; i < setLength; i++) 296: { 297: UnicodeBlock block = sets[i]; 298: if (usCollator.compare(blockName, block.canonicalName) == 0) 299: return block; 300: } 301: break; 302: case NO_SPACES_NAME: 303: for (int i = 0; i < setLength; i++) 304: { 305: UnicodeBlock block = sets[i]; 306: String nsName = block.canonicalName.replaceAll(" ",""); 307: if (usCollator.compare(blockName, nsName) == 0) 308: return block; 309: } 310: break; 311: case CONSTANT_NAME: 312: for (int i = 0; i < setLength; i++) 313: { 314: UnicodeBlock block = sets[i]; 315: if (usCollator.compare(blockName, block.toString()) == 0) 316: return block; 317: } 318: break; 319: } 320: throw new IllegalArgumentException("No Unicode block found for " + 321: blockName + "."); 322: } 323: 324: /** 325: * Basic Latin. 326: * 0x0000 - 0x007F. 327: */ 328: public static final UnicodeBlock BASIC_LATIN 329: = new UnicodeBlock(0x0000, 0x007F, 330: "BASIC_LATIN", 331: "Basic Latin"); 332: 333: /** 334: * Latin-1 Supplement. 335: * 0x0080 - 0x00FF. 336: */ 337: public static final UnicodeBlock LATIN_1_SUPPLEMENT 338: = new UnicodeBlock(0x0080, 0x00FF, 339: "LATIN_1_SUPPLEMENT", 340: "Latin-1 Supplement"); 341: 342: /** 343: * Latin Extended-A. 344: * 0x0100 - 0x017F. 345: */ 346: public static final UnicodeBlock LATIN_EXTENDED_A 347: = new UnicodeBlock(0x0100, 0x017F, 348: "LATIN_EXTENDED_A", 349: "Latin Extended-A"); 350: 351: /** 352: * Latin Extended-B. 353: * 0x0180 - 0x024F. 354: */ 355: public static final UnicodeBlock LATIN_EXTENDED_B 356: = new UnicodeBlock(0x0180, 0x024F, 357: "LATIN_EXTENDED_B", 358: "Latin Extended-B"); 359: 360: /** 361: * IPA Extensions. 362: * 0x0250 - 0x02AF. 363: */ 364: public static final UnicodeBlock IPA_EXTENSIONS 365: = new UnicodeBlock(0x0250, 0x02AF, 366: "IPA_EXTENSIONS", 367: "IPA Extensions"); 368: 369: /** 370: * Spacing Modifier Letters. 371: * 0x02B0 - 0x02FF. 372: */ 373: public static final UnicodeBlock SPACING_MODIFIER_LETTERS 374: = new UnicodeBlock(0x02B0, 0x02FF, 375: "SPACING_MODIFIER_LETTERS", 376: "Spacing Modifier Letters"); 377: 378: /** 379: * Combining Diacritical Marks. 380: * 0x0300 - 0x036F. 381: */ 382: public static final UnicodeBlock COMBINING_DIACRITICAL_MARKS 383: = new UnicodeBlock(0x0300, 0x036F, 384: "COMBINING_DIACRITICAL_MARKS", 385: "Combining Diacritical Marks"); 386: 387: /** 388: * Greek. 389: * 0x0370 - 0x03FF. 390: */ 391: public static final UnicodeBlock GREEK 392: = new UnicodeBlock(0x0370, 0x03FF, 393: "GREEK", 394: "Greek"); 395: 396: /** 397: * Cyrillic. 398: * 0x0400 - 0x04FF. 399: */ 400: public static final UnicodeBlock CYRILLIC 401: = new UnicodeBlock(0x0400, 0x04FF, 402: "CYRILLIC", 403: "Cyrillic"); 404: 405: /** 406: * Cyrillic Supplementary. 407: * 0x0500 - 0x052F. 408: * @since 1.5 409: */ 410: public static final UnicodeBlock CYRILLIC_SUPPLEMENTARY 411: = new UnicodeBlock(0x0500, 0x052F, 412: "CYRILLIC_SUPPLEMENTARY", 413: "Cyrillic Supplementary"); 414: 415: /** 416: * Armenian. 417: * 0x0530 - 0x058F. 418: */ 419: public static final UnicodeBlock ARMENIAN 420: = new UnicodeBlock(0x0530, 0x058F, 421: "ARMENIAN", 422: "Armenian"); 423: 424: /** 425: * Hebrew. 426: * 0x0590 - 0x05FF. 427: */ 428: public static final UnicodeBlock HEBREW 429: = new UnicodeBlock(0x0590, 0x05FF, 430: "HEBREW", 431: "Hebrew"); 432: 433: /** 434: * Arabic. 435: * 0x0600 - 0x06FF. 436: */ 437: public static final UnicodeBlock ARABIC 438: = new UnicodeBlock(0x0600, 0x06FF, 439: "ARABIC", 440: "Arabic"); 441: 442: /** 443: * Syriac. 444: * 0x0700 - 0x074F. 445: * @since 1.4 446: */ 447: public static final UnicodeBlock SYRIAC 448: = new UnicodeBlock(0x0700, 0x074F, 449: "SYRIAC", 450: "Syriac"); 451: 452: /** 453: * Thaana. 454: * 0x0780 - 0x07BF. 455: * @since 1.4 456: */ 457: public static final UnicodeBlock THAANA 458: = new UnicodeBlock(0x0780, 0x07BF, 459: "THAANA", 460: "Thaana"); 461: 462: /** 463: * Devanagari. 464: * 0x0900 - 0x097F. 465: */ 466: public static final UnicodeBlock DEVANAGARI 467: = new UnicodeBlock(0x0900, 0x097F, 468: "DEVANAGARI", 469: "Devanagari"); 470: 471: /** 472: * Bengali. 473: * 0x0980 - 0x09FF. 474: */ 475: public static final UnicodeBlock BENGALI 476: = new UnicodeBlock(0x0980, 0x09FF, 477: "BENGALI", 478: "Bengali"); 479: 480: /** 481: * Gurmukhi. 482: * 0x0A00 - 0x0A7F. 483: */ 484: public static final UnicodeBlock GURMUKHI 485: = new UnicodeBlock(0x0A00, 0x0A7F, 486: "GURMUKHI", 487: "Gurmukhi"); 488: 489: /** 490: * Gujarati. 491: * 0x0A80 - 0x0AFF. 492: */ 493: public static final UnicodeBlock GUJARATI 494: = new UnicodeBlock(0x0A80, 0x0AFF, 495: "GUJARATI", 496: "Gujarati"); 497: 498: /** 499: * Oriya. 500: * 0x0B00 - 0x0B7F. 501: */ 502: public static final UnicodeBlock ORIYA 503: = new UnicodeBlock(0x0B00, 0x0B7F, 504: "ORIYA", 505: "Oriya"); 506: 507: /** 508: * Tamil. 509: * 0x0B80 - 0x0BFF. 510: */ 511: public static final UnicodeBlock TAMIL 512: = new UnicodeBlock(0x0B80, 0x0BFF, 513: "TAMIL", 514: "Tamil"); 515: 516: /** 517: * Telugu. 518: * 0x0C00 - 0x0C7F. 519: */ 520: public static final UnicodeBlock TELUGU 521: = new UnicodeBlock(0x0C00, 0x0C7F, 522: "TELUGU", 523: "Telugu"); 524: 525: /** 526: * Kannada. 527: * 0x0C80 - 0x0CFF. 528: */ 529: public static final UnicodeBlock KANNADA 530: = new UnicodeBlock(0x0C80, 0x0CFF, 531: "KANNADA", 532: "Kannada"); 533: 534: /** 535: * Malayalam. 536: * 0x0D00 - 0x0D7F. 537: */ 538: public static final UnicodeBlock MALAYALAM 539: = new UnicodeBlock(0x0D00, 0x0D7F, 540: "MALAYALAM", 541: "Malayalam"); 542: 543: /** 544: * Sinhala. 545: * 0x0D80 - 0x0DFF. 546: * @since 1.4 547: */ 548: public static final UnicodeBlock SINHALA 549: = new UnicodeBlock(0x0D80, 0x0DFF, 550: "SINHALA", 551: "Sinhala"); 552: 553: /** 554: * Thai. 555: * 0x0E00 - 0x0E7F. 556: */ 557: public static final UnicodeBlock THAI 558: = new UnicodeBlock(0x0E00, 0x0E7F, 559: "THAI", 560: "Thai"); 561: 562: /** 563: * Lao. 564: * 0x0E80 - 0x0EFF. 565: */ 566: public static final UnicodeBlock LAO 567: = new UnicodeBlock(0x0E80, 0x0EFF, 568: "LAO", 569: "Lao"); 570: 571: /** 572: * Tibetan. 573: * 0x0F00 - 0x0FFF. 574: */ 575: public static final UnicodeBlock TIBETAN 576: = new UnicodeBlock(0x0F00, 0x0FFF, 577: "TIBETAN", 578: "Tibetan"); 579: 580: /** 581: * Myanmar. 582: * 0x1000 - 0x109F. 583: * @since 1.4 584: */ 585: public static final UnicodeBlock MYANMAR 586: = new UnicodeBlock(0x1000, 0x109F, 587: "MYANMAR", 588: "Myanmar"); 589: 590: /** 591: * Georgian. 592: * 0x10A0 - 0x10FF. 593: */ 594: public static final UnicodeBlock GEORGIAN 595: = new UnicodeBlock(0x10A0, 0x10FF, 596: "GEORGIAN", 597: "Georgian"); 598: 599: /** 600: * Hangul Jamo. 601: * 0x1100 - 0x11FF. 602: */ 603: public static final UnicodeBlock HANGUL_JAMO 604: = new UnicodeBlock(0x1100, 0x11FF, 605: "HANGUL_JAMO", 606: "Hangul Jamo"); 607: 608: /** 609: * Ethiopic. 610: * 0x1200 - 0x137F. 611: * @since 1.4 612: */ 613: public static final UnicodeBlock ETHIOPIC 614: = new UnicodeBlock(0x1200, 0x137F, 615: "ETHIOPIC", 616: "Ethiopic"); 617: 618: /** 619: * Cherokee. 620: * 0x13A0 - 0x13FF. 621: * @since 1.4 622: */ 623: public static final UnicodeBlock CHEROKEE 624: = new UnicodeBlock(0x13A0, 0x13FF, 625: "CHEROKEE", 626: "Cherokee"); 627: 628: /** 629: * Unified Canadian Aboriginal Syllabics. 630: * 0x1400 - 0x167F. 631: * @since 1.4 632: */ 633: public static final UnicodeBlock UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS 634: = new UnicodeBlock(0x1400, 0x167F, 635: "UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS", 636: "Unified Canadian Aboriginal Syllabics"); 637: 638: /** 639: * Ogham. 640: * 0x1680 - 0x169F. 641: * @since 1.4 642: */ 643: public static final UnicodeBlock OGHAM 644: = new UnicodeBlock(0x1680, 0x169F, 645: "OGHAM", 646: "Ogham"); 647: 648: /** 649: * Runic. 650: * 0x16A0 - 0x16FF. 651: * @since 1.4 652: */ 653: public static final UnicodeBlock RUNIC 654: = new UnicodeBlock(0x16A0, 0x16FF, 655: "RUNIC", 656: "Runic"); 657: 658: /** 659: * Tagalog. 660: * 0x1700 - 0x171F. 661: * @since 1.5 662: */ 663: public static final UnicodeBlock TAGALOG 664: = new UnicodeBlock(0x1700, 0x171F, 665: "TAGALOG", 666: "Tagalog"); 667: 668: /** 669: * Hanunoo. 670: * 0x1720 - 0x173F. 671: * @since 1.5 672: */ 673: public static final UnicodeBlock HANUNOO 674: = new UnicodeBlock(0x1720, 0x173F, 675: "HANUNOO", 676: "Hanunoo"); 677: 678: /** 679: * Buhid. 680: * 0x1740 - 0x175F. 681: * @since 1.5 682: */ 683: public static final UnicodeBlock BUHID 684: = new UnicodeBlock(0x1740, 0x175F, 685: "BUHID", 686: "Buhid"); 687: 688: /** 689: * Tagbanwa. 690: * 0x1760 - 0x177F. 691: * @since 1.5 692: */ 693: public static final UnicodeBlock TAGBANWA 694: = new UnicodeBlock(0x1760, 0x177F, 695: "TAGBANWA", 696: "Tagbanwa"); 697: 698: /** 699: * Khmer. 700: * 0x1780 - 0x17FF. 701: * @since 1.4 702: */ 703: public static final UnicodeBlock KHMER 704: = new UnicodeBlock(0x1780, 0x17FF, 705: "KHMER", 706: "Khmer"); 707: 708: /** 709: * Mongolian. 710: * 0x1800 - 0x18AF. 711: * @since 1.4 712: */ 713: public static final UnicodeBlock MONGOLIAN 714: = new UnicodeBlock(0x1800, 0x18AF, 715: "MONGOLIAN", 716: "Mongolian"); 717: 718: /** 719: * Limbu. 720: * 0x1900 - 0x194F. 721: * @since 1.5 722: */ 723: public static final UnicodeBlock LIMBU 724: = new UnicodeBlock(0x1900, 0x194F, 725: "LIMBU", 726: "Limbu"); 727: 728: /** 729: * Tai Le. 730: * 0x1950 - 0x197F. 731: * @since 1.5 732: */ 733: public static final UnicodeBlock TAI_LE 734: = new UnicodeBlock(0x1950, 0x197F, 735: "TAI_LE", 736: "Tai Le"); 737: 738: /** 739: * Khmer Symbols. 740: * 0x19E0 - 0x19FF. 741: * @since 1.5 742: */ 743: public static final UnicodeBlock KHMER_SYMBOLS 744: = new UnicodeBlock(0x19E0, 0x19FF, 745: "KHMER_SYMBOLS", 746: "Khmer Symbols"); 747: 748: /** 749: * Phonetic Extensions. 750: * 0x1D00 - 0x1D7F. 751: * @since 1.5 752: */ 753: public static final UnicodeBlock PHONETIC_EXTENSIONS 754: = new UnicodeBlock(0x1D00, 0x1D7F, 755: "PHONETIC_EXTENSIONS", 756: "Phonetic Extensions"); 757: 758: /** 759: * Latin Extended Additional. 760: * 0x1E00 - 0x1EFF. 761: */ 762: public static final UnicodeBlock LATIN_EXTENDED_ADDITIONAL 763: = new UnicodeBlock(0x1E00, 0x1EFF, 764: "LATIN_EXTENDED_ADDITIONAL", 765: "Latin Extended Additional"); 766: 767: /** 768: * Greek Extended. 769: * 0x1F00 - 0x1FFF. 770: */ 771: public static final UnicodeBlock GREEK_EXTENDED 772: = new UnicodeBlock(0x1F00, 0x1FFF, 773: "GREEK_EXTENDED", 774: "Greek Extended"); 775: 776: /** 777: * General Punctuation. 778: * 0x2000 - 0x206F. 779: */ 780: public static final UnicodeBlock GENERAL_PUNCTUATION 781: = new UnicodeBlock(0x2000, 0x206F, 782: "GENERAL_PUNCTUATION", 783: "General Punctuation"); 784: 785: /** 786: * Superscripts and Subscripts. 787: * 0x2070 - 0x209F. 788: */ 789: public static final UnicodeBlock SUPERSCRIPTS_AND_SUBSCRIPTS 790: = new UnicodeBlock(0x2070, 0x209F, 791: "SUPERSCRIPTS_AND_SUBSCRIPTS", 792: "Superscripts and Subscripts"); 793: 794: /** 795: * Currency Symbols. 796: * 0x20A0 - 0x20CF. 797: */ 798: public static final UnicodeBlock CURRENCY_SYMBOLS 799: = new UnicodeBlock(0x20A0, 0x20CF, 800: "CURRENCY_SYMBOLS", 801: "Currency Symbols"); 802: 803: /** 804: * Combining Marks for Symbols. 805: * 0x20D0 - 0x20FF. 806: */ 807: public static final UnicodeBlock COMBINING_MARKS_FOR_SYMBOLS 808: = new UnicodeBlock(0x20D0, 0x20FF, 809: "COMBINING_MARKS_FOR_SYMBOLS", 810: "Combining Marks for Symbols"); 811: 812: /** 813: * Letterlike Symbols. 814: * 0x2100 - 0x214F. 815: */ 816: public static final UnicodeBlock LETTERLIKE_SYMBOLS 817: = new UnicodeBlock(0x2100, 0x214F, 818: "LETTERLIKE_SYMBOLS", 819: "Letterlike Symbols"); 820: 821: /** 822: * Number Forms. 823: * 0x2150 - 0x218F. 824: */ 825: public static final UnicodeBlock NUMBER_FORMS 826: = new UnicodeBlock(0x2150, 0x218F, 827: "NUMBER_FORMS", 828: "Number Forms"); 829: 830: /** 831: * Arrows. 832: * 0x2190 - 0x21FF. 833: */ 834: public static final UnicodeBlock ARROWS 835: = new UnicodeBlock(0x2190, 0x21FF, 836: "ARROWS", 837: "Arrows"); 838: 839: /** 840: * Mathematical Operators. 841: * 0x2200 - 0x22FF. 842: */ 843: public static final UnicodeBlock MATHEMATICAL_OPERATORS 844: = new UnicodeBlock(0x2200, 0x22FF, 845: "MATHEMATICAL_OPERATORS", 846: "Mathematical Operators"); 847: 848: /** 849: * Miscellaneous Technical. 850: * 0x2300 - 0x23FF. 851: */ 852: public static final UnicodeBlock MISCELLANEOUS_TECHNICAL 853: = new UnicodeBlock(0x2300, 0x23FF, 854: "MISCELLANEOUS_TECHNICAL", 855: "Miscellaneous Technical"); 856: 857: /** 858: * Control Pictures. 859: * 0x2400 - 0x243F. 860: */ 861: public static final UnicodeBlock CONTROL_PICTURES 862: = new UnicodeBlock(0x2400, 0x243F, 863: "CONTROL_PICTURES", 864: "Control Pictures"); 865: 866: /** 867: * Optical Character Recognition. 868: * 0x2440 - 0x245F. 869: */ 870: public static final UnicodeBlock OPTICAL_CHARACTER_RECOGNITION 871: = new UnicodeBlock(0x2440, 0x245F, 872: "OPTICAL_CHARACTER_RECOGNITION", 873: "Optical Character Recognition"); 874: 875: /** 876: * Enclosed Alphanumerics. 877: * 0x2460 - 0x24FF. 878: */ 879: public static final UnicodeBlock ENCLOSED_ALPHANUMERICS 880: = new UnicodeBlock(0x2460, 0x24FF, 881: "ENCLOSED_ALPHANUMERICS", 882: "Enclosed Alphanumerics"); 883: 884: /** 885: * Box Drawing. 886: * 0x2500 - 0x257F. 887: */ 888: public static final UnicodeBlock BOX_DRAWING 889: = new UnicodeBlock(0x2500, 0x257F, 890: "BOX_DRAWING", 891: "Box Drawing"); 892: 893: /** 894: * Block Elements. 895: * 0x2580 - 0x259F. 896: */ 897: public static final UnicodeBlock BLOCK_ELEMENTS 898: = new UnicodeBlock(0x2580, 0x259F, 899: "BLOCK_ELEMENTS", 900: "Block Elements"); 901: 902: /** 903: * Geometric Shapes. 904: * 0x25A0 - 0x25FF. 905: */ 906: public static final UnicodeBlock GEOMETRIC_SHAPES 907: = new UnicodeBlock(0x25A0, 0x25FF, 908: "GEOMETRIC_SHAPES", 909: "Geometric Shapes"); 910: 911: /** 912: * Miscellaneous Symbols. 913: * 0x2600 - 0x26FF. 914: */ 915: public static final UnicodeBlock MISCELLANEOUS_SYMBOLS 916: = new UnicodeBlock(0x2600, 0x26FF, 917: "MISCELLANEOUS_SYMBOLS", 918: "Miscellaneous Symbols"); 919: 920: /** 921: * Dingbats. 922: * 0x2700 - 0x27BF. 923: */ 924: public static final UnicodeBlock DINGBATS 925: = new UnicodeBlock(0x2700, 0x27BF, 926: "DINGBATS", 927: "Dingbats"); 928: 929: /** 930: * Miscellaneous Mathematical Symbols-A. 931: * 0x27C0 - 0x27EF. 932: * @since 1.5 933: */ 934: public static final UnicodeBlock MISCELLANEOUS_MATHEMATICAL_SYMBOLS_A 935: = new UnicodeBlock(0x27C0, 0x27EF, 936: "MISCELLANEOUS_MATHEMATICAL_SYMBOLS_A", 937: "Miscellaneous Mathematical Symbols-A"); 938: 939: /** 940: * Supplemental Arrows-A. 941: * 0x27F0 - 0x27FF. 942: * @since 1.5 943: */ 944: public static final UnicodeBlock SUPPLEMENTAL_ARROWS_A 945: = new UnicodeBlock(0x27F0, 0x27FF, 946: "SUPPLEMENTAL_ARROWS_A", 947: "Supplemental Arrows-A"); 948: 949: /** 950: * Braille Patterns. 951: * 0x2800 - 0x28FF. 952: * @since 1.4 953: */ 954: public static final UnicodeBlock BRAILLE_PATTERNS 955: = new UnicodeBlock(0x2800, 0x28FF, 956: "BRAILLE_PATTERNS", 957: "Braille Patterns"); 958: 959: /** 960: * Supplemental Arrows-B. 961: * 0x2900 - 0x297F. 962: * @since 1.5 963: */ 964: public static final UnicodeBlock SUPPLEMENTAL_ARROWS_B 965: = new UnicodeBlock(0x2900, 0x297F, 966: "SUPPLEMENTAL_ARROWS_B", 967: "Supplemental Arrows-B"); 968: 969: /** 970: * Miscellaneous Mathematical Symbols-B. 971: * 0x2980 - 0x29FF. 972: * @since 1.5 973: */ 974: public static final UnicodeBlock MISCELLANEOUS_MATHEMATICAL_SYMBOLS_B 975: = new UnicodeBlock(0x2980, 0x29FF, 976: "MISCELLANEOUS_MATHEMATICAL_SYMBOLS_B", 977: "Miscellaneous Mathematical Symbols-B"); 978: 979: /** 980: * Supplemental Mathematical Operators. 981: * 0x2A00 - 0x2AFF. 982: * @since 1.5 983: */ 984: public static final UnicodeBlock SUPPLEMENTAL_MATHEMATICAL_OPERATORS 985: = new UnicodeBlock(0x2A00, 0x2AFF, 986: "SUPPLEMENTAL_MATHEMATICAL_OPERATORS", 987: "Supplemental Mathematical Operators"); 988: 989: /** 990: * Miscellaneous Symbols and Arrows. 991: * 0x2B00 - 0x2BFF. 992: * @since 1.5 993: */ 994: public static final UnicodeBlock MISCELLANEOUS_SYMBOLS_AND_ARROWS 995: = new UnicodeBlock(0x2B00, 0x2BFF, 996: "MISCELLANEOUS_SYMBOLS_AND_ARROWS", 997: "Miscellaneous Symbols and Arrows"); 998: 999: /** 1000: * CJK Radicals Supplement. 1001: * 0x2E80 - 0x2EFF. 1002: * @since 1.4 1003: */ 1004: public static final UnicodeBlock CJK_RADICALS_SUPPLEMENT 1005: = new UnicodeBlock(0x2E80, 0x2EFF, 1006: "CJK_RADICALS_SUPPLEMENT", 1007: "CJK Radicals Supplement"); 1008: 1009: /** 1010: * Kangxi Radicals. 1011: * 0x2F00 - 0x2FDF. 1012: * @since 1.4 1013: */ 1014: public static final UnicodeBlock KANGXI_RADICALS 1015: = new UnicodeBlock(0x2F00, 0x2FDF, 1016: "KANGXI_RADICALS", 1017: "Kangxi Radicals"); 1018: 1019: /** 1020: * Ideographic Description Characters. 1021: * 0x2FF0 - 0x2FFF. 1022: * @since 1.4 1023: */ 1024: public static final UnicodeBlock IDEOGRAPHIC_DESCRIPTION_CHARACTERS 1025: = new UnicodeBlock(0x2FF0, 0x2FFF, 1026: "IDEOGRAPHIC_DESCRIPTION_CHARACTERS", 1027: "Ideographic Description Characters"); 1028: 1029: /** 1030: * CJK Symbols and Punctuation. 1031: * 0x3000 - 0x303F. 1032: */ 1033: public static final UnicodeBlock CJK_SYMBOLS_AND_PUNCTUATION 1034: = new UnicodeBlock(0x3000, 0x303F, 1035: "CJK_SYMBOLS_AND_PUNCTUATION", 1036: "CJK Symbols and Punctuation"); 1037: 1038: /** 1039: * Hiragana. 1040: * 0x3040 - 0x309F. 1041: */ 1042: public static final UnicodeBlock HIRAGANA 1043: = new UnicodeBlock(0x3040, 0x309F, 1044: "HIRAGANA", 1045: "Hiragana"); 1046: 1047: /** 1048: * Katakana. 1049: * 0x30A0 - 0x30FF. 1050: */ 1051: public static final UnicodeBlock KATAKANA 1052: = new UnicodeBlock(0x30A0, 0x30FF, 1053: "KATAKANA", 1054: "Katakana"); 1055: 1056: /** 1057: * Bopomofo. 1058: * 0x3100 - 0x312F. 1059: */ 1060: public static final UnicodeBlock BOPOMOFO 1061: = new UnicodeBlock(0x3100, 0x312F, 1062: "BOPOMOFO", 1063: "Bopomofo"); 1064: 1065: /** 1066: * Hangul Compatibility Jamo. 1067: * 0x3130 - 0x318F. 1068: */ 1069: public static final UnicodeBlock HANGUL_COMPATIBILITY_JAMO 1070: = new UnicodeBlock(0x3130, 0x318F, 1071: "HANGUL_COMPATIBILITY_JAMO", 1072: "Hangul Compatibility Jamo"); 1073: 1074: /** 1075: * Kanbun. 1076: * 0x3190 - 0x319F. 1077: */ 1078: public static final UnicodeBlock KANBUN 1079: = new UnicodeBlock(0x3190, 0x319F, 1080: "KANBUN", 1081: "Kanbun"); 1082: 1083: /** 1084: * Bopomofo Extended. 1085: * 0x31A0 - 0x31BF. 1086: * @since 1.4 1087: */ 1088: public static final UnicodeBlock BOPOMOFO_EXTENDED 1089: = new UnicodeBlock(0x31A0, 0x31BF, 1090: "BOPOMOFO_EXTENDED", 1091: "Bopomofo Extended"); 1092: 1093: /** 1094: * Katakana Phonetic Extensions. 1095: * 0x31F0 - 0x31FF. 1096: * @since 1.5 1097: */ 1098: public static final UnicodeBlock KATAKANA_PHONETIC_EXTENSIONS 1099: = new UnicodeBlock(0x31F0, 0x31FF, 1100: "KATAKANA_PHONETIC_EXTENSIONS", 1101: "Katakana Phonetic Extensions"); 1102: 1103: /** 1104: * Enclosed CJK Letters and Months. 1105: * 0x3200 - 0x32FF. 1106: */ 1107: public static final UnicodeBlock ENCLOSED_CJK_LETTERS_AND_MONTHS 1108: = new UnicodeBlock(0x3200, 0x32FF, 1109: "ENCLOSED_CJK_LETTERS_AND_MONTHS", 1110: "Enclosed CJK Letters and Months"); 1111: 1112: /** 1113: * CJK Compatibility. 1114: * 0x3300 - 0x33FF. 1115: */ 1116: public static final UnicodeBlock CJK_COMPATIBILITY 1117: = new UnicodeBlock(0x3300, 0x33FF, 1118: "CJK_COMPATIBILITY", 1119: "CJK Compatibility"); 1120: 1121: /** 1122: * CJK Unified Ideographs Extension A. 1123: * 0x3400 - 0x4DBF. 1124: * @since 1.4 1125: */ 1126: public static final UnicodeBlock CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A 1127: = new UnicodeBlock(0x3400, 0x4DBF, 1128: "CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A", 1129: "CJK Unified Ideographs Extension A"); 1130: 1131: /** 1132: * Yijing Hexagram Symbols. 1133: * 0x4DC0 - 0x4DFF. 1134: * @since 1.5 1135: */ 1136: public static final UnicodeBlock YIJING_HEXAGRAM_SYMBOLS 1137: = new UnicodeBlock(0x4DC0, 0x4DFF, 1138: "YIJING_HEXAGRAM_SYMBOLS", 1139: "Yijing Hexagram Symbols"); 1140: 1141: /** 1142: * CJK Unified Ideographs. 1143: * 0x4E00 - 0x9FFF. 1144: */ 1145: public static final UnicodeBlock CJK_UNIFIED_IDEOGRAPHS 1146: = new UnicodeBlock(0x4E00, 0x9FFF, 1147: "CJK_UNIFIED_IDEOGRAPHS", 1148: "CJK Unified Ideographs"); 1149: 1150: /** 1151: * Yi Syllables. 1152: * 0xA000 - 0xA48F. 1153: * @since 1.4 1154: */ 1155: public static final UnicodeBlock YI_SYLLABLES 1156: = new UnicodeBlock(0xA000, 0xA48F, 1157: "YI_SYLLABLES", 1158: "Yi Syllables"); 1159: 1160: /** 1161: * Yi Radicals. 1162: * 0xA490 - 0xA4CF. 1163: * @since 1.4 1164: */ 1165: public static final UnicodeBlock YI_RADICALS 1166: = new UnicodeBlock(0xA490, 0xA4CF, 1167: "YI_RADICALS", 1168: "Yi Radicals"); 1169: 1170: /** 1171: * Hangul Syllables. 1172: * 0xAC00 - 0xD7AF. 1173: */ 1174: public static final UnicodeBlock HANGUL_SYLLABLES 1175: = new UnicodeBlock(0xAC00, 0xD7AF, 1176: "HANGUL_SYLLABLES", 1177: "Hangul Syllables"); 1178: 1179: /** 1180: * High Surrogates. 1181: * 0xD800 - 0xDB7F. 1182: * @since 1.5 1183: */ 1184: public static final UnicodeBlock HIGH_SURROGATES 1185: = new UnicodeBlock(0xD800, 0xDB7F, 1186: "HIGH_SURROGATES", 1187: "High Surrogates"); 1188: 1189: /** 1190: * High Private Use Surrogates. 1191: * 0xDB80 - 0xDBFF. 1192: * @since 1.5 1193: */ 1194: public static final UnicodeBlock HIGH_PRIVATE_USE_SURROGATES 1195: = new UnicodeBlock(0xDB80, 0xDBFF, 1196: "HIGH_PRIVATE_USE_SURROGATES", 1197: "High Private Use Surrogates"); 1198: 1199: /** 1200: * Low Surrogates. 1201: * 0xDC00 - 0xDFFF. 1202: * @since 1.5 1203: */ 1204: public static final UnicodeBlock LOW_SURROGATES 1205: = new UnicodeBlock(0xDC00, 0xDFFF, 1206: "LOW_SURROGATES", 1207: "Low Surrogates"); 1208: 1209: /** 1210: * Private Use Area. 1211: * 0xE000 - 0xF8FF. 1212: */ 1213: public static final UnicodeBlock PRIVATE_USE_AREA 1214: = new UnicodeBlock(0xE000, 0xF8FF, 1215: "PRIVATE_USE_AREA", 1216: "Private Use Area"); 1217: 1218: /** 1219: * CJK Compatibility Ideographs. 1220: * 0xF900 - 0xFAFF. 1221: */ 1222: public static final UnicodeBlock CJK_COMPATIBILITY_IDEOGRAPHS 1223: = new UnicodeBlock(0xF900, 0xFAFF, 1224: "CJK_COMPATIBILITY_IDEOGRAPHS", 1225: "CJK Compatibility Ideographs"); 1226: 1227: /** 1228: * Alphabetic Presentation Forms. 1229: * 0xFB00 - 0xFB4F. 1230: */ 1231: public static final UnicodeBlock ALPHABETIC_PRESENTATION_FORMS 1232: = new UnicodeBlock(0xFB00, 0xFB4F, 1233: "ALPHABETIC_PRESENTATION_FORMS", 1234: "Alphabetic Presentation Forms"); 1235: 1236: /** 1237: * Arabic Presentation Forms-A. 1238: * 0xFB50 - 0xFDFF. 1239: */ 1240: public static final UnicodeBlock ARABIC_PRESENTATION_FORMS_A 1241: = new UnicodeBlock(0xFB50, 0xFDFF, 1242: "ARABIC_PRESENTATION_FORMS_A", 1243: "Arabic Presentation Forms-A"); 1244: 1245: /** 1246: * Variation Selectors. 1247: * 0xFE00 - 0xFE0F. 1248: * @since 1.5 1249: */ 1250: public static final UnicodeBlock VARIATION_SELECTORS 1251: = new UnicodeBlock(0xFE00, 0xFE0F, 1252: "VARIATION_SELECTORS", 1253: "Variation Selectors"); 1254: 1255: /** 1256: * Combining Half Marks. 1257: * 0xFE20 - 0xFE2F. 1258: */ 1259: public static final UnicodeBlock COMBINING_HALF_MARKS 1260: = new UnicodeBlock(0xFE20, 0xFE2F, 1261: "COMBINING_HALF_MARKS", 1262: "Combining Half Marks"); 1263: 1264: /** 1265: * CJK Compatibility Forms. 1266: * 0xFE30 - 0xFE4F. 1267: */ 1268: public static final UnicodeBlock CJK_COMPATIBILITY_FORMS 1269: = new UnicodeBlock(0xFE30, 0xFE4F, 1270: "CJK_COMPATIBILITY_FORMS", 1271: "CJK Compatibility Forms"); 1272: 1273: /** 1274: * Small Form Variants. 1275: * 0xFE50 - 0xFE6F. 1276: */ 1277: public static final UnicodeBlock SMALL_FORM_VARIANTS 1278: = new UnicodeBlock(0xFE50, 0xFE6F, 1279: "SMALL_FORM_VARIANTS", 1280: "Small Form Variants"); 1281: 1282: /** 1283: * Arabic Presentation Forms-B. 1284: * 0xFE70 - 0xFEFF. 1285: */ 1286: public static final UnicodeBlock ARABIC_PRESENTATION_FORMS_B 1287: = new UnicodeBlock(0xFE70, 0xFEFF, 1288: "ARABIC_PRESENTATION_FORMS_B", 1289: "Arabic Presentation Forms-B"); 1290: 1291: /** 1292: * Halfwidth and Fullwidth Forms. 1293: * 0xFF00 - 0xFFEF. 1294: */ 1295: public static final UnicodeBlock HALFWIDTH_AND_FULLWIDTH_FORMS 1296: = new UnicodeBlock(0xFF00, 0xFFEF, 1297: "HALFWIDTH_AND_FULLWIDTH_FORMS", 1298: "Halfwidth and Fullwidth Forms"); 1299: 1300: /** 1301: * Specials. 1302: * 0xFFF0 - 0xFFFF. 1303: */ 1304: public static final UnicodeBlock SPECIALS 1305: = new UnicodeBlock(0xFFF0, 0xFFFF, 1306: "SPECIALS", 1307: "Specials"); 1308: 1309: /** 1310: * Linear B Syllabary. 1311: * 0x10000 - 0x1007F. 1312: * @since 1.5 1313: */ 1314: public static final UnicodeBlock LINEAR_B_SYLLABARY 1315: = new UnicodeBlock(0x10000, 0x1007F, 1316: "LINEAR_B_SYLLABARY", 1317: "Linear B Syllabary"); 1318: 1319: /** 1320: * Linear B Ideograms. 1321: * 0x10080 - 0x100FF. 1322: * @since 1.5 1323: */ 1324: public static final UnicodeBlock LINEAR_B_IDEOGRAMS 1325: = new UnicodeBlock(0x10080, 0x100FF, 1326: "LINEAR_B_IDEOGRAMS", 1327: "Linear B Ideograms"); 1328: 1329: /** 1330: * Aegean Numbers. 1331: * 0x10100 - 0x1013F. 1332: * @since 1.5 1333: */ 1334: public static final UnicodeBlock AEGEAN_NUMBERS 1335: = new UnicodeBlock(0x10100, 0x1013F, 1336: "AEGEAN_NUMBERS", 1337: "Aegean Numbers"); 1338: 1339: /** 1340: * Old Italic. 1341: * 0x10300 - 0x1032F. 1342: * @since 1.5 1343: */ 1344: public static final UnicodeBlock OLD_ITALIC 1345: = new UnicodeBlock(0x10300, 0x1032F, 1346: "OLD_ITALIC", 1347: "Old Italic"); 1348: 1349: /** 1350: * Gothic. 1351: * 0x10330 - 0x1034F. 1352: * @since 1.5 1353: */ 1354: public static final UnicodeBlock GOTHIC 1355: = new UnicodeBlock(0x10330, 0x1034F, 1356: "GOTHIC", 1357: "Gothic"); 1358: 1359: /** 1360: * Ugaritic. 1361: * 0x10380 - 0x1039F. 1362: * @since 1.5 1363: */ 1364: public static final UnicodeBlock UGARITIC 1365: = new UnicodeBlock(0x10380, 0x1039F, 1366: "UGARITIC", 1367: "Ugaritic"); 1368: 1369: /** 1370: * Deseret. 1371: * 0x10400 - 0x1044F. 1372: * @since 1.5 1373: */ 1374: public static final UnicodeBlock DESERET 1375: = new UnicodeBlock(0x10400, 0x1044F, 1376: "DESERET", 1377: "Deseret"); 1378: 1379: /** 1380: * Shavian. 1381: * 0x10450 - 0x1047F. 1382: * @since 1.5 1383: */ 1384: public static final UnicodeBlock SHAVIAN 1385: = new UnicodeBlock(0x10450, 0x1047F, 1386: "SHAVIAN", 1387: "Shavian"); 1388: 1389: /** 1390: * Osmanya. 1391: * 0x10480 - 0x104AF. 1392: * @since 1.5 1393: */ 1394: public static final UnicodeBlock OSMANYA 1395: = new UnicodeBlock(0x10480, 0x104AF, 1396: "OSMANYA", 1397: "Osmanya"); 1398: 1399: /** 1400: * Cypriot Syllabary. 1401: * 0x10800 - 0x1083F. 1402: * @since 1.5 1403: */ 1404: public static final UnicodeBlock CYPRIOT_SYLLABARY 1405: = new UnicodeBlock(0x10800, 0x1083F, 1406: "CYPRIOT_SYLLABARY", 1407: "Cypriot Syllabary"); 1408: 1409: /** 1410: * Byzantine Musical Symbols. 1411: * 0x1D000 - 0x1D0FF. 1412: * @since 1.5 1413: */ 1414: public static final UnicodeBlock BYZANTINE_MUSICAL_SYMBOLS 1415: = new UnicodeBlock(0x1D000, 0x1D0FF, 1416: "BYZANTINE_MUSICAL_SYMBOLS", 1417: "Byzantine Musical Symbols"); 1418: 1419: /** 1420: * Musical Symbols. 1421: * 0x1D100 - 0x1D1FF. 1422: * @since 1.5 1423: */ 1424: public static final UnicodeBlock MUSICAL_SYMBOLS 1425: = new UnicodeBlock(0x1D100, 0x1D1FF, 1426: "MUSICAL_SYMBOLS", 1427: "Musical Symbols"); 1428: 1429: /** 1430: * Tai Xuan Jing Symbols. 1431: * 0x1D300 - 0x1D35F. 1432: * @since 1.5 1433: */ 1434: public static final UnicodeBlock TAI_XUAN_JING_SYMBOLS 1435: = new UnicodeBlock(0x1D300, 0x1D35F, 1436: "TAI_XUAN_JING_SYMBOLS", 1437: "Tai Xuan Jing Symbols"); 1438: 1439: /** 1440: * Mathematical Alphanumeric Symbols. 1441: * 0x1D400 - 0x1D7FF. 1442: * @since 1.5 1443: */ 1444: public static final UnicodeBlock MATHEMATICAL_ALPHANUMERIC_SYMBOLS 1445: = new UnicodeBlock(0x1D400, 0x1D7FF, 1446: "MATHEMATICAL_ALPHANUMERIC_SYMBOLS", 1447: "Mathematical Alphanumeric Symbols"); 1448: 1449: /** 1450: * CJK Unified Ideographs Extension B. 1451: * 0x20000 - 0x2A6DF. 1452: * @since 1.5 1453: */ 1454: public static final UnicodeBlock CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B 1455: = new UnicodeBlock(0x20000, 0x2A6DF, 1456: "CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B", 1457: "CJK Unified Ideographs Extension B"); 1458: 1459: /** 1460: * CJK Compatibility Ideographs Supplement. 1461: * 0x2F800 - 0x2FA1F. 1462: * @since 1.5 1463: */ 1464: public static final UnicodeBlock CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT 1465: = new UnicodeBlock(0x2F800, 0x2FA1F, 1466: "CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT", 1467: "CJK Compatibility Ideographs Supplement"); 1468: 1469: /** 1470: * Tags. 1471: * 0xE0000 - 0xE007F. 1472: * @since 1.5 1473: */ 1474: public static final UnicodeBlock TAGS 1475: = new UnicodeBlock(0xE0000, 0xE007F, 1476: "TAGS", 1477: "Tags"); 1478: 1479: /** 1480: * Variation Selectors Supplement. 1481: * 0xE0100 - 0xE01EF. 1482: * @since 1.5 1483: */ 1484: public static final UnicodeBlock VARIATION_SELECTORS_SUPPLEMENT 1485: = new UnicodeBlock(0xE0100, 0xE01EF, 1486: "VARIATION_SELECTORS_SUPPLEMENT", 1487: "Variation Selectors Supplement"); 1488: 1489: /** 1490: * Supplementary Private Use Area-A. 1491: * 0xF0000 - 0xFFFFF. 1492: * @since 1.5 1493: */ 1494: public static final UnicodeBlock SUPPLEMENTARY_PRIVATE_USE_AREA_A 1495: = new UnicodeBlock(0xF0000, 0xFFFFF, 1496: "SUPPLEMENTARY_PRIVATE_USE_AREA_A", 1497: "Supplementary Private Use Area-A"); 1498: 1499: /** 1500: * Supplementary Private Use Area-B. 1501: * 0x100000 - 0x10FFFF. 1502: * @since 1.5 1503: */ 1504: public static final UnicodeBlock SUPPLEMENTARY_PRIVATE_USE_AREA_B 1505: = new UnicodeBlock(0x100000, 0x10FFFF, 1506: "SUPPLEMENTARY_PRIVATE_USE_AREA_B", 1507: "Supplementary Private Use Area-B"); 1508: 1509: /** 1510: * Surrogates Area. 1511: * 'D800' - 'DFFF'. 1512: * @deprecated As of 1.5, the three areas, 1513: * <a href="#HIGH_SURROGATES">HIGH_SURROGATES</a>, 1514: * <a href="#HIGH_PRIVATE_USE_SURROGATES">HIGH_PRIVATE_USE_SURROGATES</a> 1515: * and <a href="#LOW_SURROGATES">LOW_SURROGATES</a>, as defined 1516: * by the Unicode standard, should be used in preference to 1517: * this. These are also returned from calls to <code>of(int)</code> 1518: * and <code>of(char)</code>. 1519: */ 1520: public static final UnicodeBlock SURROGATES_AREA 1521: = new UnicodeBlock(0xD800, 0xDFFF, 1522: "SURROGATES_AREA", 1523: "Surrogates Area"); 1524: 1525: /** 1526: * The defined subsets. 1527: */ 1528: private static final UnicodeBlock sets[] = { 1529: BASIC_LATIN, 1530: LATIN_1_SUPPLEMENT, 1531: LATIN_EXTENDED_A, 1532: LATIN_EXTENDED_B, 1533: IPA_EXTENSIONS, 1534: SPACING_MODIFIER_LETTERS, 1535: COMBINING_DIACRITICAL_MARKS, 1536: GREEK, 1537: CYRILLIC, 1538: CYRILLIC_SUPPLEMENTARY, 1539: ARMENIAN, 1540: HEBREW, 1541: ARABIC, 1542: SYRIAC, 1543: THAANA, 1544: DEVANAGARI, 1545: BENGALI, 1546: GURMUKHI, 1547: GUJARATI, 1548: ORIYA, 1549: TAMIL, 1550: TELUGU, 1551: KANNADA, 1552: MALAYALAM, 1553: SINHALA, 1554: THAI, 1555: LAO, 1556: TIBETAN, 1557: MYANMAR, 1558: GEORGIAN, 1559: HANGUL_JAMO, 1560: ETHIOPIC, 1561: CHEROKEE, 1562: UNIFIED_CANADIAN_ABORIGINAL_SYLLABICS, 1563: OGHAM, 1564: RUNIC, 1565: TAGALOG, 1566: HANUNOO, 1567: BUHID, 1568: TAGBANWA, 1569: KHMER, 1570: MONGOLIAN, 1571: LIMBU, 1572: TAI_LE, 1573: KHMER_SYMBOLS, 1574: PHONETIC_EXTENSIONS, 1575: LATIN_EXTENDED_ADDITIONAL, 1576: GREEK_EXTENDED, 1577: GENERAL_PUNCTUATION, 1578: SUPERSCRIPTS_AND_SUBSCRIPTS, 1579: CURRENCY_SYMBOLS, 1580: COMBINING_MARKS_FOR_SYMBOLS, 1581: LETTERLIKE_SYMBOLS, 1582: NUMBER_FORMS, 1583: ARROWS, 1584: MATHEMATICAL_OPERATORS, 1585: MISCELLANEOUS_TECHNICAL, 1586: CONTROL_PICTURES, 1587: OPTICAL_CHARACTER_RECOGNITION, 1588: ENCLOSED_ALPHANUMERICS, 1589: BOX_DRAWING, 1590: BLOCK_ELEMENTS, 1591: GEOMETRIC_SHAPES, 1592: MISCELLANEOUS_SYMBOLS, 1593: DINGBATS, 1594: MISCELLANEOUS_MATHEMATICAL_SYMBOLS_A, 1595: SUPPLEMENTAL_ARROWS_A, 1596: BRAILLE_PATTERNS, 1597: SUPPLEMENTAL_ARROWS_B, 1598: MISCELLANEOUS_MATHEMATICAL_SYMBOLS_B, 1599: SUPPLEMENTAL_MATHEMATICAL_OPERATORS, 1600: MISCELLANEOUS_SYMBOLS_AND_ARROWS, 1601: CJK_RADICALS_SUPPLEMENT, 1602: KANGXI_RADICALS, 1603: IDEOGRAPHIC_DESCRIPTION_CHARACTERS, 1604: CJK_SYMBOLS_AND_PUNCTUATION, 1605: HIRAGANA, 1606: KATAKANA, 1607: BOPOMOFO, 1608: HANGUL_COMPATIBILITY_JAMO, 1609: KANBUN, 1610: BOPOMOFO_EXTENDED, 1611: KATAKANA_PHONETIC_EXTENSIONS, 1612: ENCLOSED_CJK_LETTERS_AND_MONTHS, 1613: CJK_COMPATIBILITY, 1614: CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A, 1615: YIJING_HEXAGRAM_SYMBOLS, 1616: CJK_UNIFIED_IDEOGRAPHS, 1617: YI_SYLLABLES, 1618: YI_RADICALS, 1619: HANGUL_SYLLABLES, 1620: HIGH_SURROGATES, 1621: HIGH_PRIVATE_USE_SURROGATES, 1622: LOW_SURROGATES, 1623: PRIVATE_USE_AREA, 1624: CJK_COMPATIBILITY_IDEOGRAPHS, 1625: ALPHABETIC_PRESENTATION_FORMS, 1626: ARABIC_PRESENTATION_FORMS_A, 1627: VARIATION_SELECTORS, 1628: COMBINING_HALF_MARKS, 1629: CJK_COMPATIBILITY_FORMS, 1630: SMALL_FORM_VARIANTS, 1631: ARABIC_PRESENTATION_FORMS_B, 1632: HALFWIDTH_AND_FULLWIDTH_FORMS, 1633: SPECIALS, 1634: LINEAR_B_SYLLABARY, 1635: LINEAR_B_IDEOGRAMS, 1636: AEGEAN_NUMBERS, 1637: OLD_ITALIC, 1638: GOTHIC, 1639: UGARITIC, 1640: DESERET, 1641: SHAVIAN, 1642: OSMANYA, 1643: CYPRIOT_SYLLABARY, 1644: BYZANTINE_MUSICAL_SYMBOLS, 1645: MUSICAL_SYMBOLS, 1646: TAI_XUAN_JING_SYMBOLS, 1647: MATHEMATICAL_ALPHANUMERIC_SYMBOLS, 1648: CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B, 1649: CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT, 1650: TAGS, 1651: VARIATION_SELECTORS_SUPPLEMENT, 1652: SUPPLEMENTARY_PRIVATE_USE_AREA_A, 1653: SUPPLEMENTARY_PRIVATE_USE_AREA_B, 1654: }; 1655: } // class UnicodeBlock 1656: 1657: /** 1658: * The immutable value of this Character. 1659: * 1660: * @serial the value of this Character 1661: */ 1662: private final char value; 1663: 1664: /** 1665: * Compatible with JDK 1.0+. 1666: */ 1667: private static final long serialVersionUID = 3786198910865385080L; 1668: 1669: /** 1670: * Smallest value allowed for radix arguments in Java. This value is 2. 1671: * 1672: * @see #digit(char, int) 1673: * @see #forDigit(int, int) 1674: * @see Integer#toString(int, int) 1675: * @see Integer#valueOf(String) 1676: */ 1677: public static final int MIN_RADIX = 2; 1678: 1679: /** 1680: * Largest value allowed for radix arguments in Java. This value is 36. 1681: * 1682: * @see #digit(char, int) 1683: * @see #forDigit(int, int) 1684: * @see Integer#toString(int, int) 1685: * @see Integer#valueOf(String) 1686: */ 1687: public static final int MAX_RADIX = 36; 1688: 1689: /** 1690: * The minimum value the char data type can hold. 1691: * This value is <code>'\\u0000'</code>. 1692: */ 1693: public static final char MIN_VALUE = '\u0000'; 1694: 1695: /** 1696: * The maximum value the char data type can hold. 1697: * This value is <code>'\\uFFFF'</code>. 1698: */ 1699: public static final char MAX_VALUE = '\uFFFF'; 1700: 1701: /** 1702: * Class object representing the primitive char data type. 1703: * 1704: * @since 1.1 1705: */ 1706: public static final Class TYPE = VMClassLoader.getPrimitiveClass('C'); 1707: 1708: /** 1709: * The number of bits needed to represent a <code>char</code>. 1710: * @since 1.5 1711: */ 1712: public static final int SIZE = 16; 1713: 1714: // This caches some Character values, and is used by boxing 1715: // conversions via valueOf(). We must cache at least 0..127; 1716: // this constant controls how much we actually cache. 1717: private static final int MAX_CACHE = 127; 1718: private static Character[] charCache = new Character[MAX_CACHE + 1]; 1719: 1720: /** 1721: * Lu = Letter, Uppercase (Informative). 1722: * 1723: * @since 1.1 1724: */ 1725: public static final byte UPPERCASE_LETTER = 1; 1726: 1727: /** 1728: * Ll = Letter, Lowercase (Informative). 1729: * 1730: * @since 1.1 1731: */ 1732: public static final byte LOWERCASE_LETTER = 2; 1733: 1734: /** 1735: * Lt = Letter, Titlecase (Informative). 1736: * 1737: * @since 1.1 1738: */ 1739: public static final byte TITLECASE_LETTER = 3; 1740: 1741: /** 1742: * Mn = Mark, Non-Spacing (Normative). 1743: * 1744: * @since 1.1 1745: */ 1746: public static final byte NON_SPACING_MARK = 6; 1747: 1748: /** 1749: * Mc = Mark, Spacing Combining (Normative). 1750: * 1751: * @since 1.1 1752: */ 1753: public static final byte COMBINING_SPACING_MARK = 8; 1754: 1755: /** 1756: * Me = Mark, Enclosing (Normative). 1757: * 1758: * @since 1.1 1759: */ 1760: public static final byte ENCLOSING_MARK = 7; 1761: 1762: /** 1763: * Nd = Number, Decimal Digit (Normative). 1764: * 1765: * @since 1.1 1766: */ 1767: public static final byte DECIMAL_DIGIT_NUMBER = 9; 1768: 1769: /** 1770: * Nl = Number, Letter (Normative). 1771: * 1772: * @since 1.1 1773: */ 1774: public static final byte LETTER_NUMBER = 10; 1775: 1776: /** 1777: * No = Number, Other (Normative). 1778: * 1779: * @since 1.1 1780: */ 1781: public static final byte OTHER_NUMBER = 11; 1782: 1783: /** 1784: * Zs = Separator, Space (Normative). 1785: * 1786: * @since 1.1 1787: */ 1788: public static final byte SPACE_SEPARATOR = 12; 1789: 1790: /** 1791: * Zl = Separator, Line (Normative). 1792: * 1793: * @since 1.1 1794: */ 1795: public static final byte LINE_SEPARATOR = 13; 1796: 1797: /** 1798: * Zp = Separator, Paragraph (Normative). 1799: * 1800: * @since 1.1 1801: */ 1802: public static final byte PARAGRAPH_SEPARATOR = 14; 1803: 1804: /** 1805: * Cc = Other, Control (Normative). 1806: * 1807: * @since 1.1 1808: */ 1809: public static final byte CONTROL = 15; 1810: 1811: /** 1812: * Cf = Other, Format (Normative). 1813: * 1814: * @since 1.1 1815: */ 1816: public static final byte FORMAT = 16; 1817: 1818: /** 1819: * Cs = Other, Surrogate (Normative). 1820: * 1821: * @since 1.1 1822: */ 1823: public static final byte SURROGATE = 19; 1824: 1825: /** 1826: * Co = Other, Private Use (Normative). 1827: * 1828: * @since 1.1 1829: */ 1830: public static final byte PRIVATE_USE = 18; 1831: 1832: /** 1833: * Cn = Other, Not Assigned (Normative). 1834: * 1835: * @since 1.1 1836: */ 1837: public static final byte UNASSIGNED = 0; 1838: 1839: /** 1840: * Lm = Letter, Modifier (Informative). 1841: * 1842: * @since 1.1 1843: */ 1844: public static final byte MODIFIER_LETTER = 4; 1845: 1846: /** 1847: * Lo = Letter, Other (Informative). 1848: * 1849: * @since 1.1 1850: */ 1851: public static final byte OTHER_LETTER = 5; 1852: 1853: /** 1854: * Pc = Punctuation, Connector (Informative). 1855: * 1856: * @since 1.1 1857: */ 1858: public static final byte CONNECTOR_PUNCTUATION = 23; 1859: 1860: /** 1861: * Pd = Punctuation, Dash (Informative). 1862: * 1863: * @since 1.1 1864: */ 1865: public static final byte DASH_PUNCTUATION = 20; 1866: 1867: /** 1868: * Ps = Punctuation, Open (Informative). 1869: * 1870: * @since 1.1 1871: */ 1872: public static final byte START_PUNCTUATION = 21; 1873: 1874: /** 1875: * Pe = Punctuation, Close (Informative). 1876: * 1877: * @since 1.1 1878: */ 1879: public static final byte END_PUNCTUATION = 22; 1880: 1881: /** 1882: * Pi = Punctuation, Initial Quote (Informative). 1883: * 1884: * @since 1.4 1885: */ 1886: public static final byte INITIAL_QUOTE_PUNCTUATION = 29; 1887: 1888: /** 1889: * Pf = Punctuation, Final Quote (Informative). 1890: * 1891: * @since 1.4 1892: */ 1893: public static final byte FINAL_QUOTE_PUNCTUATION = 30; 1894: 1895: /** 1896: * Po = Punctuation, Other (Informative). 1897: * 1898: * @since 1.1 1899: */ 1900: public static final byte OTHER_PUNCTUATION = 24; 1901: 1902: /** 1903: * Sm = Symbol, Math (Informative). 1904: * 1905: * @since 1.1 1906: */ 1907: public static final byte MATH_SYMBOL = 25; 1908: 1909: /** 1910: * Sc = Symbol, Currency (Informative). 1911: * 1912: * @since 1.1 1913: */ 1914: public static final byte CURRENCY_SYMBOL = 26; 1915: 1916: /** 1917: * Sk = Symbol, Modifier (Informative). 1918: * 1919: * @since 1.1 1920: */ 1921: public static final byte MODIFIER_SYMBOL = 27; 1922: 1923: /** 1924: * So = Symbol, Other (Informative). 1925: * 1926: * @since 1.1 1927: */ 1928: public static final byte OTHER_SYMBOL = 28; 1929: 1930: /** 1931: * Undefined bidirectional character type. Undefined char values have 1932: * undefined directionality in the Unicode specification. 1933: * 1934: * @since 1.4 1935: */ 1936: public static final byte DIRECTIONALITY_UNDEFINED = -1; 1937: 1938: /** 1939: * Strong bidirectional character type "L". 1940: * 1941: * @since 1.4 1942: */ 1943: public static final byte DIRECTIONALITY_LEFT_TO_RIGHT = 0; 1944: 1945: /** 1946: * Strong bidirectional character type "R". 1947: * 1948: * @since 1.4 1949: */ 1950: public static final byte DIRECTIONALITY_RIGHT_TO_LEFT = 1; 1951: 1952: /** 1953: * Strong bidirectional character type "AL". 1954: * 1955: * @since 1.4 1956: */ 1957: public static final byte DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC = 2; 1958: 1959: /** 1960: * Weak bidirectional character type "EN". 1961: * 1962: * @since 1.4 1963: */ 1964: public static final byte DIRECTIONALITY_EUROPEAN_NUMBER = 3; 1965: 1966: /** 1967: * Weak bidirectional character type "ES". 1968: * 1969: * @since 1.4 1970: */ 1971: public static final byte DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR = 4; 1972: 1973: /** 1974: * Weak bidirectional character type "ET". 1975: * 1976: * @since 1.4 1977: */ 1978: public static final byte DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR = 5; 1979: 1980: /** 1981: * Weak bidirectional character type "AN". 1982: * 1983: * @since 1.4 1984: */ 1985: public static final byte DIRECTIONALITY_ARABIC_NUMBER = 6; 1986: 1987: /** 1988: * Weak bidirectional character type "CS". 1989: * 1990: * @since 1.4 1991: */ 1992: public static final byte DIRECTIONALITY_COMMON_NUMBER_SEPARATOR = 7; 1993: 1994: /** 1995: * Weak bidirectional character type "NSM". 1996: * 1997: * @since 1.4 1998: */ 1999: public static final byte DIRECTIONALITY_NONSPACING_MARK = 8; 2000: 2001: /** 2002: * Weak bidirectional character type "BN". 2003: * 2004: * @since 1.4 2005: */ 2006: public static final byte DIRECTIONALITY_BOUNDARY_NEUTRAL = 9; 2007: 2008: /** 2009: * Neutral bidirectional character type "B". 2010: * 2011: * @since 1.4 2012: */ 2013: public static final byte DIRECTIONALITY_PARAGRAPH_SEPARATOR = 10; 2014: 2015: /** 2016: * Neutral bidirectional character type "S". 2017: * 2018: * @since 1.4 2019: */ 2020: public static final byte DIRECTIONALITY_SEGMENT_SEPARATOR = 11; 2021: 2022: /** 2023: * Strong bidirectional character type "WS". 2024: * 2025: * @since 1.4 2026: */ 2027: public static final byte DIRECTIONALITY_WHITESPACE = 12; 2028: 2029: /** 2030: * Neutral bidirectional character type "ON". 2031: * 2032: * @since 1.4 2033: */ 2034: public static final byte DIRECTIONALITY_OTHER_NEUTRALS = 13; 2035: 2036: /** 2037: * Strong bidirectional character type "LRE". 2038: * 2039: * @since 1.4 2040: */ 2041: public static final byte DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING = 14; 2042: 2043: /** 2044: * Strong bidirectional character type "LRO". 2045: * 2046: * @since 1.4 2047: */ 2048: public static final byte DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE = 15; 2049: 2050: /** 2051: * Strong bidirectional character type "RLE". 2052: * 2053: * @since 1.4 2054: */ 2055: public static final byte DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING = 16; 2056: 2057: /** 2058: * Strong bidirectional character type "RLO". 2059: * 2060: * @since 1.4 2061: */ 2062: public static final byte DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE = 17; 2063: 2064: /** 2065: * Weak bidirectional character type "PDF". 2066: * 2067: * @since 1.4 2068: */ 2069: public static final byte DIRECTIONALITY_POP_DIRECTIONAL_FORMAT = 18; 2070: 2071: /** 2072: * Mask for grabbing the type out of the result of readChar. 2073: * @see #readChar(char) 2074: */ 2075: private static final int TYPE_MASK = 0x1F; 2076: 2077: /** 2078: * Mask for grabbing the non-breaking space flag out of the result of 2079: * readChar. 2080: * @see #readChar(char) 2081: */ 2082: private static final int NO_BREAK_MASK = 0x20; 2083: 2084: /** 2085: * Mask for grabbing the mirrored directionality flag out of the result 2086: * of readChar. 2087: * @see #readChar(char) 2088: */ 2089: private static final int MIRROR_MASK = 0x40; 2090: 2091: /** 2092: * Min value for supplementary code point. 2093: * 2094: * @since 1.5 2095: */ 2096: public static final int MIN_SUPPLEMENTARY_CODE_POINT = 0x10000; 2097: 2098: /** 2099: * Min value for code point. 2100: * 2101: * @since 1.5 2102: */ 2103: public static final int MIN_CODE_POINT = 0; 2104: 2105: 2106: /** 2107: * Max value for code point. 2108: * 2109: * @since 1.5 2110: */ 2111: public static final int MAX_CODE_POINT = 0x010ffff; 2112: 2113: 2114: /** 2115: * Minimum high surrogate code in UTF-16 encoding. 2116: * 2117: * @since 1.5 2118: */ 2119: public static final char MIN_HIGH_SURROGATE = '\ud800'; 2120: 2121: /** 2122: * Maximum high surrogate code in UTF-16 encoding. 2123: * 2124: * @since 1.5 2125: */ 2126: public static final char MAX_HIGH_SURROGATE = '\udbff'; 2127: 2128: /** 2129: * Minimum low surrogate code in UTF-16 encoding. 2130: * 2131: * @since 1.5 2132: */ 2133: public static final char MIN_LOW_SURROGATE = '\udc00'; 2134: 2135: /** 2136: * Maximum low surrogate code in UTF-16 encoding. 2137: * 2138: * @since 1.5 2139: */ 2140: public static final char MAX_LOW_SURROGATE = '\udfff'; 2141: 2142: /** 2143: * Minimum surrogate code in UTF-16 encoding. 2144: * 2145: * @since 1.5 2146: */ 2147: public static final char MIN_SURROGATE = MIN_HIGH_SURROGATE; 2148: 2149: /** 2150: * Maximum low surrogate code in UTF-16 encoding. 2151: * 2152: * @since 1.5 2153: */ 2154: public static final char MAX_SURROGATE = MAX_LOW_SURROGATE; 2155: 2156: /** 2157: * Grabs an attribute offset from the Unicode attribute database. The lower 2158: * 5 bits are the character type, the next 2 bits are flags, and the top 2159: * 9 bits are the offset into the attribute tables. Note that the top 9 2160: * bits are meaningless in this context; they are useful only in the native 2161: * code. 2162: * 2163: * @param ch the character to look up 2164: * @return the character's attribute offset and type 2165: * @see #TYPE_MASK 2166: * @see #NO_BREAK_MASK 2167: * @see #MIRROR_MASK 2168: */ 2169: private static native char readChar(char ch); 2170: 2171: /** 2172: * Wraps up a character. 2173: * 2174: * @param value the character to wrap 2175: */ 2176: public Character(char value) 2177: { 2178: this.value = value; 2179: } 2180: 2181: /** 2182: * Returns the character which has been wrapped by this class. 2183: * 2184: * @return the character wrapped 2185: */ 2186: public char charValue() 2187: { 2188: return value; 2189: } 2190: 2191: /** 2192: * Returns the numerical value (unsigned) of the wrapped character. 2193: * Range of returned values: 0x0000-0xFFFF. 2194: * 2195: * @return the value of the wrapped character 2196: */ 2197: public int hashCode() 2198: { 2199: return value; 2200: } 2201: 2202: /** 2203: * Determines if an object is equal to this object. This is only true for 2204: * another Character object wrapping the same value. 2205: * 2206: * @param o object to compare 2207: * @return true if o is a Character with the same value 2208: */ 2209: public boolean equals(Object o) 2210: { 2211: return o instanceof Character && value == ((Character) o).value; 2212: } 2213: 2214: /** 2215: * Converts the wrapped character into a String. 2216: * 2217: * @return a String containing one character -- the wrapped character 2218: * of this instance 2219: */ 2220: public String toString() 2221: { 2222: // This assumes that String.valueOf(char) can create a single-character 2223: // String more efficiently than through the public API. 2224: return String.valueOf(value); 2225: } 2226: 2227: /** 2228: * Returns a String of length 1 representing the specified character. 2229: * 2230: * @param ch the character to convert 2231: * @return a String containing the character 2232: * @since 1.4 2233: */ 2234: public static String toString(char ch) 2235: { 2236: // This assumes that String.valueOf(char) can create a single-character 2237: // String more efficiently than through the public API. 2238: return String.valueOf(ch); 2239: } 2240: 2241: /** 2242: * Determines if a character is a Unicode lowercase letter. For example, 2243: * <code>'a'</code> is lowercase. 2244: * <br> 2245: * lowercase = [Ll] 2246: * 2247: * @param ch character to test 2248: * @return true if ch is a Unicode lowercase letter, else false 2249: * @see #isUpperCase(char) 2250: * @see #isTitleCase(char) 2251: * @see #toLowerCase(char) 2252: * @see #getType(char) 2253: */ 2254: public static boolean isLowerCase(char ch) 2255: { 2256: return getType(ch) == LOWERCASE_LETTER; 2257: } 2258: 2259: /** 2260: * Determines if a character is a Unicode uppercase letter. For example, 2261: * <code>'A'</code> is uppercase. 2262: * <br> 2263: * uppercase = [Lu] 2264: * 2265: * @param ch character to test 2266: * @return true if ch is a Unicode uppercase letter, else false 2267: * @see #isLowerCase(char) 2268: * @see #isTitleCase(char) 2269: * @see #toUpperCase(char) 2270: * @see #getType(char) 2271: */ 2272: public static boolean isUpperCase(char ch) 2273: { 2274: return getType(ch) == UPPERCASE_LETTER; 2275: } 2276: 2277: /** 2278: * Determines if a character is a Unicode titlecase letter. For example, 2279: * the character "Lj" (Latin capital L with small letter j) is titlecase. 2280: * <br> 2281: * titlecase = [Lt] 2282: * 2283: * @param ch character to test 2284: * @return true if ch is a Unicode titlecase letter, else false 2285: * @see #isLowerCase(char) 2286: * @see #isUpperCase(char) 2287: * @see #toTitleCase(char) 2288: * @see #getType(char) 2289: */ 2290: public static boolean isTitleCase(char ch) 2291: { 2292: return getType(ch) == TITLECASE_LETTER; 2293: } 2294: 2295: /** 2296: * Determines if a character is a Unicode decimal digit. For example, 2297: * <code>'0'</code> is a digit. 2298: * <br> 2299: * Unicode decimal digit = [Nd] 2300: * 2301: * @param ch character to test 2302: * @return true if ch is a Unicode decimal digit, else false 2303: * @see #digit(char, int) 2304: * @see #forDigit(int, int) 2305: * @see #getType(char) 2306: */ 2307: public static boolean isDigit(char ch) 2308: { 2309: return getType(ch) == DECIMAL_DIGIT_NUMBER; 2310: } 2311: 2312: /** 2313: * Determines if a character is part of the Unicode Standard. This is an 2314: * evolving standard, but covers every character in the data file. 2315: * <br> 2316: * defined = not [Cn] 2317: * 2318: * @param ch character to test 2319: * @return true if ch is a Unicode character, else false 2320: * @see #isDigit(char) 2321: * @see #isLetter(char) 2322: * @see #isLetterOrDigit(char) 2323: * @see #isLowerCase(char) 2324: * @see #isTitleCase(char) 2325: * @see #isUpperCase(char) 2326: */ 2327: public static boolean isDefined(char ch) 2328: { 2329: return getType(ch) != UNASSIGNED; 2330: } 2331: 2332: /** 2333: * Determines if a character is a Unicode letter. Not all letters have case, 2334: * so this may return true when isLowerCase and isUpperCase return false. 2335: * <br> 2336: * letter = [Lu]|[Ll]|[Lt]|[Lm]|[Lo] 2337: * 2338: * @param ch character to test 2339: * @return true if ch is a Unicode letter, else false 2340: * @see #isDigit(char) 2341: * @see #isJavaIdentifierStart(char) 2342: * @see #isJavaLetter(char) 2343: * @see #isJavaLetterOrDigit(char) 2344: * @see #isLetterOrDigit(char) 2345: * @see #isLowerCase(char) 2346: * @see #isTitleCase(char) 2347: * @see #isUnicodeIdentifierStart(char) 2348: * @see #isUpperCase(char) 2349: */ 2350: public static boolean isLetter(char ch) 2351: { 2352: return ((1 << getType(ch)) 2353: & ((1 << UPPERCASE_LETTER) 2354: | (1 << LOWERCASE_LETTER) 2355: | (1 << TITLECASE_LETTER) 2356: | (1 << MODIFIER_LETTER) 2357: | (1 << OTHER_LETTER))) != 0; 2358: } 2359: 2360: /** 2361: * Determines if a character is a Unicode letter or a Unicode digit. This 2362: * is the combination of isLetter and isDigit. 2363: * <br> 2364: * letter or digit = [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nd] 2365: * 2366: * @param ch character to test 2367: * @return true if ch is a Unicode letter or a Unicode digit, else false 2368: * @see #isDigit(char) 2369: * @see #isJavaIdentifierPart(char) 2370: * @see #isJavaLetter(char) 2371: * @see #isJavaLetterOrDigit(char) 2372: * @see #isLetter(char) 2373: * @see #isUnicodeIdentifierPart(char) 2374: */ 2375: public static boolean isLetterOrDigit(char ch) 2376: { 2377: return ((1 << getType(ch)) 2378: & ((1 << UPPERCASE_LETTER) 2379: | (1 << LOWERCASE_LETTER) 2380: | (1 << TITLECASE_LETTER) 2381: | (1 << MODIFIER_LETTER) 2382: | (1 << OTHER_LETTER) 2383: | (1 << DECIMAL_DIGIT_NUMBER))) != 0; 2384: } 2385: 2386: /** 2387: * Determines if a character can start a Java identifier. This is the 2388: * combination of isLetter, any character where getType returns 2389: * LETTER_NUMBER, currency symbols (like '$'), and connecting punctuation 2390: * (like '_'). 2391: * 2392: * @param ch character to test 2393: * @return true if ch can start a Java identifier, else false 2394: * @deprecated Replaced by {@link #isJavaIdentifierStart(char)} 2395: * @see #isJavaLetterOrDigit(char) 2396: * @see #isJavaIdentifierStart(char) 2397: * @see #isJavaIdentifierPart(char) 2398: * @see #isLetter(char) 2399: * @see #isLetterOrDigit(char) 2400: * @see #isUnicodeIdentifierStart(char) 2401: */ 2402: public static boolean isJavaLetter(char ch) 2403: { 2404: return isJavaIdentifierStart(ch); 2405: } 2406: 2407: /** 2408: * Determines if a character can follow the first letter in 2409: * a Java identifier. This is the combination of isJavaLetter (isLetter, 2410: * type of LETTER_NUMBER, currency, connecting punctuation) and digit, 2411: * numeric letter (like Roman numerals), combining marks, non-spacing marks, 2412: * or isIdentifierIgnorable. 2413: * 2414: * @param ch character to test 2415: * @return true if ch can follow the first letter in a Java identifier 2416: * @deprecated Replaced by {@link #isJavaIdentifierPart(char)} 2417: * @see #isJavaLetter(char) 2418: * @see #isJavaIdentifierStart(char) 2419: * @see #isJavaIdentifierPart(char) 2420: * @see #isLetter(char) 2421: * @see #isLetterOrDigit(char) 2422: * @see #isUnicodeIdentifierPart(char) 2423: * @see #isIdentifierIgnorable(char) 2424: */ 2425: public static boolean isJavaLetterOrDigit(char ch) 2426: { 2427: return isJavaIdentifierPart(ch); 2428: } 2429: 2430: /** 2431: * Determines if a character can start a Java identifier. This is the 2432: * combination of isLetter, any character where getType returns 2433: * LETTER_NUMBER, currency symbols (like '$'), and connecting punctuation 2434: * (like '_'). 2435: * <br> 2436: * Java identifier start = [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nl]|[Sc]|[Pc] 2437: * 2438: * @param ch character to test 2439: * @return true if ch can start a Java identifier, else false 2440: * @see #isJavaIdentifierPart(char) 2441: * @see #isLetter(char) 2442: * @see #isUnicodeIdentifierStart(char) 2443: * @since 1.1 2444: */ 2445: public static boolean isJavaIdentifierStart(char ch) 2446: { 2447: return ((1 << getType(ch)) 2448: & ((1 << UPPERCASE_LETTER) 2449: | (1 << LOWERCASE_LETTER) 2450: | (1 << TITLECASE_LETTER) 2451: | (1 << MODIFIER_LETTER) 2452: | (1 << OTHER_LETTER) 2453: | (1 << LETTER_NUMBER) 2454: | (1 << CURRENCY_SYMBOL) 2455: | (1 << CONNECTOR_PUNCTUATION))) != 0; 2456: } 2457: 2458: /** 2459: * Determines if a character can follow the first letter in 2460: * a Java identifier. This is the combination of isJavaLetter (isLetter, 2461: * type of LETTER_NUMBER, currency, connecting punctuation) and digit, 2462: * numeric letter (like Roman numerals), combining marks, non-spacing marks, 2463: * or isIdentifierIgnorable. 2464: * <br> 2465: * Java identifier extender = 2466: * [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nl]|[Sc]|[Pc]|[Mn]|[Mc]|[Nd]|[Cf] 2467: * |U+0000-U+0008|U+000E-U+001B|U+007F-U+009F 2468: * 2469: * @param ch character to test 2470: * @return true if ch can follow the first letter in a Java identifier 2471: * @see #isIdentifierIgnorable(char) 2472: * @see #isJavaIdentifierStart(char) 2473: * @see #isLetterOrDigit(char) 2474: * @see #isUnicodeIdentifierPart(char) 2475: * @since 1.1 2476: */ 2477: public static boolean isJavaIdentifierPart(char ch) 2478: { 2479: int category = getType(ch); 2480: return ((1 << category) 2481: & ((1 << UPPERCASE_LETTER) 2482: | (1 << LOWERCASE_LETTER) 2483: | (1 << TITLECASE_LETTER) 2484: | (1 << MODIFIER_LETTER) 2485: | (1 << OTHER_LETTER) 2486: | (1 << NON_SPACING_MARK) 2487: | (1 << COMBINING_SPACING_MARK) 2488: | (1 << DECIMAL_DIGIT_NUMBER) 2489: | (1 << LETTER_NUMBER) 2490: | (1 << CURRENCY_SYMBOL) 2491: | (1 << CONNECTOR_PUNCTUATION) 2492: | (1 << FORMAT))) != 0 2493: || (category == CONTROL && isIdentifierIgnorable(ch)); 2494: } 2495: 2496: /** 2497: * Determines if a character can start a Unicode identifier. Only 2498: * letters can start a Unicode identifier, but this includes characters 2499: * in LETTER_NUMBER. 2500: * <br> 2501: * Unicode identifier start = [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nl] 2502: * 2503: * @param ch character to test 2504: * @return true if ch can start a Unicode identifier, else false 2505: * @see #isJavaIdentifierStart(char) 2506: * @see #isLetter(char) 2507: * @see #isUnicodeIdentifierPart(char) 2508: * @since 1.1 2509: */ 2510: public static boolean isUnicodeIdentifierStart(char ch) 2511: { 2512: return ((1 << getType(ch)) 2513: & ((1 << UPPERCASE_LETTER) 2514: | (1 << LOWERCASE_LETTER) 2515: | (1 << TITLECASE_LETTER) 2516: | (1 << MODIFIER_LETTER) 2517: | (1 << OTHER_LETTER) 2518: | (1 << LETTER_NUMBER))) != 0; 2519: } 2520: 2521: /** 2522: * Determines if a character can follow the first letter in 2523: * a Unicode identifier. This includes letters, connecting punctuation, 2524: * digits, numeric letters, combining marks, non-spacing marks, and 2525: * isIdentifierIgnorable. 2526: * <br> 2527: * Unicode identifier extender = 2528: * [Lu]|[Ll]|[Lt]|[Lm]|[Lo]|[Nl]|[Mn]|[Mc]|[Nd]|[Pc]|[Cf]| 2529: * |U+0000-U+0008|U+000E-U+001B|U+007F-U+009F 2530: * 2531: * @param ch character to test 2532: * @return true if ch can follow the first letter in a Unicode identifier 2533: * @see #isIdentifierIgnorable(char) 2534: * @see #isJavaIdentifierPart(char) 2535: * @see #isLetterOrDigit(char) 2536: * @see #isUnicodeIdentifierStart(char) 2537: * @since 1.1 2538: */ 2539: public static boolean isUnicodeIdentifierPart(char ch) 2540: { 2541: int category = getType(ch); 2542: return ((1 << category) 2543: & ((1 << UPPERCASE_LETTER) 2544: | (1 << LOWERCASE_LETTER) 2545: | (1 << TITLECASE_LETTER) 2546: | (1 << MODIFIER_LETTER) 2547: | (1 << OTHER_LETTER) 2548: | (1 << NON_SPACING_MARK) 2549: | (1 << COMBINING_SPACING_MARK) 2550: | (1 << DECIMAL_DIGIT_NUMBER) 2551: | (1 << LETTER_NUMBER) 2552: | (1 << CONNECTOR_PUNCTUATION) 2553: | (1 << FORMAT))) != 0 2554: || (category == CONTROL && isIdentifierIgnorable(ch)); 2555: } 2556: 2557: /** 2558: * Determines if a character is ignorable in a Unicode identifier. This 2559: * includes the non-whitespace ISO control characters (<code>'\u0000'</code> 2560: * through <code>'\u0008'</code>, <code>'\u000E'</code> through 2561: * <code>'\u001B'</code>, and <code>'\u007F'</code> through 2562: * <code>'\u009F'</code>), and FORMAT characters. 2563: * <br> 2564: * Unicode identifier ignorable = [Cf]|U+0000-U+0008|U+000E-U+001B 2565: * |U+007F-U+009F 2566: * 2567: * @param ch character to test 2568: * @return true if ch is ignorable in a Unicode or Java identifier 2569: * @see #isJavaIdentifierPart(char) 2570: * @see #isUnicodeIdentifierPart(char) 2571: * @since 1.1 2572: */ 2573: public static boolean isIdentifierIgnorable(char ch) 2574: { 2575: return (ch <= '\u009F' && (ch < '\t' || ch >= '\u007F' 2576: || (ch <= '\u001B' && ch >= '\u000E'))) 2577: || getType(ch) == FORMAT; 2578: } 2579: 2580: /** 2581: * Converts a Unicode character into its lowercase equivalent mapping. 2582: * If a mapping does not exist, then the character passed is returned. 2583: * Note that isLowerCase(toLowerCase(ch)) does not always return true. 2584: * 2585: * @param ch character to convert to lowercase 2586: * @return lowercase mapping of ch, or ch if lowercase mapping does 2587: * not exist 2588: * @see #isLowerCase(char) 2589: * @see #isUpperCase(char) 2590: * @see #toTitleCase(char) 2591: * @see #toUpperCase(char) 2592: */ 2593: public static native char toLowerCase(char ch); 2594: 2595: /** 2596: * Converts a Unicode character into its uppercase equivalent mapping. 2597: * If a mapping does not exist, then the character passed is returned. 2598: * Note that isUpperCase(toUpperCase(ch)) does not always return true. 2599: * 2600: * @param ch character to convert to uppercase 2601: * @return uppercase mapping of ch, or ch if uppercase mapping does 2602: * not exist 2603: * @see #isLowerCase(char) 2604: * @see #isUpperCase(char) 2605: * @see #toLowerCase(char) 2606: * @see #toTitleCase(char) 2607: */ 2608: public static native char toUpperCase(char ch); 2609: 2610: /** 2611: * Converts a Unicode character into its titlecase equivalent mapping. 2612: * If a mapping does not exist, then the character passed is returned. 2613: * Note that isTitleCase(toTitleCase(ch)) does not always return true. 2614: * 2615: * @param ch character to convert to titlecase 2616: * @return titlecase mapping of ch, or ch if titlecase mapping does 2617: * not exist 2618: * @see #isTitleCase(char) 2619: * @see #toLowerCase(char) 2620: * @see #toUpperCase(char) 2621: */ 2622: public static native char toTitleCase(char ch); 2623: 2624: /** 2625: * Converts a character into a digit of the specified radix. If the radix 2626: * exceeds MIN_RADIX or MAX_RADIX, or if the result of getNumericValue(ch) 2627: * exceeds the radix, or if ch is not a decimal digit or in the case 2628: * insensitive set of 'a'-'z', the result is -1. 2629: * <br> 2630: * character argument boundary = [Nd]|U+0041-U+005A|U+0061-U+007A 2631: * |U+FF21-U+FF3A|U+FF41-U+FF5A 2632: * 2633: * @param ch character to convert into a digit 2634: * @param radix radix in which ch is a digit 2635: * @return digit which ch represents in radix, or -1 not a valid digit 2636: * @see #MIN_RADIX 2637: * @see #MAX_RADIX 2638: * @see #forDigit(int, int) 2639: * @see #isDigit(char) 2640: * @see #getNumericValue(char) 2641: */ 2642: public static native int digit(char ch, int radix); 2643: 2644: /** 2645: * Returns the Unicode numeric value property of a character. For example, 2646: * <code>'\\u216C'</code> (the Roman numeral fifty) returns 50. 2647: * 2648: * <p>This method also returns values for the letters A through Z, (not 2649: * specified by Unicode), in these ranges: <code>'\u0041'</code> 2650: * through <code>'\u005A'</code> (uppercase); <code>'\u0061'</code> 2651: * through <code>'\u007A'</code> (lowercase); and <code>'\uFF21'</code> 2652: * through <code>'\uFF3A'</code>, <code>'\uFF41'</code> through 2653: * <code>'\uFF5A'</code> (full width variants). 2654: * 2655: * <p>If the character lacks a numeric value property, -1 is returned. 2656: * If the character has a numeric value property which is not representable 2657: * as a nonnegative integer, such as a fraction, -2 is returned. 2658: * 2659: * character argument boundary = [Nd]|[Nl]|[No]|U+0041-U+005A|U+0061-U+007A 2660: * |U+FF21-U+FF3A|U+FF41-U+FF5A 2661: * 2662: * @param ch character from which the numeric value property will 2663: * be retrieved 2664: * @return the numeric value property of ch, or -1 if it does not exist, or 2665: * -2 if it is not representable as a nonnegative integer 2666: * @see #forDigit(int, int) 2667: * @see #digit(char, int) 2668: * @see #isDigit(char) 2669: * @since 1.1 2670: */ 2671: public static native int getNumericValue(char ch); 2672: 2673: /** 2674: * Determines if a character is a ISO-LATIN-1 space. This is only the five 2675: * characters <code>'\t'</code>, <code>'\n'</code>, <code>'\f'</code>, 2676: * <code>'\r'</code>, and <code>' '</code>. 2677: * <br> 2678: * Java space = U+0020|U+0009|U+000A|U+000C|U+000D 2679: * 2680: * @param ch character to test 2681: * @return true if ch is a space, else false 2682: * @deprecated Replaced by {@link #isWhitespace(char)} 2683: * @see #isSpaceChar(char) 2684: * @see #isWhitespace(char) 2685: */ 2686: public static boolean isSpace(char ch) 2687: { 2688: // Performing the subtraction up front alleviates need to compare longs. 2689: return ch-- <= ' ' && ((1 << ch) 2690: & ((1 << (' ' - 1)) 2691: | (1 << ('\t' - 1)) 2692: | (1 << ('\n' - 1)) 2693: | (1 << ('\r' - 1)) 2694: | (1 << ('\f' - 1)))) != 0; 2695: } 2696: 2697: /** 2698: * Determines if a character is a Unicode space character. This includes 2699: * SPACE_SEPARATOR, LINE_SEPARATOR, and PARAGRAPH_SEPARATOR. 2700: * <br> 2701: * Unicode space = [Zs]|[Zp]|[Zl] 2702: * 2703: * @param ch character to test 2704: * @return true if ch is a Unicode space, else false 2705: * @see #isWhitespace(char) 2706: * @since 1.1 2707: */ 2708: public static boolean isSpaceChar(char ch) 2709: { 2710: return ((1 << getType(ch)) 2711: & ((1 << SPACE_SEPARATOR) 2712: | (1 << LINE_SEPARATOR) 2713: | (1 << PARAGRAPH_SEPARATOR))) != 0; 2714: } 2715: 2716: /** 2717: * Determines if a character is Java whitespace. This includes Unicode 2718: * space characters (SPACE_SEPARATOR, LINE_SEPARATOR, and 2719: * PARAGRAPH_SEPARATOR) except the non-breaking spaces 2720: * (<code>'\u00A0'</code>, <code>'\u2007'</code>, and <code>'\u202F'</code>); 2721: * and these characters: <code>'\u0009'</code>, <code>'\u000A'</code>, 2722: * <code>'\u000B'</code>, <code>'\u000C'</code>, <code>'\u000D'</code>, 2723: * <code>'\u001C'</code>, <code>'\u001D'</code>, <code>'\u001E'</code>, 2724: * and <code>'\u001F'</code>. 2725: * <br> 2726: * Java whitespace = ([Zs] not Nb)|[Zl]|[Zp]|U+0009-U+000D|U+001C-U+001F 2727: * 2728: * @param ch character to test 2729: * @return true if ch is Java whitespace, else false 2730: * @see #isSpaceChar(char) 2731: * @since 1.1 2732: */ 2733: public static boolean isWhitespace(char ch) 2734: { 2735: int attr = readChar(ch); 2736: return ((((1 << (attr & TYPE_MASK)) 2737: & ((1 << SPACE_SEPARATOR) 2738: | (1 << LINE_SEPARATOR) 2739: | (1 << PARAGRAPH_SEPARATOR))) != 0) 2740: && (attr & NO_BREAK_MASK) == 0) 2741: || (ch <= '\u001F' && ((1 << ch) 2742: & ((1 << '\t') 2743: | (1 << '\n') 2744: | (1 << '\u000B') 2745: | (1 << '\u000C') 2746: | (1 << '\r') 2747: | (1 << '\u001C') 2748: | (1 << '\u001D') 2749: | (1 << '\u001E') 2750: | (1 << '\u001F'))) != 0); 2751: } 2752: 2753: /** 2754: * Determines if a character has the ISO Control property. 2755: * <br> 2756: * ISO Control = [Cc] 2757: * 2758: * @param ch character to test 2759: * @return true if ch is an ISO Control character, else false 2760: * @see #isSpaceChar(char) 2761: * @see #isWhitespace(char) 2762: * @since 1.1 2763: */ 2764: public static boolean isISOControl(char ch) 2765: { 2766: return getType(ch) == CONTROL; 2767: } 2768: 2769: /** 2770: * Returns the Unicode general category property of a character. 2771: * 2772: * @param ch character from which the general category property will 2773: * be retrieved 2774: * @return the character category property of ch as an integer 2775: * @see #UNASSIGNED 2776: * @see #UPPERCASE_LETTER 2777: * @see #LOWERCASE_LETTER 2778: * @see #TITLECASE_LETTER 2779: * @see #MODIFIER_LETTER 2780: * @see #OTHER_LETTER 2781: * @see #NON_SPACING_MARK 2782: * @see #ENCLOSING_MARK 2783: * @see #COMBINING_SPACING_MARK 2784: * @see #DECIMAL_DIGIT_NUMBER 2785: * @see #LETTER_NUMBER 2786: * @see #OTHER_NUMBER 2787: * @see #SPACE_SEPARATOR 2788: * @see #LINE_SEPARATOR 2789: * @see #PARAGRAPH_SEPARATOR 2790: * @see #CONTROL 2791: * @see #FORMAT 2792: * @see #PRIVATE_USE 2793: * @see #SURROGATE 2794: * @see #DASH_PUNCTUATION 2795: * @see #START_PUNCTUATION 2796: * @see #END_PUNCTUATION 2797: * @see #CONNECTOR_PUNCTUATION 2798: * @see #OTHER_PUNCTUATION 2799: * @see #MATH_SYMBOL 2800: * @see #CURRENCY_SYMBOL 2801: * @see #MODIFIER_SYMBOL 2802: * @see #INITIAL_QUOTE_PUNCTUATION 2803: * @see #FINAL_QUOTE_PUNCTUATION 2804: * @since 1.1 2805: */ 2806: public static native int getType(char ch); 2807: 2808: /** 2809: * Converts a digit into a character which represents that digit 2810: * in a specified radix. If the radix exceeds MIN_RADIX or MAX_RADIX, 2811: * or the digit exceeds the radix, then the null character <code>'\0'</code> 2812: * is returned. Otherwise the return value is in '0'-'9' and 'a'-'z'. 2813: * <br> 2814: * return value boundary = U+0030-U+0039|U+0061-U+007A 2815: * 2816: * @param digit digit to be converted into a character 2817: * @param radix radix of digit 2818: * @return character representing digit in radix, or '\0' 2819: * @see #MIN_RADIX 2820: * @see #MAX_RADIX 2821: * @see #digit(char, int) 2822: */ 2823: public static char forDigit(int digit, int radix) 2824: { 2825: if (radix < MIN_RADIX || radix > MAX_RADIX 2826: || digit < 0 || digit >= radix) 2827: return '\0'; 2828: return (char) (digit < 10 ? ('0' + digit) : ('a' - 10 + digit)); 2829: } 2830: 2831: /** 2832: * Returns the Unicode directionality property of the character. This 2833: * is used in the visual ordering of text. 2834: * 2835: * @param ch the character to look up 2836: * @return the directionality constant, or DIRECTIONALITY_UNDEFINED 2837: * @see #DIRECTIONALITY_UNDEFINED 2838: * @see #DIRECTIONALITY_LEFT_TO_RIGHT 2839: * @see #DIRECTIONALITY_RIGHT_TO_LEFT 2840: * @see #DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC 2841: * @see #DIRECTIONALITY_EUROPEAN_NUMBER 2842: * @see #DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR 2843: * @see #DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR 2844: * @see #DIRECTIONALITY_ARABIC_NUMBER 2845: * @see #DIRECTIONALITY_COMMON_NUMBER_SEPARATOR 2846: * @see #DIRECTIONALITY_NONSPACING_MARK 2847: * @see #DIRECTIONALITY_BOUNDARY_NEUTRAL 2848: * @see #DIRECTIONALITY_PARAGRAPH_SEPARATOR 2849: * @see #DIRECTIONALITY_SEGMENT_SEPARATOR 2850: * @see #DIRECTIONALITY_WHITESPACE 2851: * @see #DIRECTIONALITY_OTHER_NEUTRALS 2852: * @see #DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING 2853: * @see #DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE 2854: * @see #DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING 2855: * @see #DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE 2856: * @see #DIRECTIONALITY_POP_DIRECTIONAL_FORMAT 2857: * @since 1.4 2858: */ 2859: public static native byte getDirectionality(char ch); 2860: 2861: /** 2862: * Determines whether the character is mirrored according to Unicode. For 2863: * example, <code>\u0028</code> (LEFT PARENTHESIS) appears as '(' in 2864: * left-to-right text, but ')' in right-to-left text. 2865: * 2866: * @param ch the character to look up 2867: * @return true if the character is mirrored 2868: * @since 1.4 2869: */ 2870: public static boolean isMirrored(char ch) 2871: { 2872: return (readChar(ch) & MIRROR_MASK) != 0; 2873: } 2874: 2875: /** 2876: * Compares another Character to this Character, numerically. 2877: * 2878: * @param anotherCharacter Character to compare with this Character 2879: * @return a negative integer if this Character is less than 2880: * anotherCharacter, zero if this Character is equal, and 2881: * a positive integer if this Character is greater 2882: * @throws NullPointerException if anotherCharacter is null 2883: * @since 1.2 2884: */ 2885: public int compareTo(Character anotherCharacter) 2886: { 2887: return value - anotherCharacter.value; 2888: } 2889: 2890: /** 2891: * Compares an object to this Character. Assuming the object is a 2892: * Character object, this method performs the same comparison as 2893: * compareTo(Character). 2894: * 2895: * @param o object to compare 2896: * @return the comparison value 2897: * @throws ClassCastException if o is not a Character object 2898: * @throws NullPointerException if o is null 2899: * @see #compareTo(Character) 2900: * @since 1.2 2901: */ 2902: public int compareTo(Object o) 2903: { 2904: return compareTo((Character) o); 2905: } 2906: 2907: /** 2908: * Returns an <code>Character</code> object wrapping the value. 2909: * In contrast to the <code>Character</code> constructor, this method 2910: * will cache some values. It is used by boxing conversion. 2911: * 2912: * @param val the value to wrap 2913: * @return the <code>Character</code> 2914: * 2915: * @since 1.5 2916: */ 2917: public static Character valueOf(char val) 2918: { 2919: if (val > MAX_CACHE) 2920: return new Character(val); 2921: synchronized (charCache) 2922: { 2923: if (charCache[val - MIN_VALUE] == null) 2924: charCache[val - MIN_VALUE] = new Character(val); 2925: return charCache[val - MIN_VALUE]; 2926: } 2927: } 2928: 2929: /** 2930: * Reverse the bytes in val. 2931: * @since 1.5 2932: */ 2933: public static char reverseBytes(char val) 2934: { 2935: return (char) (((val >> 8) & 0xff) | ((val << 8) & 0xff00)); 2936: } 2937: 2938: /** 2939: * Converts a unicode code point to a UTF-16 representation of that 2940: * code point. 2941: * 2942: * @param codePoint the unicode code point 2943: * 2944: * @return the UTF-16 representation of that code point 2945: * 2946: * @throws IllegalArgumentException if the code point is not a valid 2947: * unicode code point 2948: * 2949: * @since 1.5 2950: */ 2951: public static char[] toChars(int codePoint) 2952: { 2953: char[] result = new char[charCount(codePoint)]; 2954: int ignore = toChars(codePoint, result, 0); 2955: return result; 2956: } 2957: 2958: /** 2959: * Converts a unicode code point to its UTF-16 representation. 2960: * 2961: * @param codePoint the unicode code point 2962: * @param dst the target char array 2963: * @param dstIndex the start index for the target 2964: * 2965: * @return number of characters written to <code>dst</code> 2966: * 2967: * @throws IllegalArgumentException if <code>codePoint</code> is not a 2968: * valid unicode code point 2969: * @throws NullPointerException if <code>dst</code> is <code>null</code> 2970: * @throws IndexOutOfBoundsException if <code>dstIndex</code> is not valid 2971: * in <code>dst</code> or if the UTF-16 representation does not 2972: * fit into <code>dst</code> 2973: * 2974: * @since 1.5 2975: */ 2976: public static int toChars(int codePoint, char[] dst, int dstIndex) 2977: { 2978: if (!isValidCodePoint(codePoint)) 2979: { 2980: throw new IllegalArgumentException("not a valid code point: " 2981: + codePoint); 2982: } 2983: 2984: int result; 2985: if (isSupplementaryCodePoint(codePoint)) 2986: { 2987: // Write second char first to cause IndexOutOfBoundsException 2988: // immediately. 2989: final int cp2 = codePoint - 0x10000; 2990: dst[dstIndex + 1] = (char) ((cp2 % 0x400) + (int) MIN_LOW_SURROGATE); 2991: dst[dstIndex] = (char) ((cp2 / 0x400) + (int) MIN_HIGH_SURROGATE); 2992: result = 2; 2993: } 2994: else 2995: { 2996: dst[dstIndex] = (char) codePoint; 2997: result = 1; 2998: } 2999: return result; 3000: } 3001: 3002: /** 3003: * Return number of 16-bit characters required to represent the given 3004: * code point. 3005: * 3006: * @param codePoint a unicode code point 3007: * 3008: * @return 2 if codePoint >= 0x10000, 1 otherwise. 3009: * 3010: * @since 1.5 3011: */ 3012: public static int charCount(int codePoint) 3013: { 3014: return 3015: (codePoint >= MIN_SUPPLEMENTARY_CODE_POINT) 3016: ? 2 3017: : 1; 3018: } 3019: 3020: /** 3021: * Determines whether the specified code point is 3022: * in the range 0x10000 .. 0x10FFFF, i.e. the character is within the Unicode 3023: * supplementary character range. 3024: * 3025: * @param codePoint a Unicode code point 3026: * 3027: * @return <code>true</code> if code point is in supplementary range 3028: * 3029: * @since 1.5 3030: */ 3031: public static boolean isSupplementaryCodePoint(int codePoint) 3032: { 3033: return codePoint >= MIN_SUPPLEMENTARY_CODE_POINT 3034: && codePoint <= MAX_CODE_POINT; 3035: } 3036: 3037: /** 3038: * Determines whether the specified code point is 3039: * in the range 0x0000 .. 0x10FFFF, i.e. it is a valid Unicode code point. 3040: * 3041: * @param codePoint a Unicode code point 3042: * 3043: * @return <code>true</code> if code point is valid 3044: * 3045: * @since 1.5 3046: */ 3047: public static boolean isValidCodePoint(int codePoint) 3048: { 3049: return codePoint >= MIN_CODE_POINT && codePoint <= MAX_CODE_POINT; 3050: } 3051: 3052: /** 3053: * Return true if the given character is a high surrogate. 3054: * @param ch the character 3055: * @return true if the character is a high surrogate character 3056: * 3057: * @since 1.5 3058: */ 3059: public static boolean isHighSurrogate(char ch) 3060: { 3061: return ch >= MIN_HIGH_SURROGATE && ch <= MAX_HIGH_SURROGATE; 3062: } 3063: 3064: /** 3065: * Return true if the given character is a low surrogate. 3066: * @param ch the character 3067: * @return true if the character is a low surrogate character 3068: * 3069: * @since 1.5 3070: */ 3071: public static boolean isLowSurrogate(char ch) 3072: { 3073: return ch >= MIN_LOW_SURROGATE && ch <= MAX_LOW_SURROGATE; 3074: } 3075: 3076: /** 3077: * Return true if the given characters compose a surrogate pair. 3078: * This is true if the first character is a high surrogate and the 3079: * second character is a low surrogate. 3080: * @param ch1 the first character 3081: * @param ch2 the first character 3082: * @return true if the characters compose a surrogate pair 3083: * 3084: * @since 1.5 3085: */ 3086: public static boolean isSurrogatePair(char ch1, char ch2) 3087: { 3088: return isHighSurrogate(ch1) && isLowSurrogate(ch2); 3089: } 3090: 3091: /** 3092: * Given a valid surrogate pair, this returns the corresponding 3093: * code point. 3094: * @param high the high character of the pair 3095: * @param low the low character of the pair 3096: * @return the corresponding code point 3097: * 3098: * @since 1.5 3099: */ 3100: public static int toCodePoint(char high, char low) 3101: { 3102: return ((high - MIN_HIGH_SURROGATE) * 0x400) + 3103: (low - MIN_LOW_SURROGATE) + 0x10000; 3104: } 3105: 3106: /** 3107: * Get the code point at the specified index in the CharSequence. 3108: * This is like CharSequence#charAt(int), but if the character is 3109: * the start of a surrogate pair, and there is a following 3110: * character, and this character completes the pair, then the 3111: * corresponding supplementary code point is returned. Otherwise, 3112: * the character at the index is returned. 3113: * 3114: * @param sequence the CharSequence 3115: * @param index the index of the codepoint to get, starting at 0 3116: * @return the codepoint at the specified index 3117: * @throws IndexOutOfBoundsException if index is negative or >= length() 3118: * @since 1.5 3119: */ 3120: public static int codePointAt(CharSequence sequence, int index) 3121: { 3122: int len = sequence.length(); 3123: if (index < 0 || index >= len) 3124: throw new IndexOutOfBoundsException(); 3125: char high = sequence.charAt(index); 3126: if (! isHighSurrogate(high) || ++index >= len) 3127: return high; 3128: char low = sequence.charAt(index); 3129: if (! isLowSurrogate(low)) 3130: return high; 3131: return toCodePoint(high, low); 3132: } 3133: 3134: /** 3135: * Get the code point at the specified index in the CharSequence. 3136: * If the character is the start of a surrogate pair, and there is a 3137: * following character, and this character completes the pair, then 3138: * the corresponding supplementary code point is returned. 3139: * Otherwise, the character at the index is returned. 3140: * 3141: * @param chars the character array in which to look 3142: * @param index the index of the codepoint to get, starting at 0 3143: * @return the codepoint at the specified index 3144: * @throws IndexOutOfBoundsException if index is negative or >= length() 3145: * @since 1.5 3146: */ 3147: public static int codePointAt(char[] chars, int index) 3148: { 3149: return codePointAt(chars, index, chars.length); 3150: } 3151: 3152: /** 3153: * Get the code point at the specified index in the CharSequence. 3154: * If the character is the start of a surrogate pair, and there is a 3155: * following character within the specified range, and this 3156: * character completes the pair, then the corresponding 3157: * supplementary code point is returned. Otherwise, the character 3158: * at the index is returned. 3159: * 3160: * @param chars the character array in which to look 3161: * @param index the index of the codepoint to get, starting at 0 3162: * @param limit the limit past which characters should not be examined 3163: * @return the codepoint at the specified index 3164: * @throws IndexOutOfBoundsException if index is negative or >= 3165: * limit, or if limit is negative or >= the length of the array 3166: * @since 1.5 3167: */ 3168: public static int codePointAt(char[] chars, int index, int limit) 3169: { 3170: if (index < 0 || index >= limit || limit < 0 || limit >= chars.length) 3171: throw new IndexOutOfBoundsException(); 3172: char high = chars[index]; 3173: if (! isHighSurrogate(high) || ++index >= limit) 3174: return high; 3175: char low = chars[index]; 3176: if (! isLowSurrogate(low)) 3177: return high; 3178: return toCodePoint(high, low); 3179: } 3180: 3181: /** 3182: * Get the code point before the specified index. This is like 3183: * #codePointAt(char[], int), but checks the characters at 3184: * <code>index-1</code> and <code>index-2</code> to see if they form 3185: * a supplementary code point. If they do not, the character at 3186: * <code>index-1</code> is returned. 3187: * 3188: * @param chars the character array 3189: * @param index the index just past the codepoint to get, starting at 0 3190: * @return the codepoint at the specified index 3191: * @throws IndexOutOfBoundsException if index is negative or >= length() 3192: * @since 1.5 3193: */ 3194: public static int codePointBefore(char[] chars, int index) 3195: { 3196: return codePointBefore(chars, index, 1); 3197: } 3198: 3199: /** 3200: * Get the code point before the specified index. This is like 3201: * #codePointAt(char[], int), but checks the characters at 3202: * <code>index-1</code> and <code>index-2</code> to see if they form 3203: * a supplementary code point. If they do not, the character at 3204: * <code>index-1</code> is returned. The start parameter is used to 3205: * limit the range of the array which may be examined. 3206: * 3207: * @param chars the character array 3208: * @param index the index just past the codepoint to get, starting at 0 3209: * @param start the index before which characters should not be examined 3210: * @return the codepoint at the specified index 3211: * @throws IndexOutOfBoundsException if index is > start or > 3212: * the length of the array, or if limit is negative or >= the 3213: * length of the array 3214: * @since 1.5 3215: */ 3216: public static int codePointBefore(char[] chars, int index, int start) 3217: { 3218: if (index < start || index > chars.length 3219: || start < 0 || start >= chars.length) 3220: throw new IndexOutOfBoundsException(); 3221: --index; 3222: char low = chars[index]; 3223: if (! isLowSurrogate(low) || --index < start) 3224: return low; 3225: char high = chars[index]; 3226: if (! isHighSurrogate(high)) 3227: return low; 3228: return toCodePoint(high, low); 3229: } 3230: 3231: /** 3232: * Get the code point before the specified index. This is like 3233: * #codePointAt(CharSequence, int), but checks the characters at 3234: * <code>index-1</code> and <code>index-2</code> to see if they form 3235: * a supplementary code point. If they do not, the character at 3236: * <code>index-1</code> is returned. 3237: * 3238: * @param sequence the CharSequence 3239: * @param index the index just past the codepoint to get, starting at 0 3240: * @return the codepoint at the specified index 3241: * @throws IndexOutOfBoundsException if index is negative or >= length() 3242: * @since 1.5 3243: */ 3244: public static int codePointBefore(CharSequence sequence, int index) 3245: { 3246: int len = sequence.length(); 3247: if (index < 1 || index > len) 3248: throw new IndexOutOfBoundsException(); 3249: --index; 3250: char low = sequence.charAt(index); 3251: if (! isLowSurrogate(low) || --index < 0) 3252: return low; 3253: char high = sequence.charAt(index); 3254: if (! isHighSurrogate(high)) 3255: return low; 3256: return toCodePoint(high, low); 3257: } 3258: } // class Character