1 /* 2 * Copyright (c) 2011, 2016, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 package java.util.regex; 27 28 import java.util.HashMap; 29 import java.util.Locale; 30 import java.util.regex.Pattern.CharPredicate; 31 import java.util.regex.Pattern.BmpCharPredicate; 32 33 class CharPredicates { 34 35 static final CharPredicate ALPHABETIC() { 36 return Character::isAlphabetic; 37 } 38 39 // \p{gc=Decimal_Number} 40 static final CharPredicate DIGIT() { 41 return Character::isDigit; 42 } 43 44 static final CharPredicate LETTER() { 45 return Character::isLetter; 46 } 47 48 static final CharPredicate IDEOGRAPHIC() { 49 return Character::isIdeographic; 50 } 51 52 static final CharPredicate LOWERCASE() { 53 return Character::isLowerCase; 54 } 55 56 static final CharPredicate UPPERCASE() { 57 return Character::isUpperCase; 58 } 59 60 static final CharPredicate TITLECASE() { 61 return Character::isTitleCase; 62 } 63 64 // \p{Whitespace} 65 static final CharPredicate WHITE_SPACE() { 66 return ch -> 67 ((((1 << Character.SPACE_SEPARATOR) | 68 (1 << Character.LINE_SEPARATOR) | 69 (1 << Character.PARAGRAPH_SEPARATOR)) >> Character.getType(ch)) & 1) 70 != 0 || (ch >= 0x9 && ch <= 0xd) || (ch == 0x85); 71 } 72 73 // \p{gc=Control} 74 static final CharPredicate CONTROL() { 75 return ch -> Character.getType(ch) == Character.CONTROL; 76 } 77 78 // \p{gc=Punctuation} 79 static final CharPredicate PUNCTUATION() { 80 return ch -> 81 ((((1 << Character.CONNECTOR_PUNCTUATION) | 82 (1 << Character.DASH_PUNCTUATION) | 83 (1 << Character.START_PUNCTUATION) | 84 (1 << Character.END_PUNCTUATION) | 85 (1 << Character.OTHER_PUNCTUATION) | 86 (1 << Character.INITIAL_QUOTE_PUNCTUATION) | 87 (1 << Character.FINAL_QUOTE_PUNCTUATION)) >> Character.getType(ch)) & 1) 88 != 0; 89 } 90 91 // \p{gc=Decimal_Number} 92 // \p{Hex_Digit} -> PropList.txt: Hex_Digit 93 static final CharPredicate HEX_DIGIT() { 94 return DIGIT().union(ch -> (ch >= 0x0030 && ch <= 0x0039) || 95 (ch >= 0x0041 && ch <= 0x0046) || 96 (ch >= 0x0061 && ch <= 0x0066) || 97 (ch >= 0xFF10 && ch <= 0xFF19) || 98 (ch >= 0xFF21 && ch <= 0xFF26) || 99 (ch >= 0xFF41 && ch <= 0xFF46)); 100 } 101 102 static final CharPredicate ASSIGNED() { 103 return ch -> Character.getType(ch) != Character.UNASSIGNED; 104 } 105 106 // PropList.txt:Noncharacter_Code_Point 107 static final CharPredicate NONCHARACTER_CODE_POINT() { 108 return ch -> (ch & 0xfffe) == 0xfffe || (ch >= 0xfdd0 && ch <= 0xfdef); 109 } 110 111 // \p{alpha} 112 // \p{digit} 113 static final CharPredicate ALNUM() { 114 return ALPHABETIC().union(DIGIT()); 115 } 116 117 // \p{Whitespace} -- 118 // [\N{LF} \N{VT} \N{FF} \N{CR} \N{NEL} -> 0xa, 0xb, 0xc, 0xd, 0x85 119 // \p{gc=Line_Separator} 120 // \p{gc=Paragraph_Separator}] 121 static final CharPredicate BLANK() { 122 return ch -> 123 Character.getType(ch) == Character.SPACE_SEPARATOR || 124 ch == 0x9; // \N{HT} 125 } 126 127 // [^ 128 // \p{space} 129 // \p{gc=Control} 130 // \p{gc=Surrogate} 131 // \p{gc=Unassigned}] 132 static final CharPredicate GRAPH() { 133 return ch -> 134 ((((1 << Character.SPACE_SEPARATOR) | 135 (1 << Character.LINE_SEPARATOR) | 136 (1 << Character.PARAGRAPH_SEPARATOR) | 137 (1 << Character.CONTROL) | 138 (1 << Character.SURROGATE) | 139 (1 << Character.UNASSIGNED)) >> Character.getType(ch)) & 1) 140 == 0; 141 } 142 143 // \p{graph} 144 // \p{blank} 145 // -- \p{cntrl} 146 static final CharPredicate PRINT() { 147 return GRAPH().union(BLANK()).and(CONTROL().negate()); 148 } 149 150 // 200C..200D PropList.txt:Join_Control 151 static final CharPredicate JOIN_CONTROL() { 152 return ch -> ch == 0x200C || ch == 0x200D; 153 } 154 155 // \p{alpha} 156 // \p{gc=Mark} 157 // \p{digit} 158 // \p{gc=Connector_Punctuation} 159 // \p{Join_Control} 200C..200D 160 static final CharPredicate WORD() { 161 return ALPHABETIC().union(ch -> ((((1 << Character.NON_SPACING_MARK) | 162 (1 << Character.ENCLOSING_MARK) | 163 (1 << Character.COMBINING_SPACING_MARK) | 164 (1 << Character.DECIMAL_DIGIT_NUMBER) | 165 (1 << Character.CONNECTOR_PUNCTUATION)) 166 >> Character.getType(ch)) & 1) != 0, 167 JOIN_CONTROL()); 168 } 169 170 ///////////////////////////////////////////////////////////////////////////// 171 172 private static CharPredicate getPosixPredicate(String name) { 173 switch (name) { 174 case "ALPHA": return ALPHABETIC(); 175 case "LOWER": return LOWERCASE(); 176 case "UPPER": return UPPERCASE(); 177 case "SPACE": return WHITE_SPACE(); 178 case "PUNCT": return PUNCTUATION(); 179 case "XDIGIT": return HEX_DIGIT(); 180 case "ALNUM": return ALNUM(); 181 case "CNTRL": return CONTROL(); 182 case "DIGIT": return DIGIT(); 183 case "BLANK": return BLANK(); 184 case "GRAPH": return GRAPH(); 185 case "PRINT": return PRINT(); 186 default: return null; 187 } 188 } 189 190 private static CharPredicate getUnicodePredicate(String name) { 191 switch (name) { 192 case "ALPHABETIC": return ALPHABETIC(); 193 case "ASSIGNED": return ASSIGNED(); 194 case "CONTROL": return CONTROL(); 195 case "HEXDIGIT": return HEX_DIGIT(); 196 case "IDEOGRAPHIC": return IDEOGRAPHIC(); 197 case "JOINCONTROL": return JOIN_CONTROL(); 198 case "LETTER": return LETTER(); 199 case "LOWERCASE": return LOWERCASE(); 200 case "NONCHARACTERCODEPOINT": return NONCHARACTER_CODE_POINT(); 201 case "TITLECASE": return TITLECASE(); 202 case "PUNCTUATION": return PUNCTUATION(); 203 case "UPPERCASE": return UPPERCASE(); 204 case "WHITESPACE": return WHITE_SPACE(); 205 case "WORD": return WORD(); 206 case "WHITE_SPACE": return WHITE_SPACE(); 207 case "HEX_DIGIT": return HEX_DIGIT(); 208 case "NONCHARACTER_CODE_POINT": return NONCHARACTER_CODE_POINT(); 209 case "JOIN_CONTROL": return JOIN_CONTROL(); 210 default: return null; 211 } 212 } 213 214 public static CharPredicate forUnicodeProperty(String propName) { 215 propName = propName.toUpperCase(Locale.ROOT); 216 CharPredicate p = getUnicodePredicate(propName); 217 if (p != null) 218 return p; 219 return getPosixPredicate(propName); 220 } 221 222 public static CharPredicate forPOSIXName(String propName) { 223 return getPosixPredicate(propName.toUpperCase(Locale.ENGLISH)); 224 } 225 226 ///////////////////////////////////////////////////////////////////////////// 227 228 /** 229 * Returns a predicate matching all characters belong to a named 230 * UnicodeScript. 231 */ 232 static CharPredicate forUnicodeScript(String name) { 233 final Character.UnicodeScript script; 234 try { 235 script = Character.UnicodeScript.forName(name); 236 return ch -> script == Character.UnicodeScript.of(ch); 237 } catch (IllegalArgumentException iae) {} 238 return null; 239 } 240 241 /** 242 * Returns a predicate matching all characters in a UnicodeBlock. 243 */ 244 static CharPredicate forUnicodeBlock(String name) { 245 final Character.UnicodeBlock block; 246 try { 247 block = Character.UnicodeBlock.forName(name); 248 return ch -> block == Character.UnicodeBlock.of(ch); 249 } catch (IllegalArgumentException iae) {} 250 return null; 251 } 252 253 ///////////////////////////////////////////////////////////////////////////// 254 255 // unicode categories, aliases, properties, java methods ... 256 257 static CharPredicate forProperty(String name) { 258 // Unicode character property aliases, defined in 259 // http://www.unicode.org/Public/UNIDATA/PropertyValueAliases.txt 260 switch (name) { 261 case "Cn": return category(1<<Character.UNASSIGNED); 262 case "Lu": return category(1<<Character.UPPERCASE_LETTER); 263 case "Ll": return category(1<<Character.LOWERCASE_LETTER); 264 case "Lt": return category(1<<Character.TITLECASE_LETTER); 265 case "Lm": return category(1<<Character.MODIFIER_LETTER); 266 case "Lo": return category(1<<Character.OTHER_LETTER); 267 case "Mn": return category(1<<Character.NON_SPACING_MARK); 268 case "Me": return category(1<<Character.ENCLOSING_MARK); 269 case "Mc": return category(1<<Character.COMBINING_SPACING_MARK); 270 case "Nd": return category(1<<Character.DECIMAL_DIGIT_NUMBER); 271 case "Nl": return category(1<<Character.LETTER_NUMBER); 272 case "No": return category(1<<Character.OTHER_NUMBER); 273 case "Zs": return category(1<<Character.SPACE_SEPARATOR); 274 case "Zl": return category(1<<Character.LINE_SEPARATOR); 275 case "Zp": return category(1<<Character.PARAGRAPH_SEPARATOR); 276 case "Cc": return category(1<<Character.CONTROL); 277 case "Cf": return category(1<<Character.FORMAT); 278 case "Co": return category(1<<Character.PRIVATE_USE); 279 case "Cs": return category(1<<Character.SURROGATE); 280 case "Pd": return category(1<<Character.DASH_PUNCTUATION); 281 case "Ps": return category(1<<Character.START_PUNCTUATION); 282 case "Pe": return category(1<<Character.END_PUNCTUATION); 283 case "Pc": return category(1<<Character.CONNECTOR_PUNCTUATION); 284 case "Po": return category(1<<Character.OTHER_PUNCTUATION); 285 case "Sm": return category(1<<Character.MATH_SYMBOL); 286 case "Sc": return category(1<<Character.CURRENCY_SYMBOL); 287 case "Sk": return category(1<<Character.MODIFIER_SYMBOL); 288 case "So": return category(1<<Character.OTHER_SYMBOL); 289 case "Pi": return category(1<<Character.INITIAL_QUOTE_PUNCTUATION); 290 case "Pf": return category(1<<Character.FINAL_QUOTE_PUNCTUATION); 291 case "L": return category(((1<<Character.UPPERCASE_LETTER) | 292 (1<<Character.LOWERCASE_LETTER) | 293 (1<<Character.TITLECASE_LETTER) | 294 (1<<Character.MODIFIER_LETTER) | 295 (1<<Character.OTHER_LETTER))); 296 case "M": return category(((1<<Character.NON_SPACING_MARK) | 297 (1<<Character.ENCLOSING_MARK) | 298 (1<<Character.COMBINING_SPACING_MARK))); 299 case "N": return category(((1<<Character.DECIMAL_DIGIT_NUMBER) | 300 (1<<Character.LETTER_NUMBER) | 301 (1<<Character.OTHER_NUMBER))); 302 case "Z": return category(((1<<Character.SPACE_SEPARATOR) | 303 (1<<Character.LINE_SEPARATOR) | 304 (1<<Character.PARAGRAPH_SEPARATOR))); 305 case "C": return category(((1<<Character.CONTROL) | 306 (1<<Character.FORMAT) | 307 (1<<Character.PRIVATE_USE) | 308 (1<<Character.SURROGATE) | 309 (1<<Character.UNASSIGNED))); // Other 310 case "P": return category(((1<<Character.DASH_PUNCTUATION) | 311 (1<<Character.START_PUNCTUATION) | 312 (1<<Character.END_PUNCTUATION) | 313 (1<<Character.CONNECTOR_PUNCTUATION) | 314 (1<<Character.OTHER_PUNCTUATION) | 315 (1<<Character.INITIAL_QUOTE_PUNCTUATION) | 316 (1<<Character.FINAL_QUOTE_PUNCTUATION))); 317 case "S": return category(((1<<Character.MATH_SYMBOL) | 318 (1<<Character.CURRENCY_SYMBOL) | 319 (1<<Character.MODIFIER_SYMBOL) | 320 (1<<Character.OTHER_SYMBOL))); 321 case "LC": return category(((1<<Character.UPPERCASE_LETTER) | 322 (1<<Character.LOWERCASE_LETTER) | 323 (1<<Character.TITLECASE_LETTER))); 324 case "LD": return category(((1<<Character.UPPERCASE_LETTER) | 325 (1<<Character.LOWERCASE_LETTER) | 326 (1<<Character.TITLECASE_LETTER) | 327 (1<<Character.MODIFIER_LETTER) | 328 (1<<Character.OTHER_LETTER) | 329 (1<<Character.DECIMAL_DIGIT_NUMBER))); 330 case "L1": return range(0x00, 0xFF); // Latin-1 331 case "all": return Pattern.ALL(); 332 // Posix regular expression character classes, defined in 333 // http://www.unix.org/onlinepubs/009695399/basedefs/xbd_chap09.html 334 case "ASCII": return range(0x00, 0x7F); // ASCII 335 case "Alnum": return ctype(ASCII.ALNUM); // Alphanumeric characters 336 case "Alpha": return ctype(ASCII.ALPHA); // Alphabetic characters 337 case "Blank": return ctype(ASCII.BLANK); // Space and tab characters 338 case "Cntrl": return ctype(ASCII.CNTRL); // Control characters 339 case "Digit": return range('0', '9'); // Numeric characters 340 case "Graph": return ctype(ASCII.GRAPH); // printable and visible 341 case "Lower": return range('a', 'z'); // Lower-case alphabetic 342 case "Print": return range(0x20, 0x7E); // Printable characters 343 case "Punct": return ctype(ASCII.PUNCT); // Punctuation characters 344 case "Space": return ctype(ASCII.SPACE); // Space characters 345 case "Upper": return range('A', 'Z'); // Upper-case alphabetic 346 case "XDigit": return ctype(ASCII.XDIGIT); // hexadecimal digits 347 348 // Java character properties, defined by methods in Character.java 349 case "javaLowerCase": return java.lang.Character::isLowerCase; 350 case "javaUpperCase": return Character::isUpperCase; 351 case "javaAlphabetic": return java.lang.Character::isAlphabetic; 352 case "javaIdeographic": return java.lang.Character::isIdeographic; 353 case "javaTitleCase": return java.lang.Character::isTitleCase; 354 case "javaDigit": return java.lang.Character::isDigit; 355 case "javaDefined": return java.lang.Character::isDefined; 356 case "javaLetter": return java.lang.Character::isLetter; 357 case "javaLetterOrDigit": return java.lang.Character::isLetterOrDigit; 358 case "javaJavaIdentifierStart": return java.lang.Character::isJavaIdentifierStart; 359 case "javaJavaIdentifierPart": return java.lang.Character::isJavaIdentifierPart; 360 case "javaUnicodeIdentifierStart": return java.lang.Character::isUnicodeIdentifierStart; 361 case "javaUnicodeIdentifierPart": return java.lang.Character::isUnicodeIdentifierPart; 362 case "javaIdentifierIgnorable": return java.lang.Character::isIdentifierIgnorable; 363 case "javaSpaceChar": return java.lang.Character::isSpaceChar; 364 case "javaWhitespace": return java.lang.Character::isWhitespace; 365 case "javaISOControl": return java.lang.Character::isISOControl; 366 case "javaMirrored": return java.lang.Character::isMirrored; 367 default: return null; 368 } 369 } 370 371 private static CharPredicate category(final int typeMask) { 372 return ch -> (typeMask & (1 << Character.getType(ch))) != 0; 373 } 374 375 private static CharPredicate range(final int lower, final int upper) { 376 return (BmpCharPredicate)ch -> lower <= ch && ch <= upper; 377 } 378 379 private static CharPredicate ctype(final int ctype) { 380 return (BmpCharPredicate)ch -> ch < 128 && ASCII.isType(ch, ctype); 381 } 382 383 ///////////////////////////////////////////////////////////////////////////// 384 385 /** 386 * Posix ASCII variants, not in the lookup map 387 */ 388 static final BmpCharPredicate ASCII_DIGIT() { 389 return ch -> ch < 128 && ASCII.isDigit(ch); 390 } 391 static final BmpCharPredicate ASCII_WORD() { 392 return ch -> ch < 128 && ASCII.isWord(ch); 393 } 394 static final BmpCharPredicate ASCII_SPACE() { 395 return ch -> ch < 128 && ASCII.isSpace(ch); 396 } 397 398 }