1 /* 2 * Copyright (c) 2000, 2018, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 package java.lang; 27 28 import java.io.UnsupportedEncodingException; 29 import java.lang.ref.SoftReference; 30 import java.nio.ByteBuffer; 31 import java.nio.CharBuffer; 32 import java.nio.charset.Charset; 33 import java.nio.charset.CharsetDecoder; 34 import java.nio.charset.CharsetEncoder; 35 import java.nio.charset.CharacterCodingException; 36 import java.nio.charset.CoderResult; 37 import java.nio.charset.CodingErrorAction; 38 import java.nio.charset.IllegalCharsetNameException; 39 import java.nio.charset.UnsupportedCharsetException; 40 import java.util.Arrays; 41 import jdk.internal.HotSpotIntrinsicCandidate; 42 import sun.nio.cs.HistoricallyNamedCharset; 43 import sun.nio.cs.ArrayDecoder; 44 import sun.nio.cs.ArrayEncoder; 45 46 import static java.lang.String.LATIN1; 47 import static java.lang.String.UTF16; 48 import static java.lang.String.COMPACT_STRINGS; 49 import static java.lang.Character.isSurrogate; 50 import static java.lang.Character.highSurrogate; 51 import static java.lang.Character.lowSurrogate; 52 import static java.lang.Character.isSupplementaryCodePoint; 53 import static java.lang.StringUTF16.putChar; 54 55 /** 56 * Utility class for string encoding and decoding. 57 */ 58 59 class StringCoding { 60 61 private StringCoding() { } 62 63 /** The cached coders for each thread */ 64 private static final ThreadLocal<SoftReference<StringDecoder>> decoder = 65 new ThreadLocal<>(); 66 private static final ThreadLocal<SoftReference<StringEncoder>> encoder = 67 new ThreadLocal<>(); 68 69 private static final Charset ISO_8859_1 = sun.nio.cs.ISO_8859_1.INSTANCE; 70 private static final Charset US_ASCII = sun.nio.cs.US_ASCII.INSTANCE; 71 private static final Charset UTF_8 = sun.nio.cs.UTF_8.INSTANCE; 72 73 private static <T> T deref(ThreadLocal<SoftReference<T>> tl) { 74 SoftReference<T> sr = tl.get(); 75 if (sr == null) 76 return null; 77 return sr.get(); 78 } 79 80 private static <T> void set(ThreadLocal<SoftReference<T>> tl, T ob) { 81 tl.set(new SoftReference<>(ob)); 82 } 83 84 // Trim the given byte array to the given length 85 private static byte[] safeTrim(byte[] ba, int len, boolean isTrusted) { 86 if (len == ba.length && (isTrusted || System.getSecurityManager() == null)) 87 return ba; 88 else 89 return Arrays.copyOf(ba, len); 90 } 91 92 private static int scale(int len, float expansionFactor) { 93 // We need to perform double, not float, arithmetic; otherwise 94 // we lose low order bits when len is larger than 2**24. 95 return (int)(len * (double)expansionFactor); 96 } 97 98 private static Charset lookupCharset(String csn) { 99 if (Charset.isSupported(csn)) { 100 try { 101 return Charset.forName(csn); 102 } catch (UnsupportedCharsetException x) { 103 throw new Error(x); 104 } 105 } 106 return null; 107 } 108 109 static class Result { 110 byte[] value; 111 byte coder; 112 113 Result with() { 114 coder = COMPACT_STRINGS ? LATIN1 : UTF16; 115 value = new byte[0]; 116 return this; 117 } 118 119 Result with(char[] val, int off, int len) { 120 if (String.COMPACT_STRINGS) { 121 byte[] bs = StringUTF16.compress(val, off, len); 122 if (bs != null) { 123 value = bs; 124 coder = LATIN1; 125 return this; 126 } 127 } 128 coder = UTF16; 129 value = StringUTF16.toBytes(val, off, len); 130 return this; 131 } 132 133 Result with(byte[] val, byte coder) { 134 this.coder = coder; 135 value = val; 136 return this; 137 } 138 } 139 140 @HotSpotIntrinsicCandidate 141 public static boolean hasNegatives(byte[] ba, int off, int len) { 142 for (int i = off; i < off + len; i++) { 143 if (ba[i] < 0) { 144 return true; 145 } 146 } 147 return false; 148 } 149 150 // -- Decoding -- 151 static class StringDecoder { 152 private final String requestedCharsetName; 153 private final Charset cs; 154 private final boolean isASCIICompatible; 155 private final CharsetDecoder cd; 156 protected final Result result; 157 158 StringDecoder(Charset cs, String rcn) { 159 this.requestedCharsetName = rcn; 160 this.cs = cs; 161 this.cd = cs.newDecoder() 162 .onMalformedInput(CodingErrorAction.REPLACE) 163 .onUnmappableCharacter(CodingErrorAction.REPLACE); 164 this.result = new Result(); 165 this.isASCIICompatible = (cd instanceof ArrayDecoder) && 166 ((ArrayDecoder)cd).isASCIICompatible(); 167 } 168 169 String charsetName() { 170 if (cs instanceof HistoricallyNamedCharset) 171 return ((HistoricallyNamedCharset)cs).historicalName(); 172 return cs.name(); 173 } 174 175 final String requestedCharsetName() { 176 return requestedCharsetName; 177 } 178 179 Result decode(byte[] ba, int off, int len) { 180 if (len == 0) { 181 return result.with(); 182 } 183 // fastpath for ascii compatible 184 if (isASCIICompatible && !hasNegatives(ba, off, len)) { 185 if (COMPACT_STRINGS) { 186 return result.with(Arrays.copyOfRange(ba, off, off + len), 187 LATIN1); 188 } else { 189 return result.with(StringLatin1.inflate(ba, off, len), UTF16); 190 } 191 } 192 int en = scale(len, cd.maxCharsPerByte()); 193 char[] ca = new char[en]; 194 if (cd instanceof ArrayDecoder) { 195 int clen = ((ArrayDecoder)cd).decode(ba, off, len, ca); 196 return result.with(ca, 0, clen); 197 } 198 cd.reset(); 199 ByteBuffer bb = ByteBuffer.wrap(ba, off, len); 200 CharBuffer cb = CharBuffer.wrap(ca); 201 try { 202 CoderResult cr = cd.decode(bb, cb, true); 203 if (!cr.isUnderflow()) 204 cr.throwException(); 205 cr = cd.flush(cb); 206 if (!cr.isUnderflow()) 207 cr.throwException(); 208 } catch (CharacterCodingException x) { 209 // Substitution is always enabled, 210 // so this shouldn't happen 211 throw new Error(x); 212 } 213 return result.with(ca, 0, cb.position()); 214 } 215 } 216 217 static Result decode(String charsetName, byte[] ba, int off, int len) 218 throws UnsupportedEncodingException 219 { 220 StringDecoder sd = deref(decoder); 221 String csn = (charsetName == null) ? "ISO-8859-1" : charsetName; 222 if ((sd == null) || !(csn.equals(sd.requestedCharsetName()) 223 || csn.equals(sd.charsetName()))) { 224 sd = null; 225 try { 226 Charset cs = lookupCharset(csn); 227 if (cs != null) { 228 if (cs == UTF_8) { 229 return decodeUTF8(ba, off, len, true); 230 } 231 if (cs == ISO_8859_1) { 232 return decodeLatin1(ba, off, len); 233 } 234 if (cs == US_ASCII) { 235 return decodeASCII(ba, off, len); 236 } 237 sd = new StringDecoder(cs, csn); 238 } 239 } catch (IllegalCharsetNameException x) {} 240 if (sd == null) 241 throw new UnsupportedEncodingException(csn); 242 set(decoder, sd); 243 } 244 return sd.decode(ba, off, len); 245 } 246 247 static Result decode(Charset cs, byte[] ba, int off, int len) { 248 if (cs == UTF_8) { 249 return decodeUTF8(ba, off, len, true); 250 } 251 if (cs == ISO_8859_1) { 252 return decodeLatin1(ba, off, len); 253 } 254 if (cs == US_ASCII) { 255 return decodeASCII(ba, off, len); 256 } 257 258 // (1)We never cache the "external" cs, the only benefit of creating 259 // an additional StringDe/Encoder object to wrap it is to share the 260 // de/encode() method. These SD/E objects are short-lived, the young-gen 261 // gc should be able to take care of them well. But the best approach 262 // is still not to generate them if not really necessary. 263 // (2)The defensive copy of the input byte/char[] has a big performance 264 // impact, as well as the outgoing result byte/char[]. Need to do the 265 // optimization check of (sm==null && classLoader0==null) for both. 266 // (3)There might be a timing gap in isTrusted setting. getClassLoader0() 267 // is only checked (and then isTrusted gets set) when (SM==null). It is 268 // possible that the SM==null for now but then SM is NOT null later 269 // when safeTrim() is invoked...the "safe" way to do is to redundant 270 // check (... && (isTrusted || SM == null || getClassLoader0())) in trim 271 // but it then can be argued that the SM is null when the operation 272 // is started... 273 CharsetDecoder cd = cs.newDecoder(); 274 // ascii fastpath 275 if ((cd instanceof ArrayDecoder) && 276 ((ArrayDecoder)cd).isASCIICompatible() && !hasNegatives(ba, off, len)) { 277 return decodeLatin1(ba, off, len); 278 } 279 int en = scale(len, cd.maxCharsPerByte()); 280 if (len == 0) { 281 return new Result().with(); 282 } 283 cd.onMalformedInput(CodingErrorAction.REPLACE) 284 .onUnmappableCharacter(CodingErrorAction.REPLACE) 285 .reset(); 286 char[] ca = new char[en]; 287 if (cd instanceof ArrayDecoder) { 288 int clen = ((ArrayDecoder)cd).decode(ba, off, len, ca); 289 return new Result().with(ca, 0, clen); 290 } 291 if (cs.getClass().getClassLoader0() != null && 292 System.getSecurityManager() != null) { 293 ba = Arrays.copyOfRange(ba, off, off + len); 294 off = 0; 295 } 296 ByteBuffer bb = ByteBuffer.wrap(ba, off, len); 297 CharBuffer cb = CharBuffer.wrap(ca); 298 try { 299 CoderResult cr = cd.decode(bb, cb, true); 300 if (!cr.isUnderflow()) 301 cr.throwException(); 302 cr = cd.flush(cb); 303 if (!cr.isUnderflow()) 304 cr.throwException(); 305 } catch (CharacterCodingException x) { 306 // Substitution is always enabled, 307 // so this shouldn't happen 308 throw new Error(x); 309 } 310 return new Result().with(ca, 0, cb.position()); 311 } 312 313 static Result decode(byte[] ba, int off, int len) { 314 Charset cs = Charset.defaultCharset(); 315 if (cs == UTF_8) { 316 return decodeUTF8(ba, off, len, true); 317 } 318 if (cs == ISO_8859_1) { 319 return decodeLatin1(ba, off, len); 320 } 321 if (cs == US_ASCII) { 322 return decodeASCII(ba, off, len); 323 } 324 StringDecoder sd = deref(decoder); 325 if (sd == null || !cs.name().equals(sd.cs.name())) { 326 sd = new StringDecoder(cs, cs.name()); 327 set(decoder, sd); 328 } 329 return sd.decode(ba, off, len); 330 } 331 332 // -- Encoding -- 333 private static class StringEncoder { 334 private Charset cs; 335 private CharsetEncoder ce; 336 private final boolean isASCIICompatible; 337 private final String requestedCharsetName; 338 private final boolean isTrusted; 339 340 private StringEncoder(Charset cs, String rcn) { 341 this.requestedCharsetName = rcn; 342 this.cs = cs; 343 this.ce = cs.newEncoder() 344 .onMalformedInput(CodingErrorAction.REPLACE) 345 .onUnmappableCharacter(CodingErrorAction.REPLACE); 346 this.isTrusted = (cs.getClass().getClassLoader0() == null); 347 this.isASCIICompatible = (ce instanceof ArrayEncoder) && 348 ((ArrayEncoder)ce).isASCIICompatible(); 349 } 350 351 String charsetName() { 352 if (cs instanceof HistoricallyNamedCharset) 353 return ((HistoricallyNamedCharset)cs).historicalName(); 354 return cs.name(); 355 } 356 357 final String requestedCharsetName() { 358 return requestedCharsetName; 359 } 360 361 byte[] encode(byte coder, byte[] val) { 362 // fastpath for ascii compatible 363 if (coder == LATIN1 && isASCIICompatible && 364 !hasNegatives(val, 0, val.length)) { 365 return Arrays.copyOf(val, val.length); 366 } 367 int len = val.length >> coder; // assume LATIN1=0/UTF16=1; 368 int en = scale(len, ce.maxBytesPerChar()); 369 byte[] ba = new byte[en]; 370 if (len == 0) { 371 return ba; 372 } 373 if (ce instanceof ArrayEncoder) { 374 int blen = (coder == LATIN1 ) ? ((ArrayEncoder)ce).encodeFromLatin1(val, 0, len, ba) 375 : ((ArrayEncoder)ce).encodeFromUTF16(val, 0, len, ba); 376 if (blen != -1) { 377 return safeTrim(ba, blen, isTrusted); 378 } 379 } 380 char[] ca = (coder == LATIN1 ) ? StringLatin1.toChars(val) 381 : StringUTF16.toChars(val); 382 ce.reset(); 383 ByteBuffer bb = ByteBuffer.wrap(ba); 384 CharBuffer cb = CharBuffer.wrap(ca, 0, len); 385 try { 386 CoderResult cr = ce.encode(cb, bb, true); 387 if (!cr.isUnderflow()) 388 cr.throwException(); 389 cr = ce.flush(bb); 390 if (!cr.isUnderflow()) 391 cr.throwException(); 392 } catch (CharacterCodingException x) { 393 // Substitution is always enabled, 394 // so this shouldn't happen 395 throw new Error(x); 396 } 397 return safeTrim(ba, bb.position(), isTrusted); 398 } 399 } 400 401 static byte[] encode(String charsetName, byte coder, byte[] val) 402 throws UnsupportedEncodingException 403 { 404 StringEncoder se = deref(encoder); 405 String csn = (charsetName == null) ? "ISO-8859-1" : charsetName; 406 if ((se == null) || !(csn.equals(se.requestedCharsetName()) 407 || csn.equals(se.charsetName()))) { 408 se = null; 409 try { 410 Charset cs = lookupCharset(csn); 411 if (cs != null) { 412 if (cs == UTF_8) { 413 return encodeUTF8(coder, val, true); 414 } 415 if (cs == ISO_8859_1) { 416 return encode8859_1(coder, val); 417 } 418 if (cs == US_ASCII) { 419 return encodeASCII(coder, val); 420 } 421 se = new StringEncoder(cs, csn); 422 } 423 } catch (IllegalCharsetNameException x) {} 424 if (se == null) { 425 throw new UnsupportedEncodingException (csn); 426 } 427 set(encoder, se); 428 } 429 return se.encode(coder, val); 430 } 431 432 static byte[] encode(Charset cs, byte coder, byte[] val) { 433 if (cs == UTF_8) { 434 return encodeUTF8(coder, val, true); 435 } 436 if (cs == ISO_8859_1) { 437 return encode8859_1(coder, val); 438 } 439 if (cs == US_ASCII) { 440 return encodeASCII(coder, val); 441 } 442 CharsetEncoder ce = cs.newEncoder(); 443 // fastpath for ascii compatible 444 if (coder == LATIN1 && (((ce instanceof ArrayEncoder) && 445 ((ArrayEncoder)ce).isASCIICompatible() && 446 !hasNegatives(val, 0, val.length)))) { 447 return Arrays.copyOf(val, val.length); 448 } 449 int len = val.length >> coder; // assume LATIN1=0/UTF16=1; 450 int en = scale(len, ce.maxBytesPerChar()); 451 byte[] ba = new byte[en]; 452 if (len == 0) { 453 return ba; 454 } 455 ce.onMalformedInput(CodingErrorAction.REPLACE) 456 .onUnmappableCharacter(CodingErrorAction.REPLACE) 457 .reset(); 458 if (ce instanceof ArrayEncoder) { 459 int blen = (coder == LATIN1 ) ? ((ArrayEncoder)ce).encodeFromLatin1(val, 0, len, ba) 460 : ((ArrayEncoder)ce).encodeFromUTF16(val, 0, len, ba); 461 if (blen != -1) { 462 return safeTrim(ba, blen, true); 463 } 464 } 465 boolean isTrusted = cs.getClass().getClassLoader0() == null || 466 System.getSecurityManager() == null; 467 char[] ca = (coder == LATIN1 ) ? StringLatin1.toChars(val) 468 : StringUTF16.toChars(val); 469 ByteBuffer bb = ByteBuffer.wrap(ba); 470 CharBuffer cb = CharBuffer.wrap(ca, 0, len); 471 try { 472 CoderResult cr = ce.encode(cb, bb, true); 473 if (!cr.isUnderflow()) 474 cr.throwException(); 475 cr = ce.flush(bb); 476 if (!cr.isUnderflow()) 477 cr.throwException(); 478 } catch (CharacterCodingException x) { 479 throw new Error(x); 480 } 481 return safeTrim(ba, bb.position(), isTrusted); 482 } 483 484 static byte[] encode(byte coder, byte[] val) { 485 Charset cs = Charset.defaultCharset(); 486 if (cs == UTF_8) { 487 return encodeUTF8(coder, val, true); 488 } 489 if (cs == ISO_8859_1) { 490 return encode8859_1(coder, val); 491 } 492 if (cs == US_ASCII) { 493 return encodeASCII(coder, val); 494 } 495 StringEncoder se = deref(encoder); 496 if (se == null || !cs.name().equals(se.cs.name())) { 497 se = new StringEncoder(cs, cs.name()); 498 set(encoder, se); 499 } 500 return se.encode(coder, val); 501 } 502 503 /** 504 * Print a message directly to stderr, bypassing all character conversion 505 * methods. 506 * @param msg message to print 507 */ 508 private static native void err(String msg); 509 510 /* The cached Result for each thread */ 511 private static final ThreadLocal<StringCoding.Result> 512 resultCached = new ThreadLocal<>() { 513 protected StringCoding.Result initialValue() { 514 return new StringCoding.Result(); 515 }}; 516 517 ////////////////////////// ascii ////////////////////////////// 518 519 private static Result decodeASCII(byte[] ba, int off, int len) { 520 Result result = resultCached.get(); 521 if (COMPACT_STRINGS && !hasNegatives(ba, off, len)) { 522 return result.with(Arrays.copyOfRange(ba, off, off + len), 523 LATIN1); 524 } 525 byte[] dst = new byte[len<<1]; 526 int dp = 0; 527 while (dp < len) { 528 int b = ba[off++]; 529 putChar(dst, dp++, (b >= 0) ? (char)b : repl); 530 } 531 return result.with(dst, UTF16); 532 } 533 534 private static byte[] encodeASCII(byte coder, byte[] val) { 535 if (coder == LATIN1) { 536 byte[] dst = new byte[val.length]; 537 for (int i = 0; i < val.length; i++) { 538 if (val[i] < 0) { 539 dst[i] = '?'; 540 } else { 541 dst[i] = val[i]; 542 } 543 } 544 return dst; 545 } 546 int len = val.length >> 1; 547 byte[] dst = new byte[len]; 548 int dp = 0; 549 for (int i = 0; i < len; i++) { 550 char c = StringUTF16.getChar(val, i); 551 if (c < 0x80) { 552 dst[dp++] = (byte)c; 553 continue; 554 } 555 if (Character.isHighSurrogate(c) && i + 1 < len && 556 Character.isLowSurrogate(StringUTF16.getChar(val, i + 1))) { 557 i++; 558 } 559 dst[dp++] = '?'; 560 } 561 if (len == dp) { 562 return dst; 563 } 564 return Arrays.copyOf(dst, dp); 565 } 566 567 ////////////////////////// latin1/8859_1 /////////////////////////// 568 569 private static Result decodeLatin1(byte[] ba, int off, int len) { 570 Result result = resultCached.get(); 571 if (COMPACT_STRINGS) { 572 return result.with(Arrays.copyOfRange(ba, off, off + len), LATIN1); 573 } else { 574 return result.with(StringLatin1.inflate(ba, off, len), UTF16); 575 } 576 } 577 578 @HotSpotIntrinsicCandidate 579 private static int implEncodeISOArray(byte[] sa, int sp, 580 byte[] da, int dp, int len) { 581 int i = 0; 582 for (; i < len; i++) { 583 char c = StringUTF16.getChar(sa, sp++); 584 if (c > '\u00FF') 585 break; 586 da[dp++] = (byte)c; 587 } 588 return i; 589 } 590 591 private static byte[] encode8859_1(byte coder, byte[] val) { 592 return encode8859_1(coder, val, true); 593 } 594 595 private static byte[] encode8859_1(byte coder, byte[] val, boolean doReplace) { 596 if (coder == LATIN1) { 597 return Arrays.copyOf(val, val.length); 598 } 599 int len = val.length >> 1; 600 byte[] dst = new byte[len]; 601 int dp = 0; 602 int sp = 0; 603 int sl = len; 604 while (sp < sl) { 605 int ret = implEncodeISOArray(val, sp, dst, dp, len); 606 sp = sp + ret; 607 dp = dp + ret; 608 if (ret != len) { 609 if (!doReplace) { 610 throwMalformed(sp, 1); 611 } 612 char c = StringUTF16.getChar(val, sp++); 613 if (Character.isHighSurrogate(c) && sp < sl && 614 Character.isLowSurrogate(StringUTF16.getChar(val, sp))) { 615 sp++; 616 } 617 dst[dp++] = '?'; 618 len = sl - sp; 619 } 620 } 621 if (dp == dst.length) { 622 return dst; 623 } 624 return Arrays.copyOf(dst, dp); 625 } 626 627 //////////////////////////////// utf8 //////////////////////////////////// 628 629 private static boolean isNotContinuation(int b) { 630 return (b & 0xc0) != 0x80; 631 } 632 633 private static boolean isMalformed3(int b1, int b2, int b3) { 634 return (b1 == (byte)0xe0 && (b2 & 0xe0) == 0x80) || 635 (b2 & 0xc0) != 0x80 || (b3 & 0xc0) != 0x80; 636 } 637 638 private static boolean isMalformed3_2(int b1, int b2) { 639 return (b1 == (byte)0xe0 && (b2 & 0xe0) == 0x80) || 640 (b2 & 0xc0) != 0x80; 641 } 642 643 private static boolean isMalformed4(int b2, int b3, int b4) { 644 return (b2 & 0xc0) != 0x80 || (b3 & 0xc0) != 0x80 || 645 (b4 & 0xc0) != 0x80; 646 } 647 648 private static boolean isMalformed4_2(int b1, int b2) { 649 return (b1 == 0xf0 && (b2 < 0x90 || b2 > 0xbf)) || 650 (b1 == 0xf4 && (b2 & 0xf0) != 0x80) || 651 (b2 & 0xc0) != 0x80; 652 } 653 654 private static boolean isMalformed4_3(int b3) { 655 return (b3 & 0xc0) != 0x80; 656 } 657 658 // for nb == 3/4 659 private static int malformedN(byte[] src, int sp, int nb) { 660 if (nb == 3) { 661 int b1 = src[sp++]; 662 int b2 = src[sp++]; // no need to lookup b3 663 return ((b1 == (byte)0xe0 && (b2 & 0xe0) == 0x80) || 664 isNotContinuation(b2)) ? 1 : 2; 665 } else if (nb == 4) { // we don't care the speed here 666 int b1 = src[sp++] & 0xff; 667 int b2 = src[sp++] & 0xff; 668 if (b1 > 0xf4 || 669 (b1 == 0xf0 && (b2 < 0x90 || b2 > 0xbf)) || 670 (b1 == 0xf4 && (b2 & 0xf0) != 0x80) || 671 isNotContinuation(b2)) 672 return 1; 673 if (isNotContinuation(src[sp++])) 674 return 2; 675 return 3; 676 } 677 assert false; 678 return -1; 679 } 680 681 private static void throwMalformed(int off, int nb) { 682 throw new IllegalArgumentException("malformed input off : " + off + 683 ", length : " + nb); 684 } 685 686 private static void throwMalformed(byte[] val) { 687 int dp = 0; 688 while (dp < val.length && val[dp] >=0) { dp++; } 689 throwMalformed(dp, 1); 690 } 691 692 private static char repl = '\ufffd'; 693 694 private static Result decodeUTF8(byte[] src, int sp, int len, boolean doReplace) { 695 // ascii-bais, which has a relative impact to the non-ascii-only bytes 696 if (COMPACT_STRINGS && !hasNegatives(src, sp, len)) 697 return resultCached.get().with(Arrays.copyOfRange(src, sp, sp + len), 698 LATIN1); 699 return decodeUTF8_0(src, sp, len, doReplace); 700 } 701 702 private static Result decodeUTF8_0(byte[] src, int sp, int len, boolean doReplace) { 703 Result ret = resultCached.get(); 704 705 int sl = sp + len; 706 int dp = 0; 707 byte[] dst = new byte[len]; 708 709 if (COMPACT_STRINGS) { 710 while (sp < sl) { 711 int b1 = src[sp]; 712 if (b1 >= 0) { 713 dst[dp++] = (byte)b1; 714 sp++; 715 continue; 716 } 717 if ((b1 == (byte)0xc2 || b1 == (byte)0xc3) && 718 sp + 1 < sl) { 719 int b2 = src[sp + 1]; 720 if (!isNotContinuation(b2)) { 721 dst[dp++] = (byte)(((b1 << 6) ^ b2)^ 722 (((byte) 0xC0 << 6) ^ 723 ((byte) 0x80 << 0))); 724 sp += 2; 725 continue; 726 } 727 } 728 // anything not a latin1, including the repl 729 // we have to go with the utf16 730 break; 731 } 732 if (sp == sl) { 733 if (dp != dst.length) { 734 dst = Arrays.copyOf(dst, dp); 735 } 736 return ret.with(dst, LATIN1); 737 } 738 } 739 if (dp == 0) { 740 dst = new byte[len << 1]; 741 } else { 742 byte[] buf = new byte[len << 1]; 743 StringLatin1.inflate(dst, 0, buf, 0, dp); 744 dst = buf; 745 } 746 while (sp < sl) { 747 int b1 = src[sp++]; 748 if (b1 >= 0) { 749 putChar(dst, dp++, (char) b1); 750 } else if ((b1 >> 5) == -2 && (b1 & 0x1e) != 0) { 751 if (sp < sl) { 752 int b2 = src[sp++]; 753 if (isNotContinuation(b2)) { 754 if (!doReplace) { 755 throwMalformed(sp - 1, 1); 756 } 757 putChar(dst, dp++, repl); 758 sp--; 759 } else { 760 putChar(dst, dp++, (char)(((b1 << 6) ^ b2)^ 761 (((byte) 0xC0 << 6) ^ 762 ((byte) 0x80 << 0)))); 763 } 764 continue; 765 } 766 if (!doReplace) { 767 throwMalformed(sp, 1); // underflow() 768 } 769 putChar(dst, dp++, repl); 770 break; 771 } else if ((b1 >> 4) == -2) { 772 if (sp + 1 < sl) { 773 int b2 = src[sp++]; 774 int b3 = src[sp++]; 775 if (isMalformed3(b1, b2, b3)) { 776 if (!doReplace) { 777 throwMalformed(sp - 3, 3); 778 } 779 putChar(dst, dp++, repl); 780 sp -= 3; 781 sp += malformedN(src, sp, 3); 782 } else { 783 char c = (char)((b1 << 12) ^ 784 (b2 << 6) ^ 785 (b3 ^ 786 (((byte) 0xE0 << 12) ^ 787 ((byte) 0x80 << 6) ^ 788 ((byte) 0x80 << 0)))); 789 if (isSurrogate(c)) { 790 if (!doReplace) { 791 throwMalformed(sp - 3, 3); 792 } 793 putChar(dst, dp++, repl); 794 } else { 795 putChar(dst, dp++, c); 796 } 797 } 798 continue; 799 } 800 if (sp < sl && isMalformed3_2(b1, src[sp])) { 801 if (!doReplace) { 802 throwMalformed(sp - 1, 2); 803 } 804 putChar(dst, dp++, repl); 805 continue; 806 } 807 if (!doReplace){ 808 throwMalformed(sp, 1); 809 } 810 putChar(dst, dp++, repl); 811 break; 812 } else if ((b1 >> 3) == -2) { 813 if (sp + 2 < sl) { 814 int b2 = src[sp++]; 815 int b3 = src[sp++]; 816 int b4 = src[sp++]; 817 int uc = ((b1 << 18) ^ 818 (b2 << 12) ^ 819 (b3 << 6) ^ 820 (b4 ^ 821 (((byte) 0xF0 << 18) ^ 822 ((byte) 0x80 << 12) ^ 823 ((byte) 0x80 << 6) ^ 824 ((byte) 0x80 << 0)))); 825 if (isMalformed4(b2, b3, b4) || 826 !isSupplementaryCodePoint(uc)) { // shortest form check 827 if (!doReplace) { 828 throwMalformed(sp - 4, 4); 829 } 830 putChar(dst, dp++, repl); 831 sp -= 4; 832 sp += malformedN(src, sp, 4); 833 } else { 834 putChar(dst, dp++, highSurrogate(uc)); 835 putChar(dst, dp++, lowSurrogate(uc)); 836 } 837 continue; 838 } 839 b1 &= 0xff; 840 if (b1 > 0xf4 || 841 sp < sl && isMalformed4_2(b1, src[sp] & 0xff)) { 842 if (!doReplace) { 843 throwMalformed(sp - 1, 1); // or 2 844 } 845 putChar(dst, dp++, repl); 846 continue; 847 } 848 if (!doReplace) { 849 throwMalformed(sp - 1, 1); 850 } 851 sp++; 852 putChar(dst, dp++, repl); 853 if (sp < sl && isMalformed4_3(src[sp])) { 854 continue; 855 } 856 break; 857 } else { 858 if (!doReplace) { 859 throwMalformed(sp - 1, 1); 860 } 861 putChar(dst, dp++, repl); 862 } 863 } 864 if (dp != len) { 865 dst = Arrays.copyOf(dst, dp << 1); 866 } 867 return ret.with(dst, UTF16); 868 } 869 870 private static byte[] encodeUTF8(byte coder, byte[] val, boolean doReplace) { 871 if (coder == UTF16) 872 return encodeUTF8_UTF16(val, doReplace); 873 874 if (!hasNegatives(val, 0, val.length)) 875 return Arrays.copyOf(val, val.length); 876 877 int dp = 0; 878 byte[] dst = new byte[val.length << 1]; 879 for (int sp = 0; sp < val.length; sp++) { 880 byte c = val[sp]; 881 if (c < 0) { 882 dst[dp++] = (byte)(0xc0 | ((c & 0xff) >> 6)); 883 dst[dp++] = (byte)(0x80 | (c & 0x3f)); 884 } else { 885 dst[dp++] = c; 886 } 887 } 888 if (dp == dst.length) 889 return dst; 890 return Arrays.copyOf(dst, dp); 891 } 892 893 private static byte[] encodeUTF8_UTF16(byte[] val, boolean doReplace) { 894 int dp = 0; 895 int sp = 0; 896 int sl = val.length >> 1; 897 byte[] dst = new byte[sl * 3]; 898 char c; 899 while (sp < sl && (c = StringUTF16.getChar(val, sp)) < '\u0080') { 900 // ascii fast loop; 901 dst[dp++] = (byte)c; 902 sp++; 903 } 904 while (sp < sl) { 905 c = StringUTF16.getChar(val, sp++); 906 if (c < 0x80) { 907 dst[dp++] = (byte)c; 908 } else if (c < 0x800) { 909 dst[dp++] = (byte)(0xc0 | (c >> 6)); 910 dst[dp++] = (byte)(0x80 | (c & 0x3f)); 911 } else if (Character.isSurrogate(c)) { 912 int uc = -1; 913 char c2; 914 if (Character.isHighSurrogate(c) && sp < sl && 915 Character.isLowSurrogate(c2 = StringUTF16.getChar(val, sp))) { 916 uc = Character.toCodePoint(c, c2); 917 } 918 if (uc < 0) { 919 if (doReplace) { 920 dst[dp++] = '?'; 921 } else { 922 throwMalformed(sp - 1, 1); // or 2, does not matter here 923 } 924 } else { 925 dst[dp++] = (byte)(0xf0 | ((uc >> 18))); 926 dst[dp++] = (byte)(0x80 | ((uc >> 12) & 0x3f)); 927 dst[dp++] = (byte)(0x80 | ((uc >> 6) & 0x3f)); 928 dst[dp++] = (byte)(0x80 | (uc & 0x3f)); 929 sp++; // 2 chars 930 } 931 } else { 932 // 3 bytes, 16 bits 933 dst[dp++] = (byte)(0xe0 | ((c >> 12))); 934 dst[dp++] = (byte)(0x80 | ((c >> 6) & 0x3f)); 935 dst[dp++] = (byte)(0x80 | (c & 0x3f)); 936 } 937 } 938 if (dp == dst.length) { 939 return dst; 940 } 941 return Arrays.copyOf(dst, dp); 942 } 943 944 ////////////////////// for j.u.z.ZipCoder ////////////////////////// 945 946 /* 947 * Throws iae, instead of replacing, if malformed or unmappble. 948 */ 949 static String newStringUTF8NoRepl(byte[] src, int off, int len) { 950 if (COMPACT_STRINGS && !hasNegatives(src, off, len)) 951 return new String(Arrays.copyOfRange(src, off, off + len), LATIN1); 952 Result ret = decodeUTF8_0(src, off, len, false); 953 return new String(ret.value, ret.coder); 954 } 955 956 /* 957 * Throws iae, instead of replacing, if unmappble. 958 */ 959 static byte[] getBytesUTF8NoRepl(String s) { 960 return encodeUTF8(s.coder(), s.value(), false); 961 } 962 963 ////////////////////// for j.n.f.Files ////////////////////////// 964 965 private static boolean isASCII(byte[] src) { 966 return !hasNegatives(src, 0, src.length); 967 } 968 969 private static String newStringLatin1(byte[] src) { 970 if (COMPACT_STRINGS) 971 return new String(src, LATIN1); 972 return new String(StringLatin1.inflate(src, 0, src.length), UTF16); 973 } 974 975 static String newStringNoRepl(byte[] src, Charset cs) { 976 if (cs == UTF_8) { 977 if (COMPACT_STRINGS && isASCII(src)) 978 return new String(src, LATIN1); 979 Result ret = decodeUTF8_0(src, 0, src.length, false); 980 return new String(ret.value, ret.coder); 981 } 982 if (cs == ISO_8859_1) { 983 return newStringLatin1(src); 984 } 985 if (cs == US_ASCII) { 986 if (isASCII(src)) { 987 return newStringLatin1(src); 988 } else { 989 throwMalformed(src); 990 } 991 } 992 993 CharsetDecoder cd = cs.newDecoder(); 994 // ascii fastpath 995 if ((cd instanceof ArrayDecoder) && 996 ((ArrayDecoder)cd).isASCIICompatible() && isASCII(src)) { 997 return newStringLatin1(src); 998 } 999 int len = src.length; 1000 if (len == 0) { 1001 return ""; 1002 } 1003 int en = scale(len, cd.maxCharsPerByte()); 1004 char[] ca = new char[en]; 1005 if (cs.getClass().getClassLoader0() != null && 1006 System.getSecurityManager() != null) { 1007 src = Arrays.copyOf(src, len); 1008 } 1009 ByteBuffer bb = ByteBuffer.wrap(src); 1010 CharBuffer cb = CharBuffer.wrap(ca); 1011 try { 1012 CoderResult cr = cd.decode(bb, cb, true); 1013 if (!cr.isUnderflow()) 1014 cr.throwException(); 1015 cr = cd.flush(cb); 1016 if (!cr.isUnderflow()) 1017 cr.throwException(); 1018 } catch (CharacterCodingException x) { 1019 throw new IllegalArgumentException(x); // todo 1020 } 1021 Result ret = resultCached.get().with(ca, 0, cb.position()); 1022 return new String(ret.value, ret.coder); 1023 } 1024 1025 /* 1026 * Throws iae, instead of replacing, if unmappble. 1027 */ 1028 static byte[] getBytesNoRepl(String s, Charset cs) { 1029 byte[] val = s.value(); 1030 byte coder = s.coder(); 1031 if (cs == UTF_8) { 1032 if (isASCII(val)) { 1033 return val; 1034 } 1035 return encodeUTF8(coder, val, false); 1036 } 1037 if (cs == ISO_8859_1) { 1038 if (coder == LATIN1) { 1039 return val; 1040 } 1041 return encode8859_1(coder, val, false); 1042 } 1043 if (cs == US_ASCII) { 1044 if (coder == LATIN1) { 1045 if (isASCII(val)) { 1046 return val; 1047 } else { 1048 throwMalformed(val); 1049 } 1050 } 1051 } 1052 CharsetEncoder ce = cs.newEncoder(); 1053 // fastpath for ascii compatible 1054 if (coder == LATIN1 && (((ce instanceof ArrayEncoder) && 1055 ((ArrayEncoder)ce).isASCIICompatible() && 1056 isASCII(val)))) { 1057 return val; 1058 } 1059 int len = val.length >> coder; // assume LATIN1=0/UTF16=1; 1060 int en = scale(len, ce.maxBytesPerChar()); 1061 byte[] ba = new byte[en]; 1062 if (len == 0) { 1063 return ba; 1064 } 1065 if (ce instanceof ArrayEncoder) { 1066 int blen = (coder == LATIN1 ) ? ((ArrayEncoder)ce).encodeFromLatin1(val, 0, len, ba) 1067 : ((ArrayEncoder)ce).encodeFromUTF16(val, 0, len, ba); 1068 if (blen != -1) { 1069 return safeTrim(ba, blen, true); 1070 } 1071 } 1072 boolean isTrusted = cs.getClass().getClassLoader0() == null || 1073 System.getSecurityManager() == null; 1074 char[] ca = (coder == LATIN1 ) ? StringLatin1.toChars(val) 1075 : StringUTF16.toChars(val); 1076 ByteBuffer bb = ByteBuffer.wrap(ba); 1077 CharBuffer cb = CharBuffer.wrap(ca, 0, len); 1078 try { 1079 CoderResult cr = ce.encode(cb, bb, true); 1080 if (!cr.isUnderflow()) 1081 cr.throwException(); 1082 cr = ce.flush(bb); 1083 if (!cr.isUnderflow()) 1084 cr.throwException(); 1085 } catch (CharacterCodingException x) { 1086 throw new Error(x); 1087 } 1088 return safeTrim(ba, bb.position(), isTrusted); 1089 } 1090 }