1 /* 2 * Copyright (c) 2000, 2017, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. Oracle designates this 8 * particular file as subject to the "Classpath" exception as provided 9 * by Oracle in the LICENSE file that accompanied this code. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 */ 25 26 package java.lang; 27 28 import java.io.UnsupportedEncodingException; 29 import java.lang.ref.SoftReference; 30 import java.nio.ByteBuffer; 31 import java.nio.CharBuffer; 32 import java.nio.charset.Charset; 33 import java.nio.charset.CharsetDecoder; 34 import java.nio.charset.CharsetEncoder; 35 import java.nio.charset.CharacterCodingException; 36 import java.nio.charset.CoderResult; 37 import java.nio.charset.CodingErrorAction; 38 import java.nio.charset.IllegalCharsetNameException; 39 import java.nio.charset.UnsupportedCharsetException; 40 import java.util.Arrays; 41 import jdk.internal.HotSpotIntrinsicCandidate; 42 import sun.nio.cs.HistoricallyNamedCharset; 43 import sun.nio.cs.ArrayDecoder; 44 import sun.nio.cs.ArrayEncoder; 45 46 import static java.lang.String.LATIN1; 47 import static java.lang.String.UTF16; 48 import static java.lang.String.COMPACT_STRINGS; 49 import static java.lang.Character.isSurrogate; 50 import static java.lang.Character.highSurrogate; 51 import static java.lang.Character.lowSurrogate; 52 import static java.lang.Character.isSupplementaryCodePoint; 53 import static java.lang.StringUTF16.putChar; 54 55 /** 56 * Utility class for string encoding and decoding. 57 */ 58 59 class StringCoding { 60 61 private StringCoding() { } 62 63 /** The cached coders for each thread */ 64 private static final ThreadLocal<SoftReference<StringDecoder>> decoder = 65 new ThreadLocal<>(); 66 private static final ThreadLocal<SoftReference<StringEncoder>> encoder = 67 new ThreadLocal<>(); 68 69 private static final Charset ISO_8859_1 = sun.nio.cs.ISO_8859_1.INSTANCE; 70 private static final Charset US_ASCII = sun.nio.cs.US_ASCII.INSTANCE; 71 private static final Charset UTF_8 = sun.nio.cs.UTF_8.INSTANCE; 72 73 private static <T> T deref(ThreadLocal<SoftReference<T>> tl) { 74 SoftReference<T> sr = tl.get(); 75 if (sr == null) 76 return null; 77 return sr.get(); 78 } 79 80 private static <T> void set(ThreadLocal<SoftReference<T>> tl, T ob) { 81 tl.set(new SoftReference<>(ob)); 82 } 83 84 // Trim the given byte array to the given length 85 private static byte[] safeTrim(byte[] ba, int len, boolean isTrusted) { 86 if (len == ba.length && (isTrusted || System.getSecurityManager() == null)) 87 return ba; 88 else 89 return Arrays.copyOf(ba, len); 90 } 91 92 private static int scale(int len, float expansionFactor) { 93 // We need to perform double, not float, arithmetic; otherwise 94 // we lose low order bits when len is larger than 2**24. 95 return (int)(len * (double)expansionFactor); 96 } 97 98 private static Charset lookupCharset(String csn) { 99 if (Charset.isSupported(csn)) { 100 try { 101 return Charset.forName(csn); 102 } catch (UnsupportedCharsetException x) { 103 throw new Error(x); 104 } 105 } 106 return null; 107 } 108 109 static class Result { 110 byte[] value; 111 byte coder; 112 113 Result with() { 114 coder = COMPACT_STRINGS ? LATIN1 : UTF16; 115 value = new byte[0]; 116 return this; 117 } 118 119 Result with(char[] val, int off, int len) { 120 if (String.COMPACT_STRINGS) { 121 byte[] bs = StringUTF16.compress(val, off, len); 122 if (bs != null) { 123 value = bs; 124 coder = LATIN1; 125 return this; 126 } 127 } 128 coder = UTF16; 129 value = StringUTF16.toBytes(val, off, len); 130 return this; 131 } 132 133 Result with(byte[] val, byte coder) { 134 this.coder = coder; 135 value = val; 136 return this; 137 } 138 } 139 140 @HotSpotIntrinsicCandidate 141 public static boolean hasNegatives(byte[] ba, int off, int len) { 142 for (int i = off; i < off + len; i++) { 143 if (ba[i] < 0) { 144 return true; 145 } 146 } 147 return false; 148 } 149 150 // -- Decoding -- 151 static class StringDecoder { 152 private final String requestedCharsetName; 153 private final Charset cs; 154 private final boolean isASCIICompatible; 155 private final CharsetDecoder cd; 156 protected final Result result; 157 158 StringDecoder(Charset cs, String rcn) { 159 this.requestedCharsetName = rcn; 160 this.cs = cs; 161 this.cd = cs.newDecoder() 162 .onMalformedInput(CodingErrorAction.REPLACE) 163 .onUnmappableCharacter(CodingErrorAction.REPLACE); 164 this.result = new Result(); 165 this.isASCIICompatible = (cd instanceof ArrayDecoder) && 166 ((ArrayDecoder)cd).isASCIICompatible(); 167 } 168 169 String charsetName() { 170 if (cs instanceof HistoricallyNamedCharset) 171 return ((HistoricallyNamedCharset)cs).historicalName(); 172 return cs.name(); 173 } 174 175 final String requestedCharsetName() { 176 return requestedCharsetName; 177 } 178 179 Result decode(byte[] ba, int off, int len) { 180 if (len == 0) { 181 return result.with(); 182 } 183 // fastpath for ascii compatible 184 if (isASCIICompatible && !hasNegatives(ba, off, len)) { 185 if (COMPACT_STRINGS) { 186 return result.with(Arrays.copyOfRange(ba, off, off + len), 187 LATIN1); 188 } else { 189 return result.with(StringLatin1.inflate(ba, off, len), UTF16); 190 } 191 } 192 int en = scale(len, cd.maxCharsPerByte()); 193 char[] ca = new char[en]; 194 if (cd instanceof ArrayDecoder) { 195 int clen = ((ArrayDecoder)cd).decode(ba, off, len, ca); 196 return result.with(ca, 0, clen); 197 } 198 cd.reset(); 199 ByteBuffer bb = ByteBuffer.wrap(ba, off, len); 200 CharBuffer cb = CharBuffer.wrap(ca); 201 try { 202 CoderResult cr = cd.decode(bb, cb, true); 203 if (!cr.isUnderflow()) 204 cr.throwException(); 205 cr = cd.flush(cb); 206 if (!cr.isUnderflow()) 207 cr.throwException(); 208 } catch (CharacterCodingException x) { 209 // Substitution is always enabled, 210 // so this shouldn't happen 211 throw new Error(x); 212 } 213 return result.with(ca, 0, cb.position()); 214 } 215 } 216 217 static Result decode(String charsetName, byte[] ba, int off, int len) 218 throws UnsupportedEncodingException 219 { 220 StringDecoder sd = deref(decoder); 221 String csn = (charsetName == null) ? "ISO-8859-1" : charsetName; 222 if ((sd == null) || !(csn.equals(sd.requestedCharsetName()) 223 || csn.equals(sd.charsetName()))) { 224 sd = null; 225 try { 226 Charset cs = lookupCharset(csn); 227 if (cs != null) { 228 if (cs == UTF_8) { 229 return decodeUTF8(ba, off, len, true); 230 } 231 if (cs == ISO_8859_1) { 232 return decodeLatin1(ba, off, len); 233 } 234 if (cs == US_ASCII) { 235 return decodeASCII(ba, off, len); 236 } 237 sd = new StringDecoder(cs, csn); 238 } 239 } catch (IllegalCharsetNameException x) {} 240 if (sd == null) 241 throw new UnsupportedEncodingException(csn); 242 set(decoder, sd); 243 } 244 return sd.decode(ba, off, len); 245 } 246 247 static Result decode(Charset cs, byte[] ba, int off, int len) { 248 if (cs == UTF_8) { 249 return decodeUTF8(ba, off, len, true); 250 } 251 if (cs == ISO_8859_1) { 252 return decodeLatin1(ba, off, len); 253 } 254 if (cs == US_ASCII) { 255 return decodeASCII(ba, off, len); 256 } 257 258 // (1)We never cache the "external" cs, the only benefit of creating 259 // an additional StringDe/Encoder object to wrap it is to share the 260 // de/encode() method. These SD/E objects are short-lived, the young-gen 261 // gc should be able to take care of them well. But the best approach 262 // is still not to generate them if not really necessary. 263 // (2)The defensive copy of the input byte/char[] has a big performance 264 // impact, as well as the outgoing result byte/char[]. Need to do the 265 // optimization check of (sm==null && classLoader0==null) for both. 266 // (3)There might be a timing gap in isTrusted setting. getClassLoader0() 267 // is only checked (and then isTrusted gets set) when (SM==null). It is 268 // possible that the SM==null for now but then SM is NOT null later 269 // when safeTrim() is invoked...the "safe" way to do is to redundant 270 // check (... && (isTrusted || SM == null || getClassLoader0())) in trim 271 // but it then can be argued that the SM is null when the operation 272 // is started... 273 CharsetDecoder cd = cs.newDecoder(); 274 // ascii fastpath 275 if ((cd instanceof ArrayDecoder) && 276 ((ArrayDecoder)cd).isASCIICompatible() && !hasNegatives(ba, off, len)) { 277 return decodeLatin1(ba, off, len); 278 } 279 int en = scale(len, cd.maxCharsPerByte()); 280 if (len == 0) { 281 return new Result().with(); 282 } 283 cd.onMalformedInput(CodingErrorAction.REPLACE) 284 .onUnmappableCharacter(CodingErrorAction.REPLACE) 285 .reset(); 286 char[] ca = new char[en]; 287 if (cd instanceof ArrayDecoder) { 288 int clen = ((ArrayDecoder)cd).decode(ba, off, len, ca); 289 return new Result().with(ca, 0, clen); 290 } 291 if (cs.getClass().getClassLoader0() != null && 292 System.getSecurityManager() != null) { 293 ba = Arrays.copyOfRange(ba, off, off + len); 294 off = 0; 295 } 296 ByteBuffer bb = ByteBuffer.wrap(ba, off, len); 297 CharBuffer cb = CharBuffer.wrap(ca); 298 try { 299 CoderResult cr = cd.decode(bb, cb, true); 300 if (!cr.isUnderflow()) 301 cr.throwException(); 302 cr = cd.flush(cb); 303 if (!cr.isUnderflow()) 304 cr.throwException(); 305 } catch (CharacterCodingException x) { 306 // Substitution is always enabled, 307 // so this shouldn't happen 308 throw new Error(x); 309 } 310 return new Result().with(ca, 0, cb.position()); 311 } 312 313 static Result decode(byte[] ba, int off, int len) { 314 Charset cs = Charset.defaultCharset(); 315 if (cs == UTF_8) { 316 return decodeUTF8(ba, off, len, true); 317 } 318 if (cs == ISO_8859_1) { 319 return decodeLatin1(ba, off, len); 320 } 321 if (cs == US_ASCII) { 322 return decodeASCII(ba, off, len); 323 } 324 StringDecoder sd = deref(decoder); 325 if (sd == null || !cs.name().equals(sd.cs.name())) { 326 sd = new StringDecoder(cs, cs.name()); 327 set(decoder, sd); 328 } 329 return sd.decode(ba, off, len); 330 } 331 332 // -- Encoding -- 333 private static class StringEncoder { 334 private Charset cs; 335 private CharsetEncoder ce; 336 private final boolean isASCIICompatible; 337 private final String requestedCharsetName; 338 private final boolean isTrusted; 339 340 private StringEncoder(Charset cs, String rcn) { 341 this.requestedCharsetName = rcn; 342 this.cs = cs; 343 this.ce = cs.newEncoder() 344 .onMalformedInput(CodingErrorAction.REPLACE) 345 .onUnmappableCharacter(CodingErrorAction.REPLACE); 346 this.isTrusted = (cs.getClass().getClassLoader0() == null); 347 this.isASCIICompatible = (ce instanceof ArrayEncoder) && 348 ((ArrayEncoder)ce).isASCIICompatible(); 349 } 350 351 String charsetName() { 352 if (cs instanceof HistoricallyNamedCharset) 353 return ((HistoricallyNamedCharset)cs).historicalName(); 354 return cs.name(); 355 } 356 357 final String requestedCharsetName() { 358 return requestedCharsetName; 359 } 360 361 byte[] encode(byte coder, byte[] val) { 362 // fastpath for ascii compatible 363 if (coder == LATIN1 && isASCIICompatible && 364 !hasNegatives(val, 0, val.length)) { 365 return Arrays.copyOf(val, val.length); 366 } 367 int len = val.length >> coder; // assume LATIN1=0/UTF16=1; 368 int en = scale(len, ce.maxBytesPerChar()); 369 byte[] ba = new byte[en]; 370 if (len == 0) { 371 return ba; 372 } 373 if (ce instanceof ArrayEncoder) { 374 int blen = (coder == LATIN1 ) ? ((ArrayEncoder)ce).encodeFromLatin1(val, 0, len, ba) 375 : ((ArrayEncoder)ce).encodeFromUTF16(val, 0, len, ba); 376 if (blen != -1) { 377 return safeTrim(ba, blen, isTrusted); 378 } 379 } 380 char[] ca = (coder == LATIN1 ) ? StringLatin1.toChars(val) 381 : StringUTF16.toChars(val); 382 ce.reset(); 383 ByteBuffer bb = ByteBuffer.wrap(ba); 384 CharBuffer cb = CharBuffer.wrap(ca, 0, len); 385 try { 386 CoderResult cr = ce.encode(cb, bb, true); 387 if (!cr.isUnderflow()) 388 cr.throwException(); 389 cr = ce.flush(bb); 390 if (!cr.isUnderflow()) 391 cr.throwException(); 392 } catch (CharacterCodingException x) { 393 // Substitution is always enabled, 394 // so this shouldn't happen 395 throw new Error(x); 396 } 397 return safeTrim(ba, bb.position(), isTrusted); 398 } 399 } 400 401 static byte[] encode(String charsetName, byte coder, byte[] val) 402 throws UnsupportedEncodingException 403 { 404 StringEncoder se = deref(encoder); 405 String csn = (charsetName == null) ? "ISO-8859-1" : charsetName; 406 if ((se == null) || !(csn.equals(se.requestedCharsetName()) 407 || csn.equals(se.charsetName()))) { 408 se = null; 409 try { 410 Charset cs = lookupCharset(csn); 411 if (cs != null) { 412 if (cs == UTF_8) { 413 return encodeUTF8(coder, val, true); 414 } 415 if (cs == ISO_8859_1) { 416 return encode8859_1(coder, val); 417 } 418 if (cs == US_ASCII) { 419 return encodeASCII(coder, val); 420 } 421 se = new StringEncoder(cs, csn); 422 } 423 } catch (IllegalCharsetNameException x) {} 424 if (se == null) { 425 throw new UnsupportedEncodingException (csn); 426 } 427 set(encoder, se); 428 } 429 return se.encode(coder, val); 430 } 431 432 static byte[] encode(Charset cs, byte coder, byte[] val) { 433 if (cs == UTF_8) { 434 return encodeUTF8(coder, val, true); 435 } 436 if (cs == ISO_8859_1) { 437 return encode8859_1(coder, val); 438 } 439 if (cs == US_ASCII) { 440 return encodeASCII(coder, val); 441 } 442 CharsetEncoder ce = cs.newEncoder(); 443 // fastpath for ascii compatible 444 if (coder == LATIN1 && (((ce instanceof ArrayEncoder) && 445 ((ArrayEncoder)ce).isASCIICompatible() && 446 !hasNegatives(val, 0, val.length)))) { 447 return Arrays.copyOf(val, val.length); 448 } 449 int len = val.length >> coder; // assume LATIN1=0/UTF16=1; 450 int en = scale(len, ce.maxBytesPerChar()); 451 byte[] ba = new byte[en]; 452 if (len == 0) { 453 return ba; 454 } 455 ce.onMalformedInput(CodingErrorAction.REPLACE) 456 .onUnmappableCharacter(CodingErrorAction.REPLACE) 457 .reset(); 458 if (ce instanceof ArrayEncoder) { 459 int blen = (coder == LATIN1 ) ? ((ArrayEncoder)ce).encodeFromLatin1(val, 0, len, ba) 460 : ((ArrayEncoder)ce).encodeFromUTF16(val, 0, len, ba); 461 if (blen != -1) { 462 return safeTrim(ba, blen, true); 463 } 464 } 465 boolean isTrusted = cs.getClass().getClassLoader0() == null || 466 System.getSecurityManager() == null; 467 char[] ca = (coder == LATIN1 ) ? StringLatin1.toChars(val) 468 : StringUTF16.toChars(val); 469 ByteBuffer bb = ByteBuffer.wrap(ba); 470 CharBuffer cb = CharBuffer.wrap(ca, 0, len); 471 try { 472 CoderResult cr = ce.encode(cb, bb, true); 473 if (!cr.isUnderflow()) 474 cr.throwException(); 475 cr = ce.flush(bb); 476 if (!cr.isUnderflow()) 477 cr.throwException(); 478 } catch (CharacterCodingException x) { 479 throw new Error(x); 480 } 481 return safeTrim(ba, bb.position(), isTrusted); 482 } 483 484 static byte[] encode(byte coder, byte[] val) { 485 Charset cs = Charset.defaultCharset(); 486 if (cs == UTF_8) { 487 return encodeUTF8(coder, val, true); 488 } 489 if (cs == ISO_8859_1) { 490 return encode8859_1(coder, val); 491 } 492 if (cs == US_ASCII) { 493 return encodeASCII(coder, val); 494 } 495 StringEncoder se = deref(encoder); 496 if (se == null || !cs.name().equals(se.cs.name())) { 497 se = new StringEncoder(cs, cs.name()); 498 set(encoder, se); 499 } 500 return se.encode(coder, val); 501 } 502 503 /** 504 * Print a message directly to stderr, bypassing all character conversion 505 * methods. 506 * @param msg message to print 507 */ 508 private static native void err(String msg); 509 510 /* The cached Result for each thread */ 511 private static final ThreadLocal<StringCoding.Result> 512 resultCached = new ThreadLocal<>() { 513 protected StringCoding.Result initialValue() { 514 return new StringCoding.Result(); 515 }}; 516 517 ////////////////////////// ascii ////////////////////////////// 518 519 private static Result decodeASCII(byte[] ba, int off, int len) { 520 Result result = resultCached.get(); 521 if (COMPACT_STRINGS && !hasNegatives(ba, off, len)) { 522 return result.with(Arrays.copyOfRange(ba, off, off + len), 523 LATIN1); 524 } 525 byte[] dst = new byte[len<<1]; 526 int dp = 0; 527 while (dp < len) { 528 int b = ba[off++]; 529 putChar(dst, dp++, (b >= 0) ? (char)b : repl); 530 } 531 return result.with(dst, UTF16); 532 } 533 534 private static byte[] encodeASCII(byte coder, byte[] val) { 535 if (coder == LATIN1) { 536 byte[] dst = new byte[val.length]; 537 for (int i = 0; i < val.length; i++) { 538 if (val[i] < 0) { 539 dst[i] = '?'; 540 } else { 541 dst[i] = val[i]; 542 } 543 } 544 return dst; 545 } 546 int len = val.length >> 1; 547 byte[] dst = new byte[len]; 548 int dp = 0; 549 for (int i = 0; i < len; i++) { 550 char c = StringUTF16.getChar(val, i); 551 if (c < 0x80) { 552 dst[dp++] = (byte)c; 553 continue; 554 } 555 if (Character.isHighSurrogate(c) && i + 1 < len && 556 Character.isLowSurrogate(StringUTF16.getChar(val, i + 1))) { 557 i++; 558 } 559 dst[dp++] = '?'; 560 } 561 if (len == dp) { 562 return dst; 563 } 564 return Arrays.copyOf(dst, dp); 565 } 566 567 ////////////////////////// latin1/8859_1 /////////////////////////// 568 569 private static Result decodeLatin1(byte[] ba, int off, int len) { 570 Result result = resultCached.get(); 571 if (COMPACT_STRINGS) { 572 return result.with(Arrays.copyOfRange(ba, off, off + len), LATIN1); 573 } else { 574 return result.with(StringLatin1.inflate(ba, off, len), UTF16); 575 } 576 } 577 578 @HotSpotIntrinsicCandidate 579 private static int implEncodeISOArray(byte[] sa, int sp, 580 byte[] da, int dp, int len) { 581 int i = 0; 582 for (; i < len; i++) { 583 char c = StringUTF16.getChar(sa, sp++); 584 if (c > '\u00FF') 585 break; 586 da[dp++] = (byte)c; 587 } 588 return i; 589 } 590 591 private static byte[] encode8859_1(byte coder, byte[] val) { 592 if (coder == LATIN1) { 593 return Arrays.copyOf(val, val.length); 594 } 595 int len = val.length >> 1; 596 byte[] dst = new byte[len]; 597 int dp = 0; 598 int sp = 0; 599 int sl = len; 600 while (sp < sl) { 601 int ret = implEncodeISOArray(val, sp, dst, dp, len); 602 sp = sp + ret; 603 dp = dp + ret; 604 if (ret != len) { 605 char c = StringUTF16.getChar(val, sp++); 606 if (Character.isHighSurrogate(c) && sp < sl && 607 Character.isLowSurrogate(StringUTF16.getChar(val, sp))) { 608 sp++; 609 } 610 dst[dp++] = '?'; 611 len = sl - sp; 612 } 613 } 614 if (dp == dst.length) { 615 return dst; 616 } 617 return Arrays.copyOf(dst, dp); 618 } 619 620 //////////////////////////////// utf8 //////////////////////////////////// 621 622 private static boolean isNotContinuation(int b) { 623 return (b & 0xc0) != 0x80; 624 } 625 626 private static boolean isMalformed3(int b1, int b2, int b3) { 627 return (b1 == (byte)0xe0 && (b2 & 0xe0) == 0x80) || 628 (b2 & 0xc0) != 0x80 || (b3 & 0xc0) != 0x80; 629 } 630 631 private static boolean isMalformed3_2(int b1, int b2) { 632 return (b1 == (byte)0xe0 && (b2 & 0xe0) == 0x80) || 633 (b2 & 0xc0) != 0x80; 634 } 635 636 private static boolean isMalformed4(int b2, int b3, int b4) { 637 return (b2 & 0xc0) != 0x80 || (b3 & 0xc0) != 0x80 || 638 (b4 & 0xc0) != 0x80; 639 } 640 641 private static boolean isMalformed4_2(int b1, int b2) { 642 return (b1 == 0xf0 && (b2 < 0x90 || b2 > 0xbf)) || 643 (b1 == 0xf4 && (b2 & 0xf0) != 0x80) || 644 (b2 & 0xc0) != 0x80; 645 } 646 647 private static boolean isMalformed4_3(int b3) { 648 return (b3 & 0xc0) != 0x80; 649 } 650 651 // for nb == 3/4 652 private static int malformedN(byte[] src, int sp, int nb) { 653 if (nb == 3) { 654 int b1 = src[sp++]; 655 int b2 = src[sp++]; // no need to lookup b3 656 return ((b1 == (byte)0xe0 && (b2 & 0xe0) == 0x80) || 657 isNotContinuation(b2)) ? 1 : 2; 658 } else if (nb == 4) { // we don't care the speed here 659 int b1 = src[sp++] & 0xff; 660 int b2 = src[sp++] & 0xff; 661 if (b1 > 0xf4 || 662 (b1 == 0xf0 && (b2 < 0x90 || b2 > 0xbf)) || 663 (b1 == 0xf4 && (b2 & 0xf0) != 0x80) || 664 isNotContinuation(b2)) 665 return 1; 666 if (isNotContinuation(src[sp++])) 667 return 2; 668 return 3; 669 } 670 assert false; 671 return -1; 672 } 673 674 private static void throwMalformed(int off, int nb) { 675 throw new IllegalArgumentException("malformed input off : " + off + 676 ", length : " + nb); 677 } 678 679 private static char repl = '\ufffd'; 680 681 private static Result decodeUTF8(byte[] src, int sp, int len, boolean doReplace) { 682 // ascii-bais, which has a relative impact to the non-ascii-only bytes 683 if (COMPACT_STRINGS && !hasNegatives(src, sp, len)) 684 return resultCached.get().with(Arrays.copyOfRange(src, sp, sp + len), 685 LATIN1); 686 return decodeUTF8_0(src, sp, len, doReplace); 687 } 688 689 private static Result decodeUTF8_0(byte[] src, int sp, int len, boolean doReplace) { 690 Result ret = resultCached.get(); 691 692 int sl = sp + len; 693 int dp = 0; 694 byte[] dst = new byte[len]; 695 696 if (COMPACT_STRINGS) { 697 while (sp < sl) { 698 int b1 = src[sp]; 699 if (b1 >= 0) { 700 dst[dp++] = (byte)b1; 701 sp++; 702 continue; 703 } 704 if ((b1 == (byte)0xc2 || b1 == (byte)0xc3) && 705 sp + 1 < sl) { 706 int b2 = src[sp + 1]; 707 if (!isNotContinuation(b2)) { 708 dst[dp++] = (byte)(((b1 << 6) ^ b2)^ 709 (((byte) 0xC0 << 6) ^ 710 ((byte) 0x80 << 0))); 711 sp += 2; 712 continue; 713 } 714 } 715 // anything not a latin1, including the repl 716 // we have to go with the utf16 717 break; 718 } 719 if (sp == sl) { 720 if (dp != dst.length) { 721 dst = Arrays.copyOf(dst, dp); 722 } 723 return ret.with(dst, LATIN1); 724 } 725 } 726 if (dp == 0) { 727 dst = new byte[len << 1]; 728 } else { 729 byte[] buf = new byte[len << 1]; 730 StringLatin1.inflate(dst, 0, buf, 0, dp); 731 dst = buf; 732 } 733 while (sp < sl) { 734 int b1 = src[sp++]; 735 if (b1 >= 0) { 736 putChar(dst, dp++, (char) b1); 737 } else if ((b1 >> 5) == -2 && (b1 & 0x1e) != 0) { 738 if (sp < sl) { 739 int b2 = src[sp++]; 740 if (isNotContinuation(b2)) { 741 if (!doReplace) { 742 throwMalformed(sp - 1, 1); 743 } 744 putChar(dst, dp++, repl); 745 sp--; 746 } else { 747 putChar(dst, dp++, (char)(((b1 << 6) ^ b2)^ 748 (((byte) 0xC0 << 6) ^ 749 ((byte) 0x80 << 0)))); 750 } 751 continue; 752 } 753 if (!doReplace) { 754 throwMalformed(sp, 1); // underflow() 755 } 756 putChar(dst, dp++, repl); 757 break; 758 } else if ((b1 >> 4) == -2) { 759 if (sp + 1 < sl) { 760 int b2 = src[sp++]; 761 int b3 = src[sp++]; 762 if (isMalformed3(b1, b2, b3)) { 763 if (!doReplace) { 764 throwMalformed(sp - 3, 3); 765 } 766 putChar(dst, dp++, repl); 767 sp -= 3; 768 sp += malformedN(src, sp, 3); 769 } else { 770 char c = (char)((b1 << 12) ^ 771 (b2 << 6) ^ 772 (b3 ^ 773 (((byte) 0xE0 << 12) ^ 774 ((byte) 0x80 << 6) ^ 775 ((byte) 0x80 << 0)))); 776 if (isSurrogate(c)) { 777 if (!doReplace) { 778 throwMalformed(sp - 3, 3); 779 } 780 putChar(dst, dp++, repl); 781 } else { 782 putChar(dst, dp++, c); 783 } 784 } 785 continue; 786 } 787 if (sp < sl && isMalformed3_2(b1, src[sp])) { 788 if (!doReplace) { 789 throwMalformed(sp - 1, 2); 790 } 791 putChar(dst, dp++, repl); 792 continue; 793 } 794 if (!doReplace){ 795 throwMalformed(sp, 1); 796 } 797 putChar(dst, dp++, repl); 798 break; 799 } else if ((b1 >> 3) == -2) { 800 if (sp + 2 < sl) { 801 int b2 = src[sp++]; 802 int b3 = src[sp++]; 803 int b4 = src[sp++]; 804 int uc = ((b1 << 18) ^ 805 (b2 << 12) ^ 806 (b3 << 6) ^ 807 (b4 ^ 808 (((byte) 0xF0 << 18) ^ 809 ((byte) 0x80 << 12) ^ 810 ((byte) 0x80 << 6) ^ 811 ((byte) 0x80 << 0)))); 812 if (isMalformed4(b2, b3, b4) || 813 !isSupplementaryCodePoint(uc)) { // shortest form check 814 if (!doReplace) { 815 throwMalformed(sp - 4, 4); 816 } 817 putChar(dst, dp++, repl); 818 sp -= 4; 819 sp += malformedN(src, sp, 4); 820 } else { 821 putChar(dst, dp++, highSurrogate(uc)); 822 putChar(dst, dp++, lowSurrogate(uc)); 823 } 824 continue; 825 } 826 b1 &= 0xff; 827 if (b1 > 0xf4 || 828 sp < sl && isMalformed4_2(b1, src[sp] & 0xff)) { 829 if (!doReplace) { 830 throwMalformed(sp - 1, 1); // or 2 831 } 832 putChar(dst, dp++, repl); 833 continue; 834 } 835 if (!doReplace) { 836 throwMalformed(sp - 1, 1); 837 } 838 sp++; 839 putChar(dst, dp++, repl); 840 if (sp < sl && isMalformed4_3(src[sp])) { 841 continue; 842 } 843 break; 844 } else { 845 if (!doReplace) { 846 throwMalformed(sp - 1, 1); 847 } 848 putChar(dst, dp++, repl); 849 } 850 } 851 if (dp != len) { 852 dst = Arrays.copyOf(dst, dp << 1); 853 } 854 return ret.with(dst, UTF16); 855 } 856 857 private static byte[] encodeUTF8(byte coder, byte[] val, boolean doReplace) { 858 if (coder == UTF16) 859 return encodeUTF8_UTF16(val, doReplace); 860 861 if (!hasNegatives(val, 0, val.length)) 862 return Arrays.copyOf(val, val.length); 863 864 int dp = 0; 865 byte[] dst = new byte[val.length << 1]; 866 for (int sp = 0; sp < val.length; sp++) { 867 byte c = val[sp]; 868 if (c < 0) { 869 dst[dp++] = (byte)(0xc0 | ((c & 0xff) >> 6)); 870 dst[dp++] = (byte)(0x80 | (c & 0x3f)); 871 } else { 872 dst[dp++] = c; 873 } 874 } 875 if (dp == dst.length) 876 return dst; 877 return Arrays.copyOf(dst, dp); 878 } 879 880 private static byte[] encodeUTF8_UTF16(byte[] val, boolean doReplace) { 881 int dp = 0; 882 int sp = 0; 883 int sl = val.length >> 1; 884 byte[] dst = new byte[sl * 3]; 885 char c; 886 while (sp < sl && (c = StringUTF16.getChar(val, sp)) < '\u0080') { 887 // ascii fast loop; 888 dst[dp++] = (byte)c; 889 sp++; 890 } 891 while (sp < sl) { 892 c = StringUTF16.getChar(val, sp++); 893 if (c < 0x80) { 894 dst[dp++] = (byte)c; 895 } else if (c < 0x800) { 896 dst[dp++] = (byte)(0xc0 | (c >> 6)); 897 dst[dp++] = (byte)(0x80 | (c & 0x3f)); 898 } else if (Character.isSurrogate(c)) { 899 int uc = -1; 900 char c2; 901 if (Character.isHighSurrogate(c) && sp < sl && 902 Character.isLowSurrogate(c2 = StringUTF16.getChar(val, sp))) { 903 uc = Character.toCodePoint(c, c2); 904 } 905 if (uc < 0) { 906 if (doReplace) { 907 dst[dp++] = '?'; 908 } else { 909 throwMalformed(sp - 1, 1); // or 2, does not matter here 910 } 911 } else { 912 dst[dp++] = (byte)(0xf0 | ((uc >> 18))); 913 dst[dp++] = (byte)(0x80 | ((uc >> 12) & 0x3f)); 914 dst[dp++] = (byte)(0x80 | ((uc >> 6) & 0x3f)); 915 dst[dp++] = (byte)(0x80 | (uc & 0x3f)); 916 sp++; // 2 chars 917 } 918 } else { 919 // 3 bytes, 16 bits 920 dst[dp++] = (byte)(0xe0 | ((c >> 12))); 921 dst[dp++] = (byte)(0x80 | ((c >> 6) & 0x3f)); 922 dst[dp++] = (byte)(0x80 | (c & 0x3f)); 923 } 924 } 925 if (dp == dst.length) { 926 return dst; 927 } 928 return Arrays.copyOf(dst, dp); 929 } 930 931 ////////////////////// for j.u.z.ZipCoder ////////////////////////// 932 933 /* 934 * Throws iae, instead of replacing, if malformed or unmappble. 935 */ 936 static String newStringUTF8NoRepl(byte[] src, int off, int len) { 937 if (COMPACT_STRINGS && !hasNegatives(src, off, len)) 938 return new String(Arrays.copyOfRange(src, off, off + len), LATIN1); 939 Result ret = decodeUTF8_0(src, off, len, false); 940 return new String(ret.value, ret.coder); 941 } 942 943 /* 944 * Throws iae, instead of replacing, if unmappble. 945 */ 946 static byte[] getBytesUTF8NoRepl(String s) { 947 return encodeUTF8(s.coder(), s.value(), false); 948 } 949 }