1 /*
   2  * Copyright (c) 2000, 2018, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.  Oracle designates this
   8  * particular file as subject to the "Classpath" exception as provided
   9  * by Oracle in the LICENSE file that accompanied this code.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  */
  25 
  26 package java.lang;
  27 
  28 import java.io.UnsupportedEncodingException;
  29 import java.lang.ref.SoftReference;
  30 import java.nio.ByteBuffer;
  31 import java.nio.CharBuffer;
  32 import java.nio.charset.Charset;
  33 import java.nio.charset.CharsetDecoder;
  34 import java.nio.charset.CharsetEncoder;
  35 import java.nio.charset.CharacterCodingException;
  36 import java.nio.charset.CoderResult;
  37 import java.nio.charset.CodingErrorAction;
  38 import java.nio.charset.IllegalCharsetNameException;
  39 import java.nio.charset.UnsupportedCharsetException;
  40 import java.util.Arrays;
  41 import jdk.internal.HotSpotIntrinsicCandidate;
  42 import sun.nio.cs.HistoricallyNamedCharset;
  43 import sun.nio.cs.ArrayDecoder;
  44 import sun.nio.cs.ArrayEncoder;
  45 
  46 import static java.lang.String.LATIN1;
  47 import static java.lang.String.UTF16;
  48 import static java.lang.String.COMPACT_STRINGS;
  49 import static java.lang.Character.isSurrogate;
  50 import static java.lang.Character.highSurrogate;
  51 import static java.lang.Character.lowSurrogate;
  52 import static java.lang.Character.isSupplementaryCodePoint;
  53 import static java.lang.StringUTF16.putChar;
  54 
  55 /**
  56  * Utility class for string encoding and decoding.
  57  */
  58 
  59 class StringCoding {
  60 
  61     private StringCoding() { }
  62 
  63     /** The cached coders for each thread */
  64     private static final ThreadLocal<SoftReference<StringDecoder>> decoder =
  65         new ThreadLocal<>();
  66     private static final ThreadLocal<SoftReference<StringEncoder>> encoder =
  67         new ThreadLocal<>();
  68 
  69     private static final Charset ISO_8859_1 = sun.nio.cs.ISO_8859_1.INSTANCE;
  70     private static final Charset US_ASCII = sun.nio.cs.US_ASCII.INSTANCE;
  71     private static final Charset UTF_8 = sun.nio.cs.UTF_8.INSTANCE;
  72 
  73     private static <T> T deref(ThreadLocal<SoftReference<T>> tl) {
  74         SoftReference<T> sr = tl.get();
  75         if (sr == null)
  76             return null;
  77         return sr.get();
  78     }
  79 
  80     private static <T> void set(ThreadLocal<SoftReference<T>> tl, T ob) {
  81         tl.set(new SoftReference<>(ob));
  82     }
  83 
  84     // Trim the given byte array to the given length
  85     private static byte[] safeTrim(byte[] ba, int len, boolean isTrusted) {
  86         if (len == ba.length && (isTrusted || System.getSecurityManager() == null))
  87             return ba;
  88         else
  89             return Arrays.copyOf(ba, len);
  90     }
  91 
  92     private static int scale(int len, float expansionFactor) {
  93         // We need to perform double, not float, arithmetic; otherwise
  94         // we lose low order bits when len is larger than 2**24.
  95         return (int)(len * (double)expansionFactor);
  96     }
  97 
  98     private static Charset lookupCharset(String csn) {
  99         if (Charset.isSupported(csn)) {
 100             try {
 101                 return Charset.forName(csn);
 102             } catch (UnsupportedCharsetException x) {
 103                 throw new Error(x);
 104             }
 105         }
 106         return null;
 107     }
 108 
 109     static class Result {
 110         byte[] value;
 111         byte coder;
 112 
 113         Result with() {
 114             coder = COMPACT_STRINGS ? LATIN1 : UTF16;
 115             value = new byte[0];
 116             return this;
 117         }
 118 
 119         Result with(char[] val, int off, int len) {
 120             if (String.COMPACT_STRINGS) {
 121                 byte[] bs = StringUTF16.compress(val, off, len);
 122                 if (bs != null) {
 123                     value = bs;
 124                     coder = LATIN1;
 125                     return this;
 126                 }
 127             }
 128             coder = UTF16;
 129             value = StringUTF16.toBytes(val, off, len);
 130             return this;
 131         }
 132 
 133         Result with(byte[] val, byte coder) {
 134             this.coder = coder;
 135             value = val;
 136             return this;
 137         }
 138     }
 139 
 140     @HotSpotIntrinsicCandidate
 141     public static boolean hasNegatives(byte[] ba, int off, int len) {
 142         for (int i = off; i < off + len; i++) {
 143             if (ba[i] < 0) {
 144                 return true;
 145             }
 146         }
 147         return false;
 148     }
 149 
 150     // -- Decoding --
 151     static class StringDecoder {
 152         private final String requestedCharsetName;
 153         private final Charset cs;
 154         private final boolean isASCIICompatible;
 155         private final CharsetDecoder cd;
 156         protected final Result result;
 157 
 158         StringDecoder(Charset cs, String rcn) {
 159             this.requestedCharsetName = rcn;
 160             this.cs = cs;
 161             this.cd = cs.newDecoder()
 162                 .onMalformedInput(CodingErrorAction.REPLACE)
 163                 .onUnmappableCharacter(CodingErrorAction.REPLACE);
 164             this.result = new Result();
 165             this.isASCIICompatible = (cd instanceof ArrayDecoder) &&
 166                     ((ArrayDecoder)cd).isASCIICompatible();
 167         }
 168 
 169         String charsetName() {
 170             if (cs instanceof HistoricallyNamedCharset)
 171                 return ((HistoricallyNamedCharset)cs).historicalName();
 172             return cs.name();
 173         }
 174 
 175         final String requestedCharsetName() {
 176             return requestedCharsetName;
 177         }
 178 
 179         Result decode(byte[] ba, int off, int len) {
 180             if (len == 0) {
 181                 return result.with();
 182             }
 183             // fastpath for ascii compatible
 184             if (isASCIICompatible && !hasNegatives(ba, off, len)) {
 185                 if (COMPACT_STRINGS) {
 186                     return result.with(Arrays.copyOfRange(ba, off, off + len),
 187                                       LATIN1);
 188                 } else {
 189                     return result.with(StringLatin1.inflate(ba, off, len), UTF16);
 190                 }
 191             }
 192             int en = scale(len, cd.maxCharsPerByte());
 193             char[] ca = new char[en];
 194             if (cd instanceof ArrayDecoder) {
 195                 int clen = ((ArrayDecoder)cd).decode(ba, off, len, ca);
 196                 return result.with(ca, 0, clen);
 197             }
 198             cd.reset();
 199             ByteBuffer bb = ByteBuffer.wrap(ba, off, len);
 200             CharBuffer cb = CharBuffer.wrap(ca);
 201             try {
 202                 CoderResult cr = cd.decode(bb, cb, true);
 203                 if (!cr.isUnderflow())
 204                     cr.throwException();
 205                 cr = cd.flush(cb);
 206                 if (!cr.isUnderflow())
 207                     cr.throwException();
 208             } catch (CharacterCodingException x) {
 209                 // Substitution is always enabled,
 210                 // so this shouldn't happen
 211                 throw new Error(x);
 212             }
 213             return result.with(ca, 0, cb.position());
 214         }
 215     }
 216 
 217     static Result decode(String charsetName, byte[] ba, int off, int len)
 218         throws UnsupportedEncodingException
 219     {
 220         StringDecoder sd = deref(decoder);
 221         String csn = (charsetName == null) ? "ISO-8859-1" : charsetName;
 222         if ((sd == null) || !(csn.equals(sd.requestedCharsetName())
 223                               || csn.equals(sd.charsetName()))) {
 224             sd = null;
 225             try {
 226                 Charset cs = lookupCharset(csn);
 227                 if (cs != null) {
 228                     if (cs == UTF_8) {
 229                         return decodeUTF8(ba, off, len, true);
 230                     }
 231                     if (cs == ISO_8859_1) {
 232                         return decodeLatin1(ba, off, len);
 233                     }
 234                     if (cs == US_ASCII) {
 235                         return decodeASCII(ba, off, len);
 236                     }
 237                     sd = new StringDecoder(cs, csn);
 238                 }
 239             } catch (IllegalCharsetNameException x) {}
 240             if (sd == null)
 241                 throw new UnsupportedEncodingException(csn);
 242             set(decoder, sd);
 243         }
 244         return sd.decode(ba, off, len);
 245     }
 246 
 247     static Result decode(Charset cs, byte[] ba, int off, int len) {
 248         if (cs == UTF_8) {
 249             return decodeUTF8(ba, off, len, true);
 250         }
 251         if (cs == ISO_8859_1) {
 252             return decodeLatin1(ba, off, len);
 253         }
 254         if (cs == US_ASCII) {
 255             return decodeASCII(ba, off, len);
 256         }
 257 
 258         // (1)We never cache the "external" cs, the only benefit of creating
 259         // an additional StringDe/Encoder object to wrap it is to share the
 260         // de/encode() method. These SD/E objects are short-lived, the young-gen
 261         // gc should be able to take care of them well. But the best approach
 262         // is still not to generate them if not really necessary.
 263         // (2)The defensive copy of the input byte/char[] has a big performance
 264         // impact, as well as the outgoing result byte/char[]. Need to do the
 265         // optimization check of (sm==null && classLoader0==null) for both.
 266         // (3)There might be a timing gap in isTrusted setting. getClassLoader0()
 267         // is only checked (and then isTrusted gets set) when (SM==null). It is
 268         // possible that the SM==null for now but then SM is NOT null later
 269         // when safeTrim() is invoked...the "safe" way to do is to redundant
 270         // check (... && (isTrusted || SM == null || getClassLoader0())) in trim
 271         // but it then can be argued that the SM is null when the operation
 272         // is started...
 273         CharsetDecoder cd = cs.newDecoder();
 274         // ascii fastpath
 275         if ((cd instanceof ArrayDecoder) &&
 276             ((ArrayDecoder)cd).isASCIICompatible() && !hasNegatives(ba, off, len)) {
 277             return decodeLatin1(ba, off, len);
 278         }
 279         int en = scale(len, cd.maxCharsPerByte());
 280         if (len == 0) {
 281             return new Result().with();
 282         }
 283         cd.onMalformedInput(CodingErrorAction.REPLACE)
 284           .onUnmappableCharacter(CodingErrorAction.REPLACE)
 285           .reset();
 286         char[] ca = new char[en];
 287         if (cd instanceof ArrayDecoder) {
 288             int clen = ((ArrayDecoder)cd).decode(ba, off, len, ca);
 289             return new Result().with(ca, 0, clen);
 290         }
 291         if (cs.getClass().getClassLoader0() != null &&
 292             System.getSecurityManager() != null) {
 293             ba = Arrays.copyOfRange(ba, off, off + len);
 294             off = 0;
 295         }
 296         ByteBuffer bb = ByteBuffer.wrap(ba, off, len);
 297         CharBuffer cb = CharBuffer.wrap(ca);
 298         try {
 299             CoderResult cr = cd.decode(bb, cb, true);
 300             if (!cr.isUnderflow())
 301                 cr.throwException();
 302             cr = cd.flush(cb);
 303             if (!cr.isUnderflow())
 304                 cr.throwException();
 305         } catch (CharacterCodingException x) {
 306             // Substitution is always enabled,
 307             // so this shouldn't happen
 308             throw new Error(x);
 309         }
 310         return new Result().with(ca, 0, cb.position());
 311     }
 312 
 313     static Result decode(byte[] ba, int off, int len) {
 314         Charset cs = Charset.defaultCharset();
 315         if (cs == UTF_8) {
 316             return decodeUTF8(ba, off, len, true);
 317         }
 318         if (cs == ISO_8859_1) {
 319             return decodeLatin1(ba, off, len);
 320         }
 321         if (cs == US_ASCII) {
 322             return decodeASCII(ba, off, len);
 323         }
 324         StringDecoder sd = deref(decoder);
 325         if (sd == null || !cs.name().equals(sd.cs.name())) {
 326             sd = new StringDecoder(cs, cs.name());
 327             set(decoder, sd);
 328         }
 329         return sd.decode(ba, off, len);
 330     }
 331 
 332     // -- Encoding --
 333     private static class StringEncoder {
 334         private Charset cs;
 335         private CharsetEncoder ce;
 336         private final boolean isASCIICompatible;
 337         private final String requestedCharsetName;
 338         private final boolean isTrusted;
 339 
 340         private StringEncoder(Charset cs, String rcn) {
 341             this.requestedCharsetName = rcn;
 342             this.cs = cs;
 343             this.ce = cs.newEncoder()
 344                 .onMalformedInput(CodingErrorAction.REPLACE)
 345                 .onUnmappableCharacter(CodingErrorAction.REPLACE);
 346             this.isTrusted = (cs.getClass().getClassLoader0() == null);
 347             this.isASCIICompatible = (ce instanceof ArrayEncoder) &&
 348                     ((ArrayEncoder)ce).isASCIICompatible();
 349         }
 350 
 351         String charsetName() {
 352             if (cs instanceof HistoricallyNamedCharset)
 353                 return ((HistoricallyNamedCharset)cs).historicalName();
 354             return cs.name();
 355         }
 356 
 357         final String requestedCharsetName() {
 358             return requestedCharsetName;
 359         }
 360 
 361         byte[] encode(byte coder, byte[] val) {
 362             // fastpath for ascii compatible
 363             if (coder == LATIN1 && isASCIICompatible &&
 364                 !hasNegatives(val, 0, val.length)) {
 365                 return Arrays.copyOf(val, val.length);
 366             }
 367             int len = val.length >> coder;  // assume LATIN1=0/UTF16=1;
 368             int en = scale(len, ce.maxBytesPerChar());
 369             byte[] ba = new byte[en];
 370             if (len == 0) {
 371                 return ba;
 372             }
 373             if (ce instanceof ArrayEncoder) {
 374                 int blen = (coder == LATIN1 ) ? ((ArrayEncoder)ce).encodeFromLatin1(val, 0, len, ba)
 375                                               : ((ArrayEncoder)ce).encodeFromUTF16(val, 0, len, ba);
 376                 if (blen != -1) {
 377                     return safeTrim(ba, blen, isTrusted);
 378                 }
 379             }
 380             char[] ca = (coder == LATIN1 ) ? StringLatin1.toChars(val)
 381                                            : StringUTF16.toChars(val);
 382             ce.reset();
 383             ByteBuffer bb = ByteBuffer.wrap(ba);
 384             CharBuffer cb = CharBuffer.wrap(ca, 0, len);
 385             try {
 386                 CoderResult cr = ce.encode(cb, bb, true);
 387                 if (!cr.isUnderflow())
 388                     cr.throwException();
 389                 cr = ce.flush(bb);
 390                 if (!cr.isUnderflow())
 391                     cr.throwException();
 392             } catch (CharacterCodingException x) {
 393                 // Substitution is always enabled,
 394                 // so this shouldn't happen
 395                 throw new Error(x);
 396             }
 397             return safeTrim(ba, bb.position(), isTrusted);
 398         }
 399     }
 400 
 401     static byte[] encode(String charsetName, byte coder, byte[] val)
 402         throws UnsupportedEncodingException
 403     {
 404         StringEncoder se = deref(encoder);
 405         String csn = (charsetName == null) ? "ISO-8859-1" : charsetName;
 406         if ((se == null) || !(csn.equals(se.requestedCharsetName())
 407                               || csn.equals(se.charsetName()))) {
 408             se = null;
 409             try {
 410                 Charset cs = lookupCharset(csn);
 411                 if (cs != null) {
 412                     if (cs == UTF_8) {
 413                         return encodeUTF8(coder, val, true);
 414                     }
 415                     if (cs == ISO_8859_1) {
 416                         return encode8859_1(coder, val);
 417                     }
 418                     if (cs == US_ASCII) {
 419                         return encodeASCII(coder, val);
 420                     }
 421                     se = new StringEncoder(cs, csn);
 422                 }
 423             } catch (IllegalCharsetNameException x) {}
 424             if (se == null) {
 425                 throw new UnsupportedEncodingException (csn);
 426             }
 427             set(encoder, se);
 428         }
 429         return se.encode(coder, val);
 430     }
 431 
 432     static byte[] encode(Charset cs, byte coder, byte[] val) {
 433         if (cs == UTF_8) {
 434             return encodeUTF8(coder, val, true);
 435         }
 436         if (cs == ISO_8859_1) {
 437             return encode8859_1(coder, val);
 438         }
 439         if (cs == US_ASCII) {
 440             return encodeASCII(coder, val);
 441         }
 442         CharsetEncoder ce = cs.newEncoder();
 443         // fastpath for ascii compatible
 444         if (coder == LATIN1 && (((ce instanceof ArrayEncoder) &&
 445                                  ((ArrayEncoder)ce).isASCIICompatible() &&
 446                                  !hasNegatives(val, 0, val.length)))) {
 447             return Arrays.copyOf(val, val.length);
 448         }
 449         int len = val.length >> coder;  // assume LATIN1=0/UTF16=1;
 450         int en = scale(len, ce.maxBytesPerChar());
 451         byte[] ba = new byte[en];
 452         if (len == 0) {
 453             return ba;
 454         }
 455         ce.onMalformedInput(CodingErrorAction.REPLACE)
 456           .onUnmappableCharacter(CodingErrorAction.REPLACE)
 457           .reset();
 458         if (ce instanceof ArrayEncoder) {
 459             int blen = (coder == LATIN1 ) ? ((ArrayEncoder)ce).encodeFromLatin1(val, 0, len, ba)
 460                                           : ((ArrayEncoder)ce).encodeFromUTF16(val, 0, len, ba);
 461             if (blen != -1) {
 462                 return safeTrim(ba, blen, true);
 463             }
 464         }
 465         boolean isTrusted = cs.getClass().getClassLoader0() == null ||
 466                             System.getSecurityManager() == null;
 467         char[] ca = (coder == LATIN1 ) ? StringLatin1.toChars(val)
 468                                        : StringUTF16.toChars(val);
 469         ByteBuffer bb = ByteBuffer.wrap(ba);
 470         CharBuffer cb = CharBuffer.wrap(ca, 0, len);
 471         try {
 472             CoderResult cr = ce.encode(cb, bb, true);
 473             if (!cr.isUnderflow())
 474                 cr.throwException();
 475             cr = ce.flush(bb);
 476             if (!cr.isUnderflow())
 477                 cr.throwException();
 478         } catch (CharacterCodingException x) {
 479             throw new Error(x);
 480         }
 481         return safeTrim(ba, bb.position(), isTrusted);
 482     }
 483 
 484     static byte[] encode(byte coder, byte[] val) {
 485         Charset cs = Charset.defaultCharset();
 486         if (cs == UTF_8) {
 487             return encodeUTF8(coder, val, true);
 488         }
 489         if (cs == ISO_8859_1) {
 490             return encode8859_1(coder, val);
 491         }
 492         if (cs == US_ASCII) {
 493             return encodeASCII(coder, val);
 494         }
 495         StringEncoder se = deref(encoder);
 496         if (se == null || !cs.name().equals(se.cs.name())) {
 497             se = new StringEncoder(cs, cs.name());
 498             set(encoder, se);
 499         }
 500         return se.encode(coder, val);
 501     }
 502 
 503     /**
 504      *  Print a message directly to stderr, bypassing all character conversion
 505      *  methods.
 506      *  @param msg  message to print
 507      */
 508     private static native void err(String msg);
 509 
 510      /* The cached Result for each thread */
 511     private static final ThreadLocal<StringCoding.Result>
 512         resultCached = new ThreadLocal<>() {
 513             protected StringCoding.Result initialValue() {
 514                 return new StringCoding.Result();
 515             }};
 516 
 517     ////////////////////////// ascii //////////////////////////////
 518 
 519     private static Result decodeASCII(byte[] ba, int off, int len) {
 520         Result result = resultCached.get();
 521         if (COMPACT_STRINGS && !hasNegatives(ba, off, len)) {
 522             return result.with(Arrays.copyOfRange(ba, off, off + len),
 523                                LATIN1);
 524         }
 525         byte[] dst = new byte[len<<1];
 526         int dp = 0;
 527         while (dp < len) {
 528             int b = ba[off++];
 529             putChar(dst, dp++, (b >= 0) ? (char)b : repl);
 530         }
 531         return result.with(dst, UTF16);
 532     }
 533 
 534     private static byte[] encodeASCII(byte coder, byte[] val) {
 535         if (coder == LATIN1) {
 536             byte[] dst = new byte[val.length];
 537             for (int i = 0; i < val.length; i++) {
 538                 if (val[i] < 0) {
 539                     dst[i] = '?';
 540                 } else {
 541                     dst[i] = val[i];
 542                 }
 543             }
 544             return dst;
 545         }
 546         int len = val.length >> 1;
 547         byte[] dst = new byte[len];
 548         int dp = 0;
 549         for (int i = 0; i < len; i++) {
 550             char c = StringUTF16.getChar(val, i);
 551             if (c < 0x80) {
 552                 dst[dp++] = (byte)c;
 553                 continue;
 554             }
 555             if (Character.isHighSurrogate(c) && i + 1 < len &&
 556                 Character.isLowSurrogate(StringUTF16.getChar(val, i + 1))) {
 557                 i++;
 558             }
 559             dst[dp++] = '?';
 560         }
 561         if (len == dp) {
 562             return dst;
 563         }
 564         return Arrays.copyOf(dst, dp);
 565     }
 566 
 567     ////////////////////////// latin1/8859_1 ///////////////////////////
 568 
 569     private static Result decodeLatin1(byte[] ba, int off, int len) {
 570        Result result = resultCached.get();
 571        if (COMPACT_STRINGS) {
 572            return result.with(Arrays.copyOfRange(ba, off, off + len), LATIN1);
 573        } else {
 574            return result.with(StringLatin1.inflate(ba, off, len), UTF16);
 575        }
 576     }
 577 
 578     @HotSpotIntrinsicCandidate
 579     private static int implEncodeISOArray(byte[] sa, int sp,
 580                                           byte[] da, int dp, int len) {
 581         int i = 0;
 582         for (; i < len; i++) {
 583             char c = StringUTF16.getChar(sa, sp++);
 584             if (c > '\u00FF')
 585                 break;
 586             da[dp++] = (byte)c;
 587         }
 588         return i;
 589     }
 590 
 591     private static byte[] encode8859_1(byte coder, byte[] val) {
 592         return encode8859_1(coder, val, true);
 593     }
 594 
 595     private static byte[] encode8859_1(byte coder, byte[] val, boolean doReplace) {
 596         if (coder == LATIN1) {
 597             return Arrays.copyOf(val, val.length);
 598         }
 599         int len = val.length >> 1;
 600         byte[] dst = new byte[len];
 601         int dp = 0;
 602         int sp = 0;
 603         int sl = len;
 604         while (sp < sl) {
 605             int ret = implEncodeISOArray(val, sp, dst, dp, len);
 606             sp = sp + ret;
 607             dp = dp + ret;
 608             if (ret != len) {
 609                 if (!doReplace) {
 610                     throwMalformed(sp, 1);
 611                 }
 612                 char c = StringUTF16.getChar(val, sp++);
 613                 if (Character.isHighSurrogate(c) && sp < sl &&
 614                     Character.isLowSurrogate(StringUTF16.getChar(val, sp))) {
 615                     sp++;
 616                 }
 617                 dst[dp++] = '?';
 618                 len = sl - sp;
 619             }
 620         }
 621         if (dp == dst.length) {
 622             return dst;
 623         }
 624         return Arrays.copyOf(dst, dp);
 625     }
 626 
 627     //////////////////////////////// utf8 ////////////////////////////////////
 628 
 629     private static boolean isNotContinuation(int b) {
 630         return (b & 0xc0) != 0x80;
 631     }
 632 
 633     private static boolean isMalformed3(int b1, int b2, int b3) {
 634         return (b1 == (byte)0xe0 && (b2 & 0xe0) == 0x80) ||
 635                (b2 & 0xc0) != 0x80 || (b3 & 0xc0) != 0x80;
 636     }
 637 
 638     private static boolean isMalformed3_2(int b1, int b2) {
 639         return (b1 == (byte)0xe0 && (b2 & 0xe0) == 0x80) ||
 640                (b2 & 0xc0) != 0x80;
 641     }
 642 
 643     private static boolean isMalformed4(int b2, int b3, int b4) {
 644         return (b2 & 0xc0) != 0x80 || (b3 & 0xc0) != 0x80 ||
 645                (b4 & 0xc0) != 0x80;
 646     }
 647 
 648     private static boolean isMalformed4_2(int b1, int b2) {
 649         return (b1 == 0xf0 && (b2  < 0x90 || b2 > 0xbf)) ||
 650                (b1 == 0xf4 && (b2 & 0xf0) != 0x80) ||
 651                (b2 & 0xc0) != 0x80;
 652     }
 653 
 654     private static boolean isMalformed4_3(int b3) {
 655         return (b3 & 0xc0) != 0x80;
 656     }
 657 
 658     // for nb == 3/4
 659     private static int malformedN(byte[] src, int sp, int nb) {
 660         if (nb == 3) {
 661             int b1 = src[sp++];
 662             int b2 = src[sp++];    // no need to lookup b3
 663             return ((b1 == (byte)0xe0 && (b2 & 0xe0) == 0x80) ||
 664                     isNotContinuation(b2)) ? 1 : 2;
 665         } else if (nb == 4) { // we don't care the speed here
 666             int b1 = src[sp++] & 0xff;
 667             int b2 = src[sp++] & 0xff;
 668             if (b1 > 0xf4 ||
 669                 (b1 == 0xf0 && (b2 < 0x90 || b2 > 0xbf)) ||
 670                 (b1 == 0xf4 && (b2 & 0xf0) != 0x80) ||
 671                 isNotContinuation(b2))
 672                 return 1;
 673             if (isNotContinuation(src[sp++]))
 674                 return 2;
 675             return 3;
 676         }
 677         assert false;
 678         return -1;
 679     }
 680 
 681     private static void throwMalformed(int off, int nb) {
 682         throw new IllegalArgumentException("malformed input off : " + off +
 683                                            ", length : " + nb);
 684     }
 685 
 686     private static void throwMalformed(byte[] val) {
 687         int dp = 0;
 688         while (dp < val.length && val[dp] >=0) { dp++; }
 689         throwMalformed(dp, 1);
 690     }
 691 
 692     private static char repl = '\ufffd';
 693 
 694     private static Result decodeUTF8(byte[] src, int sp, int len, boolean doReplace) {
 695         // ascii-bais, which has a relative impact to the non-ascii-only bytes
 696         if (COMPACT_STRINGS && !hasNegatives(src, sp, len))
 697             return resultCached.get().with(Arrays.copyOfRange(src, sp, sp + len),
 698                                            LATIN1);
 699         return decodeUTF8_0(src, sp, len, doReplace);
 700     }
 701 
 702     private static Result decodeUTF8_0(byte[] src, int sp, int len, boolean doReplace) {
 703         Result ret = resultCached.get();
 704 
 705         int sl = sp + len;
 706         int dp = 0;
 707         byte[] dst = new byte[len];
 708 
 709         if (COMPACT_STRINGS) {
 710             while (sp < sl) {
 711                 int b1 = src[sp];
 712                 if (b1 >= 0) {
 713                     dst[dp++] = (byte)b1;
 714                     sp++;
 715                     continue;
 716                 }
 717                 if ((b1 == (byte)0xc2 || b1 == (byte)0xc3) &&
 718                     sp + 1 < sl) {
 719                     int b2 = src[sp + 1];
 720                     if (!isNotContinuation(b2)) {
 721                         dst[dp++] = (byte)(((b1 << 6) ^ b2)^
 722                                            (((byte) 0xC0 << 6) ^
 723                                            ((byte) 0x80 << 0)));
 724                         sp += 2;
 725                         continue;
 726                     }
 727                 }
 728                 // anything not a latin1, including the repl
 729                 // we have to go with the utf16
 730                 break;
 731             }
 732             if (sp == sl) {
 733                 if (dp != dst.length) {
 734                     dst = Arrays.copyOf(dst, dp);
 735                 }
 736                 return ret.with(dst, LATIN1);
 737             }
 738         }
 739         if (dp == 0) {
 740             dst = new byte[len << 1];
 741         } else {
 742             byte[] buf = new byte[len << 1];
 743             StringLatin1.inflate(dst, 0, buf, 0, dp);
 744             dst = buf;
 745         }
 746         while (sp < sl) {
 747             int b1 = src[sp++];
 748             if (b1 >= 0) {
 749                 putChar(dst, dp++, (char) b1);
 750             } else if ((b1 >> 5) == -2 && (b1 & 0x1e) != 0) {
 751                 if (sp < sl) {
 752                     int b2 = src[sp++];
 753                     if (isNotContinuation(b2)) {
 754                         if (!doReplace) {
 755                             throwMalformed(sp - 1, 1);
 756                         }
 757                         putChar(dst, dp++, repl);
 758                         sp--;
 759                     } else {
 760                         putChar(dst, dp++, (char)(((b1 << 6) ^ b2)^
 761                                                   (((byte) 0xC0 << 6) ^
 762                                                   ((byte) 0x80 << 0))));
 763                     }
 764                     continue;
 765                 }
 766                 if (!doReplace) {
 767                     throwMalformed(sp, 1);  // underflow()
 768                 }
 769                 putChar(dst, dp++, repl);
 770                 break;
 771             } else if ((b1 >> 4) == -2) {
 772                 if (sp + 1 < sl) {
 773                     int b2 = src[sp++];
 774                     int b3 = src[sp++];
 775                     if (isMalformed3(b1, b2, b3)) {
 776                         if (!doReplace) {
 777                             throwMalformed(sp - 3, 3);
 778                         }
 779                         putChar(dst, dp++, repl);
 780                         sp -= 3;
 781                         sp += malformedN(src, sp, 3);
 782                     } else {
 783                         char c = (char)((b1 << 12) ^
 784                                         (b2 <<  6) ^
 785                                         (b3 ^
 786                                          (((byte) 0xE0 << 12) ^
 787                                          ((byte) 0x80 <<  6) ^
 788                                          ((byte) 0x80 <<  0))));
 789                         if (isSurrogate(c)) {
 790                             if (!doReplace) {
 791                                 throwMalformed(sp - 3, 3);
 792                             }
 793                             putChar(dst, dp++, repl);
 794                         } else {
 795                             putChar(dst, dp++, c);
 796                         }
 797                     }
 798                     continue;
 799                 }
 800                 if (sp  < sl && isMalformed3_2(b1, src[sp])) {
 801                     if (!doReplace) {
 802                         throwMalformed(sp - 1, 2);
 803                     }
 804                     putChar(dst, dp++, repl);
 805                     continue;
 806                 }
 807                 if (!doReplace){
 808                     throwMalformed(sp, 1);
 809                 }
 810                 putChar(dst, dp++, repl);
 811                 break;
 812             } else if ((b1 >> 3) == -2) {
 813                 if (sp + 2 < sl) {
 814                     int b2 = src[sp++];
 815                     int b3 = src[sp++];
 816                     int b4 = src[sp++];
 817                     int uc = ((b1 << 18) ^
 818                               (b2 << 12) ^
 819                               (b3 <<  6) ^
 820                               (b4 ^
 821                                (((byte) 0xF0 << 18) ^
 822                                ((byte) 0x80 << 12) ^
 823                                ((byte) 0x80 <<  6) ^
 824                                ((byte) 0x80 <<  0))));
 825                     if (isMalformed4(b2, b3, b4) ||
 826                         !isSupplementaryCodePoint(uc)) { // shortest form check
 827                         if (!doReplace) {
 828                             throwMalformed(sp - 4, 4);
 829                         }
 830                         putChar(dst, dp++, repl);
 831                         sp -= 4;
 832                         sp += malformedN(src, sp, 4);
 833                     } else {
 834                         putChar(dst, dp++, highSurrogate(uc));
 835                         putChar(dst, dp++, lowSurrogate(uc));
 836                     }
 837                     continue;
 838                 }
 839                 b1 &= 0xff;
 840                 if (b1 > 0xf4 ||
 841                     sp  < sl && isMalformed4_2(b1, src[sp] & 0xff)) {
 842                     if (!doReplace) {
 843                         throwMalformed(sp - 1, 1);  // or 2
 844                     }
 845                     putChar(dst, dp++, repl);
 846                     continue;
 847                 }
 848                 if (!doReplace) {
 849                     throwMalformed(sp - 1, 1);
 850                 }
 851                 sp++;
 852                 putChar(dst, dp++, repl);
 853                 if (sp  < sl && isMalformed4_3(src[sp])) {
 854                     continue;
 855                 }
 856                 break;
 857             } else {
 858                 if (!doReplace) {
 859                     throwMalformed(sp - 1, 1);
 860                 }
 861                 putChar(dst, dp++, repl);
 862             }
 863         }
 864         if (dp != len) {
 865             dst = Arrays.copyOf(dst, dp << 1);
 866         }
 867         return ret.with(dst, UTF16);
 868     }
 869 
 870     private static byte[] encodeUTF8(byte coder, byte[] val, boolean doReplace) {
 871         if (coder == UTF16)
 872             return encodeUTF8_UTF16(val, doReplace);
 873 
 874         if (!hasNegatives(val, 0, val.length))
 875             return Arrays.copyOf(val, val.length);
 876 
 877         int dp = 0;
 878         byte[] dst = new byte[val.length << 1];
 879         for (int sp = 0; sp < val.length; sp++) {
 880             byte c = val[sp];
 881             if (c < 0) {
 882                 dst[dp++] = (byte)(0xc0 | ((c & 0xff) >> 6));
 883                 dst[dp++] = (byte)(0x80 | (c & 0x3f));
 884             } else {
 885                 dst[dp++] = c;
 886             }
 887         }
 888         if (dp == dst.length)
 889             return dst;
 890         return Arrays.copyOf(dst, dp);
 891     }
 892 
 893     private static byte[] encodeUTF8_UTF16(byte[] val, boolean doReplace) {
 894         int dp = 0;
 895         int sp = 0;
 896         int sl = val.length >> 1;
 897         byte[] dst = new byte[sl * 3];
 898         char c;
 899         while (sp < sl && (c = StringUTF16.getChar(val, sp)) < '\u0080') {
 900             // ascii fast loop;
 901             dst[dp++] = (byte)c;
 902             sp++;
 903         }
 904         while (sp < sl) {
 905             c = StringUTF16.getChar(val, sp++);
 906             if (c < 0x80) {
 907                 dst[dp++] = (byte)c;
 908             } else if (c < 0x800) {
 909                 dst[dp++] = (byte)(0xc0 | (c >> 6));
 910                 dst[dp++] = (byte)(0x80 | (c & 0x3f));
 911             } else if (Character.isSurrogate(c)) {
 912                 int uc = -1;
 913                 char c2;
 914                 if (Character.isHighSurrogate(c) && sp < sl &&
 915                     Character.isLowSurrogate(c2 = StringUTF16.getChar(val, sp))) {
 916                     uc = Character.toCodePoint(c, c2);
 917                 }
 918                 if (uc < 0) {
 919                     if (doReplace) {
 920                         dst[dp++] = '?';
 921                     } else {
 922                         throwMalformed(sp - 1, 1); // or 2, does not matter here
 923                     }
 924                 } else {
 925                     dst[dp++] = (byte)(0xf0 | ((uc >> 18)));
 926                     dst[dp++] = (byte)(0x80 | ((uc >> 12) & 0x3f));
 927                     dst[dp++] = (byte)(0x80 | ((uc >>  6) & 0x3f));
 928                     dst[dp++] = (byte)(0x80 | (uc & 0x3f));
 929                     sp++;  // 2 chars
 930                 }
 931             } else {
 932                 // 3 bytes, 16 bits
 933                 dst[dp++] = (byte)(0xe0 | ((c >> 12)));
 934                 dst[dp++] = (byte)(0x80 | ((c >>  6) & 0x3f));
 935                 dst[dp++] = (byte)(0x80 | (c & 0x3f));
 936             }
 937         }
 938         if (dp == dst.length) {
 939             return dst;
 940         }
 941         return Arrays.copyOf(dst, dp);
 942     }
 943 
 944     ////////////////////// for j.u.z.ZipCoder //////////////////////////
 945 
 946     /*
 947      * Throws iae, instead of replacing, if malformed or unmappble.
 948      */
 949     static String newStringUTF8NoRepl(byte[] src, int off, int len) {
 950         if (COMPACT_STRINGS && !hasNegatives(src, off, len))
 951             return new String(Arrays.copyOfRange(src, off, off + len), LATIN1);
 952         Result ret = decodeUTF8_0(src, off, len, false);
 953         return new String(ret.value, ret.coder);
 954     }
 955 
 956     /*
 957      * Throws iae, instead of replacing, if unmappble.
 958      */
 959     static byte[] getBytesUTF8NoRepl(String s) {
 960         return encodeUTF8(s.coder(), s.value(), false);
 961     }
 962 
 963     ////////////////////// for j.n.f.Files //////////////////////////
 964 
 965     private static boolean isASCII(byte[] src) {
 966         return !hasNegatives(src, 0, src.length);
 967     }
 968 
 969     private static String newStringLatin1(byte[] src) {
 970         if (COMPACT_STRINGS)
 971            return new String(src, LATIN1);
 972         return new String(StringLatin1.inflate(src, 0, src.length), UTF16);
 973     }
 974 
 975     static String newStringNoRepl(byte[] src, Charset cs) {
 976         if (cs == UTF_8) {
 977             if (COMPACT_STRINGS && isASCII(src))
 978                 return new String(src, LATIN1);
 979             Result ret = decodeUTF8_0(src, 0, src.length, false);
 980             return new String(ret.value, ret.coder);
 981         }
 982         if (cs == ISO_8859_1) {
 983             return newStringLatin1(src);
 984         }
 985         if (cs == US_ASCII) {
 986             if (isASCII(src)) {
 987                 return newStringLatin1(src);
 988             } else {
 989                 throwMalformed(src);
 990             }
 991         }
 992 
 993         CharsetDecoder cd = cs.newDecoder();
 994         // ascii fastpath
 995         if ((cd instanceof ArrayDecoder) &&
 996             ((ArrayDecoder)cd).isASCIICompatible() && isASCII(src)) {
 997             return newStringLatin1(src);
 998         }
 999         int len = src.length;
1000         if (len == 0) {
1001             return "";
1002         }
1003         int en = scale(len, cd.maxCharsPerByte());
1004         char[] ca = new char[en];
1005         if (cs.getClass().getClassLoader0() != null &&
1006             System.getSecurityManager() != null) {
1007             src = Arrays.copyOf(src, len);
1008         }
1009         ByteBuffer bb = ByteBuffer.wrap(src);
1010         CharBuffer cb = CharBuffer.wrap(ca);
1011         try {
1012             CoderResult cr = cd.decode(bb, cb, true);
1013             if (!cr.isUnderflow())
1014                 cr.throwException();
1015             cr = cd.flush(cb);
1016             if (!cr.isUnderflow())
1017                 cr.throwException();
1018         } catch (CharacterCodingException x) {
1019             throw new IllegalArgumentException(x);  // todo
1020         }
1021         Result ret = resultCached.get().with(ca, 0, cb.position());
1022         return new String(ret.value, ret.coder);
1023     }
1024 
1025     /*
1026      * Throws iae, instead of replacing, if unmappble.
1027      */
1028     static byte[] getBytesNoRepl(String s, Charset cs) {
1029         byte[] val = s.value();
1030         byte coder = s.coder();
1031         if (cs == UTF_8) {
1032             if (isASCII(val)) {
1033                 return val;
1034             }
1035             return encodeUTF8(coder, val, false);
1036         }
1037         if (cs == ISO_8859_1) {
1038             if (coder == LATIN1) {
1039                 return val;
1040             }
1041             return encode8859_1(coder, val, false);
1042         }
1043         if (cs == US_ASCII) {
1044             if (coder == LATIN1) {
1045                 if (isASCII(val)) {
1046                     return val;
1047                 } else {
1048                     throwMalformed(val);
1049                 }
1050             }
1051         }
1052         CharsetEncoder ce = cs.newEncoder();
1053         // fastpath for ascii compatible
1054         if (coder == LATIN1 && (((ce instanceof ArrayEncoder) &&
1055                                  ((ArrayEncoder)ce).isASCIICompatible() &&
1056                                  isASCII(val)))) {
1057             return val;
1058         }
1059         int len = val.length >> coder;  // assume LATIN1=0/UTF16=1;
1060         int en = scale(len, ce.maxBytesPerChar());
1061         byte[] ba = new byte[en];
1062         if (len == 0) {
1063             return ba;
1064         }
1065         if (ce instanceof ArrayEncoder) {
1066             int blen = (coder == LATIN1 ) ? ((ArrayEncoder)ce).encodeFromLatin1(val, 0, len, ba)
1067                                           : ((ArrayEncoder)ce).encodeFromUTF16(val, 0, len, ba);
1068             if (blen != -1) {
1069                 return safeTrim(ba, blen, true);
1070             }
1071         }
1072         boolean isTrusted = cs.getClass().getClassLoader0() == null ||
1073                             System.getSecurityManager() == null;
1074         char[] ca = (coder == LATIN1 ) ? StringLatin1.toChars(val)
1075                                        : StringUTF16.toChars(val);
1076         ByteBuffer bb = ByteBuffer.wrap(ba);
1077         CharBuffer cb = CharBuffer.wrap(ca, 0, len);
1078         try {
1079             CoderResult cr = ce.encode(cb, bb, true);
1080             if (!cr.isUnderflow())
1081                 cr.throwException();
1082             cr = ce.flush(bb);
1083             if (!cr.isUnderflow())
1084                 cr.throwException();
1085         } catch (CharacterCodingException x) {
1086             throw new Error(x);
1087         }
1088         return safeTrim(ba, bb.position(), isTrusted);
1089     }
1090 }