--- old/jdk/src/jdk.charsets/unix/classes/sun/nio/cs/ext/COMPOUND_TEXT_Decoder.java 2015-03-25 15:40:27.000000000 -0700 +++ /dev/null 2015-03-25 15:40:27.000000000 -0700 @@ -1,714 +0,0 @@ -/* - * Copyright (c) 2001, 2005, Oracle and/or its affiliates. All rights reserved. - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This code is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License version 2 only, as - * published by the Free Software Foundation. Oracle designates this - * particular file as subject to the "Classpath" exception as provided - * by Oracle in the LICENSE file that accompanied this code. - * - * This code is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License - * version 2 for more details (a copy is included in the LICENSE file that - * accompanied this code). - * - * You should have received a copy of the GNU General Public License version - * 2 along with this work; if not, write to the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. - * - * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA - * or visit www.oracle.com if you need additional information or have any - * questions. - */ - -package sun.nio.cs.ext; -import java.io.ByteArrayOutputStream; -import java.nio.ByteBuffer; -import java.nio.CharBuffer; -import java.nio.charset.*; - -/** - * An algorithmic conversion from COMPOUND_TEXT to Unicode. - */ - -public class COMPOUND_TEXT_Decoder extends CharsetDecoder { - - private static final int NORMAL_BYTES = 0; - private static final int NONSTANDARD_BYTES = 1; - private static final int VERSION_SEQUENCE_V = 2; - private static final int VERSION_SEQUENCE_TERM = 3; - private static final int ESCAPE_SEQUENCE = 4; - private static final int CHARSET_NGIIF = 5; - private static final int CHARSET_NLIIF = 6; - private static final int CHARSET_NLIF = 7; - private static final int CHARSET_NRIIF = 8; - private static final int CHARSET_NRIF = 9; - private static final int CHARSET_NONSTANDARD_FOML = 10; - private static final int CHARSET_NONSTANDARD_OML = 11; - private static final int CHARSET_NONSTANDARD_ML = 12; - private static final int CHARSET_NONSTANDARD_L = 13; - private static final int CHARSET_NONSTANDARD = 14; - private static final int CHARSET_LIIF = 15; - private static final int CHARSET_LIF = 16; - private static final int CHARSET_RIIF = 17; - private static final int CHARSET_RIF = 18; - private static final int CONTROL_SEQUENCE_PIF = 19; - private static final int CONTROL_SEQUENCE_IF = 20; - private static final int EXTENSION_ML = 21; - private static final int EXTENSION_L = 22; - private static final int EXTENSION = 23; - private static final int ESCAPE_SEQUENCE_OTHER = 24; - - private static final String ERR_LATIN1 = "ISO8859_1 unsupported"; - private static final String ERR_ILLSTATE = "Illegal state"; - private static final String ERR_ESCBYTE = - "Illegal byte in 0x1B escape sequence"; - private static final String ERR_ENCODINGBYTE = - "Illegal byte in non-standard character set name"; - private static final String ERR_CTRLBYTE = - "Illegal byte in 0x9B control sequence"; - private static final String ERR_CTRLPI = - "P following I in 0x9B control sequence"; - private static final String ERR_VERSTART = - "Versioning escape sequence can only appear at start of byte stream"; - private static final String ERR_VERMANDATORY = - "Cannot parse mandatory extensions"; - private static final String ERR_ENCODING = "Unknown encoding: "; - private static final String ERR_FLUSH = - "Escape sequence, control sequence, or ML extension not terminated"; - - private int state = NORMAL_BYTES ; - private int ext_count, ext_offset; - private boolean versionSequenceAllowed = true; - private byte[] byteBuf = new byte[1]; - private ByteBuffer inBB = ByteBuffer.allocate(16); - private ByteArrayOutputStream queue = new ByteArrayOutputStream(), - encodingQueue = new ByteArrayOutputStream(); - - private CharsetDecoder glDecoder, grDecoder, nonStandardDecoder, - lastDecoder; - private boolean glHigh = false, grHigh = true; - - - public COMPOUND_TEXT_Decoder(Charset cs) { - super(cs, 1.0f, 1.0f); - try { - // Initial state in ISO 2022 designates Latin-1 charset. - glDecoder = Charset.forName("ASCII").newDecoder(); - grDecoder = Charset.forName("ISO8859_1").newDecoder(); - } catch (IllegalArgumentException e) { - error(ERR_LATIN1); - } - initDecoder(glDecoder); - initDecoder(grDecoder); - } - - protected CoderResult decodeLoop(ByteBuffer src, CharBuffer des) { - CoderResult cr = CoderResult.UNDERFLOW; - byte[] input = src.array(); - int inOff = src.arrayOffset() + src.position(); - int inEnd = src.arrayOffset() + src.limit(); - - try { - while (inOff < inEnd && cr.isUnderflow()) { - // Byte parsing is done with shorts instead of bytes because - // Java bytes are signed, while COMPOUND_TEXT bytes are not. If - // we used the Java byte type, the > and < tests during parsing - // would not work correctly. - cr = handleByte((short)(input[inOff] & 0xFF), des); - inOff++; - } - return cr; - } finally { - src.position(inOff - src.arrayOffset()); - } - } - - private CoderResult handleByte(short newByte, CharBuffer cb) { - CoderResult cr = CoderResult.UNDERFLOW; - switch (state) { - case NORMAL_BYTES: - cr= normalBytes(newByte, cb); - break; - case NONSTANDARD_BYTES: - cr = nonStandardBytes(newByte, cb); - break; - case VERSION_SEQUENCE_V: - case VERSION_SEQUENCE_TERM: - cr = versionSequence(newByte); - break; - case ESCAPE_SEQUENCE: - cr = escapeSequence(newByte); - break; - case CHARSET_NGIIF: - cr = charset94N(newByte); - break; - case CHARSET_NLIIF: - case CHARSET_NLIF: - cr = charset94NL(newByte, cb); - break; - case CHARSET_NRIIF: - case CHARSET_NRIF: - cr = charset94NR(newByte, cb); - break; - case CHARSET_NONSTANDARD_FOML: - case CHARSET_NONSTANDARD_OML: - case CHARSET_NONSTANDARD_ML: - case CHARSET_NONSTANDARD_L: - case CHARSET_NONSTANDARD: - cr = charsetNonStandard(newByte, cb); - break; - case CHARSET_LIIF: - case CHARSET_LIF: - cr = charset9496L(newByte, cb); - break; - case CHARSET_RIIF: - case CHARSET_RIF: - cr = charset9496R(newByte, cb); - break; - case CONTROL_SEQUENCE_PIF: - case CONTROL_SEQUENCE_IF: - cr = controlSequence(newByte); - break; - case EXTENSION_ML: - case EXTENSION_L: - case EXTENSION: - cr = extension(newByte); - break; - case ESCAPE_SEQUENCE_OTHER: - cr = escapeSequenceOther(newByte); - break; - default: - error(ERR_ILLSTATE); - } - return cr; - } - - private CoderResult normalBytes(short newByte, CharBuffer cb) { - CoderResult cr = CoderResult.UNDERFLOW; - if ((newByte >= 0x00 && newByte <= 0x1F) || // C0 - (newByte >= 0x80 && newByte <= 0x9F)) { // C1 - char newChar; - - switch (newByte) { - case 0x1B: - state = ESCAPE_SEQUENCE; - queue.write(newByte); - return cr; - case 0x9B: - state = CONTROL_SEQUENCE_PIF; - versionSequenceAllowed = false; - queue.write(newByte); - return cr; - case 0x09: - versionSequenceAllowed = false; - newChar = '\t'; - break; - case 0x0A: - versionSequenceAllowed = false; - newChar = '\n'; - break; - default: - versionSequenceAllowed = false; - return cr; - } - if (!cb.hasRemaining()) - return CoderResult.OVERFLOW; - else - cb.put(newChar); - } else { - CharsetDecoder decoder; - boolean high; - versionSequenceAllowed = false; - - if (newByte >= 0x20 && newByte <= 0x7F) { - decoder = glDecoder; - high = glHigh; - } else /* if (newByte >= 0xA0 && newByte <= 0xFF) */ { - decoder = grDecoder; - high = grHigh; - } - if (lastDecoder != null && decoder != lastDecoder) { - cr = flushDecoder(lastDecoder, cb); - } - lastDecoder = decoder; - - if (decoder != null) { - byte b = (byte)newByte; - if (high) { - b |= 0x80; - } else { - b &= 0x7F; - } - inBB.put(b); - inBB.flip(); - cr = decoder.decode(inBB, cb, false); - if (!inBB.hasRemaining() || cr.isMalformed()) { - inBB.clear(); - } else { - int pos = inBB.limit(); - inBB.clear(); - inBB.position(pos); - } - } else if (cb.remaining() < replacement().length()) { - cb.put(replacement()); - } else { - return CoderResult.OVERFLOW; - } - } - return cr; - } - - private CoderResult nonStandardBytes(short newByte, CharBuffer cb) - { - CoderResult cr = CoderResult.UNDERFLOW; - if (nonStandardDecoder != null) { - //byteBuf[0] = (byte)newByte; - inBB.put((byte)newByte); - inBB.flip(); - cr = nonStandardDecoder.decode(inBB, cb, false); - if (!inBB.hasRemaining()) { - inBB.clear(); - } else { - int pos = inBB.limit(); - inBB.clear(); - inBB.position(pos); - } - } else if (cb.remaining() < replacement().length()) { - cb.put(replacement()); - } else { - return CoderResult.OVERFLOW; - } - - ext_offset++; - if (ext_offset >= ext_count) { - ext_offset = ext_count = 0; - state = NORMAL_BYTES; - cr = flushDecoder(nonStandardDecoder, cb); - nonStandardDecoder = null; - } - return cr; - } - - private CoderResult escapeSequence(short newByte) { - switch (newByte) { - case 0x23: - state = VERSION_SEQUENCE_V; - break; - case 0x24: - state = CHARSET_NGIIF; - versionSequenceAllowed = false; - break; - case 0x25: - state = CHARSET_NONSTANDARD_FOML; - versionSequenceAllowed = false; - break; - case 0x28: - state = CHARSET_LIIF; - versionSequenceAllowed = false; - break; - case 0x29: - case 0x2D: - state = CHARSET_RIIF; - versionSequenceAllowed = false; - break; - default: - // escapeSequenceOther will write to queue if appropriate - return escapeSequenceOther(newByte); - } - - queue.write(newByte); - return CoderResult.UNDERFLOW; - } - - /** - * Test for unknown, but valid, escape sequences. - */ - private CoderResult escapeSequenceOther(short newByte) { - if (newByte >= 0x20 && newByte <= 0x2F) { - // {I} - state = ESCAPE_SEQUENCE_OTHER; - versionSequenceAllowed = false; - queue.write(newByte); - } else if (newByte >= 0x30 && newByte <= 0x7E) { - // F -- end of sequence - state = NORMAL_BYTES; - versionSequenceAllowed = false; - queue.reset(); - } else { - return malformedInput(ERR_ESCBYTE); - } - return CoderResult.UNDERFLOW; - } - - /** - * Parses directionality, as well as unknown, but valid, control sequences. - */ - private CoderResult controlSequence(short newByte) { - if (newByte >= 0x30 && newByte <= 0x3F) { - // {P} - if (state == CONTROL_SEQUENCE_IF) { - // P no longer allowed - return malformedInput(ERR_CTRLPI); - } - queue.write(newByte); - } else if (newByte >= 0x20 && newByte <= 0x2F) { - // {I} - state = CONTROL_SEQUENCE_IF; - queue.write(newByte); - } else if (newByte >= 0x40 && newByte <= 0x7E) { - // F -- end of sequence - state = NORMAL_BYTES; - queue.reset(); - } else { - return malformedInput(ERR_CTRLBYTE); - } - return CoderResult.UNDERFLOW; - } - - private CoderResult versionSequence(short newByte) { - if (state == VERSION_SEQUENCE_V) { - if (newByte >= 0x20 && newByte <= 0x2F) { - state = VERSION_SEQUENCE_TERM; - queue.write(newByte); - } else { - return escapeSequenceOther(newByte); - } - } else /* if (state == VERSION_SEQUENCE_TERM) */ { - switch (newByte) { - case 0x30: - if (!versionSequenceAllowed) { - return malformedInput(ERR_VERSTART); - } - - // OK to ignore extensions - versionSequenceAllowed = false; - state = NORMAL_BYTES; - queue.reset(); - break; - case 0x31: - return malformedInput((versionSequenceAllowed) - ? ERR_VERMANDATORY : ERR_VERSTART); - default: - return escapeSequenceOther(newByte); - } - } - return CoderResult.UNDERFLOW; - } - - private CoderResult charset94N(short newByte) { - switch (newByte) { - case 0x28: - state = CHARSET_NLIIF; - break; - case 0x29: - state = CHARSET_NRIIF; - break; - default: - // escapeSequenceOther will write byte if appropriate - return escapeSequenceOther(newByte); - } - - queue.write(newByte); - return CoderResult.UNDERFLOW; - } - - private CoderResult charset94NL(short newByte, CharBuffer cb) { - if (newByte >= 0x21 && - newByte <= (state == CHARSET_NLIIF ? 0x23 : 0x2F)) { - // {I} - state = CHARSET_NLIF; - queue.write(newByte); - } else if (newByte >= 0x40 && newByte <= 0x7E) { - // F - return switchDecoder(newByte, cb); - } else { - return escapeSequenceOther(newByte); - } - return CoderResult.UNDERFLOW; - } - - private CoderResult charset94NR(short newByte, CharBuffer cb) - { - if (newByte >= 0x21 && - newByte <= (state == CHARSET_NRIIF ? 0x23 : 0x2F)) { - // {I} - state = CHARSET_NRIF; - queue.write(newByte); - } else if (newByte >= 0x40 && newByte <= 0x7E) { - // F - return switchDecoder(newByte, cb); - } else { - return escapeSequenceOther(newByte); - } - return CoderResult.UNDERFLOW; - } - - private CoderResult charset9496L(short newByte, CharBuffer cb) { - if (newByte >= 0x21 && - newByte <= (state == CHARSET_LIIF ? 0x23 : 0x2F)) { - // {I} - state = CHARSET_LIF; - queue.write(newByte); - return CoderResult.UNDERFLOW; - } else if (newByte >= 0x40 && newByte <= 0x7E) { - // F - return switchDecoder(newByte, cb); - } else { - return escapeSequenceOther(newByte); - } - } - - private CoderResult charset9496R(short newByte, CharBuffer cb) { - if (newByte >= 0x21 && - newByte <= (state == CHARSET_RIIF ? 0x23 : 0x2F)) { - // {I} - state = CHARSET_RIF; - queue.write(newByte); - return CoderResult.UNDERFLOW; - } else if (newByte >= 0x40 && newByte <= 0x7E) { - // F - return switchDecoder(newByte, cb); - } else { - return escapeSequenceOther(newByte); - } - } - - private CoderResult charsetNonStandard(short newByte, CharBuffer cb) { - switch (state) { - case CHARSET_NONSTANDARD_FOML: - if (newByte == 0x2F) { - state = CHARSET_NONSTANDARD_OML; - queue.write(newByte); - } else { - return escapeSequenceOther(newByte); - } - break; - case CHARSET_NONSTANDARD_OML: - if (newByte >= 0x30 && newByte <= 0x34) { - state = CHARSET_NONSTANDARD_ML; - queue.write(newByte); - } else if (newByte >= 0x35 && newByte <= 0x3F) { - state = EXTENSION_ML; - queue.write(newByte); - } else { - return escapeSequenceOther(newByte); - } - break; - case CHARSET_NONSTANDARD_ML: - ext_count = (newByte & 0x7F) * 0x80; - state = CHARSET_NONSTANDARD_L; - break; - case CHARSET_NONSTANDARD_L: - ext_count = ext_count + (newByte & 0x7F); - state = (ext_count > 0) ? CHARSET_NONSTANDARD : NORMAL_BYTES; - break; - case CHARSET_NONSTANDARD: - if (newByte == 0x3F || newByte == 0x2A) { - queue.reset(); // In this case, only current byte is bad. - return malformedInput(ERR_ENCODINGBYTE); - } - ext_offset++; - if (ext_offset >= ext_count) { - ext_offset = ext_count = 0; - state = NORMAL_BYTES; - queue.reset(); - encodingQueue.reset(); - } else if (newByte == 0x02) { - // encoding name terminator - return switchDecoder((short)0, cb); - } else { - encodingQueue.write(newByte); - } - break; - default: - error(ERR_ILLSTATE); - } - return CoderResult.UNDERFLOW; - } - - private CoderResult extension(short newByte) { - switch (state) { - case EXTENSION_ML: - ext_count = (newByte & 0x7F) * 0x80; - state = EXTENSION_L; - break; - case EXTENSION_L: - ext_count = ext_count + (newByte & 0x7F); - state = (ext_count > 0) ? EXTENSION : NORMAL_BYTES; - break; - case EXTENSION: - // Consume 'count' bytes. Don't bother putting them on the queue. - // There may be too many and we can't do anything with them anyway. - ext_offset++; - if (ext_offset >= ext_count) { - ext_offset = ext_count = 0; - state = NORMAL_BYTES; - queue.reset(); - } - break; - default: - error(ERR_ILLSTATE); - } - return CoderResult.UNDERFLOW; - } - - /** - * Preconditions: - * 1. 'queue' contains ControlSequence.escSequence - * 2. 'encodingQueue' contains ControlSequence.encoding - */ - private CoderResult switchDecoder(short lastByte, CharBuffer cb) { - CoderResult cr = CoderResult.UNDERFLOW; - CharsetDecoder decoder = null; - boolean high = false; - byte[] escSequence; - byte[] encoding = null; - - if (lastByte != 0) { - queue.write(lastByte); - } - - escSequence = queue.toByteArray(); - queue.reset(); - - if (state == CHARSET_NONSTANDARD) { - encoding = encodingQueue.toByteArray(); - encodingQueue.reset(); - decoder = CompoundTextSupport. - getNonStandardDecoder(escSequence, encoding); - } else { - decoder = CompoundTextSupport.getStandardDecoder(escSequence); - high = CompoundTextSupport.getHighBit(escSequence); - } - if (decoder != null) { - initDecoder(decoder); - } else if (unmappableCharacterAction() == CodingErrorAction.REPORT) { - int badInputLength = 1; - if (encoding != null) { - badInputLength = encoding.length; - } else if (escSequence.length > 0) { - badInputLength = escSequence.length; - } - return CoderResult.unmappableForLength(badInputLength); - } - - if (state == CHARSET_NLIIF || state == CHARSET_NLIF || - state == CHARSET_LIIF || state == CHARSET_LIF) - { - if (lastDecoder == glDecoder) { - cr = flushDecoder(glDecoder, cb); - } - glDecoder = lastDecoder = decoder; - glHigh = high; - state = NORMAL_BYTES; - } else if (state == CHARSET_NRIIF || state == CHARSET_NRIF || - state == CHARSET_RIIF || state == CHARSET_RIF) { - if (lastDecoder == grDecoder) { - cr = flushDecoder(grDecoder, cb); - } - grDecoder = lastDecoder = decoder; - grHigh = high; - state = NORMAL_BYTES; - } else if (state == CHARSET_NONSTANDARD) { - if (lastDecoder != null) { - cr = flushDecoder(lastDecoder, cb); - lastDecoder = null; - } - nonStandardDecoder = decoder; - state = NONSTANDARD_BYTES; - } else { - error(ERR_ILLSTATE); - } - return cr; - } - - private ByteBuffer fbb= ByteBuffer.allocate(0); - private CoderResult flushDecoder(CharsetDecoder dec, CharBuffer cb) { - dec.decode(fbb, cb, true); - CoderResult cr = dec.flush(cb); - dec.reset(); //reuse - return cr; - } - - private CoderResult malformedInput(String msg) { - int badInputLength = queue.size() + 1 /* current byte */ ; - queue.reset(); - //TBD: nowhere to put the msg in CoderResult - return CoderResult.malformedForLength(badInputLength); - } - - private void error(String msg) { - // For now, throw InternalError. Convert to 'assert' keyword later. - throw new InternalError(msg); - } - - protected CoderResult implFlush(CharBuffer out) { - CoderResult cr = CoderResult.UNDERFLOW; - if (lastDecoder != null) - cr = flushDecoder(lastDecoder, out); - if (state != NORMAL_BYTES) - //TBD message ERR_FLUSH; - cr = CoderResult.malformedForLength(0); - reset(); - return cr; - } - - /** - * Resets the decoder. - * Call this method to reset the decoder to its initial state - */ - protected void implReset() { - state = NORMAL_BYTES; - ext_count = ext_offset = 0; - versionSequenceAllowed = true; - queue.reset(); - encodingQueue.reset(); - nonStandardDecoder = lastDecoder = null; - glHigh = false; - grHigh = true; - try { - // Initial state in ISO 2022 designates Latin-1 charset. - glDecoder = Charset.forName("ASCII").newDecoder(); - grDecoder = Charset.forName("ISO8859_1").newDecoder(); - } catch (IllegalArgumentException e) { - error(ERR_LATIN1); - } - initDecoder(glDecoder); - initDecoder(grDecoder); - } - - protected void implOnMalformedInput(CodingErrorAction newAction) { - if (glDecoder != null) - glDecoder.onMalformedInput(newAction); - if (grDecoder != null) - grDecoder.onMalformedInput(newAction); - if (nonStandardDecoder != null) - nonStandardDecoder.onMalformedInput(newAction); - } - - protected void implOnUnmappableCharacter(CodingErrorAction newAction) { - if (glDecoder != null) - glDecoder.onUnmappableCharacter(newAction); - if (grDecoder != null) - grDecoder.onUnmappableCharacter(newAction); - if (nonStandardDecoder != null) - nonStandardDecoder.onUnmappableCharacter(newAction); - } - - protected void implReplaceWith(String newReplacement) { - if (glDecoder != null) - glDecoder.replaceWith(newReplacement); - if (grDecoder != null) - grDecoder.replaceWith(newReplacement); - if (nonStandardDecoder != null) - nonStandardDecoder.replaceWith(newReplacement); - } - - private void initDecoder(CharsetDecoder dec) { - dec.onUnmappableCharacter(CodingErrorAction.REPLACE) - .replaceWith(replacement()); - } -}