New src/java.xml/share/classes/com/sun/org/apache/xml/internal/serializer/Encodings.java

   1 /*
   2  * Copyright (c) 2017, Oracle and/or its affiliates. All rights reserved.
   3  */
   4 /*
   5  * Licensed to the Apache Software Foundation (ASF) under one or more
   6  * contributor license agreements.  See the NOTICE file distributed with
   7  * this work for additional information regarding copyright ownership.
   8  * The ASF licenses this file to You under the Apache License, Version 2.0
   9  * (the "License"); you may not use this file except in compliance with
  10  * the License.  You may obtain a copy of the License at
  11  *
  12  *      http://www.apache.org/licenses/LICENSE-2.0
  13  *
  14  * Unless required by applicable law or agreed to in writing, software
  15  * distributed under the License is distributed on an "AS IS" BASIS,
  16  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  17  * See the License for the specific language governing permissions and
  18  * limitations under the License.
  19  */
  20 
  21 package com.sun.org.apache.xml.internal.serializer;
  22 
  23 import java.io.BufferedWriter;
  24 import java.io.IOException;
  25 import java.io.InputStream;
  26 import java.io.OutputStream;
  27 import java.io.OutputStreamWriter;
  28 import java.io.UnsupportedEncodingException;
  29 import java.io.Writer;
  30 import java.net.MalformedURLException;
  31 import java.net.URL;
  32 import java.nio.charset.Charset;
  33 import java.nio.charset.IllegalCharsetNameException;
  34 import java.nio.charset.UnsupportedCharsetException;
  35 import java.util.Collections;
  36 import java.util.Enumeration;
  37 import java.util.HashMap;
  38 import java.util.Map.Entry;
  39 import java.util.Map;
  40 import java.util.Properties;
  41 import java.util.StringTokenizer;
  42 import jdk.xml.internal.SecuritySupport;
  43 
  44 /**
  45  * Provides information about encodings. Depends on the Java runtime
  46  * to provides writers for the different encodings, but can be used
  47  * to override encoding names and provide the last printable character
  48  * for each encoding.
  49  *
  50  * @author <a href="mailto:arkin@intalio.com">Assaf Arkin</a>
  51  * @LastModified: Oct 2017
  52  */
  53 
  54 public final class Encodings extends Object
  55 {
  56 
  57     /**
  58      * The last printable character for unknown encodings.
  59      */
  60     private static final int m_defaultLastPrintable = 0x7F;
  61 
  62     /**
  63      * Standard filename for properties file with encodings data.
  64      */
  65     private static final String ENCODINGS_FILE = "com/sun/org/apache/xml/internal/serializer/Encodings.properties";
  66 
  67     /**
  68      * Standard filename for properties file with encodings data.
  69      */
  70     private static final String ENCODINGS_PROP = "com.sun.org.apache.xalan.internal.serialize.encodings";
  71 
  72 
  73     /**
  74      * Returns a writer for the specified encoding based on
  75      * an output stream.
  76      *
  77      * @param output The output stream
  78      * @param encoding The encoding
  79      * @return A suitable writer
  80      * @throws UnsupportedEncodingException There is no convertor
  81      *  to support this encoding
  82      */
  83     static Writer getWriter(OutputStream output, String encoding)
  84         throws UnsupportedEncodingException
  85     {
  86 
  87         final EncodingInfo ei = _encodingInfos.findEncoding(toUpperCaseFast(encoding));
  88         if (ei != null) {
  89             try {
  90                 return new BufferedWriter(new OutputStreamWriter(
  91                         output, ei.javaName));
  92             } catch (UnsupportedEncodingException usee) {
  93                 // keep trying
  94             }
  95         }
  96 
  97         return new BufferedWriter(new OutputStreamWriter(output, encoding));
  98     }
  99 
 100 
 101     /**
 102      * Returns the last printable character for an unspecified
 103      * encoding.
 104      *
 105      * @return the default size
 106      */
 107     public static int getLastPrintable()
 108     {
 109         return m_defaultLastPrintable;
 110     }
 111 
 112 
 113 
 114     /**
 115      * Returns the EncodingInfo object for the specified
 116      * encoding.
 117      * <p>
 118      * This is not a public API.
 119      *
 120      * @param encoding The encoding
 121      * @return The object that is used to determine if
 122      * characters are in the given encoding.
 123      * @xsl.usage internal
 124      */
 125     static EncodingInfo getEncodingInfo(String encoding)
 126     {
 127         EncodingInfo ei;
 128 
 129         String normalizedEncoding = toUpperCaseFast(encoding);
 130         ei = _encodingInfos.findEncoding(normalizedEncoding);
 131         if (ei == null) {
 132             // We shouldn't have to do this, but just in case.
 133             try {
 134                 // This may happen if the caller tries to use
 135                 // an encoding that wasn't registered in the
 136                 // (java name)->(preferred mime name) mapping file.
 137                 // In that case we attempt to load the charset for the
 138                 // given encoding, and if that succeeds - we create a new
 139                 // EncodingInfo instance - assuming the canonical name
 140                 // of the charset can be used as the mime name.
 141                 final Charset c = Charset.forName(encoding);
 142                 final String name = c.name();
 143                 ei = new EncodingInfo(name, name);
 144                 _encodingInfos.putEncoding(normalizedEncoding, ei);
 145             } catch (IllegalCharsetNameException | UnsupportedCharsetException x) {
 146                 ei = new EncodingInfo(null,null);
 147             }
 148         }
 149 
 150         return ei;
 151     }
 152 
 153     /**
 154      * Determines if the encoding specified was recognized by the
 155      * serializer or not.
 156      *
 157      * @param encoding The encoding
 158      * @return boolean - true if the encoding was recognized else false
 159      */
 160     public static boolean isRecognizedEncoding(String encoding)
 161     {
 162         EncodingInfo ei;
 163 
 164         String normalizedEncoding = toUpperCaseFast(encoding);
 165         ei = _encodingInfos.findEncoding(normalizedEncoding);
 166         if (ei != null)
 167             return true;
 168         return false;
 169     }
 170 
 171     /**
 172      * A fast and cheap way to uppercase a String that is
 173      * only made of printable ASCII characters.
 174      * <p>
 175      * This is not a public API.
 176      * @param s a String of ASCII characters
 177      * @return an uppercased version of the input String,
 178      * possibly the same String.
 179      * @xsl.usage internal
 180      */
 181     static private String toUpperCaseFast(final String s) {
 182 
 183         boolean different = false;
 184         final int mx = s.length();
 185                 char[] chars = new char[mx];
 186         for (int i=0; i < mx; i++) {
 187                 char ch = s.charAt(i);
 188             // is the character a lower case ASCII one?
 189                 if ('a' <= ch && ch <= 'z') {
 190                 // a cheap and fast way to uppercase that is good enough
 191                         ch = (char) (ch + ('A' - 'a'));
 192                         different = true; // the uppercased String is different
 193                 }
 194                 chars[i] = ch;
 195         }
 196 
 197         // A little optimization, don't call String.valueOf() if
 198         // the uppercased string is the same as the input string.
 199         final String upper;
 200         if (different)
 201                 upper = String.valueOf(chars);
 202         else
 203                 upper = s;
 204 
 205         return upper;
 206     }
 207 
 208     /** The default encoding, ISO style, ISO style.   */
 209     static final String DEFAULT_MIME_ENCODING = "UTF-8";
 210 
 211     /**
 212      * Get the proper mime encoding.  From the XSLT recommendation: "The encoding
 213      * attribute specifies the preferred encoding to use for outputting the result
 214      * tree. XSLT processors are required to respect values of UTF-8 and UTF-16.
 215      * For other values, if the XSLT processor does not support the specified
 216      * encoding it may signal an error; if it does not signal an error it should
 217      * use UTF-8 or UTF-16 instead. The XSLT processor must not use an encoding
 218      * whose name does not match the EncName production of the XML Recommendation
 219      * [XML]. If no encoding attribute is specified, then the XSLT processor should
 220      * use either UTF-8 or UTF-16."
 221      *
 222      * @param encoding Reference to java-style encoding string, which may be null,
 223      * in which case a default will be found.
 224      *
 225      * @return The ISO-style encoding string, or null if failure.
 226      */
 227     static String getMimeEncoding(String encoding)
 228     {
 229 
 230         if (null == encoding)
 231         {
 232             try
 233             {
 234 
 235                 // Get the default system character encoding.  This may be
 236                 // incorrect if they passed in a writer, but right now there
 237                 // seems to be no way to get the encoding from a writer.
 238                 encoding = SecuritySupport.getSystemProperty("file.encoding", "UTF8");
 239 
 240                 if (null != encoding)
 241                 {
 242 
 243                     /*
 244                     * See if the mime type is equal to UTF8.  If you don't
 245                     * do that, then  convertJava2MimeEncoding will convert
 246                     * 8859_1 to "ISO-8859-1", which is not what we want,
 247                     * I think, and I don't think I want to alter the tables
 248                     * to convert everything to UTF-8.
 249                     */
 250                     String jencoding =
 251                         (encoding.equalsIgnoreCase("Cp1252")
 252                             || encoding.equalsIgnoreCase("ISO8859_1")
 253                             || encoding.equalsIgnoreCase("8859_1")
 254                             || encoding.equalsIgnoreCase("UTF8"))
 255                             ? DEFAULT_MIME_ENCODING
 256                             : convertJava2MimeEncoding(encoding);
 257 
 258                     encoding =
 259                         (null != jencoding) ? jencoding : DEFAULT_MIME_ENCODING;
 260                 }
 261                 else
 262                 {
 263                     encoding = DEFAULT_MIME_ENCODING;
 264                 }
 265             }
 266             catch (SecurityException se)
 267             {
 268                 encoding = DEFAULT_MIME_ENCODING;
 269             }
 270         }
 271         else
 272         {
 273             encoding = convertJava2MimeEncoding(encoding);
 274         }
 275 
 276         return encoding;
 277     }
 278 
 279     /**
 280      * Try the best we can to convert a Java encoding to a XML-style encoding.
 281      *
 282      * @param encoding non-null reference to encoding string, java style.
 283      *
 284      * @return ISO-style encoding string.
 285      */
 286     private static String convertJava2MimeEncoding(String encoding)
 287     {
 288         final EncodingInfo enc =
 289              _encodingInfos.getEncodingFromJavaKey(toUpperCaseFast(encoding));
 290         if (null != enc)
 291             return enc.name;
 292         return encoding;
 293     }
 294 
 295     /**
 296      * Try the best we can to convert a Java encoding to a XML-style encoding.
 297      *
 298      * @param encoding non-null reference to encoding string, java style.
 299      *
 300      * @return ISO-style encoding string.
 301      */
 302     public static String convertMime2JavaEncoding(String encoding)
 303     {
 304         final EncodingInfo info = _encodingInfos.findEncoding(toUpperCaseFast(encoding));
 305         return info != null ? info.javaName : encoding;
 306     }
 307 
 308     // Using an inner static class here prevent initialization races
 309     // where the hash maps could be used before they were populated.
 310     //
 311     private final static class EncodingInfos {
 312         // These maps are final and not modified after initialization.
 313         private final Map<String, EncodingInfo> _encodingTableKeyJava = new HashMap<>();
 314         private final Map<String, EncodingInfo> _encodingTableKeyMime = new HashMap<>();
 315         // This map will be added to after initialization: make sure it's
 316         // thread-safe. This map should not be used frequently - only in cases
 317         // where the mapping requested was not declared in the Encodings.properties
 318         // file.
 319         private final Map<String, EncodingInfo> _encodingDynamicTable =
 320                 Collections.synchronizedMap(new HashMap<String, EncodingInfo>());
 321 
 322         private EncodingInfos() {
 323             loadEncodingInfo();
 324         }
 325 
 326         // Opens the file/resource containing java charset name -> preferred mime
 327         // name mapping and returns it as an InputStream.
 328         private InputStream openEncodingsFileStream() throws MalformedURLException, IOException {
 329             String urlString = null;
 330             InputStream is = null;
 331 
 332             try {
 333                 urlString = SecuritySupport.getSystemProperty(ENCODINGS_PROP, "");
 334             } catch (SecurityException e) {
 335             }
 336 
 337             if (urlString != null && urlString.length() > 0) {
 338                 URL url = new URL(urlString);
 339                 is = url.openStream();
 340             }
 341 
 342             if (is == null) {
 343                 is = SecuritySupport.getResourceAsStream(ENCODINGS_FILE);
 344             }
 345             return is;
 346         }
 347 
 348         // Loads the Properties resource containing the mapping:
 349         //    java charset name -> preferred mime name
 350         // and returns it.
 351         private Properties loadProperties() throws MalformedURLException, IOException {
 352             Properties props = new Properties();
 353             try (InputStream is = openEncodingsFileStream()) {
 354                 if (is != null) {
 355                     props.load(is);
 356                 } else {
 357                     // Seems to be no real need to force failure here, let the
 358                     // system do its best... The issue is not really very critical,
 359                     // and the output will be in any case _correct_ though maybe not
 360                     // always human-friendly... :)
 361                     // But maybe report/log the resource problem?
 362                     // Any standard ways to report/log errors (in static context)?
 363                 }
 364             }
 365             return props;
 366         }
 367 
 368         // Parses the mime list associated to a java charset name.
 369         // The first mime name in the list is supposed to be the preferred
 370         // mime name.
 371         private String[] parseMimeTypes(String val) {
 372             int pos = val.indexOf(' ');
 373             //int lastPrintable;
 374             if (pos < 0) {
 375                 // Maybe report/log this problem?
 376                 //  "Last printable character not defined for encoding " +
 377                 //  mimeName + " (" + val + ")" ...
 378                 return new String[] { val };
 379                 //lastPrintable = 0x00FF;
 380             }
 381             //lastPrintable =
 382             //    Integer.decode(val.substring(pos).trim()).intValue();
 383             StringTokenizer st =
 384                     new StringTokenizer(val.substring(0, pos), ",");
 385             String[] values = new String[st.countTokens()];
 386             for (int i=0; st.hasMoreTokens(); i++) {
 387                 values[i] = st.nextToken();
 388             }
 389             return values;
 390         }
 391 
 392         // This method here attempts to find the canonical charset name for the
 393         // the given name - which is supposed to be either a java name or a mime
 394         // name.
 395         // For that, it attempts to load the charset using the given name, and
 396         // then returns the charset's canonical name.
 397         // If the charset could not be loaded from the given name,
 398         // the method returns null.
 399         private String findCharsetNameFor(String name) {
 400             try {
 401                 return Charset.forName(name).name();
 402             } catch (Exception x) {
 403                 return null;
 404             }
 405         }
 406 
 407         // This method here attempts to find the canonical charset name for the
 408         // the set javaName+mimeNames - which are supposed to all refer to the
 409         // same charset.
 410         // For that it attempts to load the charset using the javaName, and if
 411         // not found, attempts again using each of the mime names in turn.
 412         // If the charset could be loaded from the javaName, then the javaName
 413         // itself is returned as charset name. Otherwise, each of the mime names
 414         // is tried in turn, until a charset can be loaded from one of the names,
 415         // and the loaded charset's canonical name is returned.
 416         // If no charset can be loaded from either the javaName or one of the
 417         // mime names, then null is returned.
 418         //
 419         // Note that the returned name is the 'java' name that will be used in
 420         // instances of EncodingInfo.
 421         // This is important because EncodingInfo uses that 'java name' later on
 422         // in calls to String.getBytes(javaName).
 423         // As it happens, sometimes only one element of the set mime names/javaName
 424         // is known by Charset: sometimes only one of the mime names is known,
 425         // sometime only the javaName is known, sometimes all are known.
 426         //
 427         // By using this method here, we fix the problem where one of the mime
 428         // names is known but the javaName is unknown, by associating the charset
 429         // loaded from one of the mime names with the unrecognized javaName.
 430         //
 431         // When none of the mime names or javaName are known - there's not much we can
 432         // do... It can mean that this encoding is not supported for this
 433         // OS. If such a charset is ever use it will result in having all characters
 434         // escaped.
 435         //
 436         private String findCharsetNameFor(String javaName, String[] mimes) {
 437             String cs = findCharsetNameFor(javaName);
 438             if (cs != null) return javaName;
 439             for (String m : mimes) {
 440                 cs = findCharsetNameFor(m);
 441                 if (cs != null) break;
 442             }
 443             return cs;
 444         }
 445 
 446         /**
 447          * Loads a list of all the supported encodings.
 448          *
 449          * System property "encodings" formatted using URL syntax may define an
 450          * external encodings list. Thanks to Sergey Ushakov for the code
 451          * contribution!
 452          */
 453         private void loadEncodingInfo() {
 454             try {
 455                 // load (java name)->(preferred mime name) mapping.
 456                 final Properties props = loadProperties();
 457 
 458                 // create instances of EncodingInfo from the loaded mapping
 459                 Enumeration<Object> keys = props.keys();
 460                 Map<String, EncodingInfo> canonicals = new HashMap<>();
 461                 while (keys.hasMoreElements()) {
 462                     final String javaName = (String) keys.nextElement();
 463                     final String[] mimes = parseMimeTypes(props.getProperty(javaName));
 464 
 465                     final String charsetName = findCharsetNameFor(javaName, mimes);
 466                     if (charsetName != null) {
 467                         final String kj = toUpperCaseFast(javaName);
 468                         final String kc = toUpperCaseFast(charsetName);
 469                         for (int i = 0; i < mimes.length; ++i) {
 470                             final String mimeName = mimes[i];
 471                             final String km = toUpperCaseFast(mimeName);
 472                             EncodingInfo info = new EncodingInfo(mimeName, charsetName);
 473                             _encodingTableKeyMime.put(km, info);
 474                             if (!canonicals.containsKey(kc)) {
 475                                 // canonicals will map the charset name to
 476                                 //   the info containing the prefered mime name
 477                                 //   (the preferred mime name is the first mime
 478                                 //   name in the list).
 479                                 canonicals.put(kc, info);
 480                                 _encodingTableKeyJava.put(kc, info);
 481                             }
 482                             _encodingTableKeyJava.put(kj, info);
 483                         }
 484                     } else {
 485                         // None of the java or mime names on the line were
 486                         // recognized => this charset is not supported?
 487                     }
 488                 }
 489 
 490                 // Fix up the _encodingTableKeyJava so that the info mapped to
 491                 // the java name contains the preferred mime name.
 492                 // (a given java name can correspond to several mime name,
 493                 //  but we want the _encodingTableKeyJava to point to the
 494                 //  preferred mime name).
 495                 for (Entry<String, EncodingInfo> e : _encodingTableKeyJava.entrySet()) {
 496                     e.setValue(canonicals.get(toUpperCaseFast(e.getValue().javaName)));
 497                 }
 498 
 499             } catch (java.net.MalformedURLException mue) {
 500                 throw new com.sun.org.apache.xml.internal.serializer.utils.WrappedRuntimeException(mue);
 501             } catch (java.io.IOException ioe) {
 502                 throw new com.sun.org.apache.xml.internal.serializer.utils.WrappedRuntimeException(ioe);
 503             }
 504         }
 505 
 506         EncodingInfo findEncoding(String normalizedEncoding) {
 507             EncodingInfo info = _encodingTableKeyJava.get(normalizedEncoding);
 508             if (info == null) {
 509                 info = _encodingTableKeyMime.get(normalizedEncoding);
 510             }
 511             if (info == null) {
 512                 info = _encodingDynamicTable.get(normalizedEncoding);
 513             }
 514             return info;
 515         }
 516 
 517         EncodingInfo getEncodingFromMimeKey(String normalizedMimeName) {
 518             return _encodingTableKeyMime.get(normalizedMimeName);
 519         }
 520 
 521         EncodingInfo getEncodingFromJavaKey(String normalizedJavaName) {
 522             return _encodingTableKeyJava.get(normalizedJavaName);
 523         }
 524 
 525         void putEncoding(String key, EncodingInfo info) {
 526             _encodingDynamicTable.put(key, info);
 527         }
 528     }
 529 
 530     /**
 531      * Return true if the character is the high member of a surrogate pair.
 532      * <p>
 533      * This is not a public API.
 534      * @param ch the character to test
 535      * @xsl.usage internal
 536      */
 537     static boolean isHighUTF16Surrogate(char ch) {
 538         return ('\uD800' <= ch && ch <= '\uDBFF');
 539     }
 540     /**
 541      * Return true if the character is the low member of a surrogate pair.
 542      * <p>
 543      * This is not a public API.
 544      * @param ch the character to test
 545      * @xsl.usage internal
 546      */
 547     static boolean isLowUTF16Surrogate(char ch) {
 548         return ('\uDC00' <= ch && ch <= '\uDFFF');
 549     }
 550     /**
 551      * Return the unicode code point represented by the high/low surrogate pair.
 552      * <p>
 553      * This is not a public API.
 554      * @param highSurrogate the high char of the high/low pair
 555      * @param lowSurrogate the low char of the high/low pair
 556      * @xsl.usage internal
 557      */
 558     static int toCodePoint(char highSurrogate, char lowSurrogate) {
 559         int codePoint =
 560             ((highSurrogate - 0xd800) << 10)
 561                 + (lowSurrogate - 0xdc00)
 562                 + 0x10000;
 563         return codePoint;
 564     }
 565     /**
 566      * Return the unicode code point represented by the char.
 567      * A bit of a dummy method, since all it does is return the char,
 568      * but as an int value.
 569      * <p>
 570      * This is not a public API.
 571      * @param ch the char.
 572      * @xsl.usage internal
 573      */
 574     static int toCodePoint(char ch) {
 575         int codePoint = ch;
 576         return codePoint;
 577     }
 578 
 579     private final static EncodingInfos _encodingInfos = new EncodingInfos();
 580 
 581 }