1 /* 2 * Copyright (c) 2017, Oracle and/or its affiliates. All rights reserved. 3 */ 4 /* 5 * Licensed to the Apache Software Foundation (ASF) under one or more 6 * contributor license agreements. See the NOTICE file distributed with 7 * this work for additional information regarding copyright ownership. 8 * The ASF licenses this file to You under the Apache License, Version 2.0 9 * (the "License"); you may not use this file except in compliance with 10 * the License. You may obtain a copy of the License at 11 * 12 * http://www.apache.org/licenses/LICENSE-2.0 13 * 14 * Unless required by applicable law or agreed to in writing, software 15 * distributed under the License is distributed on an "AS IS" BASIS, 16 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 * See the License for the specific language governing permissions and 18 * limitations under the License. 19 */ 20 21 package com.sun.org.apache.xml.internal.serializer; 22 23 import java.io.BufferedWriter; 24 import java.io.IOException; 25 import java.io.InputStream; 26 import java.io.OutputStream; 27 import java.io.OutputStreamWriter; 28 import java.io.UnsupportedEncodingException; 29 import java.io.Writer; 30 import java.net.MalformedURLException; 31 import java.net.URL; 32 import java.nio.charset.Charset; 33 import java.nio.charset.IllegalCharsetNameException; 34 import java.nio.charset.UnsupportedCharsetException; 35 import java.util.Collections; 36 import java.util.Enumeration; 37 import java.util.HashMap; 38 import java.util.Map.Entry; 39 import java.util.Map; 40 import java.util.Properties; 41 import java.util.StringTokenizer; 42 import jdk.xml.internal.SecuritySupport; 43 44 /** 45 * Provides information about encodings. Depends on the Java runtime 46 * to provides writers for the different encodings, but can be used 47 * to override encoding names and provide the last printable character 48 * for each encoding. 49 * 50 * @author <a href="mailto:arkin@intalio.com">Assaf Arkin</a> 51 * @LastModified: Oct 2017 52 */ 53 54 public final class Encodings extends Object 55 { 56 57 /** 58 * The last printable character for unknown encodings. 59 */ 60 private static final int m_defaultLastPrintable = 0x7F; 61 62 /** 63 * Standard filename for properties file with encodings data. 64 */ 65 private static final String ENCODINGS_FILE = "com/sun/org/apache/xml/internal/serializer/Encodings.properties"; 66 67 /** 68 * Standard filename for properties file with encodings data. 69 */ 70 private static final String ENCODINGS_PROP = "com.sun.org.apache.xalan.internal.serialize.encodings"; 71 72 73 /** 74 * Returns a writer for the specified encoding based on 75 * an output stream. 76 * 77 * @param output The output stream 78 * @param encoding The encoding 79 * @return A suitable writer 80 * @throws UnsupportedEncodingException There is no convertor 81 * to support this encoding 82 */ 83 static Writer getWriter(OutputStream output, String encoding) 84 throws UnsupportedEncodingException 85 { 86 87 final EncodingInfo ei = _encodingInfos.findEncoding(toUpperCaseFast(encoding)); 88 if (ei != null) { 89 try { 90 return new BufferedWriter(new OutputStreamWriter( 91 output, ei.javaName)); 92 } catch (UnsupportedEncodingException usee) { 93 // keep trying 94 } 95 } 96 97 return new BufferedWriter(new OutputStreamWriter(output, encoding)); 98 } 99 100 101 /** 102 * Returns the last printable character for an unspecified 103 * encoding. 104 * 105 * @return the default size 106 */ 107 public static int getLastPrintable() 108 { 109 return m_defaultLastPrintable; 110 } 111 112 113 114 /** 115 * Returns the EncodingInfo object for the specified 116 * encoding. 117 * <p> 118 * This is not a public API. 119 * 120 * @param encoding The encoding 121 * @return The object that is used to determine if 122 * characters are in the given encoding. 123 * @xsl.usage internal 124 */ 125 static EncodingInfo getEncodingInfo(String encoding) 126 { 127 EncodingInfo ei; 128 129 String normalizedEncoding = toUpperCaseFast(encoding); 130 ei = _encodingInfos.findEncoding(normalizedEncoding); 131 if (ei == null) { 132 // We shouldn't have to do this, but just in case. 133 try { 134 // This may happen if the caller tries to use 135 // an encoding that wasn't registered in the 136 // (java name)->(preferred mime name) mapping file. 137 // In that case we attempt to load the charset for the 138 // given encoding, and if that succeeds - we create a new 139 // EncodingInfo instance - assuming the canonical name 140 // of the charset can be used as the mime name. 141 final Charset c = Charset.forName(encoding); 142 final String name = c.name(); 143 ei = new EncodingInfo(name, name); 144 _encodingInfos.putEncoding(normalizedEncoding, ei); 145 } catch (IllegalCharsetNameException | UnsupportedCharsetException x) { 146 ei = new EncodingInfo(null,null); 147 } 148 } 149 150 return ei; 151 } 152 153 /** 154 * Determines if the encoding specified was recognized by the 155 * serializer or not. 156 * 157 * @param encoding The encoding 158 * @return boolean - true if the encoding was recognized else false 159 */ 160 public static boolean isRecognizedEncoding(String encoding) 161 { 162 EncodingInfo ei; 163 164 String normalizedEncoding = toUpperCaseFast(encoding); 165 ei = _encodingInfos.findEncoding(normalizedEncoding); 166 if (ei != null) 167 return true; 168 return false; 169 } 170 171 /** 172 * A fast and cheap way to uppercase a String that is 173 * only made of printable ASCII characters. 174 * <p> 175 * This is not a public API. 176 * @param s a String of ASCII characters 177 * @return an uppercased version of the input String, 178 * possibly the same String. 179 * @xsl.usage internal 180 */ 181 static private String toUpperCaseFast(final String s) { 182 183 boolean different = false; 184 final int mx = s.length(); 185 char[] chars = new char[mx]; 186 for (int i=0; i < mx; i++) { 187 char ch = s.charAt(i); 188 // is the character a lower case ASCII one? 189 if ('a' <= ch && ch <= 'z') { 190 // a cheap and fast way to uppercase that is good enough 191 ch = (char) (ch + ('A' - 'a')); 192 different = true; // the uppercased String is different 193 } 194 chars[i] = ch; 195 } 196 197 // A little optimization, don't call String.valueOf() if 198 // the uppercased string is the same as the input string. 199 final String upper; 200 if (different) 201 upper = String.valueOf(chars); 202 else 203 upper = s; 204 205 return upper; 206 } 207 208 /** The default encoding, ISO style, ISO style. */ 209 static final String DEFAULT_MIME_ENCODING = "UTF-8"; 210 211 /** 212 * Get the proper mime encoding. From the XSLT recommendation: "The encoding 213 * attribute specifies the preferred encoding to use for outputting the result 214 * tree. XSLT processors are required to respect values of UTF-8 and UTF-16. 215 * For other values, if the XSLT processor does not support the specified 216 * encoding it may signal an error; if it does not signal an error it should 217 * use UTF-8 or UTF-16 instead. The XSLT processor must not use an encoding 218 * whose name does not match the EncName production of the XML Recommendation 219 * [XML]. If no encoding attribute is specified, then the XSLT processor should 220 * use either UTF-8 or UTF-16." 221 * 222 * @param encoding Reference to java-style encoding string, which may be null, 223 * in which case a default will be found. 224 * 225 * @return The ISO-style encoding string, or null if failure. 226 */ 227 static String getMimeEncoding(String encoding) 228 { 229 230 if (null == encoding) 231 { 232 try 233 { 234 235 // Get the default system character encoding. This may be 236 // incorrect if they passed in a writer, but right now there 237 // seems to be no way to get the encoding from a writer. 238 encoding = SecuritySupport.getSystemProperty("file.encoding", "UTF8"); 239 240 if (null != encoding) 241 { 242 243 /* 244 * See if the mime type is equal to UTF8. If you don't 245 * do that, then convertJava2MimeEncoding will convert 246 * 8859_1 to "ISO-8859-1", which is not what we want, 247 * I think, and I don't think I want to alter the tables 248 * to convert everything to UTF-8. 249 */ 250 String jencoding = 251 (encoding.equalsIgnoreCase("Cp1252") 252 || encoding.equalsIgnoreCase("ISO8859_1") 253 || encoding.equalsIgnoreCase("8859_1") 254 || encoding.equalsIgnoreCase("UTF8")) 255 ? DEFAULT_MIME_ENCODING 256 : convertJava2MimeEncoding(encoding); 257 258 encoding = 259 (null != jencoding) ? jencoding : DEFAULT_MIME_ENCODING; 260 } 261 else 262 { 263 encoding = DEFAULT_MIME_ENCODING; 264 } 265 } 266 catch (SecurityException se) 267 { 268 encoding = DEFAULT_MIME_ENCODING; 269 } 270 } 271 else 272 { 273 encoding = convertJava2MimeEncoding(encoding); 274 } 275 276 return encoding; 277 } 278 279 /** 280 * Try the best we can to convert a Java encoding to a XML-style encoding. 281 * 282 * @param encoding non-null reference to encoding string, java style. 283 * 284 * @return ISO-style encoding string. 285 */ 286 private static String convertJava2MimeEncoding(String encoding) 287 { 288 final EncodingInfo enc = 289 _encodingInfos.getEncodingFromJavaKey(toUpperCaseFast(encoding)); 290 if (null != enc) 291 return enc.name; 292 return encoding; 293 } 294 295 /** 296 * Try the best we can to convert a Java encoding to a XML-style encoding. 297 * 298 * @param encoding non-null reference to encoding string, java style. 299 * 300 * @return ISO-style encoding string. 301 */ 302 public static String convertMime2JavaEncoding(String encoding) 303 { 304 final EncodingInfo info = _encodingInfos.findEncoding(toUpperCaseFast(encoding)); 305 return info != null ? info.javaName : encoding; 306 } 307 308 // Using an inner static class here prevent initialization races 309 // where the hash maps could be used before they were populated. 310 // 311 private final static class EncodingInfos { 312 // These maps are final and not modified after initialization. 313 private final Map<String, EncodingInfo> _encodingTableKeyJava = new HashMap<>(); 314 private final Map<String, EncodingInfo> _encodingTableKeyMime = new HashMap<>(); 315 // This map will be added to after initialization: make sure it's 316 // thread-safe. This map should not be used frequently - only in cases 317 // where the mapping requested was not declared in the Encodings.properties 318 // file. 319 private final Map<String, EncodingInfo> _encodingDynamicTable = 320 Collections.synchronizedMap(new HashMap<String, EncodingInfo>()); 321 322 private EncodingInfos() { 323 loadEncodingInfo(); 324 } 325 326 // Opens the file/resource containing java charset name -> preferred mime 327 // name mapping and returns it as an InputStream. 328 private InputStream openEncodingsFileStream() throws MalformedURLException, IOException { 329 String urlString = null; 330 InputStream is = null; 331 332 try { 333 urlString = SecuritySupport.getSystemProperty(ENCODINGS_PROP, ""); 334 } catch (SecurityException e) { 335 } 336 337 if (urlString != null && urlString.length() > 0) { 338 URL url = new URL(urlString); 339 is = url.openStream(); 340 } 341 342 if (is == null) { 343 is = SecuritySupport.getResourceAsStream(ENCODINGS_FILE); 344 } 345 return is; 346 } 347 348 // Loads the Properties resource containing the mapping: 349 // java charset name -> preferred mime name 350 // and returns it. 351 private Properties loadProperties() throws MalformedURLException, IOException { 352 Properties props = new Properties(); 353 try (InputStream is = openEncodingsFileStream()) { 354 if (is != null) { 355 props.load(is); 356 } else { 357 // Seems to be no real need to force failure here, let the 358 // system do its best... The issue is not really very critical, 359 // and the output will be in any case _correct_ though maybe not 360 // always human-friendly... :) 361 // But maybe report/log the resource problem? 362 // Any standard ways to report/log errors (in static context)? 363 } 364 } 365 return props; 366 } 367 368 // Parses the mime list associated to a java charset name. 369 // The first mime name in the list is supposed to be the preferred 370 // mime name. 371 private String[] parseMimeTypes(String val) { 372 int pos = val.indexOf(' '); 373 //int lastPrintable; 374 if (pos < 0) { 375 // Maybe report/log this problem? 376 // "Last printable character not defined for encoding " + 377 // mimeName + " (" + val + ")" ... 378 return new String[] { val }; 379 //lastPrintable = 0x00FF; 380 } 381 //lastPrintable = 382 // Integer.decode(val.substring(pos).trim()).intValue(); 383 StringTokenizer st = 384 new StringTokenizer(val.substring(0, pos), ","); 385 String[] values = new String[st.countTokens()]; 386 for (int i=0; st.hasMoreTokens(); i++) { 387 values[i] = st.nextToken(); 388 } 389 return values; 390 } 391 392 // This method here attempts to find the canonical charset name for the 393 // the given name - which is supposed to be either a java name or a mime 394 // name. 395 // For that, it attempts to load the charset using the given name, and 396 // then returns the charset's canonical name. 397 // If the charset could not be loaded from the given name, 398 // the method returns null. 399 private String findCharsetNameFor(String name) { 400 try { 401 return Charset.forName(name).name(); 402 } catch (Exception x) { 403 return null; 404 } 405 } 406 407 // This method here attempts to find the canonical charset name for the 408 // the set javaName+mimeNames - which are supposed to all refer to the 409 // same charset. 410 // For that it attempts to load the charset using the javaName, and if 411 // not found, attempts again using each of the mime names in turn. 412 // If the charset could be loaded from the javaName, then the javaName 413 // itself is returned as charset name. Otherwise, each of the mime names 414 // is tried in turn, until a charset can be loaded from one of the names, 415 // and the loaded charset's canonical name is returned. 416 // If no charset can be loaded from either the javaName or one of the 417 // mime names, then null is returned. 418 // 419 // Note that the returned name is the 'java' name that will be used in 420 // instances of EncodingInfo. 421 // This is important because EncodingInfo uses that 'java name' later on 422 // in calls to String.getBytes(javaName). 423 // As it happens, sometimes only one element of the set mime names/javaName 424 // is known by Charset: sometimes only one of the mime names is known, 425 // sometime only the javaName is known, sometimes all are known. 426 // 427 // By using this method here, we fix the problem where one of the mime 428 // names is known but the javaName is unknown, by associating the charset 429 // loaded from one of the mime names with the unrecognized javaName. 430 // 431 // When none of the mime names or javaName are known - there's not much we can 432 // do... It can mean that this encoding is not supported for this 433 // OS. If such a charset is ever use it will result in having all characters 434 // escaped. 435 // 436 private String findCharsetNameFor(String javaName, String[] mimes) { 437 String cs = findCharsetNameFor(javaName); 438 if (cs != null) return javaName; 439 for (String m : mimes) { 440 cs = findCharsetNameFor(m); 441 if (cs != null) break; 442 } 443 return cs; 444 } 445 446 /** 447 * Loads a list of all the supported encodings. 448 * 449 * System property "encodings" formatted using URL syntax may define an 450 * external encodings list. Thanks to Sergey Ushakov for the code 451 * contribution! 452 */ 453 private void loadEncodingInfo() { 454 try { 455 // load (java name)->(preferred mime name) mapping. 456 final Properties props = loadProperties(); 457 458 // create instances of EncodingInfo from the loaded mapping 459 Enumeration<Object> keys = props.keys(); 460 Map<String, EncodingInfo> canonicals = new HashMap<>(); 461 while (keys.hasMoreElements()) { 462 final String javaName = (String) keys.nextElement(); 463 final String[] mimes = parseMimeTypes(props.getProperty(javaName)); 464 465 final String charsetName = findCharsetNameFor(javaName, mimes); 466 if (charsetName != null) { 467 final String kj = toUpperCaseFast(javaName); 468 final String kc = toUpperCaseFast(charsetName); 469 for (int i = 0; i < mimes.length; ++i) { 470 final String mimeName = mimes[i]; 471 final String km = toUpperCaseFast(mimeName); 472 EncodingInfo info = new EncodingInfo(mimeName, charsetName); 473 _encodingTableKeyMime.put(km, info); 474 if (!canonicals.containsKey(kc)) { 475 // canonicals will map the charset name to 476 // the info containing the prefered mime name 477 // (the preferred mime name is the first mime 478 // name in the list). 479 canonicals.put(kc, info); 480 _encodingTableKeyJava.put(kc, info); 481 } 482 _encodingTableKeyJava.put(kj, info); 483 } 484 } else { 485 // None of the java or mime names on the line were 486 // recognized => this charset is not supported? 487 } 488 } 489 490 // Fix up the _encodingTableKeyJava so that the info mapped to 491 // the java name contains the preferred mime name. 492 // (a given java name can correspond to several mime name, 493 // but we want the _encodingTableKeyJava to point to the 494 // preferred mime name). 495 for (Entry<String, EncodingInfo> e : _encodingTableKeyJava.entrySet()) { 496 e.setValue(canonicals.get(toUpperCaseFast(e.getValue().javaName))); 497 } 498 499 } catch (java.net.MalformedURLException mue) { 500 throw new com.sun.org.apache.xml.internal.serializer.utils.WrappedRuntimeException(mue); 501 } catch (java.io.IOException ioe) { 502 throw new com.sun.org.apache.xml.internal.serializer.utils.WrappedRuntimeException(ioe); 503 } 504 } 505 506 EncodingInfo findEncoding(String normalizedEncoding) { 507 EncodingInfo info = _encodingTableKeyJava.get(normalizedEncoding); 508 if (info == null) { 509 info = _encodingTableKeyMime.get(normalizedEncoding); 510 } 511 if (info == null) { 512 info = _encodingDynamicTable.get(normalizedEncoding); 513 } 514 return info; 515 } 516 517 EncodingInfo getEncodingFromMimeKey(String normalizedMimeName) { 518 return _encodingTableKeyMime.get(normalizedMimeName); 519 } 520 521 EncodingInfo getEncodingFromJavaKey(String normalizedJavaName) { 522 return _encodingTableKeyJava.get(normalizedJavaName); 523 } 524 525 void putEncoding(String key, EncodingInfo info) { 526 _encodingDynamicTable.put(key, info); 527 } 528 } 529 530 /** 531 * Return true if the character is the high member of a surrogate pair. 532 * <p> 533 * This is not a public API. 534 * @param ch the character to test 535 * @xsl.usage internal 536 */ 537 static boolean isHighUTF16Surrogate(char ch) { 538 return ('\uD800' <= ch && ch <= '\uDBFF'); 539 } 540 /** 541 * Return true if the character is the low member of a surrogate pair. 542 * <p> 543 * This is not a public API. 544 * @param ch the character to test 545 * @xsl.usage internal 546 */ 547 static boolean isLowUTF16Surrogate(char ch) { 548 return ('\uDC00' <= ch && ch <= '\uDFFF'); 549 } 550 /** 551 * Return the unicode code point represented by the high/low surrogate pair. 552 * <p> 553 * This is not a public API. 554 * @param highSurrogate the high char of the high/low pair 555 * @param lowSurrogate the low char of the high/low pair 556 * @xsl.usage internal 557 */ 558 static int toCodePoint(char highSurrogate, char lowSurrogate) { 559 int codePoint = 560 ((highSurrogate - 0xd800) << 10) 561 + (lowSurrogate - 0xdc00) 562 + 0x10000; 563 return codePoint; 564 } 565 /** 566 * Return the unicode code point represented by the char. 567 * A bit of a dummy method, since all it does is return the char, 568 * but as an int value. 569 * <p> 570 * This is not a public API. 571 * @param ch the char. 572 * @xsl.usage internal 573 */ 574 static int toCodePoint(char ch) { 575 int codePoint = ch; 576 return codePoint; 577 } 578 579 private final static EncodingInfos _encodingInfos = new EncodingInfos(); 580 581 }