1 /* 2 * Copyright (c) 2017, Oracle and/or its affiliates. All rights reserved. 3 */ 4 /* 5 * Licensed to the Apache Software Foundation (ASF) under one or more 6 * contributor license agreements. See the NOTICE file distributed with 7 * this work for additional information regarding copyright ownership. 8 * The ASF licenses this file to You under the Apache License, Version 2.0 9 * (the "License"); you may not use this file except in compliance with 10 * the License. You may obtain a copy of the License at 11 * 12 * http://www.apache.org/licenses/LICENSE-2.0 13 * 14 * Unless required by applicable law or agreed to in writing, software 15 * distributed under the License is distributed on an "AS IS" BASIS, 16 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 * See the License for the specific language governing permissions and 18 * limitations under the License. 19 */ 20 21 package com.sun.org.apache.xml.internal.serializer; 22 23 import com.sun.org.apache.xml.internal.serializer.utils.MsgKey; 24 import com.sun.org.apache.xml.internal.serializer.utils.SystemIDResolver; 25 import com.sun.org.apache.xml.internal.serializer.utils.Utils; 26 import com.sun.org.apache.xml.internal.serializer.utils.WrappedRuntimeException; 27 import java.io.BufferedReader; 28 import java.io.InputStream; 29 import java.io.InputStreamReader; 30 import java.io.UnsupportedEncodingException; 31 import java.net.URL; 32 import java.util.Enumeration; 33 import java.util.HashMap; 34 import java.util.Locale; 35 import java.util.Map; 36 import java.util.PropertyResourceBundle; 37 import java.util.ResourceBundle; 38 import javax.xml.transform.TransformerException; 39 import jdk.xml.internal.SecuritySupport; 40 41 /** 42 * This class provides services that tell if a character should have 43 * special treatement, such as entity reference substitution or normalization 44 * of a newline character. It also provides character to entity reference 45 * lookup. 46 * 47 * DEVELOPERS: See Known Issue in the constructor. 48 * 49 * @xsl.usage internal 50 * @LastModified: Oct 2017 51 */ 52 final class CharInfo 53 { 54 /** Given a character, lookup a String to output (e.g. a decorated entity reference). */ 55 private Map<CharKey, String> m_charToString = new HashMap<>(); 56 57 /** 58 * The name of the HTML entities file. 59 * If specified, the file will be resource loaded with the default class loader. 60 */ 61 public static final String HTML_ENTITIES_RESOURCE = 62 "com.sun.org.apache.xml.internal.serializer.HTMLEntities"; 63 64 /** 65 * The name of the XML entities file. 66 * If specified, the file will be resource loaded with the default class loader. 67 */ 68 public static final String XML_ENTITIES_RESOURCE = 69 "com.sun.org.apache.xml.internal.serializer.XMLEntities"; 70 71 /** The horizontal tab character, which the parser should always normalize. */ 72 public static final char S_HORIZONAL_TAB = 0x09; 73 74 /** The linefeed character, which the parser should always normalize. */ 75 public static final char S_LINEFEED = 0x0A; 76 77 /** The carriage return character, which the parser should always normalize. */ 78 public static final char S_CARRIAGERETURN = 0x0D; 79 80 /** This flag is an optimization for HTML entities. It false if entities 81 * other than quot (34), amp (38), lt (60) and gt (62) are defined 82 * in the range 0 to 127. 83 * @xsl.usage internal 84 */ 85 final boolean onlyQuotAmpLtGt; 86 87 /** Copy the first 0,1 ... ASCII_MAX values into an array */ 88 private static final int ASCII_MAX = 128; 89 90 /** Array of values is faster access than a set of bits 91 * to quickly check ASCII characters in attribute values. 92 */ 93 private boolean[] isSpecialAttrASCII = new boolean[ASCII_MAX]; 94 95 /** Array of values is faster access than a set of bits 96 * to quickly check ASCII characters in text nodes. 97 */ 98 private boolean[] isSpecialTextASCII = new boolean[ASCII_MAX]; 99 100 private boolean[] isCleanTextASCII = new boolean[ASCII_MAX]; 101 102 /** An array of bits to record if the character is in the set. 103 * Although information in this array is complete, the 104 * isSpecialAttrASCII array is used first because access to its values 105 * is common and faster. 106 */ 107 private int array_of_bits[] = createEmptySetOfIntegers(65535); 108 109 110 // 5 for 32 bit words, 6 for 64 bit words ... 111 /* 112 * This constant is used to shift an integer to quickly 113 * calculate which element its bit is stored in. 114 * 5 for 32 bit words (int) , 6 for 64 bit words (long) 115 */ 116 private static final int SHIFT_PER_WORD = 5; 117 118 /* 119 * A mask to get the low order bits which are used to 120 * calculate the value of the bit within a given word, 121 * that will represent the presence of the integer in the 122 * set. 123 * 124 * 0x1F for 32 bit words (int), 125 * or 0x3F for 64 bit words (long) 126 */ 127 private static final int LOW_ORDER_BITMASK = 0x1f; 128 129 /* 130 * This is used for optimizing the lookup of bits representing 131 * the integers in the set. It is the index of the first element 132 * in the array array_of_bits[] that is not used. 133 */ 134 private int firstWordNotUsed; 135 136 137 /** 138 * Constructor that reads in a resource file that describes the mapping of 139 * characters to entity references. 140 * This constructor is private, just to force the use 141 * of the getCharInfo(entitiesResource) factory 142 * 143 * Resource files must be encoded in UTF-8 and can either be properties 144 * files with a .properties extension assumed. Alternatively, they can 145 * have the following form, with no particular extension assumed: 146 * 147 * <pre> 148 * # First char # is a comment 149 * Entity numericValue 150 * quot 34 151 * amp 38 152 * </pre> 153 * 154 * @param entitiesResource Name of properties or resource file that should 155 * be loaded, which describes that mapping of characters to entity 156 * references. 157 */ 158 private CharInfo(String entitiesResource, String method) 159 { 160 this(entitiesResource, method, false); 161 } 162 163 private CharInfo(String entitiesResource, String method, boolean internal) 164 { 165 ResourceBundle entities = null; 166 boolean noExtraEntities = true; 167 168 // Make various attempts to interpret the parameter as a properties 169 // file or resource file, as follows: 170 // 171 // 1) attempt to load .properties file using ResourceBundle 172 // 2) try using the class loader to find the specified file a resource 173 // file 174 // 3) try treating the resource a URI 175 176 try { 177 if (internal) { 178 // Load entity property files by using PropertyResourceBundle, 179 // cause of security issure for applets 180 entities = PropertyResourceBundle.getBundle(entitiesResource); 181 } else { 182 ClassLoader cl = SecuritySupport.getContextClassLoader(); 183 if (cl != null) { 184 entities = PropertyResourceBundle.getBundle(entitiesResource, 185 Locale.getDefault(), cl); 186 } 187 } 188 } catch (Exception e) {} 189 190 if (entities != null) { 191 Enumeration<String> keys = entities.getKeys(); 192 while (keys.hasMoreElements()){ 193 String name = keys.nextElement(); 194 String value = entities.getString(name); 195 int code = Integer.parseInt(value); 196 defineEntity(name, (char) code); 197 if (extraEntity(code)) 198 noExtraEntities = false; 199 } 200 set(S_LINEFEED); 201 set(S_CARRIAGERETURN); 202 } else { 203 InputStream is = null; 204 String err = null; 205 206 // Load user specified resource file by using URL loading, it 207 // requires a valid URI as parameter 208 try { 209 if (internal) { 210 is = CharInfo.class.getResourceAsStream(entitiesResource); 211 } else { 212 ClassLoader cl = SecuritySupport.getContextClassLoader(); 213 if (cl != null) { 214 try { 215 is = cl.getResourceAsStream(entitiesResource); 216 } catch (Exception e) { 217 err = e.getMessage(); 218 } 219 } 220 221 if (is == null) { 222 try { 223 URL url = new URL(entitiesResource); 224 is = url.openStream(); 225 } catch (Exception e) { 226 err = e.getMessage(); 227 } 228 } 229 } 230 231 if (is == null) { 232 throw new RuntimeException( 233 Utils.messages.createMessage( 234 MsgKey.ER_RESOURCE_COULD_NOT_FIND, 235 new Object[] {entitiesResource, err})); 236 } 237 238 // Fix Bugzilla#4000: force reading in UTF-8 239 // This creates the de facto standard that Xalan's resource 240 // files must be encoded in UTF-8. This should work in all 241 // JVMs. 242 // 243 // %REVIEW% KNOWN ISSUE: IT FAILS IN MICROSOFT VJ++, which 244 // didn't implement the UTF-8 encoding. Theoretically, we should 245 // simply let it fail in that case, since the JVM is obviously 246 // broken if it doesn't support such a basic standard. But 247 // since there are still some users attempting to use VJ++ for 248 // development, we have dropped in a fallback which makes a 249 // second attempt using the platform's default encoding. In VJ++ 250 // this is apparently ASCII, which is subset of UTF-8... and 251 // since the strings we'll be reading here are also primarily 252 // limited to the 7-bit ASCII range (at least, in English 253 // versions of Xalan), this should work well enough to keep us 254 // on the air until we're ready to officially decommit from 255 // VJ++. 256 257 BufferedReader reader; 258 try { 259 reader = new BufferedReader(new InputStreamReader(is, "UTF-8")); 260 } catch (UnsupportedEncodingException e) { 261 reader = new BufferedReader(new InputStreamReader(is)); 262 } 263 264 String line = reader.readLine(); 265 266 while (line != null) { 267 if (line.length() == 0 || line.charAt(0) == '#') { 268 line = reader.readLine(); 269 270 continue; 271 } 272 273 int index = line.indexOf(' '); 274 275 if (index > 1) { 276 String name = line.substring(0, index); 277 278 ++index; 279 280 if (index < line.length()) { 281 String value = line.substring(index); 282 index = value.indexOf(' '); 283 284 if (index > 0) { 285 value = value.substring(0, index); 286 } 287 288 int code = Integer.parseInt(value); 289 290 defineEntity(name, (char) code); 291 if (extraEntity(code)) 292 noExtraEntities = false; 293 } 294 } 295 296 line = reader.readLine(); 297 } 298 299 is.close(); 300 set(S_LINEFEED); 301 set(S_CARRIAGERETURN); 302 } catch (Exception e) { 303 throw new RuntimeException( 304 Utils.messages.createMessage( 305 MsgKey.ER_RESOURCE_COULD_NOT_LOAD, 306 new Object[] { entitiesResource, 307 e.toString(), 308 entitiesResource, 309 e.toString()})); 310 } finally { 311 if (is != null) { 312 try { 313 is.close(); 314 } catch (Exception except) {} 315 } 316 } 317 } 318 319 /* initialize the array isCleanTextASCII[] with a cache of values 320 * for use by ToStream.character(char[], int , int) 321 * and the array isSpecialTextASCII[] with the opposite values 322 * (all in the name of performance!) 323 */ 324 for (int ch = 0; ch <ASCII_MAX; ch++) 325 if((((0x20 <= ch || (0x0A == ch || 0x0D == ch || 0x09 == ch))) 326 && (!get(ch))) || ('"' == ch)) 327 { 328 isCleanTextASCII[ch] = true; 329 isSpecialTextASCII[ch] = false; 330 } 331 else { 332 isCleanTextASCII[ch] = false; 333 isSpecialTextASCII[ch] = true; 334 } 335 336 337 338 onlyQuotAmpLtGt = noExtraEntities; 339 340 // initialize the array with a cache of the BitSet values 341 for (int i=0; i<ASCII_MAX; i++) 342 isSpecialAttrASCII[i] = get(i); 343 344 /* Now that we've used get(ch) just above to initialize the 345 * two arrays we will change by adding a tab to the set of 346 * special chars for XML (but not HTML!). 347 * We do this because a tab is always a 348 * special character in an XML attribute, 349 * but only a special character in XML text 350 * if it has an entity defined for it. 351 * This is the reason for this delay. 352 */ 353 if (Method.XML.equals(method)) 354 { 355 isSpecialAttrASCII[S_HORIZONAL_TAB] = true; 356 } 357 } 358 359 /** 360 * Defines a new character reference. The reference's name and value are 361 * supplied. Nothing happens if the character reference is already defined. 362 * <p>Unlike internal entities, character references are a string to single 363 * character mapping. They are used to map non-ASCII characters both on 364 * parsing and printing, primarily for HTML documents. '<amp;' is an 365 * example of a character reference.</p> 366 * 367 * @param name The entity's name 368 * @param value The entity's value 369 */ 370 private void defineEntity(String name, char value) 371 { 372 StringBuilder sb = new StringBuilder("&"); 373 sb.append(name); 374 sb.append(';'); 375 String entityString = sb.toString(); 376 377 defineChar2StringMapping(entityString, value); 378 } 379 380 /** 381 * Map a character to a String. For example given 382 * the character '>' this method would return the fully decorated 383 * entity name "<". 384 * Strings for entity references are loaded from a properties file, 385 * but additional mappings defined through calls to defineChar2String() 386 * are possible. Such entity reference mappings could be over-ridden. 387 * 388 * This is reusing a stored key object, in an effort to avoid 389 * heap activity. Unfortunately, that introduces a threading risk. 390 * Simplest fix for now is to make it a synchronized method, or to give 391 * up the reuse; I see very little performance difference between them. 392 * Long-term solution would be to replace the hashtable with a sparse array 393 * keyed directly from the character's integer value; see DTM's 394 * string pool for a related solution. 395 * 396 * @param value The character that should be resolved to 397 * a String, e.g. resolve '>' to "<". 398 * 399 * @return The String that the character is mapped to, or null if not found. 400 * @xsl.usage internal 401 */ 402 String getOutputStringForChar(char value) 403 { 404 CharKey charKey = new CharKey(); 405 charKey.setChar(value); 406 return m_charToString.get(charKey); 407 } 408 409 /** 410 * Tell if the character argument that is from 411 * an attribute value should have special treatment. 412 * 413 * @param value the value of a character that is in an attribute value 414 * @return true if the character should have any special treatment, 415 * such as when writing out attribute values, 416 * or entity references. 417 * @xsl.usage internal 418 */ 419 final boolean isSpecialAttrChar(int value) 420 { 421 // for performance try the values in the boolean array first, 422 // this is faster access than the BitSet for common ASCII values 423 424 if (value < ASCII_MAX) 425 return isSpecialAttrASCII[value]; 426 427 // rather than java.util.BitSet, our private 428 // implementation is faster (and less general). 429 return get(value); 430 } 431 432 /** 433 * Tell if the character argument that is from a 434 * text node should have special treatment. 435 * 436 * @param value the value of a character that is in a text node 437 * @return true if the character should have any special treatment, 438 * such as when writing out attribute values, 439 * or entity references. 440 * @xsl.usage internal 441 */ 442 final boolean isSpecialTextChar(int value) 443 { 444 // for performance try the values in the boolean array first, 445 // this is faster access than the BitSet for common ASCII values 446 447 if (value < ASCII_MAX) 448 return isSpecialTextASCII[value]; 449 450 // rather than java.util.BitSet, our private 451 // implementation is faster (and less general). 452 return get(value); 453 } 454 455 /** 456 * This method is used to determine if an ASCII character in 457 * a text node (not an attribute value) is "clean". 458 * @param value the character to check (0 to 127). 459 * @return true if the character can go to the writer as-is 460 * @xsl.usage internal 461 */ 462 final boolean isTextASCIIClean(int value) 463 { 464 return isCleanTextASCII[value]; 465 } 466 467 468 /** 469 * Read an internal resource file that describes the mapping of 470 * characters to entity references; Construct a CharInfo object. 471 * 472 * @param entitiesFileName Name of entities resource file that should 473 * be loaded, which describes the mapping of characters to entity references. 474 * @param method the output method type, which should be one of "xml", "html", and "text". 475 * @return an instance of CharInfo 476 * 477 * @xsl.usage internal 478 */ 479 static CharInfo getCharInfoInternal(String entitiesFileName, String method) 480 { 481 CharInfo charInfo = m_getCharInfoCache.get(entitiesFileName); 482 if (charInfo != null) { 483 return charInfo; 484 } 485 486 charInfo = new CharInfo(entitiesFileName, method, true); 487 m_getCharInfoCache.put(entitiesFileName, charInfo); 488 return charInfo; 489 } 490 491 /** 492 * Constructs a CharInfo object using the following process to try reading 493 * the entitiesFileName parameter: 494 * 495 * 1) attempt to load it as a ResourceBundle 496 * 2) try using the class loader to find the specified file 497 * 3) try opening it as an URI 498 * 499 * In case of 2 and 3, the resource file must be encoded in UTF-8 and have the 500 * following format: 501 * <pre> 502 * # First char # is a comment 503 * Entity numericValue 504 * quot 34 505 * amp 38 506 * </pre> 507 * 508 * @param entitiesFileName Name of entities resource file that should 509 * be loaded, which describes the mapping of characters to entity references. 510 * @param method the output method type, which should be one of "xml", "html", and "text". 511 * @return an instance of CharInfo 512 */ 513 static CharInfo getCharInfo(String entitiesFileName, String method) 514 { 515 try { 516 return new CharInfo(entitiesFileName, method, false); 517 } catch (Exception e) {} 518 519 String absoluteEntitiesFileName; 520 521 if (entitiesFileName.indexOf(':') < 0) { 522 absoluteEntitiesFileName = 523 SystemIDResolver.getAbsoluteURIFromRelative(entitiesFileName); 524 } else { 525 try { 526 absoluteEntitiesFileName = 527 SystemIDResolver.getAbsoluteURI(entitiesFileName, null); 528 } catch (TransformerException te) { 529 throw new WrappedRuntimeException(te); 530 } 531 } 532 533 return new CharInfo(absoluteEntitiesFileName, method, false); 534 } 535 536 /** Table of user-specified char infos. */ 537 private static Map<String, CharInfo> m_getCharInfoCache = new HashMap<>(); 538 539 /** 540 * Returns the array element holding the bit value for the 541 * given integer 542 * @param i the integer that might be in the set of integers 543 * 544 */ 545 private static int arrayIndex(int i) { 546 return (i >> SHIFT_PER_WORD); 547 } 548 549 /** 550 * For a given integer in the set it returns the single bit 551 * value used within a given word that represents whether 552 * the integer is in the set or not. 553 */ 554 private static int bit(int i) { 555 int ret = (1 << (i & LOW_ORDER_BITMASK)); 556 return ret; 557 } 558 559 /** 560 * Creates a new empty set of integers (characters) 561 * @param max the maximum integer to be in the set. 562 */ 563 private int[] createEmptySetOfIntegers(int max) { 564 firstWordNotUsed = 0; // an optimization 565 566 int[] arr = new int[arrayIndex(max - 1) + 1]; 567 return arr; 568 569 } 570 571 /** 572 * Adds the integer (character) to the set of integers. 573 * @param i the integer to add to the set, valid values are 574 * 0, 1, 2 ... up to the maximum that was specified at 575 * the creation of the set. 576 */ 577 private final void set(int i) { 578 setASCIIdirty(i); 579 580 int j = (i >> SHIFT_PER_WORD); // this word is used 581 int k = j + 1; 582 583 if(firstWordNotUsed < k) // for optimization purposes. 584 firstWordNotUsed = k; 585 586 array_of_bits[j] |= (1 << (i & LOW_ORDER_BITMASK)); 587 } 588 589 590 /** 591 * Return true if the integer (character)is in the set of integers. 592 * 593 * This implementation uses an array of integers with 32 bits per 594 * integer. If a bit is set to 1 the corresponding integer is 595 * in the set of integers. 596 * 597 * @param i an integer that is tested to see if it is the 598 * set of integers, or not. 599 */ 600 private final boolean get(int i) { 601 602 boolean in_the_set = false; 603 int j = (i >> SHIFT_PER_WORD); // wordIndex(i) 604 // an optimization here, ... a quick test to see 605 // if this integer is beyond any of the words in use 606 if(j < firstWordNotUsed) 607 in_the_set = (array_of_bits[j] & 608 (1 << (i & LOW_ORDER_BITMASK)) 609 ) != 0; // 0L for 64 bit words 610 return in_the_set; 611 } 612 613 // record if there are any entities other than 614 // quot, amp, lt, gt (probably user defined) 615 /** 616 * @return true if the entity 617 * @param code The value of the character that has an entity defined 618 * for it. 619 */ 620 private boolean extraEntity(int entityValue) 621 { 622 boolean extra = false; 623 if (entityValue < 128) 624 { 625 switch (entityValue) 626 { 627 case 34 : // quot 628 case 38 : // amp 629 case 60 : // lt 630 case 62 : // gt 631 break; 632 default : // other entity in range 0 to 127 633 extra = true; 634 } 635 } 636 return extra; 637 } 638 639 /** 640 * If the character is a printable ASCII character then 641 * mark it as not clean and needing replacement with 642 * a String on output. 643 * @param ch 644 */ 645 private void setASCIIdirty(int j) 646 { 647 if (0 <= j && j < ASCII_MAX) 648 { 649 isCleanTextASCII[j] = false; 650 isSpecialTextASCII[j] = true; 651 } 652 } 653 654 /** 655 * If the character is a printable ASCII character then 656 * mark it as and not needing replacement with 657 * a String on output. 658 * @param ch 659 */ 660 private void setASCIIclean(int j) 661 { 662 if (0 <= j && j < ASCII_MAX) 663 { 664 isCleanTextASCII[j] = true; 665 isSpecialTextASCII[j] = false; 666 } 667 } 668 669 private void defineChar2StringMapping(String outputString, char inputChar) 670 { 671 CharKey character = new CharKey(inputChar); 672 m_charToString.put(character, outputString); 673 set(inputChar); 674 } 675 676 /** 677 * Simple class for fast lookup of char values, when used with 678 * hashtables. You can set the char, then use it as a key. 679 * 680 * This class is a copy of the one in com.sun.org.apache.xml.internal.utils. 681 * It exists to cut the serializers dependancy on that package. 682 * 683 * @xsl.usage internal 684 */ 685 private static class CharKey extends Object 686 { 687 688 /** String value */ 689 private char m_char; 690 691 /** 692 * Constructor CharKey 693 * 694 * @param key char value of this object. 695 */ 696 public CharKey(char key) 697 { 698 m_char = key; 699 } 700 701 /** 702 * Default constructor for a CharKey. 703 * 704 * @param key char value of this object. 705 */ 706 public CharKey() 707 { 708 } 709 710 /** 711 * Get the hash value of the character. 712 * 713 * @return hash value of the character. 714 */ 715 public final void setChar(char c) 716 { 717 m_char = c; 718 } 719 720 721 722 /** 723 * Get the hash value of the character. 724 * 725 * @return hash value of the character. 726 */ 727 public final int hashCode() 728 { 729 return (int)m_char; 730 } 731 732 /** 733 * Override of equals() for this object 734 * 735 * @param obj to compare to 736 * 737 * @return True if this object equals this string value 738 */ 739 public final boolean equals(Object obj) 740 { 741 return ((CharKey)obj).m_char == m_char; 742 } 743 } 744 745 746 }