1 /*
   2  * Copyright (c) 2017, Oracle and/or its affiliates. All rights reserved.
   3  */
   4 /*
   5  * Licensed to the Apache Software Foundation (ASF) under one or more
   6  * contributor license agreements.  See the NOTICE file distributed with
   7  * this work for additional information regarding copyright ownership.
   8  * The ASF licenses this file to You under the Apache License, Version 2.0
   9  * (the "License"); you may not use this file except in compliance with
  10  * the License.  You may obtain a copy of the License at
  11  *
  12  *      http://www.apache.org/licenses/LICENSE-2.0
  13  *
  14  * Unless required by applicable law or agreed to in writing, software
  15  * distributed under the License is distributed on an "AS IS" BASIS,
  16  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  17  * See the License for the specific language governing permissions and
  18  * limitations under the License.
  19  */
  20 
  21 package com.sun.org.apache.xerces.internal.impl.dtd.models;
  22 
  23 import com.sun.org.apache.xerces.internal.impl.dtd.XMLContentSpec;
  24 import com.sun.org.apache.xerces.internal.xni.QName;
  25 import java.util.HashMap;
  26 import java.util.Map;
  27 
  28 /**
  29 
  30  * DFAContentModel is the derivative of ContentModel that does
  31  * all of the non-trivial element content validation. This class does
  32  * the conversion from the regular expression to the DFA that
  33  * it then uses in its validation algorithm.
  34  * <p>
  35  * <b>Note:</b> Upstream work insures that this class will never see
  36  * a content model with PCDATA in it. Any model with PCDATA is 'mixed'
  37  * and is handled via the MixedContentModel class since mixed models
  38  * are very constrained in form and easily handled via a special case.
  39  * This also makes implementation of this class much easier.
  40  *
  41  * @xerces.internal
  42  *
  43  * @LastModified: Oct 2017
  44  */
  45 public class DFAContentModel
  46     implements ContentModelValidator {
  47 
  48     //
  49     // Constants
  50     //
  51     // special strings
  52 
  53     /** Epsilon string. */
  54     private static String fEpsilonString = "<<CMNODE_EPSILON>>";
  55 
  56     /** End-of-content string. */
  57     private static String fEOCString = "<<CMNODE_EOC>>";
  58 
  59     /** initializing static members **/
  60     static {
  61         fEpsilonString = fEpsilonString.intern();
  62         fEOCString = fEOCString.intern();
  63     }
  64 
  65     // debugging
  66 
  67     /** Set to true to debug content model validation. */
  68     private static final boolean DEBUG_VALIDATE_CONTENT = false;
  69 
  70     //
  71     // Data
  72     //
  73 
  74     /* this is the EquivClassComparator object */
  75     //private EquivClassComparator comparator = null;
  76 
  77     /**
  78      * This is the map of unique input symbol elements to indices into
  79      * each state's per-input symbol transition table entry. This is part
  80      * of the built DFA information that must be kept around to do the
  81      * actual validation.
  82      */
  83     private QName fElemMap[] = null;
  84 
  85     /**
  86      * This is a map of whether the element map contains information
  87      * related to ANY models.
  88      */
  89     private int fElemMapType[] = null;
  90 
  91     /** The element map size. */
  92     private int fElemMapSize = 0;
  93 
  94     /** Boolean to distinguish Schema Mixed Content */
  95     private boolean fMixed;
  96 
  97     /**
  98      * The NFA position of the special EOC (end of content) node. This
  99      * is saved away since it's used during the DFA build.
 100      */
 101     private int fEOCPos = 0;
 102 
 103 
 104     /**
 105      * This is an array of booleans, one per state (there are
 106      * fTransTableSize states in the DFA) that indicates whether that
 107      * state is a final state.
 108      */
 109     private boolean fFinalStateFlags[] = null;
 110 
 111     /**
 112      * The list of follow positions for each NFA position (i.e. for each
 113      * non-epsilon leaf node.) This is only used during the building of
 114      * the DFA, and is let go afterwards.
 115      */
 116     private CMStateSet fFollowList[] = null;
 117 
 118     /**
 119      * This is the head node of our intermediate representation. It is
 120      * only non-null during the building of the DFA (just so that it
 121      * does not have to be passed all around.) Once the DFA is built,
 122      * this is no longer required so its nulled out.
 123      */
 124     private CMNode fHeadNode = null;
 125 
 126     /**
 127      * The count of leaf nodes. This is an important number that set some
 128      * limits on the sizes of data structures in the DFA process.
 129      */
 130     private int fLeafCount = 0;
 131 
 132     /**
 133      * An array of non-epsilon leaf nodes, which is used during the DFA
 134      * build operation, then dropped.
 135      */
 136     private CMLeaf fLeafList[] = null;
 137 
 138     /** Array mapping ANY types to the leaf list. */
 139     private int fLeafListType[] = null;
 140 
 141     //private ContentLeafNameTypeVector fLeafNameTypeVector = null;
 142 
 143     /**
 144      * The string pool of our parser session. This is set during construction
 145      * and kept around.
 146      */
 147     //private StringPool fStringPool = null;
 148 
 149     /**
 150      * This is the transition table that is the main by product of all
 151      * of the effort here. It is an array of arrays of ints. The first
 152      * dimension is the number of states we end up with in the DFA. The
 153      * second dimensions is the number of unique elements in the content
 154      * model (fElemMapSize). Each entry in the second dimension indicates
 155      * the new state given that input for the first dimension's start
 156      * state.
 157      * <p>
 158      * The fElemMap array handles mapping from element indexes to
 159      * positions in the second dimension of the transition table.
 160      */
 161     private int fTransTable[][] = null;
 162 
 163     /**
 164      * The number of valid entries in the transition table, and in the other
 165      * related tables such as fFinalStateFlags.
 166      */
 167     private int fTransTableSize = 0;
 168 
 169     /**
 170      * Flag that indicates that even though we have a "complicated"
 171      * content model, it is valid to have no content. In other words,
 172      * all parts of the content model are optional. For example:
 173      * <pre>
 174      *      &lt;!ELEMENT AllOptional (Optional*,NotRequired?)&gt;
 175      * </pre>
 176      */
 177     private boolean fEmptyContentIsValid = false;
 178 
 179     // temp variables
 180 
 181     /** Temporary qualified name. */
 182     private final QName fQName = new QName();
 183 
 184     //
 185     // Constructors
 186     //
 187 
 188 
 189     //
 190     // Constructors
 191     //
 192 
 193     /**
 194      * Constructs a DFA content model.
 195      *
 196      * @param syntaxTree    The syntax tree of the content model.
 197      * @param leafCount     The number of leaves.
 198      * @param mixed
 199      *
 200      */
 201     public DFAContentModel(CMNode syntaxTree, int leafCount, boolean mixed) {
 202         // Store away our index and pools in members
 203         //fStringPool = stringPool;
 204         fLeafCount = leafCount;
 205 
 206 
 207         // this is for Schema Mixed Content
 208         fMixed = mixed;
 209 
 210         //
 211         //  Ok, so lets grind through the building of the DFA. This method
 212         //  handles the high level logic of the algorithm, but it uses a
 213         //  number of helper classes to do its thing.
 214         //
 215         //  In order to avoid having hundreds of references to the error and
 216         //  string handlers around, this guy and all of his helper classes
 217         //  just throw a simple exception and we then pass it along.
 218         //
 219         buildDFA(syntaxTree);
 220     }
 221 
 222     //
 223     // ContentModelValidator methods
 224     //
 225 
 226     /**
 227      * Check that the specified content is valid according to this
 228      * content model. This method can also be called to do 'what if'
 229      * testing of content models just to see if they would be valid.
 230      * <p>
 231      * A value of -1 in the children array indicates a PCDATA node. All other
 232      * indexes will be positive and represent child elements. The count can be
 233      * zero, since some elements have the EMPTY content model and that must be
 234      * confirmed.
 235      *
 236      * @param children The children of this element.  Each integer is an index within
 237      *                 the <code>StringPool</code> of the child element name.  An index
 238      *                 of -1 is used to indicate an occurrence of non-whitespace character
 239      *                 data.
 240      * @param offset Offset into the array where the children starts.
 241      * @param length The number of entries in the <code>children</code> array.
 242      *
 243      * @return The value -1 if fully valid, else the 0 based index of the child
 244      *         that first failed. If the value returned is equal to the number
 245      *         of children, then the specified children are valid but additional
 246      *         content is required to reach a valid ending state.
 247      *
 248      */
 249     public int validate(QName[] children, int offset, int length) {
 250 
 251         if (DEBUG_VALIDATE_CONTENT)
 252             System.out.println("DFAContentModel#validateContent");
 253 
 254         //
 255         // A DFA content model must *always* have at least 1 child
 256         // so a failure is given if no children present.
 257         //
 258         // Defect 782: This is an incorrect statement because a DFA
 259         // content model is also used for constructions such as:
 260         //
 261         //     (Optional*,NotRequired?)
 262         //
 263         // where a perfectly valid content would be NO CHILDREN.
 264         // Therefore, if there are no children, we must check to
 265         // see if the CMNODE_EOC marker is a valid start state! -Ac
 266         //
 267         if (length == 0) {
 268             if (DEBUG_VALIDATE_CONTENT) {
 269                 System.out.println("!!! no children");
 270                 System.out.println("elemMap="+fElemMap);
 271                 for (int i = 0; i < fElemMap.length; i++) {
 272                     String uri = fElemMap[i].uri;
 273                     String localpart = fElemMap[i].localpart;
 274 
 275                     System.out.println("fElemMap["+i+"]="+uri+","+
 276                                        localpart+" ("+
 277                                        uri+", "+
 278                                        localpart+
 279                                        ')');
 280 
 281                 }
 282                 System.out.println("EOCIndex="+fEOCString);
 283             }
 284 
 285             return fEmptyContentIsValid ? -1 : 0;
 286 
 287         } // if child count == 0
 288 
 289         //
 290         //  Lets loop through the children in the array and move our way
 291         //  through the states. Note that we use the fElemMap array to map
 292         //  an element index to a state index.
 293         //
 294         int curState = 0;
 295         for (int childIndex = 0; childIndex < length; childIndex++)
 296         {
 297             // Get the current element index out
 298             final QName curElem = children[offset + childIndex];
 299             // ignore mixed text
 300             if (fMixed && curElem.localpart == null) {
 301                 continue;
 302             }
 303 
 304             // Look up this child in our element map
 305             int elemIndex = 0;
 306             for (; elemIndex < fElemMapSize; elemIndex++)
 307             {
 308                 int type = fElemMapType[elemIndex] & 0x0f ;
 309                 if (type == XMLContentSpec.CONTENTSPECNODE_LEAF) {
 310                     //System.out.println("fElemMap["+elemIndex+"]: "+fElemMap[elemIndex]);
 311                     if (fElemMap[elemIndex].rawname == curElem.rawname) {
 312                         break;
 313                     }
 314                 }
 315                 else if (type == XMLContentSpec.CONTENTSPECNODE_ANY) {
 316                     String uri = fElemMap[elemIndex].uri;
 317                     if (uri == null || uri == curElem.uri) {
 318                         break;
 319                     }
 320                 }
 321                 else if (type == XMLContentSpec.CONTENTSPECNODE_ANY_LOCAL) {
 322                     if (curElem.uri == null) {
 323                         break;
 324                     }
 325                 }
 326                 else if (type == XMLContentSpec.CONTENTSPECNODE_ANY_OTHER) {
 327                     if (fElemMap[elemIndex].uri != curElem.uri) {
 328                         break;
 329                     }
 330                 }
 331             }
 332 
 333             // If we didn't find it, then obviously not valid
 334             if (elemIndex == fElemMapSize) {
 335                 if (DEBUG_VALIDATE_CONTENT) {
 336                     System.out.println("!!! didn't find it");
 337 
 338                     System.out.println("curElem : " +curElem );
 339                     for (int i=0; i<fElemMapSize; i++) {
 340                         System.out.println("fElemMap["+i+"] = " +fElemMap[i] );
 341                         System.out.println("fElemMapType["+i+"] = " +fElemMapType[i] );
 342                     }
 343                 }
 344 
 345                 return childIndex;
 346             }
 347 
 348             //
 349             //  Look up the next state for this input symbol when in the
 350             //  current state.
 351             //
 352             curState = fTransTable[curState][elemIndex];
 353 
 354             // If its not a legal transition, then invalid
 355             if (curState == -1) {
 356                 if (DEBUG_VALIDATE_CONTENT)
 357                     System.out.println("!!! not a legal transition");
 358                 return childIndex;
 359             }
 360         }
 361 
 362         //
 363         //  We transitioned all the way through the input list. However, that
 364         //  does not mean that we ended in a final state. So check whether
 365         //  our ending state is a final state.
 366         //
 367         if (DEBUG_VALIDATE_CONTENT)
 368             System.out.println("curState="+curState+", childCount="+length);
 369         if (!fFinalStateFlags[curState])
 370             return length;
 371 
 372         // success!
 373         return -1;
 374     } // validate
 375 
 376 
 377     //
 378     // Private methods
 379     //
 380 
 381     /**
 382      * Builds the internal DFA transition table from the given syntax tree.
 383      *
 384      * @param syntaxTree The syntax tree.
 385      *
 386      * @exception CMException Thrown if DFA cannot be built.
 387      */
 388     private void buildDFA(CMNode syntaxTree)
 389     {
 390         //
 391         //  The first step we need to take is to rewrite the content model
 392         //  using our CMNode objects, and in the process get rid of any
 393         //  repetition short cuts, converting them into '*' style repetitions
 394         //  or getting rid of repetitions altogether.
 395         //
 396         //  The conversions done are:
 397         //
 398         //  x+ -> (x|x*)
 399         //  x? -> (x|epsilon)
 400         //
 401         //  This is a relatively complex scenario. What is happening is that
 402         //  we create a top level binary node of which the special EOC value
 403         //  is set as the right side node. The the left side is set to the
 404         //  rewritten syntax tree. The source is the original content model
 405         //  info from the decl pool. The rewrite is done by buildSyntaxTree()
 406         //  which recurses the decl pool's content of the element and builds
 407         //  a new tree in the process.
 408         //
 409         //  Note that, during this operation, we set each non-epsilon leaf
 410         //  node's DFA state position and count the number of such leafs, which
 411         //  is left in the fLeafCount member.
 412         //
 413         //  The nodeTmp object is passed in just as a temp node to use during
 414         //  the recursion. Otherwise, we'd have to create a new node on every
 415         //  level of recursion, which would be piggy in Java (as is everything
 416         //  for that matter.)
 417         //
 418 
 419         /* MODIFIED (Jan, 2001)
 420          *
 421          * Use following rules.
 422          *   nullable(x+) := nullable(x), first(x+) := first(x),  last(x+) := last(x)
 423          *   nullable(x?) := true, first(x?) := first(x),  last(x?) := last(x)
 424          *
 425          * The same computation of follow as x* is applied to x+
 426          *
 427          * The modification drastically reduces computation time of
 428          * "(a, (b, a+, (c, (b, a+)+, a+, (d,  (c, (b, a+)+, a+)+, (b, a+)+, a+)+)+)+)+"
 429          */
 430 
 431         fQName.setValues(null, fEOCString, fEOCString, null);
 432         CMLeaf nodeEOC = new CMLeaf(fQName);
 433         fHeadNode = new CMBinOp
 434         (
 435             XMLContentSpec.CONTENTSPECNODE_SEQ
 436             , syntaxTree
 437             , nodeEOC
 438         );
 439 
 440         //
 441         //  And handle specially the EOC node, which also must be numbered
 442         //  and counted as a non-epsilon leaf node. It could not be handled
 443         //  in the above tree build because it was created before all that
 444         //  started. We save the EOC position since its used during the DFA
 445         //  building loop.
 446         //
 447         fEOCPos = fLeafCount;
 448         nodeEOC.setPosition(fLeafCount++);
 449 
 450         //
 451         //  Ok, so now we have to iterate the new tree and do a little more
 452         //  work now that we know the leaf count. One thing we need to do is
 453         //  to calculate the first and last position sets of each node. This
 454         //  is cached away in each of the nodes.
 455         //
 456         //  Along the way we also set the leaf count in each node as the
 457         //  maximum state count. They must know this in order to create their
 458         //  first/last pos sets.
 459         //
 460         //  We also need to build an array of references to the non-epsilon
 461         //  leaf nodes. Since we iterate it in the same way as before, this
 462         //  will put them in the array according to their position values.
 463         //
 464         fLeafList = new CMLeaf[fLeafCount];
 465         fLeafListType = new int[fLeafCount];
 466         postTreeBuildInit(fHeadNode, 0);
 467 
 468         //
 469         //  And, moving onward... We now need to build the follow position
 470         //  sets for all the nodes. So we allocate an array of state sets,
 471         //  one for each leaf node (i.e. each DFA position.)
 472         //
 473         fFollowList = new CMStateSet[fLeafCount];
 474         for (int index = 0; index < fLeafCount; index++)
 475             fFollowList[index] = new CMStateSet(fLeafCount);
 476         calcFollowList(fHeadNode);
 477         //
 478         //  And finally the big push... Now we build the DFA using all the
 479         //  states and the tree we've built up. First we set up the various
 480         //  data structures we are going to use while we do this.
 481         //
 482         //  First of all we need an array of unique element names in our
 483         //  content model. For each transition table entry, we need a set of
 484         //  contiguous indices to represent the transitions for a particular
 485         //  input element. So we need to a zero based range of indexes that
 486         //  map to element types. This element map provides that mapping.
 487         //
 488         fElemMap = new QName[fLeafCount];
 489         fElemMapType = new int[fLeafCount];
 490         fElemMapSize = 0;
 491         for (int outIndex = 0; outIndex < fLeafCount; outIndex++)
 492         {
 493             fElemMap[outIndex] = new QName();
 494 
 495             /****
 496             if ( (fLeafListType[outIndex] & 0x0f) != 0 ) {
 497                 if (fLeafNameTypeVector == null) {
 498                     fLeafNameTypeVector = new ContentLeafNameTypeVector();
 499                 }
 500             }
 501             /****/
 502 
 503             // Get the current leaf's element index
 504             final QName element = fLeafList[outIndex].getElement();
 505 
 506             // See if the current leaf node's element index is in the list
 507             int inIndex = 0;
 508             for (; inIndex < fElemMapSize; inIndex++)
 509             {
 510                 if (fElemMap[inIndex].rawname == element.rawname) {
 511                     break;
 512                 }
 513             }
 514 
 515             // If it was not in the list, then add it, if not the EOC node
 516             if (inIndex == fElemMapSize) {
 517                 fElemMap[fElemMapSize].setValues(element);
 518                 fElemMapType[fElemMapSize] = fLeafListType[outIndex];
 519                 fElemMapSize++;
 520             }
 521         }
 522         // set up the fLeafNameTypeVector object if there is one.
 523         /*****
 524         if (fLeafNameTypeVector != null) {
 525             fLeafNameTypeVector.setValues(fElemMap, fElemMapType, fElemMapSize);
 526         }
 527 
 528         /***
 529         * Optimization(Jan, 2001); We sort fLeafList according to
 530         * elemIndex which is *uniquely* associated to each leaf.
 531         * We are *assuming* that each element appears in at least one leaf.
 532         **/
 533 
 534         int[] fLeafSorter = new int[fLeafCount + fElemMapSize];
 535         int fSortCount = 0;
 536 
 537         for (int elemIndex = 0; elemIndex < fElemMapSize; elemIndex++) {
 538             for (int leafIndex = 0; leafIndex < fLeafCount; leafIndex++) {
 539                     final QName leaf = fLeafList[leafIndex].getElement();
 540                     final QName element = fElemMap[elemIndex];
 541                     if (leaf.rawname == element.rawname) {
 542                             fLeafSorter[fSortCount++] = leafIndex;
 543                     }
 544             }
 545             fLeafSorter[fSortCount++] = -1;
 546         }
 547 
 548         /* Optimization(Jan, 2001) */
 549 
 550         //
 551         //  Next lets create some arrays, some that that hold transient
 552         //  information during the DFA build and some that are permament.
 553         //  These are kind of sticky since we cannot know how big they will
 554         //  get, but we don't want to use any Java collections because of
 555         //  performance.
 556         //
 557         //  Basically they will probably be about fLeafCount*2 on average,
 558         //  but can be as large as 2^(fLeafCount*2), worst case. So we start
 559         //  with fLeafCount*4 as a middle ground. This will be very unlikely
 560         //  to ever have to expand, though it if does, the overhead will be
 561         //  somewhat ugly.
 562         //
 563         int curArraySize = fLeafCount * 4;
 564         CMStateSet[] statesToDo = new CMStateSet[curArraySize];
 565         fFinalStateFlags = new boolean[curArraySize];
 566         fTransTable = new int[curArraySize][];
 567 
 568         //
 569         //  Ok we start with the initial set as the first pos set of the
 570         //  head node (which is the seq node that holds the content model
 571         //  and the EOC node.)
 572         //
 573         CMStateSet setT = fHeadNode.firstPos();
 574 
 575         //
 576         //  Init our two state flags. Basically the unmarked state counter
 577         //  is always chasing the current state counter. When it catches up,
 578         //  that means we made a pass through that did not add any new states
 579         //  to the lists, at which time we are done. We could have used a
 580         //  expanding array of flags which we used to mark off states as we
 581         //  complete them, but this is easier though less readable maybe.
 582         //
 583         int unmarkedState = 0;
 584         int curState = 0;
 585 
 586         //
 587         //  Init the first transition table entry, and put the initial state
 588         //  into the states to do list, then bump the current state.
 589         //
 590         fTransTable[curState] = makeDefStateList();
 591         statesToDo[curState] = setT;
 592         curState++;
 593 
 594             /* Optimization(Jan, 2001); This is faster for
 595              * a large content model such as, "(t001+|t002+|.... |t500+)".
 596              */
 597 
 598         Map<CMStateSet, Integer> stateTable = new HashMap<>();
 599 
 600             /* Optimization(Jan, 2001) */
 601 
 602         //
 603         //  Ok, almost done with the algorithm... We now enter the
 604         //  loop where we go until the states done counter catches up with
 605         //  the states to do counter.
 606         //
 607         while (unmarkedState < curState)
 608         {
 609             //
 610             //  Get the first unmarked state out of the list of states to do.
 611             //  And get the associated transition table entry.
 612             //
 613             setT = statesToDo[unmarkedState];
 614             int[] transEntry = fTransTable[unmarkedState];
 615 
 616             // Mark this one final if it contains the EOC state
 617             fFinalStateFlags[unmarkedState] = setT.getBit(fEOCPos);
 618 
 619             // Bump up the unmarked state count, marking this state done
 620             unmarkedState++;
 621 
 622             // Loop through each possible input symbol in the element map
 623             CMStateSet newSet = null;
 624             /* Optimization(Jan, 2001) */
 625             int sorterIndex = 0;
 626             /* Optimization(Jan, 2001) */
 627             for (int elemIndex = 0; elemIndex < fElemMapSize; elemIndex++)
 628             {
 629                 //
 630                 //  Build up a set of states which is the union of all of
 631                 //  the follow sets of DFA positions that are in the current
 632                 //  state. If we gave away the new set last time through then
 633                 //  create a new one. Otherwise, zero out the existing one.
 634                 //
 635                 if (newSet == null)
 636                     newSet = new CMStateSet(fLeafCount);
 637                 else
 638                     newSet.zeroBits();
 639 
 640             /* Optimization(Jan, 2001) */
 641                 int leafIndex = fLeafSorter[sorterIndex++];
 642 
 643                 while (leafIndex != -1) {
 644                 // If this leaf index (DFA position) is in the current set...
 645                     if (setT.getBit(leafIndex))
 646                     {
 647                         //
 648                         //  If this leaf is the current input symbol, then we
 649                         //  want to add its follow list to the set of states to
 650                         //  transition to from the current state.
 651                         //
 652                                 newSet.union(fFollowList[leafIndex]);
 653                             }
 654 
 655                    leafIndex = fLeafSorter[sorterIndex++];
 656         }
 657             /* Optimization(Jan, 2001) */
 658 
 659                 //
 660                 //  If this new set is not empty, then see if its in the list
 661                 //  of states to do. If not, then add it.
 662                 //
 663                 if (!newSet.isEmpty())
 664                 {
 665                     //
 666                     //  Search the 'states to do' list to see if this new
 667                     //  state set is already in there.
 668                     //
 669 
 670             /* Optimization(Jan, 2001) */
 671             Integer stateObj = stateTable.get(newSet);
 672             int stateIndex = (stateObj == null ? curState : stateObj.intValue());
 673             /* Optimization(Jan, 2001) */
 674 
 675                     // If we did not find it, then add it
 676                     if (stateIndex == curState)
 677                     {
 678                         //
 679                         //  Put this new state into the states to do and init
 680                         //  a new entry at the same index in the transition
 681                         //  table.
 682                         //
 683                         statesToDo[curState] = newSet;
 684                         fTransTable[curState] = makeDefStateList();
 685 
 686             /* Optimization(Jan, 2001) */
 687                         stateTable.put(newSet, curState);
 688             /* Optimization(Jan, 2001) */
 689 
 690                         // We now have a new state to do so bump the count
 691                         curState++;
 692 
 693                         //
 694                         //  Null out the new set to indicate we adopted it.
 695                         //  This will cause the creation of a new set on the
 696                         //  next time around the loop.
 697                         //
 698                         newSet = null;
 699                     }
 700 
 701                     //
 702                     //  Now set this state in the transition table's entry
 703                     //  for this element (using its index), with the DFA
 704                     //  state we will move to from the current state when we
 705                     //  see this input element.
 706                     //
 707                     transEntry[elemIndex] = stateIndex;
 708 
 709                     // Expand the arrays if we're full
 710                     if (curState == curArraySize)
 711                     {
 712                         //
 713                         //  Yikes, we overflowed the initial array size, so
 714                         //  we've got to expand all of these arrays. So adjust
 715                         //  up the size by 50% and allocate new arrays.
 716                         //
 717                         final int newSize = (int)(curArraySize * 1.5);
 718                         CMStateSet[] newToDo = new CMStateSet[newSize];
 719                         boolean[] newFinalFlags = new boolean[newSize];
 720                         int[][] newTransTable = new int[newSize][];
 721 
 722                         // Copy over all of the existing content
 723                         System.arraycopy(statesToDo, 0, newToDo, 0, curArraySize);
 724                         System.arraycopy(fFinalStateFlags, 0, newFinalFlags, 0, curArraySize);
 725                         System.arraycopy(fTransTable, 0, newTransTable, 0, curArraySize);
 726 
 727                         // Store the new array size
 728                         curArraySize = newSize;
 729                         statesToDo = newToDo;
 730                         fFinalStateFlags = newFinalFlags;
 731                         fTransTable = newTransTable;
 732                     }
 733                 }
 734             }
 735         }
 736 
 737         // Check to see if we can set the fEmptyContentIsValid flag.
 738         fEmptyContentIsValid = ((CMBinOp)fHeadNode).getLeft().isNullable();
 739 
 740         //
 741         //  And now we can say bye bye to the temp representation since we've
 742         //  built the DFA.
 743         //
 744         if (DEBUG_VALIDATE_CONTENT)
 745             dumpTree(fHeadNode, 0);
 746         fHeadNode = null;
 747         fLeafList = null;
 748         fFollowList = null;
 749 
 750     }
 751 
 752     /**
 753      * Calculates the follow list of the current node.
 754      *
 755      * @param nodeCur The curent node.
 756      *
 757      * @exception CMException Thrown if follow list cannot be calculated.
 758      */
 759     private void calcFollowList(CMNode nodeCur)
 760     {
 761         // Recurse as required
 762         if (nodeCur.type() == XMLContentSpec.CONTENTSPECNODE_CHOICE)
 763         {
 764             // Recurse only
 765             calcFollowList(((CMBinOp)nodeCur).getLeft());
 766             calcFollowList(((CMBinOp)nodeCur).getRight());
 767         }
 768          else if (nodeCur.type() == XMLContentSpec.CONTENTSPECNODE_SEQ)
 769         {
 770             // Recurse first
 771             calcFollowList(((CMBinOp)nodeCur).getLeft());
 772             calcFollowList(((CMBinOp)nodeCur).getRight());
 773 
 774             //
 775             //  Now handle our level. We use our left child's last pos
 776             //  set and our right child's first pos set, so go ahead and
 777             //  get them ahead of time.
 778             //
 779             final CMStateSet last  = ((CMBinOp)nodeCur).getLeft().lastPos();
 780             final CMStateSet first = ((CMBinOp)nodeCur).getRight().firstPos();
 781 
 782             //
 783             //  Now, for every position which is in our left child's last set
 784             //  add all of the states in our right child's first set to the
 785             //  follow set for that position.
 786             //
 787             for (int index = 0; index < fLeafCount; index++)
 788             {
 789                 if (last.getBit(index))
 790                     fFollowList[index].union(first);
 791             }
 792         }
 793          /***
 794          else if (nodeCur.type() == XMLContentSpec.CONTENTSPECNODE_ZERO_OR_MORE)
 795         {
 796             // Recurse first
 797             calcFollowList(((CMUniOp)nodeCur).getChild());
 798 
 799             //
 800             //  Now handle our level. We use our own first and last position
 801             //  sets, so get them up front.
 802             //
 803             final CMStateSet first = nodeCur.firstPos();
 804             final CMStateSet last  = nodeCur.lastPos();
 805 
 806             //
 807             //  For every position which is in our last position set, add all
 808             //  of our first position states to the follow set for that
 809             //  position.
 810             //
 811             for (int index = 0; index < fLeafCount; index++)
 812             {
 813                 if (last.getBit(index))
 814                     fFollowList[index].union(first);
 815             }
 816         }
 817          else if ((nodeCur.type() == XMLContentSpec.CONTENTSPECNODE_ONE_OR_MORE)
 818               ||  (nodeCur.type() == XMLContentSpec.CONTENTSPECNODE_ZERO_OR_ONE))
 819         {
 820             throw new RuntimeException("ImplementationMessages.VAL_NIICM");
 821         }
 822         /***/
 823          else if (nodeCur.type() == XMLContentSpec.CONTENTSPECNODE_ZERO_OR_MORE
 824             || nodeCur.type() == XMLContentSpec.CONTENTSPECNODE_ONE_OR_MORE)
 825         {
 826             // Recurse first
 827             calcFollowList(((CMUniOp)nodeCur).getChild());
 828 
 829             //
 830             //  Now handle our level. We use our own first and last position
 831             //  sets, so get them up front.
 832             //
 833             final CMStateSet first = nodeCur.firstPos();
 834             final CMStateSet last  = nodeCur.lastPos();
 835 
 836             //
 837             //  For every position which is in our last position set, add all
 838             //  of our first position states to the follow set for that
 839             //  position.
 840             //
 841             for (int index = 0; index < fLeafCount; index++)
 842             {
 843                 if (last.getBit(index))
 844                     fFollowList[index].union(first);
 845             }
 846         }
 847 
 848         else if (nodeCur.type() == XMLContentSpec.CONTENTSPECNODE_ZERO_OR_ONE) {
 849             // Recurse only
 850             calcFollowList(((CMUniOp)nodeCur).getChild());
 851         }
 852          /***/
 853     }
 854 
 855     /**
 856      * Dumps the tree of the current node to standard output.
 857      *
 858      * @param nodeCur The current node.
 859      * @param level   The maximum levels to output.
 860      *
 861      * @exception CMException Thrown on error.
 862      */
 863     private void dumpTree(CMNode nodeCur, int level)
 864     {
 865         for (int index = 0; index < level; index++)
 866             System.out.print("   ");
 867 
 868         int type = nodeCur.type();
 869         if ((type == XMLContentSpec.CONTENTSPECNODE_CHOICE)
 870         ||  (type == XMLContentSpec.CONTENTSPECNODE_SEQ))
 871         {
 872             if (type == XMLContentSpec.CONTENTSPECNODE_CHOICE)
 873                 System.out.print("Choice Node ");
 874             else
 875                 System.out.print("Seq Node ");
 876 
 877             if (nodeCur.isNullable())
 878                 System.out.print("Nullable ");
 879 
 880             System.out.print("firstPos=");
 881             System.out.print(nodeCur.firstPos().toString());
 882             System.out.print(" lastPos=");
 883             System.out.println(nodeCur.lastPos().toString());
 884 
 885             dumpTree(((CMBinOp)nodeCur).getLeft(), level+1);
 886             dumpTree(((CMBinOp)nodeCur).getRight(), level+1);
 887         }
 888          else if (nodeCur.type() == XMLContentSpec.CONTENTSPECNODE_ZERO_OR_MORE)
 889         {
 890             System.out.print("Rep Node ");
 891 
 892             if (nodeCur.isNullable())
 893                 System.out.print("Nullable ");
 894 
 895             System.out.print("firstPos=");
 896             System.out.print(nodeCur.firstPos().toString());
 897             System.out.print(" lastPos=");
 898             System.out.println(nodeCur.lastPos().toString());
 899 
 900             dumpTree(((CMUniOp)nodeCur).getChild(), level+1);
 901         }
 902          else if (nodeCur.type() == XMLContentSpec.CONTENTSPECNODE_LEAF)
 903         {
 904             System.out.print
 905             (
 906                 "Leaf: (pos="
 907                 + ((CMLeaf)nodeCur).getPosition()
 908                 + "), "
 909                 + ((CMLeaf)nodeCur).getElement()
 910                 + "(elemIndex="
 911                 + ((CMLeaf)nodeCur).getElement()
 912                 + ") "
 913             );
 914 
 915             if (nodeCur.isNullable())
 916                 System.out.print(" Nullable ");
 917 
 918             System.out.print("firstPos=");
 919             System.out.print(nodeCur.firstPos().toString());
 920             System.out.print(" lastPos=");
 921             System.out.println(nodeCur.lastPos().toString());
 922         }
 923          else
 924         {
 925             throw new RuntimeException("ImplementationMessages.VAL_NIICM");
 926         }
 927     }
 928 
 929 
 930     /**
 931      * -1 is used to represent bad transitions in the transition table
 932      * entry for each state. So each entry is initialized to an all -1
 933      * array. This method creates a new entry and initializes it.
 934      */
 935     private int[] makeDefStateList()
 936     {
 937         int[] retArray = new int[fElemMapSize];
 938         for (int index = 0; index < fElemMapSize; index++)
 939             retArray[index] = -1;
 940         return retArray;
 941     }
 942 
 943     /** Post tree build initialization. */
 944     private int postTreeBuildInit(CMNode nodeCur, int curIndex)
 945     {
 946         // Set the maximum states on this node
 947         nodeCur.setMaxStates(fLeafCount);
 948 
 949         // Recurse as required
 950         if ((nodeCur.type() & 0x0f) == XMLContentSpec.CONTENTSPECNODE_ANY ||
 951             (nodeCur.type() & 0x0f) == XMLContentSpec.CONTENTSPECNODE_ANY_LOCAL ||
 952             (nodeCur.type() & 0x0f) == XMLContentSpec.CONTENTSPECNODE_ANY_OTHER) {
 953             // REVISIT: Don't waste these structures.
 954             QName qname = new QName(null, null, null, ((CMAny)nodeCur).getURI());
 955             fLeafList[curIndex] = new CMLeaf(qname, ((CMAny)nodeCur).getPosition());
 956             fLeafListType[curIndex] = nodeCur.type();
 957             curIndex++;
 958         }
 959         else if ((nodeCur.type() == XMLContentSpec.CONTENTSPECNODE_CHOICE)
 960         ||  (nodeCur.type() == XMLContentSpec.CONTENTSPECNODE_SEQ))
 961         {
 962             curIndex = postTreeBuildInit(((CMBinOp)nodeCur).getLeft(), curIndex);
 963             curIndex = postTreeBuildInit(((CMBinOp)nodeCur).getRight(), curIndex);
 964         }
 965          else if (nodeCur.type() == XMLContentSpec.CONTENTSPECNODE_ZERO_OR_MORE
 966              || nodeCur.type() == XMLContentSpec.CONTENTSPECNODE_ONE_OR_MORE
 967              || nodeCur.type() == XMLContentSpec.CONTENTSPECNODE_ZERO_OR_ONE)
 968         {
 969             curIndex = postTreeBuildInit(((CMUniOp)nodeCur).getChild(), curIndex);
 970         }
 971          else if (nodeCur.type() == XMLContentSpec.CONTENTSPECNODE_LEAF)
 972         {
 973             //
 974             //  Put this node in the leaf list at the current index if its
 975             //  a non-epsilon leaf.
 976             //
 977              final QName node = ((CMLeaf)nodeCur).getElement();
 978             if (node.localpart != fEpsilonString) {
 979                 fLeafList[curIndex] = (CMLeaf)nodeCur;
 980                 fLeafListType[curIndex] = XMLContentSpec.CONTENTSPECNODE_LEAF;
 981                 curIndex++;
 982             }
 983         }
 984          else
 985         {
 986             throw new RuntimeException("ImplementationMessages.VAL_NIICM: type="+nodeCur.type());
 987         }
 988         return curIndex;
 989     }
 990 
 991 } // class DFAContentModel