1 /*
   2  * Copyright (c) 1997, 2014, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "libadt/vectset.hpp"
  27 #include "memory/allocation.inline.hpp"
  28 #include "opto/block.hpp"
  29 #include "opto/c2compiler.hpp"
  30 #include "opto/callnode.hpp"
  31 #include "opto/cfgnode.hpp"
  32 #include "opto/machnode.hpp"
  33 #include "opto/opcodes.hpp"
  34 #include "opto/phaseX.hpp"
  35 #include "opto/rootnode.hpp"
  36 #include "opto/runtime.hpp"
  37 #include "runtime/deoptimization.hpp"
  38 #ifdef TARGET_ARCH_MODEL_x86_32
  39 # include "adfiles/ad_x86_32.hpp"
  40 #endif
  41 #ifdef TARGET_ARCH_MODEL_x86_64
  42 # include "adfiles/ad_x86_64.hpp"
  43 #endif
  44 #ifdef TARGET_ARCH_MODEL_sparc
  45 # include "adfiles/ad_sparc.hpp"
  46 #endif
  47 #ifdef TARGET_ARCH_MODEL_zero
  48 # include "adfiles/ad_zero.hpp"
  49 #endif
  50 #ifdef TARGET_ARCH_MODEL_arm
  51 # include "adfiles/ad_arm.hpp"
  52 #endif
  53 #ifdef TARGET_ARCH_MODEL_ppc_32
  54 # include "adfiles/ad_ppc_32.hpp"
  55 #endif
  56 #ifdef TARGET_ARCH_MODEL_ppc_64
  57 # include "adfiles/ad_ppc_64.hpp"
  58 #endif
  59 
  60 
  61 // Portions of code courtesy of Clifford Click
  62 
  63 // Optimization - Graph Style
  64 
  65 // To avoid float value underflow
  66 #define MIN_BLOCK_FREQUENCY 1.e-35f
  67 
  68 //----------------------------schedule_node_into_block-------------------------
  69 // Insert node n into block b. Look for projections of n and make sure they
  70 // are in b also.
  71 void PhaseCFG::schedule_node_into_block( Node *n, Block *b ) {
  72   // Set basic block of n, Add n to b,
  73   map_node_to_block(n, b);
  74   b->add_inst(n);
  75 
  76   // After Matching, nearly any old Node may have projections trailing it.
  77   // These are usually machine-dependent flags.  In any case, they might
  78   // float to another block below this one.  Move them up.
  79   for (DUIterator_Fast imax, i = n->fast_outs(imax); i < imax; i++) {
  80     Node*  use  = n->fast_out(i);
  81     if (use->is_Proj()) {
  82       Block* buse = get_block_for_node(use);
  83       if (buse != b) {              // In wrong block?
  84         if (buse != NULL) {
  85           buse->find_remove(use);   // Remove from wrong block
  86         }
  87         map_node_to_block(use, b);
  88         b->add_inst(use);
  89       }
  90     }
  91   }
  92 }
  93 
  94 //----------------------------replace_block_proj_ctrl-------------------------
  95 // Nodes that have is_block_proj() nodes as their control need to use
  96 // the appropriate Region for their actual block as their control since
  97 // the projection will be in a predecessor block.
  98 void PhaseCFG::replace_block_proj_ctrl( Node *n ) {
  99   const Node *in0 = n->in(0);
 100   assert(in0 != NULL, "Only control-dependent");
 101   const Node *p = in0->is_block_proj();
 102   if (p != NULL && p != n) {    // Control from a block projection?
 103     assert(!n->pinned() || n->is_MachConstantBase(), "only pinned MachConstantBase node is expected here");
 104     // Find trailing Region
 105     Block *pb = get_block_for_node(in0); // Block-projection already has basic block
 106     uint j = 0;
 107     if (pb->_num_succs != 1) {  // More then 1 successor?
 108       // Search for successor
 109       uint max = pb->number_of_nodes();
 110       assert( max > 1, "" );
 111       uint start = max - pb->_num_succs;
 112       // Find which output path belongs to projection
 113       for (j = start; j < max; j++) {
 114         if( pb->get_node(j) == in0 )
 115           break;
 116       }
 117       assert( j < max, "must find" );
 118       // Change control to match head of successor basic block
 119       j -= start;
 120     }
 121     n->set_req(0, pb->_succs[j]->head());
 122   }
 123 }
 124 
 125 
 126 //------------------------------schedule_pinned_nodes--------------------------
 127 // Set the basic block for Nodes pinned into blocks
 128 void PhaseCFG::schedule_pinned_nodes(VectorSet &visited) {
 129   // Allocate node stack of size C->unique()+8 to avoid frequent realloc
 130   GrowableArray <Node *> spstack(C->unique() + 8);
 131   spstack.push(_root);
 132   while (spstack.is_nonempty()) {
 133     Node* node = spstack.pop();
 134     if (!visited.test_set(node->_idx)) { // Test node and flag it as visited
 135       if (node->pinned() && !has_block(node)) {  // Pinned?  Nail it down!
 136         assert(node->in(0), "pinned Node must have Control");
 137         // Before setting block replace block_proj control edge
 138         replace_block_proj_ctrl(node);
 139         Node* input = node->in(0);
 140         while (!input->is_block_start()) {
 141           input = input->in(0);
 142         }
 143         Block* block = get_block_for_node(input); // Basic block of controlling input
 144         schedule_node_into_block(node, block);
 145       }
 146 
 147       // process all inputs that are non NULL
 148       for (int i = node->req() - 1; i >= 0; --i) {
 149         if (node->in(i) != NULL) {
 150           spstack.push(node->in(i));
 151         }
 152       }
 153     }
 154   }
 155 }
 156 
 157 #ifdef ASSERT
 158 // Assert that new input b2 is dominated by all previous inputs.
 159 // Check this by by seeing that it is dominated by b1, the deepest
 160 // input observed until b2.
 161 static void assert_dom(Block* b1, Block* b2, Node* n, const PhaseCFG* cfg) {
 162   if (b1 == NULL)  return;
 163   assert(b1->_dom_depth < b2->_dom_depth, "sanity");
 164   Block* tmp = b2;
 165   while (tmp != b1 && tmp != NULL) {
 166     tmp = tmp->_idom;
 167   }
 168   if (tmp != b1) {
 169     // Detected an unschedulable graph.  Print some nice stuff and die.
 170     tty->print_cr("!!! Unschedulable graph !!!");
 171     for (uint j=0; j<n->len(); j++) { // For all inputs
 172       Node* inn = n->in(j); // Get input
 173       if (inn == NULL)  continue;  // Ignore NULL, missing inputs
 174       Block* inb = cfg->get_block_for_node(inn);
 175       tty->print("B%d idom=B%d depth=%2d ",inb->_pre_order,
 176                  inb->_idom ? inb->_idom->_pre_order : 0, inb->_dom_depth);
 177       inn->dump();
 178     }
 179     tty->print("Failing node: ");
 180     n->dump();
 181     assert(false, "unscheduable graph");
 182   }
 183 }
 184 #endif
 185 
 186 static Block* find_deepest_input(Node* n, const PhaseCFG* cfg) {
 187   // Find the last input dominated by all other inputs.
 188   Block* deepb           = NULL;        // Deepest block so far
 189   int    deepb_dom_depth = 0;
 190   for (uint k = 0; k < n->len(); k++) { // For all inputs
 191     Node* inn = n->in(k);               // Get input
 192     if (inn == NULL)  continue;         // Ignore NULL, missing inputs
 193     Block* inb = cfg->get_block_for_node(inn);
 194     assert(inb != NULL, "must already have scheduled this input");
 195     if (deepb_dom_depth < (int) inb->_dom_depth) {
 196       // The new inb must be dominated by the previous deepb.
 197       // The various inputs must be linearly ordered in the dom
 198       // tree, or else there will not be a unique deepest block.
 199       DEBUG_ONLY(assert_dom(deepb, inb, n, cfg));
 200       deepb = inb;                      // Save deepest block
 201       deepb_dom_depth = deepb->_dom_depth;
 202     }
 203   }
 204   assert(deepb != NULL, "must be at least one input to n");
 205   return deepb;
 206 }
 207 
 208 
 209 //------------------------------schedule_early---------------------------------
 210 // Find the earliest Block any instruction can be placed in.  Some instructions
 211 // are pinned into Blocks.  Unpinned instructions can appear in last block in
 212 // which all their inputs occur.
 213 bool PhaseCFG::schedule_early(VectorSet &visited, Node_List &roots) {
 214   // Allocate stack with enough space to avoid frequent realloc
 215   Node_Stack nstack(roots.Size() + 8);
 216   // _root will be processed among C->top() inputs
 217   roots.push(C->top());
 218   visited.set(C->top()->_idx);
 219 
 220   while (roots.size() != 0) {
 221     // Use local variables nstack_top_n & nstack_top_i to cache values
 222     // on stack's top.
 223     Node* parent_node = roots.pop();
 224     uint  input_index = 0;
 225 
 226     while (true) {
 227       if (input_index == 0) {
 228         // Fixup some control.  Constants without control get attached
 229         // to root and nodes that use is_block_proj() nodes should be attached
 230         // to the region that starts their block.
 231         const Node* control_input = parent_node->in(0);
 232         if (control_input != NULL) {
 233           replace_block_proj_ctrl(parent_node);
 234         } else {
 235           // Is a constant with NO inputs?
 236           if (parent_node->req() == 1) {
 237             parent_node->set_req(0, _root);
 238           }
 239         }
 240       }
 241 
 242       // First, visit all inputs and force them to get a block.  If an
 243       // input is already in a block we quit following inputs (to avoid
 244       // cycles). Instead we put that Node on a worklist to be handled
 245       // later (since IT'S inputs may not have a block yet).
 246 
 247       // Assume all n's inputs will be processed
 248       bool done = true;
 249 
 250       while (input_index < parent_node->len()) {
 251         Node* in = parent_node->in(input_index++);
 252         if (in == NULL) {
 253           continue;
 254         }
 255 
 256         int is_visited = visited.test_set(in->_idx);
 257         if (!has_block(in)) {
 258           if (is_visited) {
 259             return false;
 260           }
 261           // Save parent node and next input's index.
 262           nstack.push(parent_node, input_index);
 263           // Process current input now.
 264           parent_node = in;
 265           input_index = 0;
 266           // Not all n's inputs processed.
 267           done = false;
 268           break;
 269         } else if (!is_visited) {
 270           // Visit this guy later, using worklist
 271           roots.push(in);
 272         }
 273       }
 274 
 275       if (done) {
 276         // All of n's inputs have been processed, complete post-processing.
 277 
 278         // Some instructions are pinned into a block.  These include Region,
 279         // Phi, Start, Return, and other control-dependent instructions and
 280         // any projections which depend on them.
 281         if (!parent_node->pinned()) {
 282           // Set earliest legal block.
 283           Block* earliest_block = find_deepest_input(parent_node, this);
 284           map_node_to_block(parent_node, earliest_block);
 285         } else {
 286           assert(get_block_for_node(parent_node) == get_block_for_node(parent_node->in(0)), "Pinned Node should be at the same block as its control edge");
 287         }
 288 
 289         if (nstack.is_empty()) {
 290           // Finished all nodes on stack.
 291           // Process next node on the worklist 'roots'.
 292           break;
 293         }
 294         // Get saved parent node and next input's index.
 295         parent_node = nstack.node();
 296         input_index = nstack.index();
 297         nstack.pop();
 298       }
 299     }
 300   }
 301   return true;
 302 }
 303 
 304 //------------------------------dom_lca----------------------------------------
 305 // Find least common ancestor in dominator tree
 306 // LCA is a current notion of LCA, to be raised above 'this'.
 307 // As a convenient boundary condition, return 'this' if LCA is NULL.
 308 // Find the LCA of those two nodes.
 309 Block* Block::dom_lca(Block* LCA) {
 310   if (LCA == NULL || LCA == this)  return this;
 311 
 312   Block* anc = this;
 313   while (anc->_dom_depth > LCA->_dom_depth)
 314     anc = anc->_idom;           // Walk up till anc is as high as LCA
 315 
 316   while (LCA->_dom_depth > anc->_dom_depth)
 317     LCA = LCA->_idom;           // Walk up till LCA is as high as anc
 318 
 319   while (LCA != anc) {          // Walk both up till they are the same
 320     LCA = LCA->_idom;
 321     anc = anc->_idom;
 322   }
 323 
 324   return LCA;
 325 }
 326 
 327 //--------------------------raise_LCA_above_use--------------------------------
 328 // We are placing a definition, and have been given a def->use edge.
 329 // The definition must dominate the use, so move the LCA upward in the
 330 // dominator tree to dominate the use.  If the use is a phi, adjust
 331 // the LCA only with the phi input paths which actually use this def.
 332 static Block* raise_LCA_above_use(Block* LCA, Node* use, Node* def, const PhaseCFG* cfg) {
 333   Block* buse = cfg->get_block_for_node(use);
 334   if (buse == NULL)    return LCA;   // Unused killing Projs have no use block
 335   if (!use->is_Phi())  return buse->dom_lca(LCA);
 336   uint pmax = use->req();       // Number of Phi inputs
 337   // Why does not this loop just break after finding the matching input to
 338   // the Phi?  Well...it's like this.  I do not have true def-use/use-def
 339   // chains.  Means I cannot distinguish, from the def-use direction, which
 340   // of many use-defs lead from the same use to the same def.  That is, this
 341   // Phi might have several uses of the same def.  Each use appears in a
 342   // different predecessor block.  But when I enter here, I cannot distinguish
 343   // which use-def edge I should find the predecessor block for.  So I find
 344   // them all.  Means I do a little extra work if a Phi uses the same value
 345   // more than once.
 346   for (uint j=1; j<pmax; j++) { // For all inputs
 347     if (use->in(j) == def) {    // Found matching input?
 348       Block* pred = cfg->get_block_for_node(buse->pred(j));
 349       LCA = pred->dom_lca(LCA);
 350     }
 351   }
 352   return LCA;
 353 }
 354 
 355 //----------------------------raise_LCA_above_marks----------------------------
 356 // Return a new LCA that dominates LCA and any of its marked predecessors.
 357 // Search all my parents up to 'early' (exclusive), looking for predecessors
 358 // which are marked with the given index.  Return the LCA (in the dom tree)
 359 // of all marked blocks.  If there are none marked, return the original
 360 // LCA.
 361 static Block* raise_LCA_above_marks(Block* LCA, node_idx_t mark, Block* early, const PhaseCFG* cfg) {
 362   Block_List worklist;
 363   worklist.push(LCA);
 364   while (worklist.size() > 0) {
 365     Block* mid = worklist.pop();
 366     if (mid == early)  continue;  // stop searching here
 367 
 368     // Test and set the visited bit.
 369     if (mid->raise_LCA_visited() == mark)  continue;  // already visited
 370 
 371     // Don't process the current LCA, otherwise the search may terminate early
 372     if (mid != LCA && mid->raise_LCA_mark() == mark) {
 373       // Raise the LCA.
 374       LCA = mid->dom_lca(LCA);
 375       if (LCA == early)  break;   // stop searching everywhere
 376       assert(early->dominates(LCA), "early is high enough");
 377       // Resume searching at that point, skipping intermediate levels.
 378       worklist.push(LCA);
 379       if (LCA == mid)
 380         continue; // Don't mark as visited to avoid early termination.
 381     } else {
 382       // Keep searching through this block's predecessors.
 383       for (uint j = 1, jmax = mid->num_preds(); j < jmax; j++) {
 384         Block* mid_parent = cfg->get_block_for_node(mid->pred(j));
 385         worklist.push(mid_parent);
 386       }
 387     }
 388     mid->set_raise_LCA_visited(mark);
 389   }
 390   return LCA;
 391 }
 392 
 393 //--------------------------memory_early_block--------------------------------
 394 // This is a variation of find_deepest_input, the heart of schedule_early.
 395 // Find the "early" block for a load, if we considered only memory and
 396 // address inputs, that is, if other data inputs were ignored.
 397 //
 398 // Because a subset of edges are considered, the resulting block will
 399 // be earlier (at a shallower dom_depth) than the true schedule_early
 400 // point of the node. We compute this earlier block as a more permissive
 401 // site for anti-dependency insertion, but only if subsume_loads is enabled.
 402 static Block* memory_early_block(Node* load, Block* early, const PhaseCFG* cfg) {
 403   Node* base;
 404   Node* index;
 405   Node* store = load->in(MemNode::Memory);
 406   load->as_Mach()->memory_inputs(base, index);
 407 
 408   assert(base != NodeSentinel && index != NodeSentinel,
 409          "unexpected base/index inputs");
 410 
 411   Node* mem_inputs[4];
 412   int mem_inputs_length = 0;
 413   if (base != NULL)  mem_inputs[mem_inputs_length++] = base;
 414   if (index != NULL) mem_inputs[mem_inputs_length++] = index;
 415   if (store != NULL) mem_inputs[mem_inputs_length++] = store;
 416 
 417   // In the comparision below, add one to account for the control input,
 418   // which may be null, but always takes up a spot in the in array.
 419   if (mem_inputs_length + 1 < (int) load->req()) {
 420     // This "load" has more inputs than just the memory, base and index inputs.
 421     // For purposes of checking anti-dependences, we need to start
 422     // from the early block of only the address portion of the instruction,
 423     // and ignore other blocks that may have factored into the wider
 424     // schedule_early calculation.
 425     if (load->in(0) != NULL) mem_inputs[mem_inputs_length++] = load->in(0);
 426 
 427     Block* deepb           = NULL;        // Deepest block so far
 428     int    deepb_dom_depth = 0;
 429     for (int i = 0; i < mem_inputs_length; i++) {
 430       Block* inb = cfg->get_block_for_node(mem_inputs[i]);
 431       if (deepb_dom_depth < (int) inb->_dom_depth) {
 432         // The new inb must be dominated by the previous deepb.
 433         // The various inputs must be linearly ordered in the dom
 434         // tree, or else there will not be a unique deepest block.
 435         DEBUG_ONLY(assert_dom(deepb, inb, load, cfg));
 436         deepb = inb;                      // Save deepest block
 437         deepb_dom_depth = deepb->_dom_depth;
 438       }
 439     }
 440     early = deepb;
 441   }
 442 
 443   return early;
 444 }
 445 
 446 //--------------------------insert_anti_dependences---------------------------
 447 // A load may need to witness memory that nearby stores can overwrite.
 448 // For each nearby store, either insert an "anti-dependence" edge
 449 // from the load to the store, or else move LCA upward to force the
 450 // load to (eventually) be scheduled in a block above the store.
 451 //
 452 // Do not add edges to stores on distinct control-flow paths;
 453 // only add edges to stores which might interfere.
 454 //
 455 // Return the (updated) LCA.  There will not be any possibly interfering
 456 // store between the load's "early block" and the updated LCA.
 457 // Any stores in the updated LCA will have new precedence edges
 458 // back to the load.  The caller is expected to schedule the load
 459 // in the LCA, in which case the precedence edges will make LCM
 460 // preserve anti-dependences.  The caller may also hoist the load
 461 // above the LCA, if it is not the early block.
 462 Block* PhaseCFG::insert_anti_dependences(Block* LCA, Node* load, bool verify) {
 463   assert(load->needs_anti_dependence_check(), "must be a load of some sort");
 464   assert(LCA != NULL, "");
 465   DEBUG_ONLY(Block* LCA_orig = LCA);
 466 
 467   // Compute the alias index.  Loads and stores with different alias indices
 468   // do not need anti-dependence edges.
 469   uint load_alias_idx = C->get_alias_index(load->adr_type());
 470 #ifdef ASSERT
 471   if (load_alias_idx == Compile::AliasIdxBot && C->AliasLevel() > 0 &&
 472       (PrintOpto || VerifyAliases ||
 473        PrintMiscellaneous && (WizardMode || Verbose))) {
 474     // Load nodes should not consume all of memory.
 475     // Reporting a bottom type indicates a bug in adlc.
 476     // If some particular type of node validly consumes all of memory,
 477     // sharpen the preceding "if" to exclude it, so we can catch bugs here.
 478     tty->print_cr("*** Possible Anti-Dependence Bug:  Load consumes all of memory.");
 479     load->dump(2);
 480     if (VerifyAliases)  assert(load_alias_idx != Compile::AliasIdxBot, "");
 481   }
 482 #endif
 483   assert(load_alias_idx || (load->is_Mach() && load->as_Mach()->ideal_Opcode() == Op_StrComp),
 484          "String compare is only known 'load' that does not conflict with any stores");
 485   assert(load_alias_idx || (load->is_Mach() && load->as_Mach()->ideal_Opcode() == Op_StrEquals),
 486          "String equals is a 'load' that does not conflict with any stores");
 487   assert(load_alias_idx || (load->is_Mach() && load->as_Mach()->ideal_Opcode() == Op_StrIndexOf),
 488          "String indexOf is a 'load' that does not conflict with any stores");
 489   assert(load_alias_idx || (load->is_Mach() && load->as_Mach()->ideal_Opcode() == Op_AryEq),
 490          "Arrays equals is a 'load' that do not conflict with any stores");
 491 
 492   if (!C->alias_type(load_alias_idx)->is_rewritable()) {
 493     // It is impossible to spoil this load by putting stores before it,
 494     // because we know that the stores will never update the value
 495     // which 'load' must witness.
 496     return LCA;
 497   }
 498 
 499   node_idx_t load_index = load->_idx;
 500 
 501   // Note the earliest legal placement of 'load', as determined by
 502   // by the unique point in the dom tree where all memory effects
 503   // and other inputs are first available.  (Computed by schedule_early.)
 504   // For normal loads, 'early' is the shallowest place (dom graph wise)
 505   // to look for anti-deps between this load and any store.
 506   Block* early = get_block_for_node(load);
 507 
 508   // If we are subsuming loads, compute an "early" block that only considers
 509   // memory or address inputs. This block may be different than the
 510   // schedule_early block in that it could be at an even shallower depth in the
 511   // dominator tree, and allow for a broader discovery of anti-dependences.
 512   if (C->subsume_loads()) {
 513     early = memory_early_block(load, early, this);
 514   }
 515 
 516   ResourceArea *area = Thread::current()->resource_area();
 517   Node_List worklist_mem(area);     // prior memory state to store
 518   Node_List worklist_store(area);   // possible-def to explore
 519   Node_List worklist_visited(area); // visited mergemem nodes
 520   Node_List non_early_stores(area); // all relevant stores outside of early
 521   bool must_raise_LCA = false;
 522 
 523 #ifdef TRACK_PHI_INPUTS
 524   // %%% This extra checking fails because MergeMem nodes are not GVNed.
 525   // Provide "phi_inputs" to check if every input to a PhiNode is from the
 526   // original memory state.  This indicates a PhiNode for which should not
 527   // prevent the load from sinking.  For such a block, set_raise_LCA_mark
 528   // may be overly conservative.
 529   // Mechanism: count inputs seen for each Phi encountered in worklist_store.
 530   DEBUG_ONLY(GrowableArray<uint> phi_inputs(area, C->unique(),0,0));
 531 #endif
 532 
 533   // 'load' uses some memory state; look for users of the same state.
 534   // Recurse through MergeMem nodes to the stores that use them.
 535 
 536   // Each of these stores is a possible definition of memory
 537   // that 'load' needs to use.  We need to force 'load'
 538   // to occur before each such store.  When the store is in
 539   // the same block as 'load', we insert an anti-dependence
 540   // edge load->store.
 541 
 542   // The relevant stores "nearby" the load consist of a tree rooted
 543   // at initial_mem, with internal nodes of type MergeMem.
 544   // Therefore, the branches visited by the worklist are of this form:
 545   //    initial_mem -> (MergeMem ->)* store
 546   // The anti-dependence constraints apply only to the fringe of this tree.
 547 
 548   Node* initial_mem = load->in(MemNode::Memory);
 549   worklist_store.push(initial_mem);
 550   worklist_visited.push(initial_mem);
 551   worklist_mem.push(NULL);
 552   while (worklist_store.size() > 0) {
 553     // Examine a nearby store to see if it might interfere with our load.
 554     Node* mem   = worklist_mem.pop();
 555     Node* store = worklist_store.pop();
 556     uint op = store->Opcode();
 557 
 558     // MergeMems do not directly have anti-deps.
 559     // Treat them as internal nodes in a forward tree of memory states,
 560     // the leaves of which are each a 'possible-def'.
 561     if (store == initial_mem    // root (exclusive) of tree we are searching
 562         || op == Op_MergeMem    // internal node of tree we are searching
 563         ) {
 564       mem = store;   // It's not a possibly interfering store.
 565       if (store == initial_mem)
 566         initial_mem = NULL;  // only process initial memory once
 567 
 568       for (DUIterator_Fast imax, i = mem->fast_outs(imax); i < imax; i++) {
 569         store = mem->fast_out(i);
 570         if (store->is_MergeMem()) {
 571           // Be sure we don't get into combinatorial problems.
 572           // (Allow phis to be repeated; they can merge two relevant states.)
 573           uint j = worklist_visited.size();
 574           for (; j > 0; j--) {
 575             if (worklist_visited.at(j-1) == store)  break;
 576           }
 577           if (j > 0)  continue; // already on work list; do not repeat
 578           worklist_visited.push(store);
 579         }
 580         worklist_mem.push(mem);
 581         worklist_store.push(store);
 582       }
 583       continue;
 584     }
 585 
 586     if (op == Op_MachProj || op == Op_Catch)   continue;
 587     if (store->needs_anti_dependence_check())  continue;  // not really a store
 588 
 589     // Compute the alias index.  Loads and stores with different alias
 590     // indices do not need anti-dependence edges.  Wide MemBar's are
 591     // anti-dependent on everything (except immutable memories).
 592     const TypePtr* adr_type = store->adr_type();
 593     if (!C->can_alias(adr_type, load_alias_idx))  continue;
 594 
 595     // Most slow-path runtime calls do NOT modify Java memory, but
 596     // they can block and so write Raw memory.
 597     if (store->is_Mach()) {
 598       MachNode* mstore = store->as_Mach();
 599       if (load_alias_idx != Compile::AliasIdxRaw) {
 600         // Check for call into the runtime using the Java calling
 601         // convention (and from there into a wrapper); it has no
 602         // _method.  Can't do this optimization for Native calls because
 603         // they CAN write to Java memory.
 604         if (mstore->ideal_Opcode() == Op_CallStaticJava) {
 605           assert(mstore->is_MachSafePoint(), "");
 606           MachSafePointNode* ms = (MachSafePointNode*) mstore;
 607           assert(ms->is_MachCallJava(), "");
 608           MachCallJavaNode* mcj = (MachCallJavaNode*) ms;
 609           if (mcj->_method == NULL) {
 610             // These runtime calls do not write to Java visible memory
 611             // (other than Raw) and so do not require anti-dependence edges.
 612             continue;
 613           }
 614         }
 615         // Same for SafePoints: they read/write Raw but only read otherwise.
 616         // This is basically a workaround for SafePoints only defining control
 617         // instead of control + memory.
 618         if (mstore->ideal_Opcode() == Op_SafePoint)
 619           continue;
 620       } else {
 621         // Some raw memory, such as the load of "top" at an allocation,
 622         // can be control dependent on the previous safepoint. See
 623         // comments in GraphKit::allocate_heap() about control input.
 624         // Inserting an anti-dep between such a safepoint and a use
 625         // creates a cycle, and will cause a subsequent failure in
 626         // local scheduling.  (BugId 4919904)
 627         // (%%% How can a control input be a safepoint and not a projection??)
 628         if (mstore->ideal_Opcode() == Op_SafePoint && load->in(0) == mstore)
 629           continue;
 630       }
 631     }
 632 
 633     // Identify a block that the current load must be above,
 634     // or else observe that 'store' is all the way up in the
 635     // earliest legal block for 'load'.  In the latter case,
 636     // immediately insert an anti-dependence edge.
 637     Block* store_block = get_block_for_node(store);
 638     assert(store_block != NULL, "unused killing projections skipped above");
 639 
 640     if (store->is_Phi()) {
 641       // 'load' uses memory which is one (or more) of the Phi's inputs.
 642       // It must be scheduled not before the Phi, but rather before
 643       // each of the relevant Phi inputs.
 644       //
 645       // Instead of finding the LCA of all inputs to a Phi that match 'mem',
 646       // we mark each corresponding predecessor block and do a combined
 647       // hoisting operation later (raise_LCA_above_marks).
 648       //
 649       // Do not assert(store_block != early, "Phi merging memory after access")
 650       // PhiNode may be at start of block 'early' with backedge to 'early'
 651       DEBUG_ONLY(bool found_match = false);
 652       for (uint j = PhiNode::Input, jmax = store->req(); j < jmax; j++) {
 653         if (store->in(j) == mem) {   // Found matching input?
 654           DEBUG_ONLY(found_match = true);
 655           Block* pred_block = get_block_for_node(store_block->pred(j));
 656           if (pred_block != early) {
 657             // If any predecessor of the Phi matches the load's "early block",
 658             // we do not need a precedence edge between the Phi and 'load'
 659             // since the load will be forced into a block preceding the Phi.
 660             pred_block->set_raise_LCA_mark(load_index);
 661             assert(!LCA_orig->dominates(pred_block) ||
 662                    early->dominates(pred_block), "early is high enough");
 663             must_raise_LCA = true;
 664           } else {
 665             // anti-dependent upon PHI pinned below 'early', no edge needed
 666             LCA = early;             // but can not schedule below 'early'
 667           }
 668         }
 669       }
 670       assert(found_match, "no worklist bug");
 671 #ifdef TRACK_PHI_INPUTS
 672 #ifdef ASSERT
 673       // This assert asks about correct handling of PhiNodes, which may not
 674       // have all input edges directly from 'mem'. See BugId 4621264
 675       int num_mem_inputs = phi_inputs.at_grow(store->_idx,0) + 1;
 676       // Increment by exactly one even if there are multiple copies of 'mem'
 677       // coming into the phi, because we will run this block several times
 678       // if there are several copies of 'mem'.  (That's how DU iterators work.)
 679       phi_inputs.at_put(store->_idx, num_mem_inputs);
 680       assert(PhiNode::Input + num_mem_inputs < store->req(),
 681              "Expect at least one phi input will not be from original memory state");
 682 #endif //ASSERT
 683 #endif //TRACK_PHI_INPUTS
 684     } else if (store_block != early) {
 685       // 'store' is between the current LCA and earliest possible block.
 686       // Label its block, and decide later on how to raise the LCA
 687       // to include the effect on LCA of this store.
 688       // If this store's block gets chosen as the raised LCA, we
 689       // will find him on the non_early_stores list and stick him
 690       // with a precedence edge.
 691       // (But, don't bother if LCA is already raised all the way.)
 692       if (LCA != early) {
 693         store_block->set_raise_LCA_mark(load_index);
 694         must_raise_LCA = true;
 695         non_early_stores.push(store);
 696       }
 697     } else {
 698       // Found a possibly-interfering store in the load's 'early' block.
 699       // This means 'load' cannot sink at all in the dominator tree.
 700       // Add an anti-dep edge, and squeeze 'load' into the highest block.
 701       assert(store != load->in(0), "dependence cycle found");
 702       if (verify) {
 703         assert(store->find_edge(load) != -1, "missing precedence edge");
 704       } else {
 705         store->add_prec(load);
 706       }
 707       LCA = early;
 708       // This turns off the process of gathering non_early_stores.
 709     }
 710   }
 711   // (Worklist is now empty; all nearby stores have been visited.)
 712 
 713   // Finished if 'load' must be scheduled in its 'early' block.
 714   // If we found any stores there, they have already been given
 715   // precedence edges.
 716   if (LCA == early)  return LCA;
 717 
 718   // We get here only if there are no possibly-interfering stores
 719   // in the load's 'early' block.  Move LCA up above all predecessors
 720   // which contain stores we have noted.
 721   //
 722   // The raised LCA block can be a home to such interfering stores,
 723   // but its predecessors must not contain any such stores.
 724   //
 725   // The raised LCA will be a lower bound for placing the load,
 726   // preventing the load from sinking past any block containing
 727   // a store that may invalidate the memory state required by 'load'.
 728   if (must_raise_LCA)
 729     LCA = raise_LCA_above_marks(LCA, load->_idx, early, this);
 730   if (LCA == early)  return LCA;
 731 
 732   // Insert anti-dependence edges from 'load' to each store
 733   // in the non-early LCA block.
 734   // Mine the non_early_stores list for such stores.
 735   if (LCA->raise_LCA_mark() == load_index) {
 736     while (non_early_stores.size() > 0) {
 737       Node* store = non_early_stores.pop();
 738       Block* store_block = get_block_for_node(store);
 739       if (store_block == LCA) {
 740         // add anti_dependence from store to load in its own block
 741         assert(store != load->in(0), "dependence cycle found");
 742         if (verify) {
 743           assert(store->find_edge(load) != -1, "missing precedence edge");
 744         } else {
 745           store->add_prec(load);
 746         }
 747       } else {
 748         assert(store_block->raise_LCA_mark() == load_index, "block was marked");
 749         // Any other stores we found must be either inside the new LCA
 750         // or else outside the original LCA.  In the latter case, they
 751         // did not interfere with any use of 'load'.
 752         assert(LCA->dominates(store_block)
 753                || !LCA_orig->dominates(store_block), "no stray stores");
 754       }
 755     }
 756   }
 757 
 758   // Return the highest block containing stores; any stores
 759   // within that block have been given anti-dependence edges.
 760   return LCA;
 761 }
 762 
 763 // This class is used to iterate backwards over the nodes in the graph.
 764 
 765 class Node_Backward_Iterator {
 766 
 767 private:
 768   Node_Backward_Iterator();
 769 
 770 public:
 771   // Constructor for the iterator
 772   Node_Backward_Iterator(Node *root, VectorSet &visited, Node_List &stack, PhaseCFG &cfg);
 773 
 774   // Postincrement operator to iterate over the nodes
 775   Node *next();
 776 
 777 private:
 778   VectorSet   &_visited;
 779   Node_List   &_stack;
 780   PhaseCFG &_cfg;
 781 };
 782 
 783 // Constructor for the Node_Backward_Iterator
 784 Node_Backward_Iterator::Node_Backward_Iterator( Node *root, VectorSet &visited, Node_List &stack, PhaseCFG &cfg)
 785   : _visited(visited), _stack(stack), _cfg(cfg) {
 786   // The stack should contain exactly the root
 787   stack.clear();
 788   stack.push(root);
 789 
 790   // Clear the visited bits
 791   visited.Clear();
 792 }
 793 
 794 // Iterator for the Node_Backward_Iterator
 795 Node *Node_Backward_Iterator::next() {
 796 
 797   // If the _stack is empty, then just return NULL: finished.
 798   if ( !_stack.size() )
 799     return NULL;
 800 
 801   // '_stack' is emulating a real _stack.  The 'visit-all-users' loop has been
 802   // made stateless, so I do not need to record the index 'i' on my _stack.
 803   // Instead I visit all users each time, scanning for unvisited users.
 804   // I visit unvisited not-anti-dependence users first, then anti-dependent
 805   // children next.
 806   Node *self = _stack.pop();
 807 
 808   // I cycle here when I am entering a deeper level of recursion.
 809   // The key variable 'self' was set prior to jumping here.
 810   while( 1 ) {
 811 
 812     _visited.set(self->_idx);
 813 
 814     // Now schedule all uses as late as possible.
 815     const Node* src = self->is_Proj() ? self->in(0) : self;
 816     uint src_rpo = _cfg.get_block_for_node(src)->_rpo;
 817 
 818     // Schedule all nodes in a post-order visit
 819     Node *unvisited = NULL;  // Unvisited anti-dependent Node, if any
 820 
 821     // Scan for unvisited nodes
 822     for (DUIterator_Fast imax, i = self->fast_outs(imax); i < imax; i++) {
 823       // For all uses, schedule late
 824       Node* n = self->fast_out(i); // Use
 825 
 826       // Skip already visited children
 827       if ( _visited.test(n->_idx) )
 828         continue;
 829 
 830       // do not traverse backward control edges
 831       Node *use = n->is_Proj() ? n->in(0) : n;
 832       uint use_rpo = _cfg.get_block_for_node(use)->_rpo;
 833 
 834       if ( use_rpo < src_rpo )
 835         continue;
 836 
 837       // Phi nodes always precede uses in a basic block
 838       if ( use_rpo == src_rpo && use->is_Phi() )
 839         continue;
 840 
 841       unvisited = n;      // Found unvisited
 842 
 843       // Check for possible-anti-dependent
 844       if( !n->needs_anti_dependence_check() )
 845         break;            // Not visited, not anti-dep; schedule it NOW
 846     }
 847 
 848     // Did I find an unvisited not-anti-dependent Node?
 849     if ( !unvisited )
 850       break;                  // All done with children; post-visit 'self'
 851 
 852     // Visit the unvisited Node.  Contains the obvious push to
 853     // indicate I'm entering a deeper level of recursion.  I push the
 854     // old state onto the _stack and set a new state and loop (recurse).
 855     _stack.push(self);
 856     self = unvisited;
 857   } // End recursion loop
 858 
 859   return self;
 860 }
 861 
 862 //------------------------------ComputeLatenciesBackwards----------------------
 863 // Compute the latency of all the instructions.
 864 void PhaseCFG::compute_latencies_backwards(VectorSet &visited, Node_List &stack) {
 865 #ifndef PRODUCT
 866   if (trace_opto_pipelining())
 867     tty->print("\n#---- ComputeLatenciesBackwards ----\n");
 868 #endif
 869 
 870   Node_Backward_Iterator iter((Node *)_root, visited, stack, *this);
 871   Node *n;
 872 
 873   // Walk over all the nodes from last to first
 874   while (n = iter.next()) {
 875     // Set the latency for the definitions of this instruction
 876     partial_latency_of_defs(n);
 877   }
 878 } // end ComputeLatenciesBackwards
 879 
 880 //------------------------------partial_latency_of_defs------------------------
 881 // Compute the latency impact of this node on all defs.  This computes
 882 // a number that increases as we approach the beginning of the routine.
 883 void PhaseCFG::partial_latency_of_defs(Node *n) {
 884   // Set the latency for this instruction
 885 #ifndef PRODUCT
 886   if (trace_opto_pipelining()) {
 887     tty->print("# latency_to_inputs: node_latency[%d] = %d for node", n->_idx, get_latency_for_node(n));
 888     dump();
 889   }
 890 #endif
 891 
 892   if (n->is_Proj()) {
 893     n = n->in(0);
 894   }
 895 
 896   if (n->is_Root()) {
 897     return;
 898   }
 899 
 900   uint nlen = n->len();
 901   uint use_latency = get_latency_for_node(n);
 902   uint use_pre_order = get_block_for_node(n)->_pre_order;
 903 
 904   for (uint j = 0; j < nlen; j++) {
 905     Node *def = n->in(j);
 906 
 907     if (!def || def == n) {
 908       continue;
 909     }
 910 
 911     // Walk backwards thru projections
 912     if (def->is_Proj()) {
 913       def = def->in(0);
 914     }
 915 
 916 #ifndef PRODUCT
 917     if (trace_opto_pipelining()) {
 918       tty->print("#    in(%2d): ", j);
 919       def->dump();
 920     }
 921 #endif
 922 
 923     // If the defining block is not known, assume it is ok
 924     Block *def_block = get_block_for_node(def);
 925     uint def_pre_order = def_block ? def_block->_pre_order : 0;
 926 
 927     if ((use_pre_order <  def_pre_order) || (use_pre_order == def_pre_order && n->is_Phi())) {
 928       continue;
 929     }
 930 
 931     uint delta_latency = n->latency(j);
 932     uint current_latency = delta_latency + use_latency;
 933 
 934     if (get_latency_for_node(def) < current_latency) {
 935       set_latency_for_node(def, current_latency);
 936     }
 937 
 938 #ifndef PRODUCT
 939     if (trace_opto_pipelining()) {
 940       tty->print_cr("#      %d + edge_latency(%d) == %d -> %d, node_latency[%d] = %d", use_latency, j, delta_latency, current_latency, def->_idx, get_latency_for_node(def));
 941     }
 942 #endif
 943   }
 944 }
 945 
 946 //------------------------------latency_from_use-------------------------------
 947 // Compute the latency of a specific use
 948 int PhaseCFG::latency_from_use(Node *n, const Node *def, Node *use) {
 949   // If self-reference, return no latency
 950   if (use == n || use->is_Root()) {
 951     return 0;
 952   }
 953 
 954   uint def_pre_order = get_block_for_node(def)->_pre_order;
 955   uint latency = 0;
 956 
 957   // If the use is not a projection, then it is simple...
 958   if (!use->is_Proj()) {
 959 #ifndef PRODUCT
 960     if (trace_opto_pipelining()) {
 961       tty->print("#    out(): ");
 962       use->dump();
 963     }
 964 #endif
 965 
 966     uint use_pre_order = get_block_for_node(use)->_pre_order;
 967 
 968     if (use_pre_order < def_pre_order)
 969       return 0;
 970 
 971     if (use_pre_order == def_pre_order && use->is_Phi())
 972       return 0;
 973 
 974     uint nlen = use->len();
 975     uint nl = get_latency_for_node(use);
 976 
 977     for ( uint j=0; j<nlen; j++ ) {
 978       if (use->in(j) == n) {
 979         // Change this if we want local latencies
 980         uint ul = use->latency(j);
 981         uint  l = ul + nl;
 982         if (latency < l) latency = l;
 983 #ifndef PRODUCT
 984         if (trace_opto_pipelining()) {
 985           tty->print_cr("#      %d + edge_latency(%d) == %d -> %d, latency = %d",
 986                         nl, j, ul, l, latency);
 987         }
 988 #endif
 989       }
 990     }
 991   } else {
 992     // This is a projection, just grab the latency of the use(s)
 993     for (DUIterator_Fast jmax, j = use->fast_outs(jmax); j < jmax; j++) {
 994       uint l = latency_from_use(use, def, use->fast_out(j));
 995       if (latency < l) latency = l;
 996     }
 997   }
 998 
 999   return latency;
1000 }
1001 
1002 //------------------------------latency_from_uses------------------------------
1003 // Compute the latency of this instruction relative to all of it's uses.
1004 // This computes a number that increases as we approach the beginning of the
1005 // routine.
1006 void PhaseCFG::latency_from_uses(Node *n) {
1007   // Set the latency for this instruction
1008 #ifndef PRODUCT
1009   if (trace_opto_pipelining()) {
1010     tty->print("# latency_from_outputs: node_latency[%d] = %d for node", n->_idx, get_latency_for_node(n));
1011     dump();
1012   }
1013 #endif
1014   uint latency=0;
1015   const Node *def = n->is_Proj() ? n->in(0): n;
1016 
1017   for (DUIterator_Fast imax, i = n->fast_outs(imax); i < imax; i++) {
1018     uint l = latency_from_use(n, def, n->fast_out(i));
1019 
1020     if (latency < l) latency = l;
1021   }
1022 
1023   set_latency_for_node(n, latency);
1024 }
1025 
1026 //------------------------------hoist_to_cheaper_block-------------------------
1027 // Pick a block for node self, between early and LCA, that is a cheaper
1028 // alternative to LCA.
1029 Block* PhaseCFG::hoist_to_cheaper_block(Block* LCA, Block* early, Node* self) {
1030   const double delta = 1+PROB_UNLIKELY_MAG(4);
1031   Block* least       = LCA;
1032   double least_freq  = least->_freq;
1033   uint target        = get_latency_for_node(self);
1034   uint start_latency = get_latency_for_node(LCA->head());
1035   uint end_latency   = get_latency_for_node(LCA->get_node(LCA->end_idx()));
1036   bool in_latency    = (target <= start_latency);
1037   const Block* root_block = get_block_for_node(_root);
1038 
1039   // Turn off latency scheduling if scheduling is just plain off
1040   if (!C->do_scheduling())
1041     in_latency = true;
1042 
1043   // Do not hoist (to cover latency) instructions which target a
1044   // single register.  Hoisting stretches the live range of the
1045   // single register and may force spilling.
1046   MachNode* mach = self->is_Mach() ? self->as_Mach() : NULL;
1047   if (mach && mach->out_RegMask().is_bound1() && mach->out_RegMask().is_NotEmpty())
1048     in_latency = true;
1049 
1050 #ifndef PRODUCT
1051   if (trace_opto_pipelining()) {
1052     tty->print("# Find cheaper block for latency %d: ", get_latency_for_node(self));
1053     self->dump();
1054     tty->print_cr("#   B%d: start latency for [%4d]=%d, end latency for [%4d]=%d, freq=%g",
1055       LCA->_pre_order,
1056       LCA->head()->_idx,
1057       start_latency,
1058       LCA->get_node(LCA->end_idx())->_idx,
1059       end_latency,
1060       least_freq);
1061   }
1062 #endif
1063 
1064   int cand_cnt = 0;  // number of candidates tried
1065 
1066   // Walk up the dominator tree from LCA (Lowest common ancestor) to
1067   // the earliest legal location.  Capture the least execution frequency.
1068   while (LCA != early) {
1069     LCA = LCA->_idom;         // Follow up the dominator tree
1070 
1071     if (LCA == NULL) {
1072       // Bailout without retry
1073       C->record_method_not_compilable("late schedule failed: LCA == NULL");
1074       return least;
1075     }
1076 
1077     // Don't hoist machine instructions to the root basic block
1078     if (mach && LCA == root_block)
1079       break;
1080 
1081     uint start_lat = get_latency_for_node(LCA->head());
1082     uint end_idx   = LCA->end_idx();
1083     uint end_lat   = get_latency_for_node(LCA->get_node(end_idx));
1084     double LCA_freq = LCA->_freq;
1085 #ifndef PRODUCT
1086     if (trace_opto_pipelining()) {
1087       tty->print_cr("#   B%d: start latency for [%4d]=%d, end latency for [%4d]=%d, freq=%g",
1088         LCA->_pre_order, LCA->head()->_idx, start_lat, end_idx, end_lat, LCA_freq);
1089     }
1090 #endif
1091     cand_cnt++;
1092     if (LCA_freq < least_freq              || // Better Frequency
1093         (StressGCM && Compile::randomized_select(cand_cnt)) || // Should be randomly accepted in stress mode
1094          (!StressGCM                    &&    // Otherwise, choose with latency
1095           !in_latency                   &&    // No block containing latency
1096           LCA_freq < least_freq * delta &&    // No worse frequency
1097           target >= end_lat             &&    // within latency range
1098           !self->is_iteratively_computed() )  // But don't hoist IV increments
1099              // because they may end up above other uses of their phi forcing
1100              // their result register to be different from their input.
1101        ) {
1102       least = LCA;            // Found cheaper block
1103       least_freq = LCA_freq;
1104       start_latency = start_lat;
1105       end_latency = end_lat;
1106       if (target <= start_lat)
1107         in_latency = true;
1108     }
1109   }
1110 
1111 #ifndef PRODUCT
1112   if (trace_opto_pipelining()) {
1113     tty->print_cr("#  Choose block B%d with start latency=%d and freq=%g",
1114       least->_pre_order, start_latency, least_freq);
1115   }
1116 #endif
1117 
1118   // See if the latency needs to be updated
1119   if (target < end_latency) {
1120 #ifndef PRODUCT
1121     if (trace_opto_pipelining()) {
1122       tty->print_cr("#  Change latency for [%4d] from %d to %d", self->_idx, target, end_latency);
1123     }
1124 #endif
1125     set_latency_for_node(self, end_latency);
1126     partial_latency_of_defs(self);
1127   }
1128 
1129   return least;
1130 }
1131 
1132 
1133 //------------------------------schedule_late-----------------------------------
1134 // Now schedule all codes as LATE as possible.  This is the LCA in the
1135 // dominator tree of all USES of a value.  Pick the block with the least
1136 // loop nesting depth that is lowest in the dominator tree.
1137 extern const char must_clone[];
1138 void PhaseCFG::schedule_late(VectorSet &visited, Node_List &stack) {
1139 #ifndef PRODUCT
1140   if (trace_opto_pipelining())
1141     tty->print("\n#---- schedule_late ----\n");
1142 #endif
1143 
1144   Node_Backward_Iterator iter((Node *)_root, visited, stack, *this);
1145   Node *self;
1146 
1147   // Walk over all the nodes from last to first
1148   while (self = iter.next()) {
1149     Block* early = get_block_for_node(self); // Earliest legal placement
1150 
1151     if (self->is_top()) {
1152       // Top node goes in bb #2 with other constants.
1153       // It must be special-cased, because it has no out edges.
1154       early->add_inst(self);
1155       continue;
1156     }
1157 
1158     // No uses, just terminate
1159     if (self->outcnt() == 0) {
1160       assert(self->is_MachProj(), "sanity");
1161       continue;                   // Must be a dead machine projection
1162     }
1163 
1164     // If node is pinned in the block, then no scheduling can be done.
1165     if( self->pinned() )          // Pinned in block?
1166       continue;
1167 
1168     MachNode* mach = self->is_Mach() ? self->as_Mach() : NULL;
1169     if (mach) {
1170       switch (mach->ideal_Opcode()) {
1171       case Op_CreateEx:
1172         // Don't move exception creation
1173         early->add_inst(self);
1174         continue;
1175         break;
1176       case Op_CheckCastPP:
1177         // Don't move CheckCastPP nodes away from their input, if the input
1178         // is a rawptr (5071820).
1179         Node *def = self->in(1);
1180         if (def != NULL && def->bottom_type()->base() == Type::RawPtr) {
1181           early->add_inst(self);
1182 #ifdef ASSERT
1183           _raw_oops.push(def);
1184 #endif
1185           continue;
1186         }
1187         break;
1188       }
1189     }
1190 
1191     // Gather LCA of all uses
1192     Block *LCA = NULL;
1193     {
1194       for (DUIterator_Fast imax, i = self->fast_outs(imax); i < imax; i++) {
1195         // For all uses, find LCA
1196         Node* use = self->fast_out(i);
1197         LCA = raise_LCA_above_use(LCA, use, self, this);
1198       }
1199     }  // (Hide defs of imax, i from rest of block.)
1200 
1201     // Place temps in the block of their use.  This isn't a
1202     // requirement for correctness but it reduces useless
1203     // interference between temps and other nodes.
1204     if (mach != NULL && mach->is_MachTemp()) {
1205       map_node_to_block(self, LCA);
1206       LCA->add_inst(self);
1207       continue;
1208     }
1209 
1210     // Check if 'self' could be anti-dependent on memory
1211     if (self->needs_anti_dependence_check()) {
1212       // Hoist LCA above possible-defs and insert anti-dependences to
1213       // defs in new LCA block.
1214       LCA = insert_anti_dependences(LCA, self);
1215     }
1216 
1217     if (early->_dom_depth > LCA->_dom_depth) {
1218       // Somehow the LCA has moved above the earliest legal point.
1219       // (One way this can happen is via memory_early_block.)
1220       if (C->subsume_loads() == true && !C->failing()) {
1221         // Retry with subsume_loads == false
1222         // If this is the first failure, the sentinel string will "stick"
1223         // to the Compile object, and the C2Compiler will see it and retry.
1224         C->record_failure(C2Compiler::retry_no_subsuming_loads());
1225       } else {
1226         // Bailout without retry when (early->_dom_depth > LCA->_dom_depth)
1227         C->record_method_not_compilable("late schedule failed: incorrect graph");
1228       }
1229       return;
1230     }
1231 
1232     // If there is no opportunity to hoist, then we're done.
1233     // In stress mode, try to hoist even the single operations.
1234     bool try_to_hoist = StressGCM || (LCA != early);
1235 
1236     // Must clone guys stay next to use; no hoisting allowed.
1237     // Also cannot hoist guys that alter memory or are otherwise not
1238     // allocatable (hoisting can make a value live longer, leading to
1239     // anti and output dependency problems which are normally resolved
1240     // by the register allocator giving everyone a different register).
1241     if (mach != NULL && must_clone[mach->ideal_Opcode()])
1242       try_to_hoist = false;
1243 
1244     Block* late = NULL;
1245     if (try_to_hoist) {
1246       // Now find the block with the least execution frequency.
1247       // Start at the latest schedule and work up to the earliest schedule
1248       // in the dominator tree.  Thus the Node will dominate all its uses.
1249       late = hoist_to_cheaper_block(LCA, early, self);
1250     } else {
1251       // Just use the LCA of the uses.
1252       late = LCA;
1253     }
1254 
1255     // Put the node into target block
1256     schedule_node_into_block(self, late);
1257 
1258 #ifdef ASSERT
1259     if (self->needs_anti_dependence_check()) {
1260       // since precedence edges are only inserted when we're sure they
1261       // are needed make sure that after placement in a block we don't
1262       // need any new precedence edges.
1263       verify_anti_dependences(late, self);
1264     }
1265 #endif
1266   } // Loop until all nodes have been visited
1267 
1268 } // end ScheduleLate
1269 
1270 //------------------------------GlobalCodeMotion-------------------------------
1271 void PhaseCFG::global_code_motion() {
1272   ResourceMark rm;
1273 
1274 #ifndef PRODUCT
1275   if (trace_opto_pipelining()) {
1276     tty->print("\n---- Start GlobalCodeMotion ----\n");
1277   }
1278 #endif
1279 
1280   // Initialize the node to block mapping for things on the proj_list
1281   for (uint i = 0; i < _matcher.number_of_projections(); i++) {
1282     unmap_node_from_block(_matcher.get_projection(i));
1283   }
1284 
1285   // Set the basic block for Nodes pinned into blocks
1286   Arena* arena = Thread::current()->resource_area();
1287   VectorSet visited(arena);
1288   schedule_pinned_nodes(visited);
1289 
1290   // Find the earliest Block any instruction can be placed in.  Some
1291   // instructions are pinned into Blocks.  Unpinned instructions can
1292   // appear in last block in which all their inputs occur.
1293   visited.Clear();
1294   Node_List stack(arena);
1295   // Pre-grow the list
1296   stack.map((C->unique() >> 1) + 16, NULL);
1297   if (!schedule_early(visited, stack)) {
1298     // Bailout without retry
1299     C->record_method_not_compilable("early schedule failed");
1300     return;
1301   }
1302 
1303   // Build Def-Use edges.
1304   // Compute the latency information (via backwards walk) for all the
1305   // instructions in the graph
1306   _node_latency = new GrowableArray<uint>(); // resource_area allocation
1307 
1308   if (C->do_scheduling()) {
1309     compute_latencies_backwards(visited, stack);
1310   }
1311 
1312   // Now schedule all codes as LATE as possible.  This is the LCA in the
1313   // dominator tree of all USES of a value.  Pick the block with the least
1314   // loop nesting depth that is lowest in the dominator tree.
1315   // ( visited.Clear() called in schedule_late()->Node_Backward_Iterator() )
1316   schedule_late(visited, stack);
1317   if (C->failing()) {
1318     // schedule_late fails only when graph is incorrect.
1319     assert(!VerifyGraphEdges, "verification should have failed");
1320     return;
1321   }
1322 
1323 #ifndef PRODUCT
1324   if (trace_opto_pipelining()) {
1325     tty->print("\n---- Detect implicit null checks ----\n");
1326   }
1327 #endif
1328 
1329   // Detect implicit-null-check opportunities.  Basically, find NULL checks
1330   // with suitable memory ops nearby.  Use the memory op to do the NULL check.
1331   // I can generate a memory op if there is not one nearby.
1332   if (C->is_method_compilation()) {
1333     // By reversing the loop direction we get a very minor gain on mpegaudio.
1334     // Feel free to revert to a forward loop for clarity.
1335     // for( int i=0; i < (int)matcher._null_check_tests.size(); i+=2 ) {
1336     for (int i = _matcher._null_check_tests.size() - 2; i >= 0; i -= 2) {
1337       Node* proj = _matcher._null_check_tests[i];
1338       Node* val  = _matcher._null_check_tests[i + 1];
1339       Block* block = get_block_for_node(proj);
1340       implicit_null_check(block, proj, val, C->allowed_deopt_reasons());
1341       // The implicit_null_check will only perform the transformation
1342       // if the null branch is truly uncommon, *and* it leads to an
1343       // uncommon trap.  Combined with the too_many_traps guards
1344       // above, this prevents SEGV storms reported in 6366351,
1345       // by recompiling offending methods without this optimization.
1346     }
1347   }
1348 
1349 #ifndef PRODUCT
1350   if (trace_opto_pipelining()) {
1351     tty->print("\n---- Start Local Scheduling ----\n");
1352   }
1353 #endif
1354 
1355   // Schedule locally.  Right now a simple topological sort.
1356   // Later, do a real latency aware scheduler.
1357   GrowableArray<int> ready_cnt(C->unique(), C->unique(), -1);
1358   visited.Clear();
1359   for (uint i = 0; i < number_of_blocks(); i++) {
1360     Block* block = get_block(i);
1361     if (!schedule_local(block, ready_cnt, visited)) {
1362       if (!C->failure_reason_is(C2Compiler::retry_no_subsuming_loads())) {
1363         C->record_method_not_compilable("local schedule failed");
1364       }
1365       return;
1366     }
1367   }
1368 
1369   // If we inserted any instructions between a Call and his CatchNode,
1370   // clone the instructions on all paths below the Catch.
1371   for (uint i = 0; i < number_of_blocks(); i++) {
1372     Block* block = get_block(i);
1373     call_catch_cleanup(block);
1374   }
1375 
1376 #ifndef PRODUCT
1377   if (trace_opto_pipelining()) {
1378     tty->print("\n---- After GlobalCodeMotion ----\n");
1379     for (uint i = 0; i < number_of_blocks(); i++) {
1380       Block* block = get_block(i);
1381       block->dump();
1382     }
1383   }
1384 #endif
1385   // Dead.
1386   _node_latency = (GrowableArray<uint> *)0xdeadbeef;
1387 }
1388 
1389 bool PhaseCFG::do_global_code_motion() {
1390 
1391   build_dominator_tree();
1392   if (C->failing()) {
1393     return false;
1394   }
1395 
1396   NOT_PRODUCT( C->verify_graph_edges(); )
1397 
1398   estimate_block_frequency();
1399 
1400   global_code_motion();
1401 
1402   if (C->failing()) {
1403     return false;
1404   }
1405 
1406   return true;
1407 }
1408 
1409 //------------------------------Estimate_Block_Frequency-----------------------
1410 // Estimate block frequencies based on IfNode probabilities.
1411 void PhaseCFG::estimate_block_frequency() {
1412 
1413   // Force conditional branches leading to uncommon traps to be unlikely,
1414   // not because we get to the uncommon_trap with less relative frequency,
1415   // but because an uncommon_trap typically causes a deopt, so we only get
1416   // there once.
1417   if (C->do_freq_based_layout()) {
1418     Block_List worklist;
1419     Block* root_blk = get_block(0);
1420     for (uint i = 1; i < root_blk->num_preds(); i++) {
1421       Block *pb = get_block_for_node(root_blk->pred(i));
1422       if (pb->has_uncommon_code()) {
1423         worklist.push(pb);
1424       }
1425     }
1426     while (worklist.size() > 0) {
1427       Block* uct = worklist.pop();
1428       if (uct == get_root_block()) {
1429         continue;
1430       }
1431       for (uint i = 1; i < uct->num_preds(); i++) {
1432         Block *pb = get_block_for_node(uct->pred(i));
1433         if (pb->_num_succs == 1) {
1434           worklist.push(pb);
1435         } else if (pb->num_fall_throughs() == 2) {
1436           pb->update_uncommon_branch(uct);
1437         }
1438       }
1439     }
1440   }
1441 
1442   // Create the loop tree and calculate loop depth.
1443   _root_loop = create_loop_tree();
1444   _root_loop->compute_loop_depth(0);
1445 
1446   // Compute block frequency of each block, relative to a single loop entry.
1447   _root_loop->compute_freq();
1448 
1449   // Adjust all frequencies to be relative to a single method entry
1450   _root_loop->_freq = 1.0;
1451   _root_loop->scale_freq();
1452 
1453   // Save outmost loop frequency for LRG frequency threshold
1454   _outer_loop_frequency = _root_loop->outer_loop_freq();
1455 
1456   // force paths ending at uncommon traps to be infrequent
1457   if (!C->do_freq_based_layout()) {
1458     Block_List worklist;
1459     Block* root_blk = get_block(0);
1460     for (uint i = 1; i < root_blk->num_preds(); i++) {
1461       Block *pb = get_block_for_node(root_blk->pred(i));
1462       if (pb->has_uncommon_code()) {
1463         worklist.push(pb);
1464       }
1465     }
1466     while (worklist.size() > 0) {
1467       Block* uct = worklist.pop();
1468       uct->_freq = PROB_MIN;
1469       for (uint i = 1; i < uct->num_preds(); i++) {
1470         Block *pb = get_block_for_node(uct->pred(i));
1471         if (pb->_num_succs == 1 && pb->_freq > PROB_MIN) {
1472           worklist.push(pb);
1473         }
1474       }
1475     }
1476   }
1477 
1478 #ifdef ASSERT
1479   for (uint i = 0; i < number_of_blocks(); i++) {
1480     Block* b = get_block(i);
1481     assert(b->_freq >= MIN_BLOCK_FREQUENCY, "Register Allocator requires meaningful block frequency");
1482   }
1483 #endif
1484 
1485 #ifndef PRODUCT
1486   if (PrintCFGBlockFreq) {
1487     tty->print_cr("CFG Block Frequencies");
1488     _root_loop->dump_tree();
1489     if (Verbose) {
1490       tty->print_cr("PhaseCFG dump");
1491       dump();
1492       tty->print_cr("Node dump");
1493       _root->dump(99999);
1494     }
1495   }
1496 #endif
1497 }
1498 
1499 //----------------------------create_loop_tree--------------------------------
1500 // Create a loop tree from the CFG
1501 CFGLoop* PhaseCFG::create_loop_tree() {
1502 
1503 #ifdef ASSERT
1504   assert(get_block(0) == get_root_block(), "first block should be root block");
1505   for (uint i = 0; i < number_of_blocks(); i++) {
1506     Block* block = get_block(i);
1507     // Check that _loop field are clear...we could clear them if not.
1508     assert(block->_loop == NULL, "clear _loop expected");
1509     // Sanity check that the RPO numbering is reflected in the _blocks array.
1510     // It doesn't have to be for the loop tree to be built, but if it is not,
1511     // then the blocks have been reordered since dom graph building...which
1512     // may question the RPO numbering
1513     assert(block->_rpo == i, "unexpected reverse post order number");
1514   }
1515 #endif
1516 
1517   int idct = 0;
1518   CFGLoop* root_loop = new CFGLoop(idct++);
1519 
1520   Block_List worklist;
1521 
1522   // Assign blocks to loops
1523   for(uint i = number_of_blocks() - 1; i > 0; i-- ) { // skip Root block
1524     Block* block = get_block(i);
1525 
1526     if (block->head()->is_Loop()) {
1527       Block* loop_head = block;
1528       assert(loop_head->num_preds() - 1 == 2, "loop must have 2 predecessors");
1529       Node* tail_n = loop_head->pred(LoopNode::LoopBackControl);
1530       Block* tail = get_block_for_node(tail_n);
1531 
1532       // Defensively filter out Loop nodes for non-single-entry loops.
1533       // For all reasonable loops, the head occurs before the tail in RPO.
1534       if (i <= tail->_rpo) {
1535 
1536         // The tail and (recursive) predecessors of the tail
1537         // are made members of a new loop.
1538 
1539         assert(worklist.size() == 0, "nonempty worklist");
1540         CFGLoop* nloop = new CFGLoop(idct++);
1541         assert(loop_head->_loop == NULL, "just checking");
1542         loop_head->_loop = nloop;
1543         // Add to nloop so push_pred() will skip over inner loops
1544         nloop->add_member(loop_head);
1545         nloop->push_pred(loop_head, LoopNode::LoopBackControl, worklist, this);
1546 
1547         while (worklist.size() > 0) {
1548           Block* member = worklist.pop();
1549           if (member != loop_head) {
1550             for (uint j = 1; j < member->num_preds(); j++) {
1551               nloop->push_pred(member, j, worklist, this);
1552             }
1553           }
1554         }
1555       }
1556     }
1557   }
1558 
1559   // Create a member list for each loop consisting
1560   // of both blocks and (immediate child) loops.
1561   for (uint i = 0; i < number_of_blocks(); i++) {
1562     Block* block = get_block(i);
1563     CFGLoop* lp = block->_loop;
1564     if (lp == NULL) {
1565       // Not assigned to a loop. Add it to the method's pseudo loop.
1566       block->_loop = root_loop;
1567       lp = root_loop;
1568     }
1569     if (lp == root_loop || block != lp->head()) { // loop heads are already members
1570       lp->add_member(block);
1571     }
1572     if (lp != root_loop) {
1573       if (lp->parent() == NULL) {
1574         // Not a nested loop. Make it a child of the method's pseudo loop.
1575         root_loop->add_nested_loop(lp);
1576       }
1577       if (block == lp->head()) {
1578         // Add nested loop to member list of parent loop.
1579         lp->parent()->add_member(lp);
1580       }
1581     }
1582   }
1583 
1584   return root_loop;
1585 }
1586 
1587 //------------------------------push_pred--------------------------------------
1588 void CFGLoop::push_pred(Block* blk, int i, Block_List& worklist, PhaseCFG* cfg) {
1589   Node* pred_n = blk->pred(i);
1590   Block* pred = cfg->get_block_for_node(pred_n);
1591   CFGLoop *pred_loop = pred->_loop;
1592   if (pred_loop == NULL) {
1593     // Filter out blocks for non-single-entry loops.
1594     // For all reasonable loops, the head occurs before the tail in RPO.
1595     if (pred->_rpo > head()->_rpo) {
1596       pred->_loop = this;
1597       worklist.push(pred);
1598     }
1599   } else if (pred_loop != this) {
1600     // Nested loop.
1601     while (pred_loop->_parent != NULL && pred_loop->_parent != this) {
1602       pred_loop = pred_loop->_parent;
1603     }
1604     // Make pred's loop be a child
1605     if (pred_loop->_parent == NULL) {
1606       add_nested_loop(pred_loop);
1607       // Continue with loop entry predecessor.
1608       Block* pred_head = pred_loop->head();
1609       assert(pred_head->num_preds() - 1 == 2, "loop must have 2 predecessors");
1610       assert(pred_head != head(), "loop head in only one loop");
1611       push_pred(pred_head, LoopNode::EntryControl, worklist, cfg);
1612     } else {
1613       assert(pred_loop->_parent == this && _parent == NULL, "just checking");
1614     }
1615   }
1616 }
1617 
1618 //------------------------------add_nested_loop--------------------------------
1619 // Make cl a child of the current loop in the loop tree.
1620 void CFGLoop::add_nested_loop(CFGLoop* cl) {
1621   assert(_parent == NULL, "no parent yet");
1622   assert(cl != this, "not my own parent");
1623   cl->_parent = this;
1624   CFGLoop* ch = _child;
1625   if (ch == NULL) {
1626     _child = cl;
1627   } else {
1628     while (ch->_sibling != NULL) { ch = ch->_sibling; }
1629     ch->_sibling = cl;
1630   }
1631 }
1632 
1633 //------------------------------compute_loop_depth-----------------------------
1634 // Store the loop depth in each CFGLoop object.
1635 // Recursively walk the children to do the same for them.
1636 void CFGLoop::compute_loop_depth(int depth) {
1637   _depth = depth;
1638   CFGLoop* ch = _child;
1639   while (ch != NULL) {
1640     ch->compute_loop_depth(depth + 1);
1641     ch = ch->_sibling;
1642   }
1643 }
1644 
1645 //------------------------------compute_freq-----------------------------------
1646 // Compute the frequency of each block and loop, relative to a single entry
1647 // into the dominating loop head.
1648 void CFGLoop::compute_freq() {
1649   // Bottom up traversal of loop tree (visit inner loops first.)
1650   // Set loop head frequency to 1.0, then transitively
1651   // compute frequency for all successors in the loop,
1652   // as well as for each exit edge.  Inner loops are
1653   // treated as single blocks with loop exit targets
1654   // as the successor blocks.
1655 
1656   // Nested loops first
1657   CFGLoop* ch = _child;
1658   while (ch != NULL) {
1659     ch->compute_freq();
1660     ch = ch->_sibling;
1661   }
1662   assert (_members.length() > 0, "no empty loops");
1663   Block* hd = head();
1664   hd->_freq = 1.0f;
1665   for (int i = 0; i < _members.length(); i++) {
1666     CFGElement* s = _members.at(i);
1667     float freq = s->_freq;
1668     if (s->is_block()) {
1669       Block* b = s->as_Block();
1670       for (uint j = 0; j < b->_num_succs; j++) {
1671         Block* sb = b->_succs[j];
1672         update_succ_freq(sb, freq * b->succ_prob(j));
1673       }
1674     } else {
1675       CFGLoop* lp = s->as_CFGLoop();
1676       assert(lp->_parent == this, "immediate child");
1677       for (int k = 0; k < lp->_exits.length(); k++) {
1678         Block* eb = lp->_exits.at(k).get_target();
1679         float prob = lp->_exits.at(k).get_prob();
1680         update_succ_freq(eb, freq * prob);
1681       }
1682     }
1683   }
1684 
1685   // For all loops other than the outer, "method" loop,
1686   // sum and normalize the exit probability. The "method" loop
1687   // should keep the initial exit probability of 1, so that
1688   // inner blocks do not get erroneously scaled.
1689   if (_depth != 0) {
1690     // Total the exit probabilities for this loop.
1691     float exits_sum = 0.0f;
1692     for (int i = 0; i < _exits.length(); i++) {
1693       exits_sum += _exits.at(i).get_prob();
1694     }
1695 
1696     // Normalize the exit probabilities. Until now, the
1697     // probabilities estimate the possibility of exit per
1698     // a single loop iteration; afterward, they estimate
1699     // the probability of exit per loop entry.
1700     for (int i = 0; i < _exits.length(); i++) {
1701       Block* et = _exits.at(i).get_target();
1702       float new_prob = 0.0f;
1703       if (_exits.at(i).get_prob() > 0.0f) {
1704         new_prob = _exits.at(i).get_prob() / exits_sum;
1705       }
1706       BlockProbPair bpp(et, new_prob);
1707       _exits.at_put(i, bpp);
1708     }
1709 
1710     // Save the total, but guard against unreasonable probability,
1711     // as the value is used to estimate the loop trip count.
1712     // An infinite trip count would blur relative block
1713     // frequencies.
1714     if (exits_sum > 1.0f) exits_sum = 1.0;
1715     if (exits_sum < PROB_MIN) exits_sum = PROB_MIN;
1716     _exit_prob = exits_sum;
1717   }
1718 }
1719 
1720 //------------------------------succ_prob-------------------------------------
1721 // Determine the probability of reaching successor 'i' from the receiver block.
1722 float Block::succ_prob(uint i) {
1723   int eidx = end_idx();
1724   Node *n = get_node(eidx);  // Get ending Node
1725 
1726   int op = n->Opcode();
1727   if (n->is_Mach()) {
1728     if (n->is_MachNullCheck()) {
1729       // Can only reach here if called after lcm. The original Op_If is gone,
1730       // so we attempt to infer the probability from one or both of the
1731       // successor blocks.
1732       assert(_num_succs == 2, "expecting 2 successors of a null check");
1733       // If either successor has only one predecessor, then the
1734       // probability estimate can be derived using the
1735       // relative frequency of the successor and this block.
1736       if (_succs[i]->num_preds() == 2) {
1737         return _succs[i]->_freq / _freq;
1738       } else if (_succs[1-i]->num_preds() == 2) {
1739         return 1 - (_succs[1-i]->_freq / _freq);
1740       } else {
1741         // Estimate using both successor frequencies
1742         float freq = _succs[i]->_freq;
1743         return freq / (freq + _succs[1-i]->_freq);
1744       }
1745     }
1746     op = n->as_Mach()->ideal_Opcode();
1747   }
1748 
1749 
1750   // Switch on branch type
1751   switch( op ) {
1752   case Op_CountedLoopEnd:
1753   case Op_If: {
1754     assert (i < 2, "just checking");
1755     // Conditionals pass on only part of their frequency
1756     float prob  = n->as_MachIf()->_prob;
1757     assert(prob >= 0.0 && prob <= 1.0, "out of range probability");
1758     // If succ[i] is the FALSE branch, invert path info
1759     if( get_node(i + eidx + 1)->Opcode() == Op_IfFalse ) {
1760       return 1.0f - prob; // not taken
1761     } else {
1762       return prob; // taken
1763     }
1764   }
1765 
1766   case Op_Jump:
1767     // Divide the frequency between all successors evenly
1768     return 1.0f/_num_succs;
1769 
1770   case Op_Catch: {
1771     const CatchProjNode *ci = get_node(i + eidx + 1)->as_CatchProj();
1772     if (ci->_con == CatchProjNode::fall_through_index) {
1773       // Fall-thru path gets the lion's share.
1774       return 1.0f - PROB_UNLIKELY_MAG(5)*_num_succs;
1775     } else {
1776       // Presume exceptional paths are equally unlikely
1777       return PROB_UNLIKELY_MAG(5);
1778     }
1779   }
1780 
1781   case Op_Root:
1782   case Op_Goto:
1783     // Pass frequency straight thru to target
1784     return 1.0f;
1785 
1786   case Op_NeverBranch:
1787     return 0.0f;
1788 
1789   case Op_TailCall:
1790   case Op_TailJump:
1791   case Op_Return:
1792   case Op_Halt:
1793   case Op_Rethrow:
1794     // Do not push out freq to root block
1795     return 0.0f;
1796 
1797   default:
1798     ShouldNotReachHere();
1799   }
1800 
1801   return 0.0f;
1802 }
1803 
1804 //------------------------------num_fall_throughs-----------------------------
1805 // Return the number of fall-through candidates for a block
1806 int Block::num_fall_throughs() {
1807   int eidx = end_idx();
1808   Node *n = get_node(eidx);  // Get ending Node
1809 
1810   int op = n->Opcode();
1811   if (n->is_Mach()) {
1812     if (n->is_MachNullCheck()) {
1813       // In theory, either side can fall-thru, for simplicity sake,
1814       // let's say only the false branch can now.
1815       return 1;
1816     }
1817     op = n->as_Mach()->ideal_Opcode();
1818   }
1819 
1820   // Switch on branch type
1821   switch( op ) {
1822   case Op_CountedLoopEnd:
1823   case Op_If:
1824     return 2;
1825 
1826   case Op_Root:
1827   case Op_Goto:
1828     return 1;
1829 
1830   case Op_Catch: {
1831     for (uint i = 0; i < _num_succs; i++) {
1832       const CatchProjNode *ci = get_node(i + eidx + 1)->as_CatchProj();
1833       if (ci->_con == CatchProjNode::fall_through_index) {
1834         return 1;
1835       }
1836     }
1837     return 0;
1838   }
1839 
1840   case Op_Jump:
1841   case Op_NeverBranch:
1842   case Op_TailCall:
1843   case Op_TailJump:
1844   case Op_Return:
1845   case Op_Halt:
1846   case Op_Rethrow:
1847     return 0;
1848 
1849   default:
1850     ShouldNotReachHere();
1851   }
1852 
1853   return 0;
1854 }
1855 
1856 //------------------------------succ_fall_through-----------------------------
1857 // Return true if a specific successor could be fall-through target.
1858 bool Block::succ_fall_through(uint i) {
1859   int eidx = end_idx();
1860   Node *n = get_node(eidx);  // Get ending Node
1861 
1862   int op = n->Opcode();
1863   if (n->is_Mach()) {
1864     if (n->is_MachNullCheck()) {
1865       // In theory, either side can fall-thru, for simplicity sake,
1866       // let's say only the false branch can now.
1867       return get_node(i + eidx + 1)->Opcode() == Op_IfFalse;
1868     }
1869     op = n->as_Mach()->ideal_Opcode();
1870   }
1871 
1872   // Switch on branch type
1873   switch( op ) {
1874   case Op_CountedLoopEnd:
1875   case Op_If:
1876   case Op_Root:
1877   case Op_Goto:
1878     return true;
1879 
1880   case Op_Catch: {
1881     const CatchProjNode *ci = get_node(i + eidx + 1)->as_CatchProj();
1882     return ci->_con == CatchProjNode::fall_through_index;
1883   }
1884 
1885   case Op_Jump:
1886   case Op_NeverBranch:
1887   case Op_TailCall:
1888   case Op_TailJump:
1889   case Op_Return:
1890   case Op_Halt:
1891   case Op_Rethrow:
1892     return false;
1893 
1894   default:
1895     ShouldNotReachHere();
1896   }
1897 
1898   return false;
1899 }
1900 
1901 //------------------------------update_uncommon_branch------------------------
1902 // Update the probability of a two-branch to be uncommon
1903 void Block::update_uncommon_branch(Block* ub) {
1904   int eidx = end_idx();
1905   Node *n = get_node(eidx);  // Get ending Node
1906 
1907   int op = n->as_Mach()->ideal_Opcode();
1908 
1909   assert(op == Op_CountedLoopEnd || op == Op_If, "must be a If");
1910   assert(num_fall_throughs() == 2, "must be a two way branch block");
1911 
1912   // Which successor is ub?
1913   uint s;
1914   for (s = 0; s <_num_succs; s++) {
1915     if (_succs[s] == ub) break;
1916   }
1917   assert(s < 2, "uncommon successor must be found");
1918 
1919   // If ub is the true path, make the proability small, else
1920   // ub is the false path, and make the probability large
1921   bool invert = (get_node(s + eidx + 1)->Opcode() == Op_IfFalse);
1922 
1923   // Get existing probability
1924   float p = n->as_MachIf()->_prob;
1925 
1926   if (invert) p = 1.0 - p;
1927   if (p > PROB_MIN) {
1928     p = PROB_MIN;
1929   }
1930   if (invert) p = 1.0 - p;
1931 
1932   n->as_MachIf()->_prob = p;
1933 }
1934 
1935 //------------------------------update_succ_freq-------------------------------
1936 // Update the appropriate frequency associated with block 'b', a successor of
1937 // a block in this loop.
1938 void CFGLoop::update_succ_freq(Block* b, float freq) {
1939   if (b->_loop == this) {
1940     if (b == head()) {
1941       // back branch within the loop
1942       // Do nothing now, the loop carried frequency will be
1943       // adjust later in scale_freq().
1944     } else {
1945       // simple branch within the loop
1946       b->_freq += freq;
1947     }
1948   } else if (!in_loop_nest(b)) {
1949     // branch is exit from this loop
1950     BlockProbPair bpp(b, freq);
1951     _exits.append(bpp);
1952   } else {
1953     // branch into nested loop
1954     CFGLoop* ch = b->_loop;
1955     ch->_freq += freq;
1956   }
1957 }
1958 
1959 //------------------------------in_loop_nest-----------------------------------
1960 // Determine if block b is in the receiver's loop nest.
1961 bool CFGLoop::in_loop_nest(Block* b) {
1962   int depth = _depth;
1963   CFGLoop* b_loop = b->_loop;
1964   int b_depth = b_loop->_depth;
1965   if (depth == b_depth) {
1966     return true;
1967   }
1968   while (b_depth > depth) {
1969     b_loop = b_loop->_parent;
1970     b_depth = b_loop->_depth;
1971   }
1972   return b_loop == this;
1973 }
1974 
1975 //------------------------------scale_freq-------------------------------------
1976 // Scale frequency of loops and blocks by trip counts from outer loops
1977 // Do a top down traversal of loop tree (visit outer loops first.)
1978 void CFGLoop::scale_freq() {
1979   float loop_freq = _freq * trip_count();
1980   _freq = loop_freq;
1981   for (int i = 0; i < _members.length(); i++) {
1982     CFGElement* s = _members.at(i);
1983     float block_freq = s->_freq * loop_freq;
1984     if (g_isnan(block_freq) || block_freq < MIN_BLOCK_FREQUENCY)
1985       block_freq = MIN_BLOCK_FREQUENCY;
1986     s->_freq = block_freq;
1987   }
1988   CFGLoop* ch = _child;
1989   while (ch != NULL) {
1990     ch->scale_freq();
1991     ch = ch->_sibling;
1992   }
1993 }
1994 
1995 // Frequency of outer loop
1996 float CFGLoop::outer_loop_freq() const {
1997   if (_child != NULL) {
1998     return _child->_freq;
1999   }
2000   return _freq;
2001 }
2002 
2003 #ifndef PRODUCT
2004 //------------------------------dump_tree--------------------------------------
2005 void CFGLoop::dump_tree() const {
2006   dump();
2007   if (_child != NULL)   _child->dump_tree();
2008   if (_sibling != NULL) _sibling->dump_tree();
2009 }
2010 
2011 //------------------------------dump-------------------------------------------
2012 void CFGLoop::dump() const {
2013   for (int i = 0; i < _depth; i++) tty->print("   ");
2014   tty->print("%s: %d  trip_count: %6.0f freq: %6.0f\n",
2015              _depth == 0 ? "Method" : "Loop", _id, trip_count(), _freq);
2016   for (int i = 0; i < _depth; i++) tty->print("   ");
2017   tty->print("         members:");
2018   int k = 0;
2019   for (int i = 0; i < _members.length(); i++) {
2020     if (k++ >= 6) {
2021       tty->print("\n              ");
2022       for (int j = 0; j < _depth+1; j++) tty->print("   ");
2023       k = 0;
2024     }
2025     CFGElement *s = _members.at(i);
2026     if (s->is_block()) {
2027       Block *b = s->as_Block();
2028       tty->print(" B%d(%6.3f)", b->_pre_order, b->_freq);
2029     } else {
2030       CFGLoop* lp = s->as_CFGLoop();
2031       tty->print(" L%d(%6.3f)", lp->_id, lp->_freq);
2032     }
2033   }
2034   tty->print("\n");
2035   for (int i = 0; i < _depth; i++) tty->print("   ");
2036   tty->print("         exits:  ");
2037   k = 0;
2038   for (int i = 0; i < _exits.length(); i++) {
2039     if (k++ >= 7) {
2040       tty->print("\n              ");
2041       for (int j = 0; j < _depth+1; j++) tty->print("   ");
2042       k = 0;
2043     }
2044     Block *blk = _exits.at(i).get_target();
2045     float prob = _exits.at(i).get_prob();
2046     tty->print(" ->%d@%d%%", blk->_pre_order, (int)(prob*100));
2047   }
2048   tty->print("\n");
2049 }
2050 #endif