1 /*
   2  * Copyright (c) 1997, 2019, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2012, 2019, SAP SE. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/macroAssembler.inline.hpp"
  28 #include "compiler/disassembler.hpp"
  29 #include "gc/shared/collectedHeap.inline.hpp"
  30 #include "gc/shared/barrierSet.hpp"
  31 #include "gc/shared/barrierSetAssembler.hpp"
  32 #include "interpreter/interpreter.hpp"
  33 #include "memory/resourceArea.hpp"
  34 #include "nativeInst_ppc.hpp"
  35 #include "prims/methodHandles.hpp"
  36 #include "runtime/biasedLocking.hpp"
  37 #include "runtime/icache.hpp"
  38 #include "runtime/interfaceSupport.inline.hpp"
  39 #include "runtime/objectMonitor.hpp"
  40 #include "runtime/os.hpp"
  41 #include "runtime/safepoint.hpp"
  42 #include "runtime/safepointMechanism.hpp"
  43 #include "runtime/sharedRuntime.hpp"
  44 #include "runtime/stubRoutines.hpp"
  45 #include "utilities/macros.hpp"
  46 #ifdef COMPILER2
  47 #include "opto/intrinsicnode.hpp"
  48 #endif
  49 
  50 #ifdef PRODUCT
  51 #define BLOCK_COMMENT(str) // nothing
  52 #else
  53 #define BLOCK_COMMENT(str) block_comment(str)
  54 #endif
  55 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  56 
  57 #ifdef ASSERT
  58 // On RISC, there's no benefit to verifying instruction boundaries.
  59 bool AbstractAssembler::pd_check_instruction_mark() { return false; }
  60 #endif
  61 
  62 void MacroAssembler::ld_largeoffset_unchecked(Register d, int si31, Register a, int emit_filler_nop) {
  63   assert(Assembler::is_simm(si31, 31) && si31 >= 0, "si31 out of range");
  64   if (Assembler::is_simm(si31, 16)) {
  65     ld(d, si31, a);
  66     if (emit_filler_nop) nop();
  67   } else {
  68     const int hi = MacroAssembler::largeoffset_si16_si16_hi(si31);
  69     const int lo = MacroAssembler::largeoffset_si16_si16_lo(si31);
  70     addis(d, a, hi);
  71     ld(d, lo, d);
  72   }
  73 }
  74 
  75 void MacroAssembler::ld_largeoffset(Register d, int si31, Register a, int emit_filler_nop) {
  76   assert_different_registers(d, a);
  77   ld_largeoffset_unchecked(d, si31, a, emit_filler_nop);
  78 }
  79 
  80 void MacroAssembler::load_sized_value(Register dst, RegisterOrConstant offs, Register base,
  81                                       size_t size_in_bytes, bool is_signed) {
  82   switch (size_in_bytes) {
  83   case  8:              ld(dst, offs, base);                         break;
  84   case  4:  is_signed ? lwa(dst, offs, base) : lwz(dst, offs, base); break;
  85   case  2:  is_signed ? lha(dst, offs, base) : lhz(dst, offs, base); break;
  86   case  1:  lbz(dst, offs, base); if (is_signed) extsb(dst, dst);    break; // lba doesn't exist :(
  87   default:  ShouldNotReachHere();
  88   }
  89 }
  90 
  91 void MacroAssembler::store_sized_value(Register dst, RegisterOrConstant offs, Register base,
  92                                        size_t size_in_bytes) {
  93   switch (size_in_bytes) {
  94   case  8:  std(dst, offs, base); break;
  95   case  4:  stw(dst, offs, base); break;
  96   case  2:  sth(dst, offs, base); break;
  97   case  1:  stb(dst, offs, base); break;
  98   default:  ShouldNotReachHere();
  99   }
 100 }
 101 
 102 void MacroAssembler::align(int modulus, int max, int rem) {
 103   int padding = (rem + modulus - (offset() % modulus)) % modulus;
 104   if (padding > max) return;
 105   for (int c = (padding >> 2); c > 0; --c) { nop(); }
 106 }
 107 
 108 // Issue instructions that calculate given TOC from global TOC.
 109 void MacroAssembler::calculate_address_from_global_toc(Register dst, address addr, bool hi16, bool lo16,
 110                                                        bool add_relocation, bool emit_dummy_addr) {
 111   int offset = -1;
 112   if (emit_dummy_addr) {
 113     offset = -128; // dummy address
 114   } else if (addr != (address)(intptr_t)-1) {
 115     offset = MacroAssembler::offset_to_global_toc(addr);
 116   }
 117 
 118   if (hi16) {
 119     addis(dst, R29_TOC, MacroAssembler::largeoffset_si16_si16_hi(offset));
 120   }
 121   if (lo16) {
 122     if (add_relocation) {
 123       // Relocate at the addi to avoid confusion with a load from the method's TOC.
 124       relocate(internal_word_Relocation::spec(addr));
 125     }
 126     addi(dst, dst, MacroAssembler::largeoffset_si16_si16_lo(offset));
 127   }
 128 }
 129 
 130 address MacroAssembler::patch_calculate_address_from_global_toc_at(address a, address bound, address addr) {
 131   const int offset = MacroAssembler::offset_to_global_toc(addr);
 132 
 133   const address inst2_addr = a;
 134   const int inst2 = *(int *)inst2_addr;
 135 
 136   // The relocation points to the second instruction, the addi,
 137   // and the addi reads and writes the same register dst.
 138   const int dst = inv_rt_field(inst2);
 139   assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst");
 140 
 141   // Now, find the preceding addis which writes to dst.
 142   int inst1 = 0;
 143   address inst1_addr = inst2_addr - BytesPerInstWord;
 144   while (inst1_addr >= bound) {
 145     inst1 = *(int *) inst1_addr;
 146     if (is_addis(inst1) && inv_rt_field(inst1) == dst) {
 147       // Stop, found the addis which writes dst.
 148       break;
 149     }
 150     inst1_addr -= BytesPerInstWord;
 151   }
 152 
 153   assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC");
 154   set_imm((int *)inst1_addr, MacroAssembler::largeoffset_si16_si16_hi(offset));
 155   set_imm((int *)inst2_addr, MacroAssembler::largeoffset_si16_si16_lo(offset));
 156   return inst1_addr;
 157 }
 158 
 159 address MacroAssembler::get_address_of_calculate_address_from_global_toc_at(address a, address bound) {
 160   const address inst2_addr = a;
 161   const int inst2 = *(int *)inst2_addr;
 162 
 163   // The relocation points to the second instruction, the addi,
 164   // and the addi reads and writes the same register dst.
 165   const int dst = inv_rt_field(inst2);
 166   assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst");
 167 
 168   // Now, find the preceding addis which writes to dst.
 169   int inst1 = 0;
 170   address inst1_addr = inst2_addr - BytesPerInstWord;
 171   while (inst1_addr >= bound) {
 172     inst1 = *(int *) inst1_addr;
 173     if (is_addis(inst1) && inv_rt_field(inst1) == dst) {
 174       // stop, found the addis which writes dst
 175       break;
 176     }
 177     inst1_addr -= BytesPerInstWord;
 178   }
 179 
 180   assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC");
 181 
 182   int offset = (get_imm(inst1_addr, 0) << 16) + get_imm(inst2_addr, 0);
 183   // -1 is a special case
 184   if (offset == -1) {
 185     return (address)(intptr_t)-1;
 186   } else {
 187     return global_toc() + offset;
 188   }
 189 }
 190 
 191 #ifdef _LP64
 192 // Patch compressed oops or klass constants.
 193 // Assembler sequence is
 194 // 1) compressed oops:
 195 //    lis  rx = const.hi
 196 //    ori rx = rx | const.lo
 197 // 2) compressed klass:
 198 //    lis  rx = const.hi
 199 //    clrldi rx = rx & 0xFFFFffff // clearMS32b, optional
 200 //    ori rx = rx | const.lo
 201 // Clrldi will be passed by.
 202 address MacroAssembler::patch_set_narrow_oop(address a, address bound, narrowOop data) {
 203   assert(UseCompressedOops, "Should only patch compressed oops");
 204 
 205   const address inst2_addr = a;
 206   const int inst2 = *(int *)inst2_addr;
 207 
 208   // The relocation points to the second instruction, the ori,
 209   // and the ori reads and writes the same register dst.
 210   const int dst = inv_rta_field(inst2);
 211   assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst");
 212   // Now, find the preceding addis which writes to dst.
 213   int inst1 = 0;
 214   address inst1_addr = inst2_addr - BytesPerInstWord;
 215   bool inst1_found = false;
 216   while (inst1_addr >= bound) {
 217     inst1 = *(int *)inst1_addr;
 218     if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break; }
 219     inst1_addr -= BytesPerInstWord;
 220   }
 221   assert(inst1_found, "inst is not lis");
 222 
 223   int xc = (data >> 16) & 0xffff;
 224   int xd = (data >>  0) & 0xffff;
 225 
 226   set_imm((int *)inst1_addr, (short)(xc)); // see enc_load_con_narrow_hi/_lo
 227   set_imm((int *)inst2_addr,        (xd)); // unsigned int
 228   return inst1_addr;
 229 }
 230 
 231 // Get compressed oop or klass constant.
 232 narrowOop MacroAssembler::get_narrow_oop(address a, address bound) {
 233   assert(UseCompressedOops, "Should only patch compressed oops");
 234 
 235   const address inst2_addr = a;
 236   const int inst2 = *(int *)inst2_addr;
 237 
 238   // The relocation points to the second instruction, the ori,
 239   // and the ori reads and writes the same register dst.
 240   const int dst = inv_rta_field(inst2);
 241   assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst");
 242   // Now, find the preceding lis which writes to dst.
 243   int inst1 = 0;
 244   address inst1_addr = inst2_addr - BytesPerInstWord;
 245   bool inst1_found = false;
 246 
 247   while (inst1_addr >= bound) {
 248     inst1 = *(int *) inst1_addr;
 249     if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break;}
 250     inst1_addr -= BytesPerInstWord;
 251   }
 252   assert(inst1_found, "inst is not lis");
 253 
 254   uint xl = ((unsigned int) (get_imm(inst2_addr, 0) & 0xffff));
 255   uint xh = (((get_imm(inst1_addr, 0)) & 0xffff) << 16);
 256 
 257   return (int) (xl | xh);
 258 }
 259 #endif // _LP64
 260 
 261 // Returns true if successful.
 262 bool MacroAssembler::load_const_from_method_toc(Register dst, AddressLiteral& a,
 263                                                 Register toc, bool fixed_size) {
 264   int toc_offset = 0;
 265   // Use RelocationHolder::none for the constant pool entry, otherwise
 266   // we will end up with a failing NativeCall::verify(x) where x is
 267   // the address of the constant pool entry.
 268   // FIXME: We should insert relocation information for oops at the constant
 269   // pool entries instead of inserting it at the loads; patching of a constant
 270   // pool entry should be less expensive.
 271   address const_address = address_constant((address)a.value(), RelocationHolder::none);
 272   if (const_address == NULL) { return false; } // allocation failure
 273   // Relocate at the pc of the load.
 274   relocate(a.rspec());
 275   toc_offset = (int)(const_address - code()->consts()->start());
 276   ld_largeoffset_unchecked(dst, toc_offset, toc, fixed_size);
 277   return true;
 278 }
 279 
 280 bool MacroAssembler::is_load_const_from_method_toc_at(address a) {
 281   const address inst1_addr = a;
 282   const int inst1 = *(int *)inst1_addr;
 283 
 284    // The relocation points to the ld or the addis.
 285    return (is_ld(inst1)) ||
 286           (is_addis(inst1) && inv_ra_field(inst1) != 0);
 287 }
 288 
 289 int MacroAssembler::get_offset_of_load_const_from_method_toc_at(address a) {
 290   assert(is_load_const_from_method_toc_at(a), "must be load_const_from_method_toc");
 291 
 292   const address inst1_addr = a;
 293   const int inst1 = *(int *)inst1_addr;
 294 
 295   if (is_ld(inst1)) {
 296     return inv_d1_field(inst1);
 297   } else if (is_addis(inst1)) {
 298     const int dst = inv_rt_field(inst1);
 299 
 300     // Now, find the succeeding ld which reads and writes to dst.
 301     address inst2_addr = inst1_addr + BytesPerInstWord;
 302     int inst2 = 0;
 303     while (true) {
 304       inst2 = *(int *) inst2_addr;
 305       if (is_ld(inst2) && inv_ra_field(inst2) == dst && inv_rt_field(inst2) == dst) {
 306         // Stop, found the ld which reads and writes dst.
 307         break;
 308       }
 309       inst2_addr += BytesPerInstWord;
 310     }
 311     return (inv_d1_field(inst1) << 16) + inv_d1_field(inst2);
 312   }
 313   ShouldNotReachHere();
 314   return 0;
 315 }
 316 
 317 // Get the constant from a `load_const' sequence.
 318 long MacroAssembler::get_const(address a) {
 319   assert(is_load_const_at(a), "not a load of a constant");
 320   const int *p = (const int*) a;
 321   unsigned long x = (((unsigned long) (get_imm(a,0) & 0xffff)) << 48);
 322   if (is_ori(*(p+1))) {
 323     x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 32);
 324     x |= (((unsigned long) (get_imm(a,3) & 0xffff)) << 16);
 325     x |= (((unsigned long) (get_imm(a,4) & 0xffff)));
 326   } else if (is_lis(*(p+1))) {
 327     x |= (((unsigned long) (get_imm(a,2) & 0xffff)) << 32);
 328     x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 16);
 329     x |= (((unsigned long) (get_imm(a,3) & 0xffff)));
 330   } else {
 331     ShouldNotReachHere();
 332     return (long) 0;
 333   }
 334   return (long) x;
 335 }
 336 
 337 // Patch the 64 bit constant of a `load_const' sequence. This is a low
 338 // level procedure. It neither flushes the instruction cache nor is it
 339 // mt safe.
 340 void MacroAssembler::patch_const(address a, long x) {
 341   assert(is_load_const_at(a), "not a load of a constant");
 342   int *p = (int*) a;
 343   if (is_ori(*(p+1))) {
 344     set_imm(0 + p, (x >> 48) & 0xffff);
 345     set_imm(1 + p, (x >> 32) & 0xffff);
 346     set_imm(3 + p, (x >> 16) & 0xffff);
 347     set_imm(4 + p, x & 0xffff);
 348   } else if (is_lis(*(p+1))) {
 349     set_imm(0 + p, (x >> 48) & 0xffff);
 350     set_imm(2 + p, (x >> 32) & 0xffff);
 351     set_imm(1 + p, (x >> 16) & 0xffff);
 352     set_imm(3 + p, x & 0xffff);
 353   } else {
 354     ShouldNotReachHere();
 355   }
 356 }
 357 
 358 AddressLiteral MacroAssembler::allocate_metadata_address(Metadata* obj) {
 359   assert(oop_recorder() != NULL, "this assembler needs a Recorder");
 360   int index = oop_recorder()->allocate_metadata_index(obj);
 361   RelocationHolder rspec = metadata_Relocation::spec(index);
 362   return AddressLiteral((address)obj, rspec);
 363 }
 364 
 365 AddressLiteral MacroAssembler::constant_metadata_address(Metadata* obj) {
 366   assert(oop_recorder() != NULL, "this assembler needs a Recorder");
 367   int index = oop_recorder()->find_index(obj);
 368   RelocationHolder rspec = metadata_Relocation::spec(index);
 369   return AddressLiteral((address)obj, rspec);
 370 }
 371 
 372 AddressLiteral MacroAssembler::allocate_oop_address(jobject obj) {
 373   assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
 374   int oop_index = oop_recorder()->allocate_oop_index(obj);
 375   return AddressLiteral(address(obj), oop_Relocation::spec(oop_index));
 376 }
 377 
 378 AddressLiteral MacroAssembler::constant_oop_address(jobject obj) {
 379   assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
 380   int oop_index = oop_recorder()->find_index(obj);
 381   return AddressLiteral(address(obj), oop_Relocation::spec(oop_index));
 382 }
 383 
 384 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr,
 385                                                       Register tmp, int offset) {
 386   intptr_t value = *delayed_value_addr;
 387   if (value != 0) {
 388     return RegisterOrConstant(value + offset);
 389   }
 390 
 391   // Load indirectly to solve generation ordering problem.
 392   // static address, no relocation
 393   int simm16_offset = load_const_optimized(tmp, delayed_value_addr, noreg, true);
 394   ld(tmp, simm16_offset, tmp); // must be aligned ((xa & 3) == 0)
 395 
 396   if (offset != 0) {
 397     addi(tmp, tmp, offset);
 398   }
 399 
 400   return RegisterOrConstant(tmp);
 401 }
 402 
 403 #ifndef PRODUCT
 404 void MacroAssembler::pd_print_patched_instruction(address branch) {
 405   Unimplemented(); // TODO: PPC port
 406 }
 407 #endif // ndef PRODUCT
 408 
 409 // Conditional far branch for destinations encodable in 24+2 bits.
 410 void MacroAssembler::bc_far(int boint, int biint, Label& dest, int optimize) {
 411 
 412   // If requested by flag optimize, relocate the bc_far as a
 413   // runtime_call and prepare for optimizing it when the code gets
 414   // relocated.
 415   if (optimize == bc_far_optimize_on_relocate) {
 416     relocate(relocInfo::runtime_call_type);
 417   }
 418 
 419   // variant 2:
 420   //
 421   //    b!cxx SKIP
 422   //    bxx   DEST
 423   //  SKIP:
 424   //
 425 
 426   const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)),
 427                                                 opposite_bcond(inv_boint_bcond(boint)));
 428 
 429   // We emit two branches.
 430   // First, a conditional branch which jumps around the far branch.
 431   const address not_taken_pc = pc() + 2 * BytesPerInstWord;
 432   const address bc_pc        = pc();
 433   bc(opposite_boint, biint, not_taken_pc);
 434 
 435   const int bc_instr = *(int*)bc_pc;
 436   assert(not_taken_pc == (address)inv_bd_field(bc_instr, (intptr_t)bc_pc), "postcondition");
 437   assert(opposite_boint == inv_bo_field(bc_instr), "postcondition");
 438   assert(boint == add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(bc_instr))),
 439                                      opposite_bcond(inv_boint_bcond(inv_bo_field(bc_instr)))),
 440          "postcondition");
 441   assert(biint == inv_bi_field(bc_instr), "postcondition");
 442 
 443   // Second, an unconditional far branch which jumps to dest.
 444   // Note: target(dest) remembers the current pc (see CodeSection::target)
 445   //       and returns the current pc if the label is not bound yet; when
 446   //       the label gets bound, the unconditional far branch will be patched.
 447   const address target_pc = target(dest);
 448   const address b_pc  = pc();
 449   b(target_pc);
 450 
 451   assert(not_taken_pc == pc(),                     "postcondition");
 452   assert(dest.is_bound() || target_pc == b_pc, "postcondition");
 453 }
 454 
 455 // 1 or 2 instructions
 456 void MacroAssembler::bc_far_optimized(int boint, int biint, Label& dest) {
 457   if (dest.is_bound() && is_within_range_of_bcxx(target(dest), pc())) {
 458     bc(boint, biint, dest);
 459   } else {
 460     bc_far(boint, biint, dest, MacroAssembler::bc_far_optimize_on_relocate);
 461   }
 462 }
 463 
 464 bool MacroAssembler::is_bc_far_at(address instruction_addr) {
 465   return is_bc_far_variant1_at(instruction_addr) ||
 466          is_bc_far_variant2_at(instruction_addr) ||
 467          is_bc_far_variant3_at(instruction_addr);
 468 }
 469 
 470 address MacroAssembler::get_dest_of_bc_far_at(address instruction_addr) {
 471   if (is_bc_far_variant1_at(instruction_addr)) {
 472     const address instruction_1_addr = instruction_addr;
 473     const int instruction_1 = *(int*)instruction_1_addr;
 474     return (address)inv_bd_field(instruction_1, (intptr_t)instruction_1_addr);
 475   } else if (is_bc_far_variant2_at(instruction_addr)) {
 476     const address instruction_2_addr = instruction_addr + 4;
 477     return bxx_destination(instruction_2_addr);
 478   } else if (is_bc_far_variant3_at(instruction_addr)) {
 479     return instruction_addr + 8;
 480   }
 481   // variant 4 ???
 482   ShouldNotReachHere();
 483   return NULL;
 484 }
 485 void MacroAssembler::set_dest_of_bc_far_at(address instruction_addr, address dest) {
 486 
 487   if (is_bc_far_variant3_at(instruction_addr)) {
 488     // variant 3, far cond branch to the next instruction, already patched to nops:
 489     //
 490     //    nop
 491     //    endgroup
 492     //  SKIP/DEST:
 493     //
 494     return;
 495   }
 496 
 497   // first, extract boint and biint from the current branch
 498   int boint = 0;
 499   int biint = 0;
 500 
 501   ResourceMark rm;
 502   const int code_size = 2 * BytesPerInstWord;
 503   CodeBuffer buf(instruction_addr, code_size);
 504   MacroAssembler masm(&buf);
 505   if (is_bc_far_variant2_at(instruction_addr) && dest == instruction_addr + 8) {
 506     // Far branch to next instruction: Optimize it by patching nops (produce variant 3).
 507     masm.nop();
 508     masm.endgroup();
 509   } else {
 510     if (is_bc_far_variant1_at(instruction_addr)) {
 511       // variant 1, the 1st instruction contains the destination address:
 512       //
 513       //    bcxx  DEST
 514       //    nop
 515       //
 516       const int instruction_1 = *(int*)(instruction_addr);
 517       boint = inv_bo_field(instruction_1);
 518       biint = inv_bi_field(instruction_1);
 519     } else if (is_bc_far_variant2_at(instruction_addr)) {
 520       // variant 2, the 2nd instruction contains the destination address:
 521       //
 522       //    b!cxx SKIP
 523       //    bxx   DEST
 524       //  SKIP:
 525       //
 526       const int instruction_1 = *(int*)(instruction_addr);
 527       boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(instruction_1))),
 528           opposite_bcond(inv_boint_bcond(inv_bo_field(instruction_1))));
 529       biint = inv_bi_field(instruction_1);
 530     } else {
 531       // variant 4???
 532       ShouldNotReachHere();
 533     }
 534 
 535     // second, set the new branch destination and optimize the code
 536     if (dest != instruction_addr + 4 && // the bc_far is still unbound!
 537         masm.is_within_range_of_bcxx(dest, instruction_addr)) {
 538       // variant 1:
 539       //
 540       //    bcxx  DEST
 541       //    nop
 542       //
 543       masm.bc(boint, biint, dest);
 544       masm.nop();
 545     } else {
 546       // variant 2:
 547       //
 548       //    b!cxx SKIP
 549       //    bxx   DEST
 550       //  SKIP:
 551       //
 552       const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)),
 553                                                     opposite_bcond(inv_boint_bcond(boint)));
 554       const address not_taken_pc = masm.pc() + 2 * BytesPerInstWord;
 555       masm.bc(opposite_boint, biint, not_taken_pc);
 556       masm.b(dest);
 557     }
 558   }
 559   ICache::ppc64_flush_icache_bytes(instruction_addr, code_size);
 560 }
 561 
 562 // Emit a NOT mt-safe patchable 64 bit absolute call/jump.
 563 void MacroAssembler::bxx64_patchable(address dest, relocInfo::relocType rt, bool link) {
 564   // get current pc
 565   uint64_t start_pc = (uint64_t) pc();
 566 
 567   const address pc_of_bl = (address) (start_pc + (6*BytesPerInstWord)); // bl is last
 568   const address pc_of_b  = (address) (start_pc + (0*BytesPerInstWord)); // b is first
 569 
 570   // relocate here
 571   if (rt != relocInfo::none) {
 572     relocate(rt);
 573   }
 574 
 575   if ( ReoptimizeCallSequences &&
 576        (( link && is_within_range_of_b(dest, pc_of_bl)) ||
 577         (!link && is_within_range_of_b(dest, pc_of_b)))) {
 578     // variant 2:
 579     // Emit an optimized, pc-relative call/jump.
 580 
 581     if (link) {
 582       // some padding
 583       nop();
 584       nop();
 585       nop();
 586       nop();
 587       nop();
 588       nop();
 589 
 590       // do the call
 591       assert(pc() == pc_of_bl, "just checking");
 592       bl(dest, relocInfo::none);
 593     } else {
 594       // do the jump
 595       assert(pc() == pc_of_b, "just checking");
 596       b(dest, relocInfo::none);
 597 
 598       // some padding
 599       nop();
 600       nop();
 601       nop();
 602       nop();
 603       nop();
 604       nop();
 605     }
 606 
 607     // Assert that we can identify the emitted call/jump.
 608     assert(is_bxx64_patchable_variant2_at((address)start_pc, link),
 609            "can't identify emitted call");
 610   } else {
 611     // variant 1:
 612     mr(R0, R11);  // spill R11 -> R0.
 613 
 614     // Load the destination address into CTR,
 615     // calculate destination relative to global toc.
 616     calculate_address_from_global_toc(R11, dest, true, true, false);
 617 
 618     mtctr(R11);
 619     mr(R11, R0);  // spill R11 <- R0.
 620     nop();
 621 
 622     // do the call/jump
 623     if (link) {
 624       bctrl();
 625     } else{
 626       bctr();
 627     }
 628     // Assert that we can identify the emitted call/jump.
 629     assert(is_bxx64_patchable_variant1b_at((address)start_pc, link),
 630            "can't identify emitted call");
 631   }
 632 
 633   // Assert that we can identify the emitted call/jump.
 634   assert(is_bxx64_patchable_at((address)start_pc, link),
 635          "can't identify emitted call");
 636   assert(get_dest_of_bxx64_patchable_at((address)start_pc, link) == dest,
 637          "wrong encoding of dest address");
 638 }
 639 
 640 // Identify a bxx64_patchable instruction.
 641 bool MacroAssembler::is_bxx64_patchable_at(address instruction_addr, bool link) {
 642   return is_bxx64_patchable_variant1b_at(instruction_addr, link)
 643     //|| is_bxx64_patchable_variant1_at(instruction_addr, link)
 644       || is_bxx64_patchable_variant2_at(instruction_addr, link);
 645 }
 646 
 647 // Does the call64_patchable instruction use a pc-relative encoding of
 648 // the call destination?
 649 bool MacroAssembler::is_bxx64_patchable_pcrelative_at(address instruction_addr, bool link) {
 650   // variant 2 is pc-relative
 651   return is_bxx64_patchable_variant2_at(instruction_addr, link);
 652 }
 653 
 654 // Identify variant 1.
 655 bool MacroAssembler::is_bxx64_patchable_variant1_at(address instruction_addr, bool link) {
 656   unsigned int* instr = (unsigned int*) instruction_addr;
 657   return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l]
 658       && is_mtctr(instr[5]) // mtctr
 659     && is_load_const_at(instruction_addr);
 660 }
 661 
 662 // Identify variant 1b: load destination relative to global toc.
 663 bool MacroAssembler::is_bxx64_patchable_variant1b_at(address instruction_addr, bool link) {
 664   unsigned int* instr = (unsigned int*) instruction_addr;
 665   return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l]
 666     && is_mtctr(instr[3]) // mtctr
 667     && is_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord, instruction_addr);
 668 }
 669 
 670 // Identify variant 2.
 671 bool MacroAssembler::is_bxx64_patchable_variant2_at(address instruction_addr, bool link) {
 672   unsigned int* instr = (unsigned int*) instruction_addr;
 673   if (link) {
 674     return is_bl (instr[6])  // bl dest is last
 675       && is_nop(instr[0])  // nop
 676       && is_nop(instr[1])  // nop
 677       && is_nop(instr[2])  // nop
 678       && is_nop(instr[3])  // nop
 679       && is_nop(instr[4])  // nop
 680       && is_nop(instr[5]); // nop
 681   } else {
 682     return is_b  (instr[0])  // b  dest is first
 683       && is_nop(instr[1])  // nop
 684       && is_nop(instr[2])  // nop
 685       && is_nop(instr[3])  // nop
 686       && is_nop(instr[4])  // nop
 687       && is_nop(instr[5])  // nop
 688       && is_nop(instr[6]); // nop
 689   }
 690 }
 691 
 692 // Set dest address of a bxx64_patchable instruction.
 693 void MacroAssembler::set_dest_of_bxx64_patchable_at(address instruction_addr, address dest, bool link) {
 694   ResourceMark rm;
 695   int code_size = MacroAssembler::bxx64_patchable_size;
 696   CodeBuffer buf(instruction_addr, code_size);
 697   MacroAssembler masm(&buf);
 698   masm.bxx64_patchable(dest, relocInfo::none, link);
 699   ICache::ppc64_flush_icache_bytes(instruction_addr, code_size);
 700 }
 701 
 702 // Get dest address of a bxx64_patchable instruction.
 703 address MacroAssembler::get_dest_of_bxx64_patchable_at(address instruction_addr, bool link) {
 704   if (is_bxx64_patchable_variant1_at(instruction_addr, link)) {
 705     return (address) (unsigned long) get_const(instruction_addr);
 706   } else if (is_bxx64_patchable_variant2_at(instruction_addr, link)) {
 707     unsigned int* instr = (unsigned int*) instruction_addr;
 708     if (link) {
 709       const int instr_idx = 6; // bl is last
 710       int branchoffset = branch_destination(instr[instr_idx], 0);
 711       return instruction_addr + branchoffset + instr_idx*BytesPerInstWord;
 712     } else {
 713       const int instr_idx = 0; // b is first
 714       int branchoffset = branch_destination(instr[instr_idx], 0);
 715       return instruction_addr + branchoffset + instr_idx*BytesPerInstWord;
 716     }
 717   // Load dest relative to global toc.
 718   } else if (is_bxx64_patchable_variant1b_at(instruction_addr, link)) {
 719     return get_address_of_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord,
 720                                                                instruction_addr);
 721   } else {
 722     ShouldNotReachHere();
 723     return NULL;
 724   }
 725 }
 726 
 727 // Uses ordering which corresponds to ABI:
 728 //    _savegpr0_14:  std  r14,-144(r1)
 729 //    _savegpr0_15:  std  r15,-136(r1)
 730 //    _savegpr0_16:  std  r16,-128(r1)
 731 void MacroAssembler::save_nonvolatile_gprs(Register dst, int offset) {
 732   std(R14, offset, dst);   offset += 8;
 733   std(R15, offset, dst);   offset += 8;
 734   std(R16, offset, dst);   offset += 8;
 735   std(R17, offset, dst);   offset += 8;
 736   std(R18, offset, dst);   offset += 8;
 737   std(R19, offset, dst);   offset += 8;
 738   std(R20, offset, dst);   offset += 8;
 739   std(R21, offset, dst);   offset += 8;
 740   std(R22, offset, dst);   offset += 8;
 741   std(R23, offset, dst);   offset += 8;
 742   std(R24, offset, dst);   offset += 8;
 743   std(R25, offset, dst);   offset += 8;
 744   std(R26, offset, dst);   offset += 8;
 745   std(R27, offset, dst);   offset += 8;
 746   std(R28, offset, dst);   offset += 8;
 747   std(R29, offset, dst);   offset += 8;
 748   std(R30, offset, dst);   offset += 8;
 749   std(R31, offset, dst);   offset += 8;
 750 
 751   stfd(F14, offset, dst);   offset += 8;
 752   stfd(F15, offset, dst);   offset += 8;
 753   stfd(F16, offset, dst);   offset += 8;
 754   stfd(F17, offset, dst);   offset += 8;
 755   stfd(F18, offset, dst);   offset += 8;
 756   stfd(F19, offset, dst);   offset += 8;
 757   stfd(F20, offset, dst);   offset += 8;
 758   stfd(F21, offset, dst);   offset += 8;
 759   stfd(F22, offset, dst);   offset += 8;
 760   stfd(F23, offset, dst);   offset += 8;
 761   stfd(F24, offset, dst);   offset += 8;
 762   stfd(F25, offset, dst);   offset += 8;
 763   stfd(F26, offset, dst);   offset += 8;
 764   stfd(F27, offset, dst);   offset += 8;
 765   stfd(F28, offset, dst);   offset += 8;
 766   stfd(F29, offset, dst);   offset += 8;
 767   stfd(F30, offset, dst);   offset += 8;
 768   stfd(F31, offset, dst);
 769 }
 770 
 771 // Uses ordering which corresponds to ABI:
 772 //    _restgpr0_14:  ld   r14,-144(r1)
 773 //    _restgpr0_15:  ld   r15,-136(r1)
 774 //    _restgpr0_16:  ld   r16,-128(r1)
 775 void MacroAssembler::restore_nonvolatile_gprs(Register src, int offset) {
 776   ld(R14, offset, src);   offset += 8;
 777   ld(R15, offset, src);   offset += 8;
 778   ld(R16, offset, src);   offset += 8;
 779   ld(R17, offset, src);   offset += 8;
 780   ld(R18, offset, src);   offset += 8;
 781   ld(R19, offset, src);   offset += 8;
 782   ld(R20, offset, src);   offset += 8;
 783   ld(R21, offset, src);   offset += 8;
 784   ld(R22, offset, src);   offset += 8;
 785   ld(R23, offset, src);   offset += 8;
 786   ld(R24, offset, src);   offset += 8;
 787   ld(R25, offset, src);   offset += 8;
 788   ld(R26, offset, src);   offset += 8;
 789   ld(R27, offset, src);   offset += 8;
 790   ld(R28, offset, src);   offset += 8;
 791   ld(R29, offset, src);   offset += 8;
 792   ld(R30, offset, src);   offset += 8;
 793   ld(R31, offset, src);   offset += 8;
 794 
 795   // FP registers
 796   lfd(F14, offset, src);   offset += 8;
 797   lfd(F15, offset, src);   offset += 8;
 798   lfd(F16, offset, src);   offset += 8;
 799   lfd(F17, offset, src);   offset += 8;
 800   lfd(F18, offset, src);   offset += 8;
 801   lfd(F19, offset, src);   offset += 8;
 802   lfd(F20, offset, src);   offset += 8;
 803   lfd(F21, offset, src);   offset += 8;
 804   lfd(F22, offset, src);   offset += 8;
 805   lfd(F23, offset, src);   offset += 8;
 806   lfd(F24, offset, src);   offset += 8;
 807   lfd(F25, offset, src);   offset += 8;
 808   lfd(F26, offset, src);   offset += 8;
 809   lfd(F27, offset, src);   offset += 8;
 810   lfd(F28, offset, src);   offset += 8;
 811   lfd(F29, offset, src);   offset += 8;
 812   lfd(F30, offset, src);   offset += 8;
 813   lfd(F31, offset, src);
 814 }
 815 
 816 // For verify_oops.
 817 void MacroAssembler::save_volatile_gprs(Register dst, int offset) {
 818   std(R2,  offset, dst);   offset += 8;
 819   std(R3,  offset, dst);   offset += 8;
 820   std(R4,  offset, dst);   offset += 8;
 821   std(R5,  offset, dst);   offset += 8;
 822   std(R6,  offset, dst);   offset += 8;
 823   std(R7,  offset, dst);   offset += 8;
 824   std(R8,  offset, dst);   offset += 8;
 825   std(R9,  offset, dst);   offset += 8;
 826   std(R10, offset, dst);   offset += 8;
 827   std(R11, offset, dst);   offset += 8;
 828   std(R12, offset, dst);   offset += 8;
 829 
 830   stfd(F0, offset, dst);   offset += 8;
 831   stfd(F1, offset, dst);   offset += 8;
 832   stfd(F2, offset, dst);   offset += 8;
 833   stfd(F3, offset, dst);   offset += 8;
 834   stfd(F4, offset, dst);   offset += 8;
 835   stfd(F5, offset, dst);   offset += 8;
 836   stfd(F6, offset, dst);   offset += 8;
 837   stfd(F7, offset, dst);   offset += 8;
 838   stfd(F8, offset, dst);   offset += 8;
 839   stfd(F9, offset, dst);   offset += 8;
 840   stfd(F10, offset, dst);  offset += 8;
 841   stfd(F11, offset, dst);  offset += 8;
 842   stfd(F12, offset, dst);  offset += 8;
 843   stfd(F13, offset, dst);
 844 }
 845 
 846 // For verify_oops.
 847 void MacroAssembler::restore_volatile_gprs(Register src, int offset) {
 848   ld(R2,  offset, src);   offset += 8;
 849   ld(R3,  offset, src);   offset += 8;
 850   ld(R4,  offset, src);   offset += 8;
 851   ld(R5,  offset, src);   offset += 8;
 852   ld(R6,  offset, src);   offset += 8;
 853   ld(R7,  offset, src);   offset += 8;
 854   ld(R8,  offset, src);   offset += 8;
 855   ld(R9,  offset, src);   offset += 8;
 856   ld(R10, offset, src);   offset += 8;
 857   ld(R11, offset, src);   offset += 8;
 858   ld(R12, offset, src);   offset += 8;
 859 
 860   lfd(F0, offset, src);   offset += 8;
 861   lfd(F1, offset, src);   offset += 8;
 862   lfd(F2, offset, src);   offset += 8;
 863   lfd(F3, offset, src);   offset += 8;
 864   lfd(F4, offset, src);   offset += 8;
 865   lfd(F5, offset, src);   offset += 8;
 866   lfd(F6, offset, src);   offset += 8;
 867   lfd(F7, offset, src);   offset += 8;
 868   lfd(F8, offset, src);   offset += 8;
 869   lfd(F9, offset, src);   offset += 8;
 870   lfd(F10, offset, src);  offset += 8;
 871   lfd(F11, offset, src);  offset += 8;
 872   lfd(F12, offset, src);  offset += 8;
 873   lfd(F13, offset, src);
 874 }
 875 
 876 void MacroAssembler::save_LR_CR(Register tmp) {
 877   mfcr(tmp);
 878   std(tmp, _abi(cr), R1_SP);
 879   mflr(tmp);
 880   std(tmp, _abi(lr), R1_SP);
 881   // Tmp must contain lr on exit! (see return_addr and prolog in ppc64.ad)
 882 }
 883 
 884 void MacroAssembler::restore_LR_CR(Register tmp) {
 885   assert(tmp != R1_SP, "must be distinct");
 886   ld(tmp, _abi(lr), R1_SP);
 887   mtlr(tmp);
 888   ld(tmp, _abi(cr), R1_SP);
 889   mtcr(tmp);
 890 }
 891 
 892 address MacroAssembler::get_PC_trash_LR(Register result) {
 893   Label L;
 894   bl(L);
 895   bind(L);
 896   address lr_pc = pc();
 897   mflr(result);
 898   return lr_pc;
 899 }
 900 
 901 void MacroAssembler::resize_frame(Register offset, Register tmp) {
 902 #ifdef ASSERT
 903   assert_different_registers(offset, tmp, R1_SP);
 904   andi_(tmp, offset, frame::alignment_in_bytes-1);
 905   asm_assert_eq("resize_frame: unaligned", 0x204);
 906 #endif
 907 
 908   // tmp <- *(SP)
 909   ld(tmp, _abi(callers_sp), R1_SP);
 910   // addr <- SP + offset;
 911   // *(addr) <- tmp;
 912   // SP <- addr
 913   stdux(tmp, R1_SP, offset);
 914 }
 915 
 916 void MacroAssembler::resize_frame(int offset, Register tmp) {
 917   assert(is_simm(offset, 16), "too big an offset");
 918   assert_different_registers(tmp, R1_SP);
 919   assert((offset & (frame::alignment_in_bytes-1))==0, "resize_frame: unaligned");
 920   // tmp <- *(SP)
 921   ld(tmp, _abi(callers_sp), R1_SP);
 922   // addr <- SP + offset;
 923   // *(addr) <- tmp;
 924   // SP <- addr
 925   stdu(tmp, offset, R1_SP);
 926 }
 927 
 928 void MacroAssembler::resize_frame_absolute(Register addr, Register tmp1, Register tmp2) {
 929   // (addr == tmp1) || (addr == tmp2) is allowed here!
 930   assert(tmp1 != tmp2, "must be distinct");
 931 
 932   // compute offset w.r.t. current stack pointer
 933   // tmp_1 <- addr - SP (!)
 934   subf(tmp1, R1_SP, addr);
 935 
 936   // atomically update SP keeping back link.
 937   resize_frame(tmp1/* offset */, tmp2/* tmp */);
 938 }
 939 
 940 void MacroAssembler::push_frame(Register bytes, Register tmp) {
 941 #ifdef ASSERT
 942   assert(bytes != R0, "r0 not allowed here");
 943   andi_(R0, bytes, frame::alignment_in_bytes-1);
 944   asm_assert_eq("push_frame(Reg, Reg): unaligned", 0x203);
 945 #endif
 946   neg(tmp, bytes);
 947   stdux(R1_SP, R1_SP, tmp);
 948 }
 949 
 950 // Push a frame of size `bytes'.
 951 void MacroAssembler::push_frame(unsigned int bytes, Register tmp) {
 952   long offset = align_addr(bytes, frame::alignment_in_bytes);
 953   if (is_simm(-offset, 16)) {
 954     stdu(R1_SP, -offset, R1_SP);
 955   } else {
 956     load_const_optimized(tmp, -offset);
 957     stdux(R1_SP, R1_SP, tmp);
 958   }
 959 }
 960 
 961 // Push a frame of size `bytes' plus abi_reg_args on top.
 962 void MacroAssembler::push_frame_reg_args(unsigned int bytes, Register tmp) {
 963   push_frame(bytes + frame::abi_reg_args_size, tmp);
 964 }
 965 
 966 // Setup up a new C frame with a spill area for non-volatile GPRs and
 967 // additional space for local variables.
 968 void MacroAssembler::push_frame_reg_args_nonvolatiles(unsigned int bytes,
 969                                                       Register tmp) {
 970   push_frame(bytes + frame::abi_reg_args_size + frame::spill_nonvolatiles_size, tmp);
 971 }
 972 
 973 // Pop current C frame.
 974 void MacroAssembler::pop_frame() {
 975   ld(R1_SP, _abi(callers_sp), R1_SP);
 976 }
 977 
 978 #if defined(ABI_ELFv2)
 979 address MacroAssembler::branch_to(Register r_function_entry, bool and_link) {
 980   // TODO(asmundak): make sure the caller uses R12 as function descriptor
 981   // most of the times.
 982   if (R12 != r_function_entry) {
 983     mr(R12, r_function_entry);
 984   }
 985   mtctr(R12);
 986   // Do a call or a branch.
 987   if (and_link) {
 988     bctrl();
 989   } else {
 990     bctr();
 991   }
 992   _last_calls_return_pc = pc();
 993 
 994   return _last_calls_return_pc;
 995 }
 996 
 997 // Call a C function via a function descriptor and use full C
 998 // calling conventions. Updates and returns _last_calls_return_pc.
 999 address MacroAssembler::call_c(Register r_function_entry) {
1000   return branch_to(r_function_entry, /*and_link=*/true);
1001 }
1002 
1003 // For tail calls: only branch, don't link, so callee returns to caller of this function.
1004 address MacroAssembler::call_c_and_return_to_caller(Register r_function_entry) {
1005   return branch_to(r_function_entry, /*and_link=*/false);
1006 }
1007 
1008 address MacroAssembler::call_c(address function_entry, relocInfo::relocType rt) {
1009   load_const(R12, function_entry, R0);
1010   return branch_to(R12,  /*and_link=*/true);
1011 }
1012 
1013 #else
1014 // Generic version of a call to C function via a function descriptor
1015 // with variable support for C calling conventions (TOC, ENV, etc.).
1016 // Updates and returns _last_calls_return_pc.
1017 address MacroAssembler::branch_to(Register function_descriptor, bool and_link, bool save_toc_before_call,
1018                                   bool restore_toc_after_call, bool load_toc_of_callee, bool load_env_of_callee) {
1019   // we emit standard ptrgl glue code here
1020   assert((function_descriptor != R0), "function_descriptor cannot be R0");
1021 
1022   // retrieve necessary entries from the function descriptor
1023   ld(R0, in_bytes(FunctionDescriptor::entry_offset()), function_descriptor);
1024   mtctr(R0);
1025 
1026   if (load_toc_of_callee) {
1027     ld(R2_TOC, in_bytes(FunctionDescriptor::toc_offset()), function_descriptor);
1028   }
1029   if (load_env_of_callee) {
1030     ld(R11, in_bytes(FunctionDescriptor::env_offset()), function_descriptor);
1031   } else if (load_toc_of_callee) {
1032     li(R11, 0);
1033   }
1034 
1035   // do a call or a branch
1036   if (and_link) {
1037     bctrl();
1038   } else {
1039     bctr();
1040   }
1041   _last_calls_return_pc = pc();
1042 
1043   return _last_calls_return_pc;
1044 }
1045 
1046 // Call a C function via a function descriptor and use full C calling
1047 // conventions.
1048 // We don't use the TOC in generated code, so there is no need to save
1049 // and restore its value.
1050 address MacroAssembler::call_c(Register fd) {
1051   return branch_to(fd, /*and_link=*/true,
1052                        /*save toc=*/false,
1053                        /*restore toc=*/false,
1054                        /*load toc=*/true,
1055                        /*load env=*/true);
1056 }
1057 
1058 address MacroAssembler::call_c_and_return_to_caller(Register fd) {
1059   return branch_to(fd, /*and_link=*/false,
1060                        /*save toc=*/false,
1061                        /*restore toc=*/false,
1062                        /*load toc=*/true,
1063                        /*load env=*/true);
1064 }
1065 
1066 address MacroAssembler::call_c(const FunctionDescriptor* fd, relocInfo::relocType rt) {
1067   if (rt != relocInfo::none) {
1068     // this call needs to be relocatable
1069     if (!ReoptimizeCallSequences
1070         || (rt != relocInfo::runtime_call_type && rt != relocInfo::none)
1071         || fd == NULL   // support code-size estimation
1072         || !fd->is_friend_function()
1073         || fd->entry() == NULL) {
1074       // it's not a friend function as defined by class FunctionDescriptor,
1075       // so do a full call-c here.
1076       load_const(R11, (address)fd, R0);
1077 
1078       bool has_env = (fd != NULL && fd->env() != NULL);
1079       return branch_to(R11, /*and_link=*/true,
1080                             /*save toc=*/false,
1081                             /*restore toc=*/false,
1082                             /*load toc=*/true,
1083                             /*load env=*/has_env);
1084     } else {
1085       // It's a friend function. Load the entry point and don't care about
1086       // toc and env. Use an optimizable call instruction, but ensure the
1087       // same code-size as in the case of a non-friend function.
1088       nop();
1089       nop();
1090       nop();
1091       bl64_patchable(fd->entry(), rt);
1092       _last_calls_return_pc = pc();
1093       return _last_calls_return_pc;
1094     }
1095   } else {
1096     // This call does not need to be relocatable, do more aggressive
1097     // optimizations.
1098     if (!ReoptimizeCallSequences
1099       || !fd->is_friend_function()) {
1100       // It's not a friend function as defined by class FunctionDescriptor,
1101       // so do a full call-c here.
1102       load_const(R11, (address)fd, R0);
1103       return branch_to(R11, /*and_link=*/true,
1104                             /*save toc=*/false,
1105                             /*restore toc=*/false,
1106                             /*load toc=*/true,
1107                             /*load env=*/true);
1108     } else {
1109       // it's a friend function, load the entry point and don't care about
1110       // toc and env.
1111       address dest = fd->entry();
1112       if (is_within_range_of_b(dest, pc())) {
1113         bl(dest);
1114       } else {
1115         bl64_patchable(dest, rt);
1116       }
1117       _last_calls_return_pc = pc();
1118       return _last_calls_return_pc;
1119     }
1120   }
1121 }
1122 
1123 // Call a C function.  All constants needed reside in TOC.
1124 //
1125 // Read the address to call from the TOC.
1126 // Read env from TOC, if fd specifies an env.
1127 // Read new TOC from TOC.
1128 address MacroAssembler::call_c_using_toc(const FunctionDescriptor* fd,
1129                                          relocInfo::relocType rt, Register toc) {
1130   if (!ReoptimizeCallSequences
1131     || (rt != relocInfo::runtime_call_type && rt != relocInfo::none)
1132     || !fd->is_friend_function()) {
1133     // It's not a friend function as defined by class FunctionDescriptor,
1134     // so do a full call-c here.
1135     assert(fd->entry() != NULL, "function must be linked");
1136 
1137     AddressLiteral fd_entry(fd->entry());
1138     bool success = load_const_from_method_toc(R11, fd_entry, toc, /*fixed_size*/ true);
1139     mtctr(R11);
1140     if (fd->env() == NULL) {
1141       li(R11, 0);
1142       nop();
1143     } else {
1144       AddressLiteral fd_env(fd->env());
1145       success = success && load_const_from_method_toc(R11, fd_env, toc, /*fixed_size*/ true);
1146     }
1147     AddressLiteral fd_toc(fd->toc());
1148     // Set R2_TOC (load from toc)
1149     success = success && load_const_from_method_toc(R2_TOC, fd_toc, toc, /*fixed_size*/ true);
1150     bctrl();
1151     _last_calls_return_pc = pc();
1152     if (!success) { return NULL; }
1153   } else {
1154     // It's a friend function, load the entry point and don't care about
1155     // toc and env. Use an optimizable call instruction, but ensure the
1156     // same code-size as in the case of a non-friend function.
1157     nop();
1158     bl64_patchable(fd->entry(), rt);
1159     _last_calls_return_pc = pc();
1160   }
1161   return _last_calls_return_pc;
1162 }
1163 #endif // ABI_ELFv2
1164 
1165 void MacroAssembler::call_VM_base(Register oop_result,
1166                                   Register last_java_sp,
1167                                   address  entry_point,
1168                                   bool     check_exceptions) {
1169   BLOCK_COMMENT("call_VM {");
1170   // Determine last_java_sp register.
1171   if (!last_java_sp->is_valid()) {
1172     last_java_sp = R1_SP;
1173   }
1174   set_top_ijava_frame_at_SP_as_last_Java_frame(last_java_sp, R11_scratch1);
1175 
1176   // ARG1 must hold thread address.
1177   mr(R3_ARG1, R16_thread);
1178 #if defined(ABI_ELFv2)
1179   address return_pc = call_c(entry_point, relocInfo::none);
1180 #else
1181   address return_pc = call_c((FunctionDescriptor*)entry_point, relocInfo::none);
1182 #endif
1183 
1184   reset_last_Java_frame();
1185 
1186   // Check for pending exceptions.
1187   if (check_exceptions) {
1188     // We don't check for exceptions here.
1189     ShouldNotReachHere();
1190   }
1191 
1192   // Get oop result if there is one and reset the value in the thread.
1193   if (oop_result->is_valid()) {
1194     get_vm_result(oop_result);
1195   }
1196 
1197   _last_calls_return_pc = return_pc;
1198   BLOCK_COMMENT("} call_VM");
1199 }
1200 
1201 void MacroAssembler::call_VM_leaf_base(address entry_point) {
1202   BLOCK_COMMENT("call_VM_leaf {");
1203 #if defined(ABI_ELFv2)
1204   call_c(entry_point, relocInfo::none);
1205 #else
1206   call_c(CAST_FROM_FN_PTR(FunctionDescriptor*, entry_point), relocInfo::none);
1207 #endif
1208   BLOCK_COMMENT("} call_VM_leaf");
1209 }
1210 
1211 void MacroAssembler::call_VM(Register oop_result, address entry_point, bool check_exceptions) {
1212   call_VM_base(oop_result, noreg, entry_point, check_exceptions);
1213 }
1214 
1215 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1,
1216                              bool check_exceptions) {
1217   // R3_ARG1 is reserved for the thread.
1218   mr_if_needed(R4_ARG2, arg_1);
1219   call_VM(oop_result, entry_point, check_exceptions);
1220 }
1221 
1222 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2,
1223                              bool check_exceptions) {
1224   // R3_ARG1 is reserved for the thread
1225   mr_if_needed(R4_ARG2, arg_1);
1226   assert(arg_2 != R4_ARG2, "smashed argument");
1227   mr_if_needed(R5_ARG3, arg_2);
1228   call_VM(oop_result, entry_point, check_exceptions);
1229 }
1230 
1231 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, Register arg_3,
1232                              bool check_exceptions) {
1233   // R3_ARG1 is reserved for the thread
1234   mr_if_needed(R4_ARG2, arg_1);
1235   assert(arg_2 != R4_ARG2, "smashed argument");
1236   mr_if_needed(R5_ARG3, arg_2);
1237   mr_if_needed(R6_ARG4, arg_3);
1238   call_VM(oop_result, entry_point, check_exceptions);
1239 }
1240 
1241 void MacroAssembler::call_VM_leaf(address entry_point) {
1242   call_VM_leaf_base(entry_point);
1243 }
1244 
1245 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1) {
1246   mr_if_needed(R3_ARG1, arg_1);
1247   call_VM_leaf(entry_point);
1248 }
1249 
1250 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2) {
1251   mr_if_needed(R3_ARG1, arg_1);
1252   assert(arg_2 != R3_ARG1, "smashed argument");
1253   mr_if_needed(R4_ARG2, arg_2);
1254   call_VM_leaf(entry_point);
1255 }
1256 
1257 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3) {
1258   mr_if_needed(R3_ARG1, arg_1);
1259   assert(arg_2 != R3_ARG1, "smashed argument");
1260   mr_if_needed(R4_ARG2, arg_2);
1261   assert(arg_3 != R3_ARG1 && arg_3 != R4_ARG2, "smashed argument");
1262   mr_if_needed(R5_ARG3, arg_3);
1263   call_VM_leaf(entry_point);
1264 }
1265 
1266 // Check whether instruction is a read access to the polling page
1267 // which was emitted by load_from_polling_page(..).
1268 bool MacroAssembler::is_load_from_polling_page(int instruction, void* ucontext,
1269                                                address* polling_address_ptr) {
1270   if (!is_ld(instruction))
1271     return false; // It's not a ld. Fail.
1272 
1273   int rt = inv_rt_field(instruction);
1274   int ra = inv_ra_field(instruction);
1275   int ds = inv_ds_field(instruction);
1276   if (!(ds == 0 && ra != 0 && rt == 0)) {
1277     return false; // It's not a ld(r0, X, ra). Fail.
1278   }
1279 
1280   if (!ucontext) {
1281     // Set polling address.
1282     if (polling_address_ptr != NULL) {
1283       *polling_address_ptr = NULL;
1284     }
1285     return true; // No ucontext given. Can't check value of ra. Assume true.
1286   }
1287 
1288 #ifdef LINUX
1289   // Ucontext given. Check that register ra contains the address of
1290   // the safepoing polling page.
1291   ucontext_t* uc = (ucontext_t*) ucontext;
1292   // Set polling address.
1293   address addr = (address)uc->uc_mcontext.regs->gpr[ra] + (ssize_t)ds;
1294   if (polling_address_ptr != NULL) {
1295     *polling_address_ptr = addr;
1296   }
1297   return os::is_poll_address(addr);
1298 #else
1299   // Not on Linux, ucontext must be NULL.
1300   ShouldNotReachHere();
1301   return false;
1302 #endif
1303 }
1304 
1305 void MacroAssembler::bang_stack_with_offset(int offset) {
1306   // When increasing the stack, the old stack pointer will be written
1307   // to the new top of stack according to the PPC64 abi.
1308   // Therefore, stack banging is not necessary when increasing
1309   // the stack by <= os::vm_page_size() bytes.
1310   // When increasing the stack by a larger amount, this method is
1311   // called repeatedly to bang the intermediate pages.
1312 
1313   // Stack grows down, caller passes positive offset.
1314   assert(offset > 0, "must bang with positive offset");
1315 
1316   long stdoffset = -offset;
1317 
1318   if (is_simm(stdoffset, 16)) {
1319     // Signed 16 bit offset, a simple std is ok.
1320     if (UseLoadInstructionsForStackBangingPPC64) {
1321       ld(R0, (int)(signed short)stdoffset, R1_SP);
1322     } else {
1323       std(R0,(int)(signed short)stdoffset, R1_SP);
1324     }
1325   } else if (is_simm(stdoffset, 31)) {
1326     const int hi = MacroAssembler::largeoffset_si16_si16_hi(stdoffset);
1327     const int lo = MacroAssembler::largeoffset_si16_si16_lo(stdoffset);
1328 
1329     Register tmp = R11;
1330     addis(tmp, R1_SP, hi);
1331     if (UseLoadInstructionsForStackBangingPPC64) {
1332       ld(R0,  lo, tmp);
1333     } else {
1334       std(R0, lo, tmp);
1335     }
1336   } else {
1337     ShouldNotReachHere();
1338   }
1339 }
1340 
1341 // If instruction is a stack bang of the form
1342 //    std    R0,    x(Ry),       (see bang_stack_with_offset())
1343 //    stdu   R1_SP, x(R1_SP),    (see push_frame(), resize_frame())
1344 // or stdux  R1_SP, Rx, R1_SP    (see push_frame(), resize_frame())
1345 // return the banged address. Otherwise, return 0.
1346 address MacroAssembler::get_stack_bang_address(int instruction, void *ucontext) {
1347 #ifdef LINUX
1348   ucontext_t* uc = (ucontext_t*) ucontext;
1349   int rs = inv_rs_field(instruction);
1350   int ra = inv_ra_field(instruction);
1351   if (   (is_ld(instruction)   && rs == 0 &&  UseLoadInstructionsForStackBangingPPC64)
1352       || (is_std(instruction)  && rs == 0 && !UseLoadInstructionsForStackBangingPPC64)
1353       || (is_stdu(instruction) && rs == 1)) {
1354     int ds = inv_ds_field(instruction);
1355     // return banged address
1356     return ds+(address)uc->uc_mcontext.regs->gpr[ra];
1357   } else if (is_stdux(instruction) && rs == 1) {
1358     int rb = inv_rb_field(instruction);
1359     address sp = (address)uc->uc_mcontext.regs->gpr[1];
1360     long rb_val = (long)uc->uc_mcontext.regs->gpr[rb];
1361     return ra != 1 || rb_val >= 0 ? NULL         // not a stack bang
1362                                   : sp + rb_val; // banged address
1363   }
1364   return NULL; // not a stack bang
1365 #else
1366   // workaround not needed on !LINUX :-)
1367   ShouldNotCallThis();
1368   return NULL;
1369 #endif
1370 }
1371 
1372 void MacroAssembler::reserved_stack_check(Register return_pc) {
1373   // Test if reserved zone needs to be enabled.
1374   Label no_reserved_zone_enabling;
1375 
1376   ld_ptr(R0, JavaThread::reserved_stack_activation_offset(), R16_thread);
1377   cmpld(CCR0, R1_SP, R0);
1378   blt_predict_taken(CCR0, no_reserved_zone_enabling);
1379 
1380   // Enable reserved zone again, throw stack overflow exception.
1381   push_frame_reg_args(0, R0);
1382   call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone), R16_thread);
1383   pop_frame();
1384   mtlr(return_pc);
1385   load_const_optimized(R0, StubRoutines::throw_delayed_StackOverflowError_entry());
1386   mtctr(R0);
1387   bctr();
1388 
1389   should_not_reach_here();
1390 
1391   bind(no_reserved_zone_enabling);
1392 }
1393 
1394 void MacroAssembler::getandsetd(Register dest_current_value, Register exchange_value, Register addr_base,
1395                                 bool cmpxchgx_hint) {
1396   Label retry;
1397   bind(retry);
1398   ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1399   stdcx_(exchange_value, addr_base);
1400   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1401     bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0.
1402   } else {
1403     bne(                  CCR0, retry); // StXcx_ sets CCR0.
1404   }
1405 }
1406 
1407 void MacroAssembler::getandaddd(Register dest_current_value, Register inc_value, Register addr_base,
1408                                 Register tmp, bool cmpxchgx_hint) {
1409   Label retry;
1410   bind(retry);
1411   ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1412   add(tmp, dest_current_value, inc_value);
1413   stdcx_(tmp, addr_base);
1414   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1415     bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0.
1416   } else {
1417     bne(                  CCR0, retry); // StXcx_ sets CCR0.
1418   }
1419 }
1420 
1421 // Word/sub-word atomic helper functions
1422 
1423 // Temps and addr_base are killed if size < 4 and processor does not support respective instructions.
1424 // Only signed types are supported with size < 4.
1425 // Atomic add always kills tmp1.
1426 void MacroAssembler::atomic_get_and_modify_generic(Register dest_current_value, Register exchange_value,
1427                                                    Register addr_base, Register tmp1, Register tmp2, Register tmp3,
1428                                                    bool cmpxchgx_hint, bool is_add, int size) {
1429   // Sub-word instructions are available since Power 8.
1430   // For older processors, instruction_type != size holds, and we
1431   // emulate the sub-word instructions by constructing a 4-byte value
1432   // that leaves the other bytes unchanged.
1433   const int instruction_type = VM_Version::has_lqarx() ? size : 4;
1434 
1435   Label retry;
1436   Register shift_amount = noreg,
1437            val32 = dest_current_value,
1438            modval = is_add ? tmp1 : exchange_value;
1439 
1440   if (instruction_type != size) {
1441     assert_different_registers(tmp1, tmp2, tmp3, dest_current_value, exchange_value, addr_base);
1442     modval = tmp1;
1443     shift_amount = tmp2;
1444     val32 = tmp3;
1445     // Need some preperation: Compute shift amount, align address. Note: shorts must be 2 byte aligned.
1446 #ifdef VM_LITTLE_ENDIAN
1447     rldic(shift_amount, addr_base, 3, 64-5); // (dest & 3) * 8;
1448     clrrdi(addr_base, addr_base, 2);
1449 #else
1450     xori(shift_amount, addr_base, (size == 1) ? 3 : 2);
1451     clrrdi(addr_base, addr_base, 2);
1452     rldic(shift_amount, shift_amount, 3, 64-5); // byte: ((3-dest) & 3) * 8; short: ((1-dest/2) & 1) * 16;
1453 #endif
1454   }
1455 
1456   // atomic emulation loop
1457   bind(retry);
1458 
1459   switch (instruction_type) {
1460     case 4: lwarx(val32, addr_base, cmpxchgx_hint); break;
1461     case 2: lharx(val32, addr_base, cmpxchgx_hint); break;
1462     case 1: lbarx(val32, addr_base, cmpxchgx_hint); break;
1463     default: ShouldNotReachHere();
1464   }
1465 
1466   if (instruction_type != size) {
1467     srw(dest_current_value, val32, shift_amount);
1468   }
1469 
1470   if (is_add) { add(modval, dest_current_value, exchange_value); }
1471 
1472   if (instruction_type != size) {
1473     // Transform exchange value such that the replacement can be done by one xor instruction.
1474     xorr(modval, dest_current_value, is_add ? modval : exchange_value);
1475     clrldi(modval, modval, (size == 1) ? 56 : 48);
1476     slw(modval, modval, shift_amount);
1477     xorr(modval, val32, modval);
1478   }
1479 
1480   switch (instruction_type) {
1481     case 4: stwcx_(modval, addr_base); break;
1482     case 2: sthcx_(modval, addr_base); break;
1483     case 1: stbcx_(modval, addr_base); break;
1484     default: ShouldNotReachHere();
1485   }
1486 
1487   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1488     bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0.
1489   } else {
1490     bne(                  CCR0, retry); // StXcx_ sets CCR0.
1491   }
1492 
1493   // l?arx zero-extends, but Java wants byte/short values sign-extended.
1494   if (size == 1) {
1495     extsb(dest_current_value, dest_current_value);
1496   } else if (size == 2) {
1497     extsh(dest_current_value, dest_current_value);
1498   };
1499 }
1500 
1501 // Temps, addr_base and exchange_value are killed if size < 4 and processor does not support respective instructions.
1502 // Only signed types are supported with size < 4.
1503 void MacroAssembler::cmpxchg_loop_body(ConditionRegister flag, Register dest_current_value,
1504                                        Register compare_value, Register exchange_value,
1505                                        Register addr_base, Register tmp1, Register tmp2,
1506                                        Label &retry, Label &failed, bool cmpxchgx_hint, int size) {
1507   // Sub-word instructions are available since Power 8.
1508   // For older processors, instruction_type != size holds, and we
1509   // emulate the sub-word instructions by constructing a 4-byte value
1510   // that leaves the other bytes unchanged.
1511   const int instruction_type = VM_Version::has_lqarx() ? size : 4;
1512 
1513   Register shift_amount = noreg,
1514            val32 = dest_current_value,
1515            modval = exchange_value;
1516 
1517   if (instruction_type != size) {
1518     assert_different_registers(tmp1, tmp2, dest_current_value, compare_value, exchange_value, addr_base);
1519     shift_amount = tmp1;
1520     val32 = tmp2;
1521     modval = tmp2;
1522     // Need some preperation: Compute shift amount, align address. Note: shorts must be 2 byte aligned.
1523 #ifdef VM_LITTLE_ENDIAN
1524     rldic(shift_amount, addr_base, 3, 64-5); // (dest & 3) * 8;
1525     clrrdi(addr_base, addr_base, 2);
1526 #else
1527     xori(shift_amount, addr_base, (size == 1) ? 3 : 2);
1528     clrrdi(addr_base, addr_base, 2);
1529     rldic(shift_amount, shift_amount, 3, 64-5); // byte: ((3-dest) & 3) * 8; short: ((1-dest/2) & 1) * 16;
1530 #endif
1531     // Transform exchange value such that the replacement can be done by one xor instruction.
1532     xorr(exchange_value, compare_value, exchange_value);
1533     clrldi(exchange_value, exchange_value, (size == 1) ? 56 : 48);
1534     slw(exchange_value, exchange_value, shift_amount);
1535   }
1536 
1537   // atomic emulation loop
1538   bind(retry);
1539 
1540   switch (instruction_type) {
1541     case 4: lwarx(val32, addr_base, cmpxchgx_hint); break;
1542     case 2: lharx(val32, addr_base, cmpxchgx_hint); break;
1543     case 1: lbarx(val32, addr_base, cmpxchgx_hint); break;
1544     default: ShouldNotReachHere();
1545   }
1546 
1547   if (instruction_type != size) {
1548     srw(dest_current_value, val32, shift_amount);
1549   }
1550   if (size == 1) {
1551     extsb(dest_current_value, dest_current_value);
1552   } else if (size == 2) {
1553     extsh(dest_current_value, dest_current_value);
1554   };
1555 
1556   cmpw(flag, dest_current_value, compare_value);
1557   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1558     bne_predict_not_taken(flag, failed);
1559   } else {
1560     bne(                  flag, failed);
1561   }
1562   // branch to done  => (flag == ne), (dest_current_value != compare_value)
1563   // fall through    => (flag == eq), (dest_current_value == compare_value)
1564 
1565   if (instruction_type != size) {
1566     xorr(modval, val32, exchange_value);
1567   }
1568 
1569   switch (instruction_type) {
1570     case 4: stwcx_(modval, addr_base); break;
1571     case 2: sthcx_(modval, addr_base); break;
1572     case 1: stbcx_(modval, addr_base); break;
1573     default: ShouldNotReachHere();
1574   }
1575 }
1576 
1577 // CmpxchgX sets condition register to cmpX(current, compare).
1578 void MacroAssembler::cmpxchg_generic(ConditionRegister flag, Register dest_current_value,
1579                                      Register compare_value, Register exchange_value,
1580                                      Register addr_base, Register tmp1, Register tmp2,
1581                                      int semantics, bool cmpxchgx_hint,
1582                                      Register int_flag_success, bool contention_hint, bool weak, int size) {
1583   Label retry;
1584   Label failed;
1585   Label done;
1586 
1587   // Save one branch if result is returned via register and
1588   // result register is different from the other ones.
1589   bool use_result_reg    = (int_flag_success != noreg);
1590   bool preset_result_reg = (int_flag_success != dest_current_value && int_flag_success != compare_value &&
1591                             int_flag_success != exchange_value && int_flag_success != addr_base &&
1592                             int_flag_success != tmp1 && int_flag_success != tmp2);
1593   assert(!weak || flag == CCR0, "weak only supported with CCR0");
1594   assert(size == 1 || size == 2 || size == 4, "unsupported");
1595 
1596   if (use_result_reg && preset_result_reg) {
1597     li(int_flag_success, 0); // preset (assume cas failed)
1598   }
1599 
1600   // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM).
1601   if (contention_hint) { // Don't try to reserve if cmp fails.
1602     switch (size) {
1603       case 1: lbz(dest_current_value, 0, addr_base); extsb(dest_current_value, dest_current_value); break;
1604       case 2: lha(dest_current_value, 0, addr_base); break;
1605       case 4: lwz(dest_current_value, 0, addr_base); break;
1606       default: ShouldNotReachHere();
1607     }
1608     cmpw(flag, dest_current_value, compare_value);
1609     bne(flag, failed);
1610   }
1611 
1612   // release/fence semantics
1613   if (semantics & MemBarRel) {
1614     release();
1615   }
1616 
1617   cmpxchg_loop_body(flag, dest_current_value, compare_value, exchange_value, addr_base, tmp1, tmp2,
1618                     retry, failed, cmpxchgx_hint, size);
1619   if (!weak || use_result_reg) {
1620     if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1621       bne_predict_not_taken(CCR0, weak ? failed : retry); // StXcx_ sets CCR0.
1622     } else {
1623       bne(                  CCR0, weak ? failed : retry); // StXcx_ sets CCR0.
1624     }
1625   }
1626   // fall through    => (flag == eq), (dest_current_value == compare_value), (swapped)
1627 
1628   // Result in register (must do this at the end because int_flag_success can be the
1629   // same register as one above).
1630   if (use_result_reg) {
1631     li(int_flag_success, 1);
1632   }
1633 
1634   if (semantics & MemBarFenceAfter) {
1635     fence();
1636   } else if (semantics & MemBarAcq) {
1637     isync();
1638   }
1639 
1640   if (use_result_reg && !preset_result_reg) {
1641     b(done);
1642   }
1643 
1644   bind(failed);
1645   if (use_result_reg && !preset_result_reg) {
1646     li(int_flag_success, 0);
1647   }
1648 
1649   bind(done);
1650   // (flag == ne) => (dest_current_value != compare_value), (!swapped)
1651   // (flag == eq) => (dest_current_value == compare_value), ( swapped)
1652 }
1653 
1654 // Preforms atomic compare exchange:
1655 //   if (compare_value == *addr_base)
1656 //     *addr_base = exchange_value
1657 //     int_flag_success = 1;
1658 //   else
1659 //     int_flag_success = 0;
1660 //
1661 // ConditionRegister flag       = cmp(compare_value, *addr_base)
1662 // Register dest_current_value  = *addr_base
1663 // Register compare_value       Used to compare with value in memory
1664 // Register exchange_value      Written to memory if compare_value == *addr_base
1665 // Register addr_base           The memory location to compareXChange
1666 // Register int_flag_success    Set to 1 if exchange_value was written to *addr_base
1667 //
1668 // To avoid the costly compare exchange the value is tested beforehand.
1669 // Several special cases exist to avoid that unnecessary information is generated.
1670 //
1671 void MacroAssembler::cmpxchgd(ConditionRegister flag,
1672                               Register dest_current_value, RegisterOrConstant compare_value, Register exchange_value,
1673                               Register addr_base, int semantics, bool cmpxchgx_hint,
1674                               Register int_flag_success, Label* failed_ext, bool contention_hint, bool weak) {
1675   Label retry;
1676   Label failed_int;
1677   Label& failed = (failed_ext != NULL) ? *failed_ext : failed_int;
1678   Label done;
1679 
1680   // Save one branch if result is returned via register and result register is different from the other ones.
1681   bool use_result_reg    = (int_flag_success!=noreg);
1682   bool preset_result_reg = (int_flag_success!=dest_current_value && int_flag_success!=compare_value.register_or_noreg() &&
1683                             int_flag_success!=exchange_value && int_flag_success!=addr_base);
1684   assert(!weak || flag == CCR0, "weak only supported with CCR0");
1685   assert(int_flag_success == noreg || failed_ext == NULL, "cannot have both");
1686 
1687   if (use_result_reg && preset_result_reg) {
1688     li(int_flag_success, 0); // preset (assume cas failed)
1689   }
1690 
1691   // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM).
1692   if (contention_hint) { // Don't try to reserve if cmp fails.
1693     ld(dest_current_value, 0, addr_base);
1694     cmpd(flag, compare_value, dest_current_value);
1695     bne(flag, failed);
1696   }
1697 
1698   // release/fence semantics
1699   if (semantics & MemBarRel) {
1700     release();
1701   }
1702 
1703   // atomic emulation loop
1704   bind(retry);
1705 
1706   ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1707   cmpd(flag, compare_value, dest_current_value);
1708   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1709     bne_predict_not_taken(flag, failed);
1710   } else {
1711     bne(                  flag, failed);
1712   }
1713 
1714   stdcx_(exchange_value, addr_base);
1715   if (!weak || use_result_reg || failed_ext) {
1716     if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1717       bne_predict_not_taken(CCR0, weak ? failed : retry); // stXcx_ sets CCR0
1718     } else {
1719       bne(                  CCR0, weak ? failed : retry); // stXcx_ sets CCR0
1720     }
1721   }
1722 
1723   // result in register (must do this at the end because int_flag_success can be the same register as one above)
1724   if (use_result_reg) {
1725     li(int_flag_success, 1);
1726   }
1727 
1728   if (semantics & MemBarFenceAfter) {
1729     fence();
1730   } else if (semantics & MemBarAcq) {
1731     isync();
1732   }
1733 
1734   if (use_result_reg && !preset_result_reg) {
1735     b(done);
1736   }
1737 
1738   bind(failed_int);
1739   if (use_result_reg && !preset_result_reg) {
1740     li(int_flag_success, 0);
1741   }
1742 
1743   bind(done);
1744   // (flag == ne) => (dest_current_value != compare_value), (!swapped)
1745   // (flag == eq) => (dest_current_value == compare_value), ( swapped)
1746 }
1747 
1748 // Look up the method for a megamorphic invokeinterface call.
1749 // The target method is determined by <intf_klass, itable_index>.
1750 // The receiver klass is in recv_klass.
1751 // On success, the result will be in method_result, and execution falls through.
1752 // On failure, execution transfers to the given label.
1753 void MacroAssembler::lookup_interface_method(Register recv_klass,
1754                                              Register intf_klass,
1755                                              RegisterOrConstant itable_index,
1756                                              Register method_result,
1757                                              Register scan_temp,
1758                                              Register temp2,
1759                                              Label& L_no_such_interface,
1760                                              bool return_method) {
1761   assert_different_registers(recv_klass, intf_klass, method_result, scan_temp);
1762 
1763   // Compute start of first itableOffsetEntry (which is at the end of the vtable).
1764   int vtable_base = in_bytes(Klass::vtable_start_offset());
1765   int itentry_off = itableMethodEntry::method_offset_in_bytes();
1766   int logMEsize   = exact_log2(itableMethodEntry::size() * wordSize);
1767   int scan_step   = itableOffsetEntry::size() * wordSize;
1768   int log_vte_size= exact_log2(vtableEntry::size_in_bytes());
1769 
1770   lwz(scan_temp, in_bytes(Klass::vtable_length_offset()), recv_klass);
1771   // %%% We should store the aligned, prescaled offset in the klassoop.
1772   // Then the next several instructions would fold away.
1773 
1774   sldi(scan_temp, scan_temp, log_vte_size);
1775   addi(scan_temp, scan_temp, vtable_base);
1776   add(scan_temp, recv_klass, scan_temp);
1777 
1778   // Adjust recv_klass by scaled itable_index, so we can free itable_index.
1779   if (return_method) {
1780     if (itable_index.is_register()) {
1781       Register itable_offset = itable_index.as_register();
1782       sldi(method_result, itable_offset, logMEsize);
1783       if (itentry_off) { addi(method_result, method_result, itentry_off); }
1784       add(method_result, method_result, recv_klass);
1785     } else {
1786       long itable_offset = (long)itable_index.as_constant();
1787       // static address, no relocation
1788       add_const_optimized(method_result, recv_klass, (itable_offset << logMEsize) + itentry_off, temp2);
1789     }
1790   }
1791 
1792   // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
1793   //   if (scan->interface() == intf) {
1794   //     result = (klass + scan->offset() + itable_index);
1795   //   }
1796   // }
1797   Label search, found_method;
1798 
1799   for (int peel = 1; peel >= 0; peel--) {
1800     // %%%% Could load both offset and interface in one ldx, if they were
1801     // in the opposite order. This would save a load.
1802     ld(temp2, itableOffsetEntry::interface_offset_in_bytes(), scan_temp);
1803 
1804     // Check that this entry is non-null. A null entry means that
1805     // the receiver class doesn't implement the interface, and wasn't the
1806     // same as when the caller was compiled.
1807     cmpd(CCR0, temp2, intf_klass);
1808 
1809     if (peel) {
1810       beq(CCR0, found_method);
1811     } else {
1812       bne(CCR0, search);
1813       // (invert the test to fall through to found_method...)
1814     }
1815 
1816     if (!peel) break;
1817 
1818     bind(search);
1819 
1820     cmpdi(CCR0, temp2, 0);
1821     beq(CCR0, L_no_such_interface);
1822     addi(scan_temp, scan_temp, scan_step);
1823   }
1824 
1825   bind(found_method);
1826 
1827   // Got a hit.
1828   if (return_method) {
1829     int ito_offset = itableOffsetEntry::offset_offset_in_bytes();
1830     lwz(scan_temp, ito_offset, scan_temp);
1831     ldx(method_result, scan_temp, method_result);
1832   }
1833 }
1834 
1835 // virtual method calling
1836 void MacroAssembler::lookup_virtual_method(Register recv_klass,
1837                                            RegisterOrConstant vtable_index,
1838                                            Register method_result) {
1839 
1840   assert_different_registers(recv_klass, method_result, vtable_index.register_or_noreg());
1841 
1842   const int base = in_bytes(Klass::vtable_start_offset());
1843   assert(vtableEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
1844 
1845   if (vtable_index.is_register()) {
1846     sldi(vtable_index.as_register(), vtable_index.as_register(), LogBytesPerWord);
1847     add(recv_klass, vtable_index.as_register(), recv_klass);
1848   } else {
1849     addi(recv_klass, recv_klass, vtable_index.as_constant() << LogBytesPerWord);
1850   }
1851   ld(R19_method, base + vtableEntry::method_offset_in_bytes(), recv_klass);
1852 }
1853 
1854 /////////////////////////////////////////// subtype checking ////////////////////////////////////////////
1855 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
1856                                                    Register super_klass,
1857                                                    Register temp1_reg,
1858                                                    Register temp2_reg,
1859                                                    Label* L_success,
1860                                                    Label* L_failure,
1861                                                    Label* L_slow_path,
1862                                                    RegisterOrConstant super_check_offset) {
1863 
1864   const Register check_cache_offset = temp1_reg;
1865   const Register cached_super       = temp2_reg;
1866 
1867   assert_different_registers(sub_klass, super_klass, check_cache_offset, cached_super);
1868 
1869   int sco_offset = in_bytes(Klass::super_check_offset_offset());
1870   int sc_offset  = in_bytes(Klass::secondary_super_cache_offset());
1871 
1872   bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
1873   bool need_slow_path = (must_load_sco || super_check_offset.constant_or_zero() == sco_offset);
1874 
1875   Label L_fallthrough;
1876   int label_nulls = 0;
1877   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
1878   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
1879   if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
1880   assert(label_nulls <= 1 ||
1881          (L_slow_path == &L_fallthrough && label_nulls <= 2 && !need_slow_path),
1882          "at most one NULL in the batch, usually");
1883 
1884   // If the pointers are equal, we are done (e.g., String[] elements).
1885   // This self-check enables sharing of secondary supertype arrays among
1886   // non-primary types such as array-of-interface. Otherwise, each such
1887   // type would need its own customized SSA.
1888   // We move this check to the front of the fast path because many
1889   // type checks are in fact trivially successful in this manner,
1890   // so we get a nicely predicted branch right at the start of the check.
1891   cmpd(CCR0, sub_klass, super_klass);
1892   beq(CCR0, *L_success);
1893 
1894   // Check the supertype display:
1895   if (must_load_sco) {
1896     // The super check offset is always positive...
1897     lwz(check_cache_offset, sco_offset, super_klass);
1898     super_check_offset = RegisterOrConstant(check_cache_offset);
1899     // super_check_offset is register.
1900     assert_different_registers(sub_klass, super_klass, cached_super, super_check_offset.as_register());
1901   }
1902   // The loaded value is the offset from KlassOopDesc.
1903 
1904   ld(cached_super, super_check_offset, sub_klass);
1905   cmpd(CCR0, cached_super, super_klass);
1906 
1907   // This check has worked decisively for primary supers.
1908   // Secondary supers are sought in the super_cache ('super_cache_addr').
1909   // (Secondary supers are interfaces and very deeply nested subtypes.)
1910   // This works in the same check above because of a tricky aliasing
1911   // between the super_cache and the primary super display elements.
1912   // (The 'super_check_addr' can address either, as the case requires.)
1913   // Note that the cache is updated below if it does not help us find
1914   // what we need immediately.
1915   // So if it was a primary super, we can just fail immediately.
1916   // Otherwise, it's the slow path for us (no success at this point).
1917 
1918 #define FINAL_JUMP(label) if (&(label) != &L_fallthrough) { b(label); }
1919 
1920   if (super_check_offset.is_register()) {
1921     beq(CCR0, *L_success);
1922     cmpwi(CCR0, super_check_offset.as_register(), sc_offset);
1923     if (L_failure == &L_fallthrough) {
1924       beq(CCR0, *L_slow_path);
1925     } else {
1926       bne(CCR0, *L_failure);
1927       FINAL_JUMP(*L_slow_path);
1928     }
1929   } else {
1930     if (super_check_offset.as_constant() == sc_offset) {
1931       // Need a slow path; fast failure is impossible.
1932       if (L_slow_path == &L_fallthrough) {
1933         beq(CCR0, *L_success);
1934       } else {
1935         bne(CCR0, *L_slow_path);
1936         FINAL_JUMP(*L_success);
1937       }
1938     } else {
1939       // No slow path; it's a fast decision.
1940       if (L_failure == &L_fallthrough) {
1941         beq(CCR0, *L_success);
1942       } else {
1943         bne(CCR0, *L_failure);
1944         FINAL_JUMP(*L_success);
1945       }
1946     }
1947   }
1948 
1949   bind(L_fallthrough);
1950 #undef FINAL_JUMP
1951 }
1952 
1953 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
1954                                                    Register super_klass,
1955                                                    Register temp1_reg,
1956                                                    Register temp2_reg,
1957                                                    Label* L_success,
1958                                                    Register result_reg) {
1959   const Register array_ptr = temp1_reg; // current value from cache array
1960   const Register temp      = temp2_reg;
1961 
1962   assert_different_registers(sub_klass, super_klass, array_ptr, temp);
1963 
1964   int source_offset = in_bytes(Klass::secondary_supers_offset());
1965   int target_offset = in_bytes(Klass::secondary_super_cache_offset());
1966 
1967   int length_offset = Array<Klass*>::length_offset_in_bytes();
1968   int base_offset   = Array<Klass*>::base_offset_in_bytes();
1969 
1970   Label hit, loop, failure, fallthru;
1971 
1972   ld(array_ptr, source_offset, sub_klass);
1973 
1974   // TODO: PPC port: assert(4 == arrayOopDesc::length_length_in_bytes(), "precondition violated.");
1975   lwz(temp, length_offset, array_ptr);
1976   cmpwi(CCR0, temp, 0);
1977   beq(CCR0, result_reg!=noreg ? failure : fallthru); // length 0
1978 
1979   mtctr(temp); // load ctr
1980 
1981   bind(loop);
1982   // Oops in table are NO MORE compressed.
1983   ld(temp, base_offset, array_ptr);
1984   cmpd(CCR0, temp, super_klass);
1985   beq(CCR0, hit);
1986   addi(array_ptr, array_ptr, BytesPerWord);
1987   bdnz(loop);
1988 
1989   bind(failure);
1990   if (result_reg!=noreg) li(result_reg, 1); // load non-zero result (indicates a miss)
1991   b(fallthru);
1992 
1993   bind(hit);
1994   std(super_klass, target_offset, sub_klass); // save result to cache
1995   if (result_reg != noreg) { li(result_reg, 0); } // load zero result (indicates a hit)
1996   if (L_success != NULL) { b(*L_success); }
1997   else if (result_reg == noreg) { blr(); } // return with CR0.eq if neither label nor result reg provided
1998 
1999   bind(fallthru);
2000 }
2001 
2002 // Try fast path, then go to slow one if not successful
2003 void MacroAssembler::check_klass_subtype(Register sub_klass,
2004                          Register super_klass,
2005                          Register temp1_reg,
2006                          Register temp2_reg,
2007                          Label& L_success) {
2008   Label L_failure;
2009   check_klass_subtype_fast_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success, &L_failure);
2010   check_klass_subtype_slow_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success);
2011   bind(L_failure); // Fallthru if not successful.
2012 }
2013 
2014 void MacroAssembler::clinit_barrier(Register klass, Register thread, Label* L_fast_path, Label* L_slow_path) {
2015   assert(L_fast_path != NULL || L_slow_path != NULL, "at least one is required");
2016 
2017   Label L_fallthrough;
2018   if (L_fast_path == NULL) {
2019     L_fast_path = &L_fallthrough;
2020   } else if (L_slow_path == NULL) {
2021     L_slow_path = &L_fallthrough;
2022   }
2023 
2024   // Fast path check: class is fully initialized
2025   lbz(R0, in_bytes(InstanceKlass::init_state_offset()), klass);
2026   cmpwi(CCR0, R0, InstanceKlass::fully_initialized);
2027   beq(CCR0, *L_fast_path);
2028 
2029   // Fast path check: current thread is initializer thread
2030   ld(R0, in_bytes(InstanceKlass::init_thread_offset()), klass);
2031   cmpd(CCR0, thread, R0);
2032   if (L_slow_path == &L_fallthrough) {
2033     beq(CCR0, *L_fast_path);
2034   } else if (L_fast_path == &L_fallthrough) {
2035     bne(CCR0, *L_slow_path);
2036   } else {
2037     Unimplemented();
2038   }
2039 
2040   bind(L_fallthrough);
2041 }
2042 
2043 RegisterOrConstant MacroAssembler::argument_offset(RegisterOrConstant arg_slot,
2044                                                    Register temp_reg,
2045                                                    int extra_slot_offset) {
2046   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
2047   int stackElementSize = Interpreter::stackElementSize;
2048   int offset = extra_slot_offset * stackElementSize;
2049   if (arg_slot.is_constant()) {
2050     offset += arg_slot.as_constant() * stackElementSize;
2051     return offset;
2052   } else {
2053     assert(temp_reg != noreg, "must specify");
2054     sldi(temp_reg, arg_slot.as_register(), exact_log2(stackElementSize));
2055     if (offset != 0)
2056       addi(temp_reg, temp_reg, offset);
2057     return temp_reg;
2058   }
2059 }
2060 
2061 // Supports temp2_reg = R0.
2062 void MacroAssembler::biased_locking_enter(ConditionRegister cr_reg, Register obj_reg,
2063                                           Register mark_reg, Register temp_reg,
2064                                           Register temp2_reg, Label& done, Label* slow_case) {
2065   assert(UseBiasedLocking, "why call this otherwise?");
2066 
2067 #ifdef ASSERT
2068   assert_different_registers(obj_reg, mark_reg, temp_reg, temp2_reg);
2069 #endif
2070 
2071   Label cas_label;
2072 
2073   // Branch to done if fast path fails and no slow_case provided.
2074   Label *slow_case_int = (slow_case != NULL) ? slow_case : &done;
2075 
2076   // Biased locking
2077   // See whether the lock is currently biased toward our thread and
2078   // whether the epoch is still valid
2079   // Note that the runtime guarantees sufficient alignment of JavaThread
2080   // pointers to allow age to be placed into low bits
2081   assert(markWord::age_shift == markWord::lock_bits + markWord::biased_lock_bits,
2082          "biased locking makes assumptions about bit layout");
2083 
2084   if (PrintBiasedLockingStatistics) {
2085     load_const(temp2_reg, (address) BiasedLocking::total_entry_count_addr(), temp_reg);
2086     lwzx(temp_reg, temp2_reg);
2087     addi(temp_reg, temp_reg, 1);
2088     stwx(temp_reg, temp2_reg);
2089   }
2090 
2091   andi(temp_reg, mark_reg, markWord::biased_lock_mask_in_place);
2092   cmpwi(cr_reg, temp_reg, markWord::biased_lock_pattern);
2093   bne(cr_reg, cas_label);
2094 
2095   load_klass(temp_reg, obj_reg);
2096 
2097   load_const_optimized(temp2_reg, ~((int) markWord::age_mask_in_place));
2098   ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg);
2099   orr(temp_reg, R16_thread, temp_reg);
2100   xorr(temp_reg, mark_reg, temp_reg);
2101   andr(temp_reg, temp_reg, temp2_reg);
2102   cmpdi(cr_reg, temp_reg, 0);
2103   if (PrintBiasedLockingStatistics) {
2104     Label l;
2105     bne(cr_reg, l);
2106     load_const(temp2_reg, (address) BiasedLocking::biased_lock_entry_count_addr());
2107     lwzx(mark_reg, temp2_reg);
2108     addi(mark_reg, mark_reg, 1);
2109     stwx(mark_reg, temp2_reg);
2110     // restore mark_reg
2111     ld(mark_reg, oopDesc::mark_offset_in_bytes(), obj_reg);
2112     bind(l);
2113   }
2114   beq(cr_reg, done);
2115 
2116   Label try_revoke_bias;
2117   Label try_rebias;
2118 
2119   // At this point we know that the header has the bias pattern and
2120   // that we are not the bias owner in the current epoch. We need to
2121   // figure out more details about the state of the header in order to
2122   // know what operations can be legally performed on the object's
2123   // header.
2124 
2125   // If the low three bits in the xor result aren't clear, that means
2126   // the prototype header is no longer biased and we have to revoke
2127   // the bias on this object.
2128   andi(temp2_reg, temp_reg, markWord::biased_lock_mask_in_place);
2129   cmpwi(cr_reg, temp2_reg, 0);
2130   bne(cr_reg, try_revoke_bias);
2131 
2132   // Biasing is still enabled for this data type. See whether the
2133   // epoch of the current bias is still valid, meaning that the epoch
2134   // bits of the mark word are equal to the epoch bits of the
2135   // prototype header. (Note that the prototype header's epoch bits
2136   // only change at a safepoint.) If not, attempt to rebias the object
2137   // toward the current thread. Note that we must be absolutely sure
2138   // that the current epoch is invalid in order to do this because
2139   // otherwise the manipulations it performs on the mark word are
2140   // illegal.
2141 
2142   int shift_amount = 64 - markWord::epoch_shift;
2143   // rotate epoch bits to right (little) end and set other bits to 0
2144   // [ big part | epoch | little part ] -> [ 0..0 | epoch ]
2145   rldicl_(temp2_reg, temp_reg, shift_amount, 64 - markWord::epoch_bits);
2146   // branch if epoch bits are != 0, i.e. they differ, because the epoch has been incremented
2147   bne(CCR0, try_rebias);
2148 
2149   // The epoch of the current bias is still valid but we know nothing
2150   // about the owner; it might be set or it might be clear. Try to
2151   // acquire the bias of the object using an atomic operation. If this
2152   // fails we will go in to the runtime to revoke the object's bias.
2153   // Note that we first construct the presumed unbiased header so we
2154   // don't accidentally blow away another thread's valid bias.
2155   andi(mark_reg, mark_reg, (markWord::biased_lock_mask_in_place |
2156                                 markWord::age_mask_in_place |
2157                                 markWord::epoch_mask_in_place));
2158   orr(temp_reg, R16_thread, mark_reg);
2159 
2160   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2161 
2162   // CmpxchgX sets cr_reg to cmpX(temp2_reg, mark_reg).
2163   cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg,
2164            /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg,
2165            /*where=*/obj_reg,
2166            MacroAssembler::MemBarAcq,
2167            MacroAssembler::cmpxchgx_hint_acquire_lock(),
2168            noreg, slow_case_int); // bail out if failed
2169 
2170   // If the biasing toward our thread failed, this means that
2171   // another thread succeeded in biasing it toward itself and we
2172   // need to revoke that bias. The revocation will occur in the
2173   // interpreter runtime in the slow case.
2174   if (PrintBiasedLockingStatistics) {
2175     load_const(temp2_reg, (address) BiasedLocking::anonymously_biased_lock_entry_count_addr(), temp_reg);
2176     lwzx(temp_reg, temp2_reg);
2177     addi(temp_reg, temp_reg, 1);
2178     stwx(temp_reg, temp2_reg);
2179   }
2180   b(done);
2181 
2182   bind(try_rebias);
2183   // At this point we know the epoch has expired, meaning that the
2184   // current "bias owner", if any, is actually invalid. Under these
2185   // circumstances _only_, we are allowed to use the current header's
2186   // value as the comparison value when doing the cas to acquire the
2187   // bias in the current epoch. In other words, we allow transfer of
2188   // the bias from one thread to another directly in this situation.
2189   load_klass(temp_reg, obj_reg);
2190   andi(temp2_reg, mark_reg, markWord::age_mask_in_place);
2191   orr(temp2_reg, R16_thread, temp2_reg);
2192   ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg);
2193   orr(temp_reg, temp2_reg, temp_reg);
2194 
2195   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2196 
2197   cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg,
2198                  /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg,
2199                  /*where=*/obj_reg,
2200                  MacroAssembler::MemBarAcq,
2201                  MacroAssembler::cmpxchgx_hint_acquire_lock(),
2202                  noreg, slow_case_int); // bail out if failed
2203 
2204   // If the biasing toward our thread failed, this means that
2205   // another thread succeeded in biasing it toward itself and we
2206   // need to revoke that bias. The revocation will occur in the
2207   // interpreter runtime in the slow case.
2208   if (PrintBiasedLockingStatistics) {
2209     load_const(temp2_reg, (address) BiasedLocking::rebiased_lock_entry_count_addr(), temp_reg);
2210     lwzx(temp_reg, temp2_reg);
2211     addi(temp_reg, temp_reg, 1);
2212     stwx(temp_reg, temp2_reg);
2213   }
2214   b(done);
2215 
2216   bind(try_revoke_bias);
2217   // The prototype mark in the klass doesn't have the bias bit set any
2218   // more, indicating that objects of this data type are not supposed
2219   // to be biased any more. We are going to try to reset the mark of
2220   // this object to the prototype value and fall through to the
2221   // CAS-based locking scheme. Note that if our CAS fails, it means
2222   // that another thread raced us for the privilege of revoking the
2223   // bias of this particular object, so it's okay to continue in the
2224   // normal locking code.
2225   load_klass(temp_reg, obj_reg);
2226   ld(temp_reg, in_bytes(Klass::prototype_header_offset()), temp_reg);
2227   andi(temp2_reg, mark_reg, markWord::age_mask_in_place);
2228   orr(temp_reg, temp_reg, temp2_reg);
2229 
2230   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2231 
2232   // CmpxchgX sets cr_reg to cmpX(temp2_reg, mark_reg).
2233   cmpxchgd(/*flag=*/cr_reg, /*current_value=*/temp2_reg,
2234                  /*compare_value=*/mark_reg, /*exchange_value=*/temp_reg,
2235                  /*where=*/obj_reg,
2236                  MacroAssembler::MemBarAcq,
2237                  MacroAssembler::cmpxchgx_hint_acquire_lock());
2238 
2239   // reload markWord in mark_reg before continuing with lightweight locking
2240   ld(mark_reg, oopDesc::mark_offset_in_bytes(), obj_reg);
2241 
2242   // Fall through to the normal CAS-based lock, because no matter what
2243   // the result of the above CAS, some thread must have succeeded in
2244   // removing the bias bit from the object's header.
2245   if (PrintBiasedLockingStatistics) {
2246     Label l;
2247     bne(cr_reg, l);
2248     load_const(temp2_reg, (address) BiasedLocking::revoked_lock_entry_count_addr(), temp_reg);
2249     lwzx(temp_reg, temp2_reg);
2250     addi(temp_reg, temp_reg, 1);
2251     stwx(temp_reg, temp2_reg);
2252     bind(l);
2253   }
2254 
2255   bind(cas_label);
2256 }
2257 
2258 void MacroAssembler::biased_locking_exit (ConditionRegister cr_reg, Register mark_addr, Register temp_reg, Label& done) {
2259   // Check for biased locking unlock case, which is a no-op
2260   // Note: we do not have to check the thread ID for two reasons.
2261   // First, the interpreter checks for IllegalMonitorStateException at
2262   // a higher level. Second, if the bias was revoked while we held the
2263   // lock, the object could not be rebiased toward another thread, so
2264   // the bias bit would be clear.
2265 
2266   ld(temp_reg, 0, mark_addr);
2267   andi(temp_reg, temp_reg, markWord::biased_lock_mask_in_place);
2268 
2269   cmpwi(cr_reg, temp_reg, markWord::biased_lock_pattern);
2270   beq(cr_reg, done);
2271 }
2272 
2273 // allocation (for C1)
2274 void MacroAssembler::eden_allocate(
2275   Register obj,                      // result: pointer to object after successful allocation
2276   Register var_size_in_bytes,        // object size in bytes if unknown at compile time; invalid otherwise
2277   int      con_size_in_bytes,        // object size in bytes if   known at compile time
2278   Register t1,                       // temp register
2279   Register t2,                       // temp register
2280   Label&   slow_case                 // continuation point if fast allocation fails
2281 ) {
2282   b(slow_case);
2283 }
2284 
2285 void MacroAssembler::tlab_allocate(
2286   Register obj,                      // result: pointer to object after successful allocation
2287   Register var_size_in_bytes,        // object size in bytes if unknown at compile time; invalid otherwise
2288   int      con_size_in_bytes,        // object size in bytes if   known at compile time
2289   Register t1,                       // temp register
2290   Label&   slow_case                 // continuation point if fast allocation fails
2291 ) {
2292   // make sure arguments make sense
2293   assert_different_registers(obj, var_size_in_bytes, t1);
2294   assert(0 <= con_size_in_bytes && is_simm16(con_size_in_bytes), "illegal object size");
2295   assert((con_size_in_bytes & MinObjAlignmentInBytesMask) == 0, "object size is not multiple of alignment");
2296 
2297   const Register new_top = t1;
2298   //verify_tlab(); not implemented
2299 
2300   ld(obj, in_bytes(JavaThread::tlab_top_offset()), R16_thread);
2301   ld(R0, in_bytes(JavaThread::tlab_end_offset()), R16_thread);
2302   if (var_size_in_bytes == noreg) {
2303     addi(new_top, obj, con_size_in_bytes);
2304   } else {
2305     add(new_top, obj, var_size_in_bytes);
2306   }
2307   cmpld(CCR0, new_top, R0);
2308   bc_far_optimized(Assembler::bcondCRbiIs1, bi0(CCR0, Assembler::greater), slow_case);
2309 
2310 #ifdef ASSERT
2311   // make sure new free pointer is properly aligned
2312   {
2313     Label L;
2314     andi_(R0, new_top, MinObjAlignmentInBytesMask);
2315     beq(CCR0, L);
2316     stop("updated TLAB free is not properly aligned", 0x934);
2317     bind(L);
2318   }
2319 #endif // ASSERT
2320 
2321   // update the tlab top pointer
2322   std(new_top, in_bytes(JavaThread::tlab_top_offset()), R16_thread);
2323   //verify_tlab(); not implemented
2324 }
2325 void MacroAssembler::incr_allocated_bytes(RegisterOrConstant size_in_bytes, Register t1, Register t2) {
2326   unimplemented("incr_allocated_bytes");
2327 }
2328 
2329 address MacroAssembler::emit_trampoline_stub(int destination_toc_offset,
2330                                              int insts_call_instruction_offset, Register Rtoc) {
2331   // Start the stub.
2332   address stub = start_a_stub(64);
2333   if (stub == NULL) { return NULL; } // CodeCache full: bail out
2334 
2335   // Create a trampoline stub relocation which relates this trampoline stub
2336   // with the call instruction at insts_call_instruction_offset in the
2337   // instructions code-section.
2338   relocate(trampoline_stub_Relocation::spec(code()->insts()->start() + insts_call_instruction_offset));
2339   const int stub_start_offset = offset();
2340 
2341   // For java_to_interp stubs we use R11_scratch1 as scratch register
2342   // and in call trampoline stubs we use R12_scratch2. This way we
2343   // can distinguish them (see is_NativeCallTrampolineStub_at()).
2344   Register reg_scratch = R12_scratch2;
2345 
2346   // Now, create the trampoline stub's code:
2347   // - load the TOC
2348   // - load the call target from the constant pool
2349   // - call
2350   if (Rtoc == noreg) {
2351     calculate_address_from_global_toc(reg_scratch, method_toc());
2352     Rtoc = reg_scratch;
2353   }
2354 
2355   ld_largeoffset_unchecked(reg_scratch, destination_toc_offset, Rtoc, false);
2356   mtctr(reg_scratch);
2357   bctr();
2358 
2359   const address stub_start_addr = addr_at(stub_start_offset);
2360 
2361   // Assert that the encoded destination_toc_offset can be identified and that it is correct.
2362   assert(destination_toc_offset == NativeCallTrampolineStub_at(stub_start_addr)->destination_toc_offset(),
2363          "encoded offset into the constant pool must match");
2364   // Trampoline_stub_size should be good.
2365   assert((uint)(offset() - stub_start_offset) <= trampoline_stub_size, "should be good size");
2366   assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline");
2367 
2368   // End the stub.
2369   end_a_stub();
2370   return stub;
2371 }
2372 
2373 // TM on PPC64.
2374 void MacroAssembler::atomic_inc_ptr(Register addr, Register result, int simm16) {
2375   Label retry;
2376   bind(retry);
2377   ldarx(result, addr, /*hint*/ false);
2378   addi(result, result, simm16);
2379   stdcx_(result, addr);
2380   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
2381     bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0
2382   } else {
2383     bne(                  CCR0, retry); // stXcx_ sets CCR0
2384   }
2385 }
2386 
2387 void MacroAssembler::atomic_ori_int(Register addr, Register result, int uimm16) {
2388   Label retry;
2389   bind(retry);
2390   lwarx(result, addr, /*hint*/ false);
2391   ori(result, result, uimm16);
2392   stwcx_(result, addr);
2393   if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
2394     bne_predict_not_taken(CCR0, retry); // stXcx_ sets CCR0
2395   } else {
2396     bne(                  CCR0, retry); // stXcx_ sets CCR0
2397   }
2398 }
2399 
2400 #if INCLUDE_RTM_OPT
2401 
2402 // Update rtm_counters based on abort status
2403 // input: abort_status
2404 //        rtm_counters_Reg (RTMLockingCounters*)
2405 void MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters_Reg) {
2406   // Mapping to keep PreciseRTMLockingStatistics similar to x86.
2407   // x86 ppc (! means inverted, ? means not the same)
2408   //  0   31  Set if abort caused by XABORT instruction.
2409   //  1  ! 7  If set, the transaction may succeed on a retry. This bit is always clear if bit 0 is set.
2410   //  2   13  Set if another logical processor conflicted with a memory address that was part of the transaction that aborted.
2411   //  3   10  Set if an internal buffer overflowed.
2412   //  4  ?12  Set if a debug breakpoint was hit.
2413   //  5  ?32  Set if an abort occurred during execution of a nested transaction.
2414   const int failure_bit[] = {tm_tabort, // Signal handler will set this too.
2415                              tm_failure_persistent,
2416                              tm_non_trans_cf,
2417                              tm_trans_cf,
2418                              tm_footprint_of,
2419                              tm_failure_code,
2420                              tm_transaction_level};
2421 
2422   const int num_failure_bits = sizeof(failure_bit) / sizeof(int);
2423   const int num_counters = RTMLockingCounters::ABORT_STATUS_LIMIT;
2424 
2425   const int bit2counter_map[][num_counters] =
2426   // 0 = no map; 1 = mapped, no inverted logic; -1 = mapped, inverted logic
2427   // Inverted logic means that if a bit is set don't count it, or vice-versa.
2428   // Care must be taken when mapping bits to counters as bits for a given
2429   // counter must be mutually exclusive. Otherwise, the counter will be
2430   // incremented more than once.
2431   // counters:
2432   // 0        1        2         3         4         5
2433   // abort  , persist, conflict, overflow, debug   , nested         bits:
2434   {{ 1      , 0      , 0       , 0       , 0       , 0      },   // abort
2435    { 0      , -1     , 0       , 0       , 0       , 0      },   // failure_persistent
2436    { 0      , 0      , 1       , 0       , 0       , 0      },   // non_trans_cf
2437    { 0      , 0      , 1       , 0       , 0       , 0      },   // trans_cf
2438    { 0      , 0      , 0       , 1       , 0       , 0      },   // footprint_of
2439    { 0      , 0      , 0       , 0       , -1      , 0      },   // failure_code = 0xD4
2440    { 0      , 0      , 0       , 0       , 0       , 1      }};  // transaction_level > 1
2441   // ...
2442 
2443   // Move abort_status value to R0 and use abort_status register as a
2444   // temporary register because R0 as third operand in ld/std is treated
2445   // as base address zero (value). Likewise, R0 as second operand in addi
2446   // is problematic because it amounts to li.
2447   const Register temp_Reg = abort_status;
2448   const Register abort_status_R0 = R0;
2449   mr(abort_status_R0, abort_status);
2450 
2451   // Increment total abort counter.
2452   int counters_offs = RTMLockingCounters::abort_count_offset();
2453   ld(temp_Reg, counters_offs, rtm_counters_Reg);
2454   addi(temp_Reg, temp_Reg, 1);
2455   std(temp_Reg, counters_offs, rtm_counters_Reg);
2456 
2457   // Increment specific abort counters.
2458   if (PrintPreciseRTMLockingStatistics) {
2459 
2460     // #0 counter offset.
2461     int abortX_offs = RTMLockingCounters::abortX_count_offset();
2462 
2463     for (int nbit = 0; nbit < num_failure_bits; nbit++) {
2464       for (int ncounter = 0; ncounter < num_counters; ncounter++) {
2465         if (bit2counter_map[nbit][ncounter] != 0) {
2466           Label check_abort;
2467           int abort_counter_offs = abortX_offs + (ncounter << 3);
2468 
2469           if (failure_bit[nbit] == tm_transaction_level) {
2470             // Don't check outer transaction, TL = 1 (bit 63). Hence only
2471             // 11 bits in the TL field are checked to find out if failure
2472             // occured in a nested transaction. This check also matches
2473             // the case when nesting_of = 1 (nesting overflow).
2474             rldicr_(temp_Reg, abort_status_R0, failure_bit[nbit], 10);
2475           } else if (failure_bit[nbit] == tm_failure_code) {
2476             // Check failure code for trap or illegal caught in TM.
2477             // Bits 0:7 are tested as bit 7 (persistent) is copied from
2478             // tabort or treclaim source operand.
2479             // On Linux: trap or illegal is TM_CAUSE_SIGNAL (0xD4).
2480             rldicl(temp_Reg, abort_status_R0, 8, 56);
2481             cmpdi(CCR0, temp_Reg, 0xD4);
2482           } else {
2483             rldicr_(temp_Reg, abort_status_R0, failure_bit[nbit], 0);
2484           }
2485 
2486           if (bit2counter_map[nbit][ncounter] == 1) {
2487             beq(CCR0, check_abort);
2488           } else {
2489             bne(CCR0, check_abort);
2490           }
2491 
2492           // We don't increment atomically.
2493           ld(temp_Reg, abort_counter_offs, rtm_counters_Reg);
2494           addi(temp_Reg, temp_Reg, 1);
2495           std(temp_Reg, abort_counter_offs, rtm_counters_Reg);
2496 
2497           bind(check_abort);
2498         }
2499       }
2500     }
2501   }
2502   // Restore abort_status.
2503   mr(abort_status, abort_status_R0);
2504 }
2505 
2506 // Branch if (random & (count-1) != 0), count is 2^n
2507 // tmp and CR0 are killed
2508 void MacroAssembler::branch_on_random_using_tb(Register tmp, int count, Label& brLabel) {
2509   mftb(tmp);
2510   andi_(tmp, tmp, count-1);
2511   bne(CCR0, brLabel);
2512 }
2513 
2514 // Perform abort ratio calculation, set no_rtm bit if high ratio.
2515 // input:  rtm_counters_Reg (RTMLockingCounters* address) - KILLED
2516 void MacroAssembler::rtm_abort_ratio_calculation(Register rtm_counters_Reg,
2517                                                  RTMLockingCounters* rtm_counters,
2518                                                  Metadata* method_data) {
2519   Label L_done, L_check_always_rtm1, L_check_always_rtm2;
2520 
2521   if (RTMLockingCalculationDelay > 0) {
2522     // Delay calculation.
2523     ld(rtm_counters_Reg, (RegisterOrConstant)(intptr_t)RTMLockingCounters::rtm_calculation_flag_addr());
2524     cmpdi(CCR0, rtm_counters_Reg, 0);
2525     beq(CCR0, L_done);
2526     load_const_optimized(rtm_counters_Reg, (address)rtm_counters, R0); // reload
2527   }
2528   // Abort ratio calculation only if abort_count > RTMAbortThreshold.
2529   //   Aborted transactions = abort_count * 100
2530   //   All transactions = total_count *  RTMTotalCountIncrRate
2531   //   Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio)
2532   ld(R0, RTMLockingCounters::abort_count_offset(), rtm_counters_Reg);
2533   if (is_simm(RTMAbortThreshold, 16)) {   // cmpdi can handle 16bit immediate only.
2534     cmpdi(CCR0, R0, RTMAbortThreshold);
2535     blt(CCR0, L_check_always_rtm2);  // reload of rtm_counters_Reg not necessary
2536   } else {
2537     load_const_optimized(rtm_counters_Reg, RTMAbortThreshold);
2538     cmpd(CCR0, R0, rtm_counters_Reg);
2539     blt(CCR0, L_check_always_rtm1);  // reload of rtm_counters_Reg required
2540   }
2541   mulli(R0, R0, 100);
2542 
2543   const Register tmpReg = rtm_counters_Reg;
2544   ld(tmpReg, RTMLockingCounters::total_count_offset(), rtm_counters_Reg);
2545   mulli(tmpReg, tmpReg, RTMTotalCountIncrRate); // allowable range: int16
2546   mulli(tmpReg, tmpReg, RTMAbortRatio);         // allowable range: int16
2547   cmpd(CCR0, R0, tmpReg);
2548   blt(CCR0, L_check_always_rtm1); // jump to reload
2549   if (method_data != NULL) {
2550     // Set rtm_state to "no rtm" in MDO.
2551     // Not using a metadata relocation. Method and Class Loader are kept alive anyway.
2552     // (See nmethod::metadata_do and CodeBuffer::finalize_oop_references.)
2553     load_const(R0, (address)method_data + MethodData::rtm_state_offset_in_bytes(), tmpReg);
2554     atomic_ori_int(R0, tmpReg, NoRTM);
2555   }
2556   b(L_done);
2557 
2558   bind(L_check_always_rtm1);
2559   load_const_optimized(rtm_counters_Reg, (address)rtm_counters, R0); // reload
2560   bind(L_check_always_rtm2);
2561   ld(tmpReg, RTMLockingCounters::total_count_offset(), rtm_counters_Reg);
2562   int64_t thresholdValue = RTMLockingThreshold / RTMTotalCountIncrRate;
2563   if (is_simm(thresholdValue, 16)) {   // cmpdi can handle 16bit immediate only.
2564     cmpdi(CCR0, tmpReg, thresholdValue);
2565   } else {
2566     load_const_optimized(R0, thresholdValue);
2567     cmpd(CCR0, tmpReg, R0);
2568   }
2569   blt(CCR0, L_done);
2570   if (method_data != NULL) {
2571     // Set rtm_state to "always rtm" in MDO.
2572     // Not using a metadata relocation. See above.
2573     load_const(R0, (address)method_data + MethodData::rtm_state_offset_in_bytes(), tmpReg);
2574     atomic_ori_int(R0, tmpReg, UseRTM);
2575   }
2576   bind(L_done);
2577 }
2578 
2579 // Update counters and perform abort ratio calculation.
2580 // input: abort_status_Reg
2581 void MacroAssembler::rtm_profiling(Register abort_status_Reg, Register temp_Reg,
2582                                    RTMLockingCounters* rtm_counters,
2583                                    Metadata* method_data,
2584                                    bool profile_rtm) {
2585 
2586   assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
2587   // Update rtm counters based on state at abort.
2588   // Reads abort_status_Reg, updates flags.
2589   assert_different_registers(abort_status_Reg, temp_Reg);
2590   load_const_optimized(temp_Reg, (address)rtm_counters, R0);
2591   rtm_counters_update(abort_status_Reg, temp_Reg);
2592   if (profile_rtm) {
2593     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
2594     rtm_abort_ratio_calculation(temp_Reg, rtm_counters, method_data);
2595   }
2596 }
2597 
2598 // Retry on abort if abort's status indicates non-persistent failure.
2599 // inputs: retry_count_Reg
2600 //       : abort_status_Reg
2601 // output: retry_count_Reg decremented by 1
2602 void MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg,
2603                                              Label& retryLabel, Label* checkRetry) {
2604   Label doneRetry;
2605 
2606   // Don't retry if failure is persistent.
2607   // The persistent bit is set when a (A) Disallowed operation is performed in
2608   // transactional state, like for instance trying to write the TFHAR after a
2609   // transaction is started; or when there is (B) a Nesting Overflow (too many
2610   // nested transactions); or when (C) the Footprint overflows (too many
2611   // addressess touched in TM state so there is no more space in the footprint
2612   // area to track them); or in case of (D) a Self-Induced Conflict, i.e. a
2613   // store is performed to a given address in TM state, then once in suspended
2614   // state the same address is accessed. Failure (A) is very unlikely to occur
2615   // in the JVM. Failure (D) will never occur because Suspended state is never
2616   // used in the JVM. Thus mostly (B) a Nesting Overflow or (C) a Footprint
2617   // Overflow will set the persistent bit.
2618   rldicr_(R0, abort_status_Reg, tm_failure_persistent, 0);
2619   bne(CCR0, doneRetry);
2620 
2621   // Don't retry if transaction was deliberately aborted, i.e. caused by a
2622   // tabort instruction.
2623   rldicr_(R0, abort_status_Reg, tm_tabort, 0);
2624   bne(CCR0, doneRetry);
2625 
2626   // Retry if transaction aborted due to a conflict with another thread.
2627   if (checkRetry) { bind(*checkRetry); }
2628   addic_(retry_count_Reg, retry_count_Reg, -1);
2629   blt(CCR0, doneRetry);
2630   b(retryLabel);
2631   bind(doneRetry);
2632 }
2633 
2634 // Spin and retry if lock is busy.
2635 // inputs: owner_addr_Reg (monitor address)
2636 //       : retry_count_Reg
2637 // output: retry_count_Reg decremented by 1
2638 // CTR is killed
2639 void MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register owner_addr_Reg, Label& retryLabel) {
2640   Label SpinLoop, doneRetry, doRetry;
2641   addic_(retry_count_Reg, retry_count_Reg, -1);
2642   blt(CCR0, doneRetry);
2643 
2644   if (RTMSpinLoopCount > 1) {
2645     li(R0, RTMSpinLoopCount);
2646     mtctr(R0);
2647   }
2648 
2649   // low thread priority
2650   smt_prio_low();
2651   bind(SpinLoop);
2652 
2653   if (RTMSpinLoopCount > 1) {
2654     bdz(doRetry);
2655     ld(R0, 0, owner_addr_Reg);
2656     cmpdi(CCR0, R0, 0);
2657     bne(CCR0, SpinLoop);
2658   }
2659 
2660   bind(doRetry);
2661 
2662   // restore thread priority to default in userspace
2663 #ifdef LINUX
2664   smt_prio_medium_low();
2665 #else
2666   smt_prio_medium();
2667 #endif
2668 
2669   b(retryLabel);
2670 
2671   bind(doneRetry);
2672 }
2673 
2674 // Use RTM for normal stack locks.
2675 // Input: objReg (object to lock)
2676 void MacroAssembler::rtm_stack_locking(ConditionRegister flag,
2677                                        Register obj, Register mark_word, Register tmp,
2678                                        Register retry_on_abort_count_Reg,
2679                                        RTMLockingCounters* stack_rtm_counters,
2680                                        Metadata* method_data, bool profile_rtm,
2681                                        Label& DONE_LABEL, Label& IsInflated) {
2682   assert(UseRTMForStackLocks, "why call this otherwise?");
2683   assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
2684   Label L_rtm_retry, L_decrement_retry, L_on_abort;
2685 
2686   if (RTMRetryCount > 0) {
2687     load_const_optimized(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
2688     bind(L_rtm_retry);
2689   }
2690   andi_(R0, mark_word, markWord::monitor_value);  // inflated vs stack-locked|neutral|biased
2691   bne(CCR0, IsInflated);
2692 
2693   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2694     Label L_noincrement;
2695     if (RTMTotalCountIncrRate > 1) {
2696       branch_on_random_using_tb(tmp, RTMTotalCountIncrRate, L_noincrement);
2697     }
2698     assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM");
2699     load_const_optimized(tmp, (address)stack_rtm_counters->total_count_addr(), R0);
2700     //atomic_inc_ptr(tmp, /*temp, will be reloaded*/mark_word); We don't increment atomically
2701     ldx(mark_word, tmp);
2702     addi(mark_word, mark_word, 1);
2703     stdx(mark_word, tmp);
2704     bind(L_noincrement);
2705   }
2706   tbegin_();
2707   beq(CCR0, L_on_abort);
2708   ld(mark_word, oopDesc::mark_offset_in_bytes(), obj);      // Reload in transaction, conflicts need to be tracked.
2709   andi(R0, mark_word, markWord::biased_lock_mask_in_place); // look at 3 lock bits
2710   cmpwi(flag, R0, markWord::unlocked_value);                // bits = 001 unlocked
2711   beq(flag, DONE_LABEL);                                    // all done if unlocked
2712 
2713   if (UseRTMXendForLockBusy) {
2714     tend_();
2715     b(L_decrement_retry);
2716   } else {
2717     tabort_();
2718   }
2719   bind(L_on_abort);
2720   const Register abort_status_Reg = tmp;
2721   mftexasr(abort_status_Reg);
2722   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2723     rtm_profiling(abort_status_Reg, /*temp*/mark_word, stack_rtm_counters, method_data, profile_rtm);
2724   }
2725   ld(mark_word, oopDesc::mark_offset_in_bytes(), obj); // reload
2726   if (RTMRetryCount > 0) {
2727     // Retry on lock abort if abort status is not permanent.
2728     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry, &L_decrement_retry);
2729   } else {
2730     bind(L_decrement_retry);
2731   }
2732 }
2733 
2734 // Use RTM for inflating locks
2735 // inputs: obj       (object to lock)
2736 //         mark_word (current header - KILLED)
2737 //         boxReg    (on-stack box address (displaced header location) - KILLED)
2738 void MacroAssembler::rtm_inflated_locking(ConditionRegister flag,
2739                                           Register obj, Register mark_word, Register boxReg,
2740                                           Register retry_on_busy_count_Reg, Register retry_on_abort_count_Reg,
2741                                           RTMLockingCounters* rtm_counters,
2742                                           Metadata* method_data, bool profile_rtm,
2743                                           Label& DONE_LABEL) {
2744   assert(UseRTMLocking, "why call this otherwise?");
2745   Label L_rtm_retry, L_decrement_retry, L_on_abort;
2746   // Clean monitor_value bit to get valid pointer.
2747   int owner_offset = ObjectMonitor::owner_offset_in_bytes() - markWord::monitor_value;
2748 
2749   // Store non-null, using boxReg instead of (intptr_t)markWord::unused_mark().
2750   std(boxReg, BasicLock::displaced_header_offset_in_bytes(), boxReg);
2751   const Register tmpReg = boxReg;
2752   const Register owner_addr_Reg = mark_word;
2753   addi(owner_addr_Reg, mark_word, owner_offset);
2754 
2755   if (RTMRetryCount > 0) {
2756     load_const_optimized(retry_on_busy_count_Reg, RTMRetryCount);  // Retry on lock busy.
2757     load_const_optimized(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort.
2758     bind(L_rtm_retry);
2759   }
2760   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2761     Label L_noincrement;
2762     if (RTMTotalCountIncrRate > 1) {
2763       branch_on_random_using_tb(R0, RTMTotalCountIncrRate, L_noincrement);
2764     }
2765     assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
2766     load_const(R0, (address)rtm_counters->total_count_addr(), tmpReg);
2767     //atomic_inc_ptr(R0, tmpReg); We don't increment atomically
2768     ldx(tmpReg, R0);
2769     addi(tmpReg, tmpReg, 1);
2770     stdx(tmpReg, R0);
2771     bind(L_noincrement);
2772   }
2773   tbegin_();
2774   beq(CCR0, L_on_abort);
2775   // We don't reload mark word. Will only be reset at safepoint.
2776   ld(R0, 0, owner_addr_Reg); // Load in transaction, conflicts need to be tracked.
2777   cmpdi(flag, R0, 0);
2778   beq(flag, DONE_LABEL);
2779 
2780   if (UseRTMXendForLockBusy) {
2781     tend_();
2782     b(L_decrement_retry);
2783   } else {
2784     tabort_();
2785   }
2786   bind(L_on_abort);
2787   const Register abort_status_Reg = tmpReg;
2788   mftexasr(abort_status_Reg);
2789   if (PrintPreciseRTMLockingStatistics || profile_rtm) {
2790     rtm_profiling(abort_status_Reg, /*temp*/ owner_addr_Reg, rtm_counters, method_data, profile_rtm);
2791     // Restore owner_addr_Reg
2792     ld(mark_word, oopDesc::mark_offset_in_bytes(), obj);
2793 #ifdef ASSERT
2794     andi_(R0, mark_word, markWord::monitor_value);
2795     asm_assert_ne("must be inflated", 0xa754); // Deflating only allowed at safepoint.
2796 #endif
2797     addi(owner_addr_Reg, mark_word, owner_offset);
2798   }
2799   if (RTMRetryCount > 0) {
2800     // Retry on lock abort if abort status is not permanent.
2801     rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
2802   }
2803 
2804   // Appears unlocked - try to swing _owner from null to non-null.
2805   cmpxchgd(flag, /*current val*/ R0, (intptr_t)0, /*new val*/ R16_thread, owner_addr_Reg,
2806            MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2807            MacroAssembler::cmpxchgx_hint_acquire_lock(), noreg, &L_decrement_retry, true);
2808 
2809   if (RTMRetryCount > 0) {
2810     // success done else retry
2811     b(DONE_LABEL);
2812     bind(L_decrement_retry);
2813     // Spin and retry if lock is busy.
2814     rtm_retry_lock_on_busy(retry_on_busy_count_Reg, owner_addr_Reg, L_rtm_retry);
2815   } else {
2816     bind(L_decrement_retry);
2817   }
2818 }
2819 
2820 #endif //  INCLUDE_RTM_OPT
2821 
2822 // "The box" is the space on the stack where we copy the object mark.
2823 void MacroAssembler::compiler_fast_lock_object(ConditionRegister flag, Register oop, Register box,
2824                                                Register temp, Register displaced_header, Register current_header,
2825                                                bool try_bias,
2826                                                RTMLockingCounters* rtm_counters,
2827                                                RTMLockingCounters* stack_rtm_counters,
2828                                                Metadata* method_data,
2829                                                bool use_rtm, bool profile_rtm) {
2830   assert_different_registers(oop, box, temp, displaced_header, current_header);
2831   assert(flag != CCR0, "bad condition register");
2832   Label cont;
2833   Label object_has_monitor;
2834   Label cas_failed;
2835 
2836   // Load markWord from object into displaced_header.
2837   ld(displaced_header, oopDesc::mark_offset_in_bytes(), oop);
2838 
2839 
2840   if (try_bias) {
2841     biased_locking_enter(flag, oop, displaced_header, temp, current_header, cont);
2842   }
2843 
2844 #if INCLUDE_RTM_OPT
2845   if (UseRTMForStackLocks && use_rtm) {
2846     rtm_stack_locking(flag, oop, displaced_header, temp, /*temp*/ current_header,
2847                       stack_rtm_counters, method_data, profile_rtm,
2848                       cont, object_has_monitor);
2849   }
2850 #endif // INCLUDE_RTM_OPT
2851 
2852   // Handle existing monitor.
2853   // The object has an existing monitor iff (mark & monitor_value) != 0.
2854   andi_(temp, displaced_header, markWord::monitor_value);
2855   bne(CCR0, object_has_monitor);
2856 
2857   // Set displaced_header to be (markWord of object | UNLOCK_VALUE).
2858   ori(displaced_header, displaced_header, markWord::unlocked_value);
2859 
2860   // Load Compare Value application register.
2861 
2862   // Initialize the box. (Must happen before we update the object mark!)
2863   std(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box);
2864 
2865   // Must fence, otherwise, preceding store(s) may float below cmpxchg.
2866   // Compare object markWord with mark and if equal exchange scratch1 with object markWord.
2867   cmpxchgd(/*flag=*/flag,
2868            /*current_value=*/current_header,
2869            /*compare_value=*/displaced_header,
2870            /*exchange_value=*/box,
2871            /*where=*/oop,
2872            MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2873            MacroAssembler::cmpxchgx_hint_acquire_lock(),
2874            noreg,
2875            &cas_failed,
2876            /*check without membar and ldarx first*/true);
2877   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2878 
2879   // If the compare-and-exchange succeeded, then we found an unlocked
2880   // object and we have now locked it.
2881   b(cont);
2882 
2883   bind(cas_failed);
2884   // We did not see an unlocked object so try the fast recursive case.
2885 
2886   // Check if the owner is self by comparing the value in the markWord of object
2887   // (current_header) with the stack pointer.
2888   sub(current_header, current_header, R1_SP);
2889   load_const_optimized(temp, ~(os::vm_page_size()-1) | markWord::lock_mask_in_place);
2890 
2891   and_(R0/*==0?*/, current_header, temp);
2892   // If condition is true we are cont and hence we can store 0 as the
2893   // displaced header in the box, which indicates that it is a recursive lock.
2894   mcrf(flag,CCR0);
2895   std(R0/*==0, perhaps*/, BasicLock::displaced_header_offset_in_bytes(), box);
2896 
2897   // Handle existing monitor.
2898   b(cont);
2899 
2900   bind(object_has_monitor);
2901   // The object's monitor m is unlocked iff m->owner == NULL,
2902   // otherwise m->owner may contain a thread or a stack address.
2903 
2904 #if INCLUDE_RTM_OPT
2905   // Use the same RTM locking code in 32- and 64-bit VM.
2906   if (use_rtm) {
2907     rtm_inflated_locking(flag, oop, displaced_header, box, temp, /*temp*/ current_header,
2908                          rtm_counters, method_data, profile_rtm, cont);
2909   } else {
2910 #endif // INCLUDE_RTM_OPT
2911 
2912   // Try to CAS m->owner from NULL to current thread.
2913   addi(temp, displaced_header, ObjectMonitor::owner_offset_in_bytes()-markWord::monitor_value);
2914   cmpxchgd(/*flag=*/flag,
2915            /*current_value=*/current_header,
2916            /*compare_value=*/(intptr_t)0,
2917            /*exchange_value=*/R16_thread,
2918            /*where=*/temp,
2919            MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2920            MacroAssembler::cmpxchgx_hint_acquire_lock());
2921 
2922   // Store a non-null value into the box.
2923   std(box, BasicLock::displaced_header_offset_in_bytes(), box);
2924 
2925 # ifdef ASSERT
2926   bne(flag, cont);
2927   // We have acquired the monitor, check some invariants.
2928   addi(/*monitor=*/temp, temp, -ObjectMonitor::owner_offset_in_bytes());
2929   // Invariant 1: _recursions should be 0.
2930   //assert(ObjectMonitor::recursions_size_in_bytes() == 8, "unexpected size");
2931   asm_assert_mem8_is_zero(ObjectMonitor::recursions_offset_in_bytes(), temp,
2932                             "monitor->_recursions should be 0", -1);
2933 # endif
2934 
2935 #if INCLUDE_RTM_OPT
2936   } // use_rtm()
2937 #endif
2938 
2939   bind(cont);
2940   // flag == EQ indicates success
2941   // flag == NE indicates failure
2942 }
2943 
2944 void MacroAssembler::compiler_fast_unlock_object(ConditionRegister flag, Register oop, Register box,
2945                                                  Register temp, Register displaced_header, Register current_header,
2946                                                  bool try_bias, bool use_rtm) {
2947   assert_different_registers(oop, box, temp, displaced_header, current_header);
2948   assert(flag != CCR0, "bad condition register");
2949   Label cont;
2950   Label object_has_monitor;
2951 
2952   if (try_bias) {
2953     biased_locking_exit(flag, oop, current_header, cont);
2954   }
2955 
2956 #if INCLUDE_RTM_OPT
2957   if (UseRTMForStackLocks && use_rtm) {
2958     assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
2959     Label L_regular_unlock;
2960     ld(current_header, oopDesc::mark_offset_in_bytes(), oop);      // fetch markword
2961     andi(R0, current_header, markWord::biased_lock_mask_in_place); // look at 3 lock bits
2962     cmpwi(flag, R0, markWord::unlocked_value);                     // bits = 001 unlocked
2963     bne(flag, L_regular_unlock);                                   // else RegularLock
2964     tend_();                                                       // otherwise end...
2965     b(cont);                                                       // ... and we're done
2966     bind(L_regular_unlock);
2967   }
2968 #endif
2969 
2970   // Find the lock address and load the displaced header from the stack.
2971   ld(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box);
2972 
2973   // If the displaced header is 0, we have a recursive unlock.
2974   cmpdi(flag, displaced_header, 0);
2975   beq(flag, cont);
2976 
2977   // Handle existing monitor.
2978   // The object has an existing monitor iff (mark & monitor_value) != 0.
2979   RTM_OPT_ONLY( if (!(UseRTMForStackLocks && use_rtm)) ) // skip load if already done
2980   ld(current_header, oopDesc::mark_offset_in_bytes(), oop);
2981   andi_(R0, current_header, markWord::monitor_value);
2982   bne(CCR0, object_has_monitor);
2983 
2984   // Check if it is still a light weight lock, this is is true if we see
2985   // the stack address of the basicLock in the markWord of the object.
2986   // Cmpxchg sets flag to cmpd(current_header, box).
2987   cmpxchgd(/*flag=*/flag,
2988            /*current_value=*/current_header,
2989            /*compare_value=*/box,
2990            /*exchange_value=*/displaced_header,
2991            /*where=*/oop,
2992            MacroAssembler::MemBarRel,
2993            MacroAssembler::cmpxchgx_hint_release_lock(),
2994            noreg,
2995            &cont);
2996 
2997   assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2998 
2999   // Handle existing monitor.
3000   b(cont);
3001 
3002   bind(object_has_monitor);
3003   STATIC_ASSERT(markWord::monitor_value <= INT_MAX);
3004   addi(current_header, current_header, -(int)markWord::monitor_value); // monitor
3005   ld(temp,             ObjectMonitor::owner_offset_in_bytes(), current_header);
3006 
3007     // It's inflated.
3008 #if INCLUDE_RTM_OPT
3009   if (use_rtm) {
3010     Label L_regular_inflated_unlock;
3011     // Clean monitor_value bit to get valid pointer
3012     cmpdi(flag, temp, 0);
3013     bne(flag, L_regular_inflated_unlock);
3014     tend_();
3015     b(cont);
3016     bind(L_regular_inflated_unlock);
3017   }
3018 #endif
3019 
3020   ld(displaced_header, ObjectMonitor::recursions_offset_in_bytes(), current_header);
3021   xorr(temp, R16_thread, temp);      // Will be 0 if we are the owner.
3022   orr(temp, temp, displaced_header); // Will be 0 if there are 0 recursions.
3023   cmpdi(flag, temp, 0);
3024   bne(flag, cont);
3025 
3026   ld(temp,             ObjectMonitor::EntryList_offset_in_bytes(), current_header);
3027   ld(displaced_header, ObjectMonitor::cxq_offset_in_bytes(), current_header);
3028   orr(temp, temp, displaced_header); // Will be 0 if both are 0.
3029   cmpdi(flag, temp, 0);
3030   bne(flag, cont);
3031   release();
3032   std(temp, ObjectMonitor::owner_offset_in_bytes(), current_header);
3033 
3034   bind(cont);
3035   // flag == EQ indicates success
3036   // flag == NE indicates failure
3037 }
3038 
3039 void MacroAssembler::safepoint_poll(Label& slow_path, Register temp_reg) {
3040   if (SafepointMechanism::uses_thread_local_poll()) {
3041     ld(temp_reg, in_bytes(Thread::polling_page_offset()), R16_thread);
3042     // Armed page has poll_bit set.
3043     andi_(temp_reg, temp_reg, SafepointMechanism::poll_bit());
3044   } else {
3045     lwz(temp_reg, (RegisterOrConstant)(intptr_t)SafepointSynchronize::address_of_state());
3046     cmpwi(CCR0, temp_reg, SafepointSynchronize::_not_synchronized);
3047   }
3048   bne(CCR0, slow_path);
3049 }
3050 
3051 void MacroAssembler::resolve_jobject(Register value, Register tmp1, Register tmp2, bool needs_frame) {
3052   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
3053   bs->resolve_jobject(this, value, tmp1, tmp2, needs_frame);
3054 }
3055 
3056 // Values for last_Java_pc, and last_Java_sp must comply to the rules
3057 // in frame_ppc.hpp.
3058 void MacroAssembler::set_last_Java_frame(Register last_Java_sp, Register last_Java_pc) {
3059   // Always set last_Java_pc and flags first because once last_Java_sp
3060   // is visible has_last_Java_frame is true and users will look at the
3061   // rest of the fields. (Note: flags should always be zero before we
3062   // get here so doesn't need to be set.)
3063 
3064   // Verify that last_Java_pc was zeroed on return to Java
3065   asm_assert_mem8_is_zero(in_bytes(JavaThread::last_Java_pc_offset()), R16_thread,
3066                           "last_Java_pc not zeroed before leaving Java", 0x200);
3067 
3068   // When returning from calling out from Java mode the frame anchor's
3069   // last_Java_pc will always be set to NULL. It is set here so that
3070   // if we are doing a call to native (not VM) that we capture the
3071   // known pc and don't have to rely on the native call having a
3072   // standard frame linkage where we can find the pc.
3073   if (last_Java_pc != noreg)
3074     std(last_Java_pc, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread);
3075 
3076   // Set last_Java_sp last.
3077   std(last_Java_sp, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread);
3078 }
3079 
3080 void MacroAssembler::reset_last_Java_frame(void) {
3081   asm_assert_mem8_isnot_zero(in_bytes(JavaThread::last_Java_sp_offset()),
3082                              R16_thread, "SP was not set, still zero", 0x202);
3083 
3084   BLOCK_COMMENT("reset_last_Java_frame {");
3085   li(R0, 0);
3086 
3087   // _last_Java_sp = 0
3088   std(R0, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread);
3089 
3090   // _last_Java_pc = 0
3091   std(R0, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread);
3092   BLOCK_COMMENT("} reset_last_Java_frame");
3093 }
3094 
3095 void MacroAssembler::set_top_ijava_frame_at_SP_as_last_Java_frame(Register sp, Register tmp1) {
3096   assert_different_registers(sp, tmp1);
3097 
3098   // sp points to a TOP_IJAVA_FRAME, retrieve frame's PC via
3099   // TOP_IJAVA_FRAME_ABI.
3100   // FIXME: assert that we really have a TOP_IJAVA_FRAME here!
3101   address entry = pc();
3102   load_const_optimized(tmp1, entry);
3103 
3104   set_last_Java_frame(/*sp=*/sp, /*pc=*/tmp1);
3105 }
3106 
3107 void MacroAssembler::get_vm_result(Register oop_result) {
3108   // Read:
3109   //   R16_thread
3110   //   R16_thread->in_bytes(JavaThread::vm_result_offset())
3111   //
3112   // Updated:
3113   //   oop_result
3114   //   R16_thread->in_bytes(JavaThread::vm_result_offset())
3115 
3116   verify_thread();
3117 
3118   ld(oop_result, in_bytes(JavaThread::vm_result_offset()), R16_thread);
3119   li(R0, 0);
3120   std(R0, in_bytes(JavaThread::vm_result_offset()), R16_thread);
3121 
3122   verify_oop(oop_result);
3123 }
3124 
3125 void MacroAssembler::get_vm_result_2(Register metadata_result) {
3126   // Read:
3127   //   R16_thread
3128   //   R16_thread->in_bytes(JavaThread::vm_result_2_offset())
3129   //
3130   // Updated:
3131   //   metadata_result
3132   //   R16_thread->in_bytes(JavaThread::vm_result_2_offset())
3133 
3134   ld(metadata_result, in_bytes(JavaThread::vm_result_2_offset()), R16_thread);
3135   li(R0, 0);
3136   std(R0, in_bytes(JavaThread::vm_result_2_offset()), R16_thread);
3137 }
3138 
3139 Register MacroAssembler::encode_klass_not_null(Register dst, Register src) {
3140   Register current = (src != noreg) ? src : dst; // Klass is in dst if no src provided.
3141   if (CompressedKlassPointers::base() != 0) {
3142     // Use dst as temp if it is free.
3143     sub_const_optimized(dst, current, CompressedKlassPointers::base(), R0);
3144     current = dst;
3145   }
3146   if (CompressedKlassPointers::shift() != 0) {
3147     srdi(dst, current, CompressedKlassPointers::shift());
3148     current = dst;
3149   }
3150   return current;
3151 }
3152 
3153 void MacroAssembler::store_klass(Register dst_oop, Register klass, Register ck) {
3154   if (UseCompressedClassPointers) {
3155     Register compressedKlass = encode_klass_not_null(ck, klass);
3156     stw(compressedKlass, oopDesc::klass_offset_in_bytes(), dst_oop);
3157   } else {
3158     std(klass, oopDesc::klass_offset_in_bytes(), dst_oop);
3159   }
3160 }
3161 
3162 void MacroAssembler::store_klass_gap(Register dst_oop, Register val) {
3163   if (UseCompressedClassPointers) {
3164     if (val == noreg) {
3165       val = R0;
3166       li(val, 0);
3167     }
3168     stw(val, oopDesc::klass_gap_offset_in_bytes(), dst_oop); // klass gap if compressed
3169   }
3170 }
3171 
3172 int MacroAssembler::instr_size_for_decode_klass_not_null() {
3173   if (!UseCompressedClassPointers) return 0;
3174   int num_instrs = 1;  // shift or move
3175   if (CompressedKlassPointers::base() != 0) num_instrs = 7;  // shift + load const + add
3176   return num_instrs * BytesPerInstWord;
3177 }
3178 
3179 void MacroAssembler::decode_klass_not_null(Register dst, Register src) {
3180   assert(dst != R0, "Dst reg may not be R0, as R0 is used here.");
3181   if (src == noreg) src = dst;
3182   Register shifted_src = src;
3183   if (CompressedKlassPointers::shift() != 0 ||
3184       CompressedKlassPointers::base() == 0 && src != dst) {  // Move required.
3185     shifted_src = dst;
3186     sldi(shifted_src, src, CompressedKlassPointers::shift());
3187   }
3188   if (CompressedKlassPointers::base() != 0) {
3189     add_const_optimized(dst, shifted_src, CompressedKlassPointers::base(), R0);
3190   }
3191 }
3192 
3193 void MacroAssembler::load_klass(Register dst, Register src) {
3194   if (UseCompressedClassPointers) {
3195     lwz(dst, oopDesc::klass_offset_in_bytes(), src);
3196     // Attention: no null check here!
3197     decode_klass_not_null(dst, dst);
3198   } else {
3199     ld(dst, oopDesc::klass_offset_in_bytes(), src);
3200   }
3201 }
3202 
3203 // ((OopHandle)result).resolve();
3204 void MacroAssembler::resolve_oop_handle(Register result) {
3205   // OopHandle::resolve is an indirection.
3206   ld(result, 0, result);
3207 }
3208 
3209 void MacroAssembler::load_mirror_from_const_method(Register mirror, Register const_method) {
3210   ld(mirror, in_bytes(ConstMethod::constants_offset()), const_method);
3211   ld(mirror, ConstantPool::pool_holder_offset_in_bytes(), mirror);
3212   ld(mirror, in_bytes(Klass::java_mirror_offset()), mirror);
3213   resolve_oop_handle(mirror);
3214 }
3215 
3216 void MacroAssembler::load_method_holder(Register holder, Register method) {
3217   ld(holder, in_bytes(Method::const_offset()), method);
3218   ld(holder, in_bytes(ConstMethod::constants_offset()), holder);
3219   ld(holder, ConstantPool::pool_holder_offset_in_bytes(), holder);
3220 }
3221 
3222 // Clear Array
3223 // For very short arrays. tmp == R0 is allowed.
3224 void MacroAssembler::clear_memory_unrolled(Register base_ptr, int cnt_dwords, Register tmp, int offset) {
3225   if (cnt_dwords > 0) { li(tmp, 0); }
3226   for (int i = 0; i < cnt_dwords; ++i) { std(tmp, offset + i * 8, base_ptr); }
3227 }
3228 
3229 // Version for constant short array length. Kills base_ptr. tmp == R0 is allowed.
3230 void MacroAssembler::clear_memory_constlen(Register base_ptr, int cnt_dwords, Register tmp) {
3231   if (cnt_dwords < 8) {
3232     clear_memory_unrolled(base_ptr, cnt_dwords, tmp);
3233     return;
3234   }
3235 
3236   Label loop;
3237   const long loopcnt   = cnt_dwords >> 1,
3238              remainder = cnt_dwords & 1;
3239 
3240   li(tmp, loopcnt);
3241   mtctr(tmp);
3242   li(tmp, 0);
3243   bind(loop);
3244     std(tmp, 0, base_ptr);
3245     std(tmp, 8, base_ptr);
3246     addi(base_ptr, base_ptr, 16);
3247     bdnz(loop);
3248   if (remainder) { std(tmp, 0, base_ptr); }
3249 }
3250 
3251 // Kills both input registers. tmp == R0 is allowed.
3252 void MacroAssembler::clear_memory_doubleword(Register base_ptr, Register cnt_dwords, Register tmp, long const_cnt) {
3253   // Procedure for large arrays (uses data cache block zero instruction).
3254     Label startloop, fast, fastloop, small_rest, restloop, done;
3255     const int cl_size         = VM_Version::L1_data_cache_line_size(),
3256               cl_dwords       = cl_size >> 3,
3257               cl_dw_addr_bits = exact_log2(cl_dwords),
3258               dcbz_min        = 1,  // Min count of dcbz executions, needs to be >0.
3259               min_cnt         = ((dcbz_min + 1) << cl_dw_addr_bits) - 1;
3260 
3261   if (const_cnt >= 0) {
3262     // Constant case.
3263     if (const_cnt < min_cnt) {
3264       clear_memory_constlen(base_ptr, const_cnt, tmp);
3265       return;
3266     }
3267     load_const_optimized(cnt_dwords, const_cnt, tmp);
3268   } else {
3269     // cnt_dwords already loaded in register. Need to check size.
3270     cmpdi(CCR1, cnt_dwords, min_cnt); // Big enough? (ensure >= dcbz_min lines included).
3271     blt(CCR1, small_rest);
3272   }
3273     rldicl_(tmp, base_ptr, 64-3, 64-cl_dw_addr_bits); // Extract dword offset within first cache line.
3274     beq(CCR0, fast);                                  // Already 128byte aligned.
3275 
3276     subfic(tmp, tmp, cl_dwords);
3277     mtctr(tmp);                        // Set ctr to hit 128byte boundary (0<ctr<cl_dwords).
3278     subf(cnt_dwords, tmp, cnt_dwords); // rest.
3279     li(tmp, 0);
3280 
3281   bind(startloop);                     // Clear at the beginning to reach 128byte boundary.
3282     std(tmp, 0, base_ptr);             // Clear 8byte aligned block.
3283     addi(base_ptr, base_ptr, 8);
3284     bdnz(startloop);
3285 
3286   bind(fast);                                  // Clear 128byte blocks.
3287     srdi(tmp, cnt_dwords, cl_dw_addr_bits);    // Loop count for 128byte loop (>0).
3288     andi(cnt_dwords, cnt_dwords, cl_dwords-1); // Rest in dwords.
3289     mtctr(tmp);                                // Load counter.
3290 
3291   bind(fastloop);
3292     dcbz(base_ptr);                    // Clear 128byte aligned block.
3293     addi(base_ptr, base_ptr, cl_size);
3294     bdnz(fastloop);
3295 
3296   bind(small_rest);
3297     cmpdi(CCR0, cnt_dwords, 0);        // size 0?
3298     beq(CCR0, done);                   // rest == 0
3299     li(tmp, 0);
3300     mtctr(cnt_dwords);                 // Load counter.
3301 
3302   bind(restloop);                      // Clear rest.
3303     std(tmp, 0, base_ptr);             // Clear 8byte aligned block.
3304     addi(base_ptr, base_ptr, 8);
3305     bdnz(restloop);
3306 
3307   bind(done);
3308 }
3309 
3310 /////////////////////////////////////////// String intrinsics ////////////////////////////////////////////
3311 
3312 #ifdef COMPILER2
3313 // Intrinsics for CompactStrings
3314 
3315 // Compress char[] to byte[] by compressing 16 bytes at once.
3316 void MacroAssembler::string_compress_16(Register src, Register dst, Register cnt,
3317                                         Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5,
3318                                         Label& Lfailure) {
3319 
3320   const Register tmp0 = R0;
3321   assert_different_registers(src, dst, cnt, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5);
3322   Label Lloop, Lslow;
3323 
3324   // Check if cnt >= 8 (= 16 bytes)
3325   lis(tmp1, 0xFF);                // tmp1 = 0x00FF00FF00FF00FF
3326   srwi_(tmp2, cnt, 3);
3327   beq(CCR0, Lslow);
3328   ori(tmp1, tmp1, 0xFF);
3329   rldimi(tmp1, tmp1, 32, 0);
3330   mtctr(tmp2);
3331 
3332   // 2x unrolled loop
3333   bind(Lloop);
3334   ld(tmp2, 0, src);               // _0_1_2_3 (Big Endian)
3335   ld(tmp4, 8, src);               // _4_5_6_7
3336 
3337   orr(tmp0, tmp2, tmp4);
3338   rldicl(tmp3, tmp2, 6*8, 64-24); // _____1_2
3339   rldimi(tmp2, tmp2, 2*8, 2*8);   // _0_2_3_3
3340   rldicl(tmp5, tmp4, 6*8, 64-24); // _____5_6
3341   rldimi(tmp4, tmp4, 2*8, 2*8);   // _4_6_7_7
3342 
3343   andc_(tmp0, tmp0, tmp1);
3344   bne(CCR0, Lfailure);            // Not latin1.
3345   addi(src, src, 16);
3346 
3347   rlwimi(tmp3, tmp2, 0*8, 24, 31);// _____1_3
3348   srdi(tmp2, tmp2, 3*8);          // ____0_2_
3349   rlwimi(tmp5, tmp4, 0*8, 24, 31);// _____5_7
3350   srdi(tmp4, tmp4, 3*8);          // ____4_6_
3351 
3352   orr(tmp2, tmp2, tmp3);          // ____0123
3353   orr(tmp4, tmp4, tmp5);          // ____4567
3354 
3355   stw(tmp2, 0, dst);
3356   stw(tmp4, 4, dst);
3357   addi(dst, dst, 8);
3358   bdnz(Lloop);
3359 
3360   bind(Lslow);                    // Fallback to slow version
3361 }
3362 
3363 // Compress char[] to byte[]. cnt must be positive int.
3364 void MacroAssembler::string_compress(Register src, Register dst, Register cnt, Register tmp, Label& Lfailure) {
3365   Label Lloop;
3366   mtctr(cnt);
3367 
3368   bind(Lloop);
3369   lhz(tmp, 0, src);
3370   cmplwi(CCR0, tmp, 0xff);
3371   bgt(CCR0, Lfailure);            // Not latin1.
3372   addi(src, src, 2);
3373   stb(tmp, 0, dst);
3374   addi(dst, dst, 1);
3375   bdnz(Lloop);
3376 }
3377 
3378 // Inflate byte[] to char[] by inflating 16 bytes at once.
3379 void MacroAssembler::string_inflate_16(Register src, Register dst, Register cnt,
3380                                        Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5) {
3381   const Register tmp0 = R0;
3382   assert_different_registers(src, dst, cnt, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5);
3383   Label Lloop, Lslow;
3384 
3385   // Check if cnt >= 8
3386   srwi_(tmp2, cnt, 3);
3387   beq(CCR0, Lslow);
3388   lis(tmp1, 0xFF);                // tmp1 = 0x00FF00FF
3389   ori(tmp1, tmp1, 0xFF);
3390   mtctr(tmp2);
3391 
3392   // 2x unrolled loop
3393   bind(Lloop);
3394   lwz(tmp2, 0, src);              // ____0123 (Big Endian)
3395   lwz(tmp4, 4, src);              // ____4567
3396   addi(src, src, 8);
3397 
3398   rldicl(tmp3, tmp2, 7*8, 64-8);  // _______2
3399   rlwimi(tmp2, tmp2, 3*8, 16, 23);// ____0113
3400   rldicl(tmp5, tmp4, 7*8, 64-8);  // _______6
3401   rlwimi(tmp4, tmp4, 3*8, 16, 23);// ____4557
3402 
3403   andc(tmp0, tmp2, tmp1);         // ____0_1_
3404   rlwimi(tmp2, tmp3, 2*8, 0, 23); // _____2_3
3405   andc(tmp3, tmp4, tmp1);         // ____4_5_
3406   rlwimi(tmp4, tmp5, 2*8, 0, 23); // _____6_7
3407 
3408   rldimi(tmp2, tmp0, 3*8, 0*8);   // _0_1_2_3
3409   rldimi(tmp4, tmp3, 3*8, 0*8);   // _4_5_6_7
3410 
3411   std(tmp2, 0, dst);
3412   std(tmp4, 8, dst);
3413   addi(dst, dst, 16);
3414   bdnz(Lloop);
3415 
3416   bind(Lslow);                    // Fallback to slow version
3417 }
3418 
3419 // Inflate byte[] to char[]. cnt must be positive int.
3420 void MacroAssembler::string_inflate(Register src, Register dst, Register cnt, Register tmp) {
3421   Label Lloop;
3422   mtctr(cnt);
3423 
3424   bind(Lloop);
3425   lbz(tmp, 0, src);
3426   addi(src, src, 1);
3427   sth(tmp, 0, dst);
3428   addi(dst, dst, 2);
3429   bdnz(Lloop);
3430 }
3431 
3432 void MacroAssembler::string_compare(Register str1, Register str2,
3433                                     Register cnt1, Register cnt2,
3434                                     Register tmp1, Register result, int ae) {
3435   const Register tmp0 = R0,
3436                  diff = tmp1;
3437 
3438   assert_different_registers(str1, str2, cnt1, cnt2, tmp0, tmp1, result);
3439   Label Ldone, Lslow, Lloop, Lreturn_diff;
3440 
3441   // Note: Making use of the fact that compareTo(a, b) == -compareTo(b, a)
3442   // we interchange str1 and str2 in the UL case and negate the result.
3443   // Like this, str1 is always latin1 encoded, except for the UU case.
3444   // In addition, we need 0 (or sign which is 0) extend.
3445 
3446   if (ae == StrIntrinsicNode::UU) {
3447     srwi(cnt1, cnt1, 1);
3448   } else {
3449     clrldi(cnt1, cnt1, 32);
3450   }
3451 
3452   if (ae != StrIntrinsicNode::LL) {
3453     srwi(cnt2, cnt2, 1);
3454   } else {
3455     clrldi(cnt2, cnt2, 32);
3456   }
3457 
3458   // See if the lengths are different, and calculate min in cnt1.
3459   // Save diff in case we need it for a tie-breaker.
3460   subf_(diff, cnt2, cnt1); // diff = cnt1 - cnt2
3461   // if (diff > 0) { cnt1 = cnt2; }
3462   if (VM_Version::has_isel()) {
3463     isel(cnt1, CCR0, Assembler::greater, /*invert*/ false, cnt2);
3464   } else {
3465     Label Lskip;
3466     blt(CCR0, Lskip);
3467     mr(cnt1, cnt2);
3468     bind(Lskip);
3469   }
3470 
3471   // Rename registers
3472   Register chr1 = result;
3473   Register chr2 = tmp0;
3474 
3475   // Compare multiple characters in fast loop (only implemented for same encoding).
3476   int stride1 = 8, stride2 = 8;
3477   if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
3478     int log2_chars_per_iter = (ae == StrIntrinsicNode::LL) ? 3 : 2;
3479     Label Lfastloop, Lskipfast;
3480 
3481     srwi_(tmp0, cnt1, log2_chars_per_iter);
3482     beq(CCR0, Lskipfast);
3483     rldicl(cnt2, cnt1, 0, 64 - log2_chars_per_iter); // Remaining characters.
3484     li(cnt1, 1 << log2_chars_per_iter); // Initialize for failure case: Rescan characters from current iteration.
3485     mtctr(tmp0);
3486 
3487     bind(Lfastloop);
3488     ld(chr1, 0, str1);
3489     ld(chr2, 0, str2);
3490     cmpd(CCR0, chr1, chr2);
3491     bne(CCR0, Lslow);
3492     addi(str1, str1, stride1);
3493     addi(str2, str2, stride2);
3494     bdnz(Lfastloop);
3495     mr(cnt1, cnt2); // Remaining characters.
3496     bind(Lskipfast);
3497   }
3498 
3499   // Loop which searches the first difference character by character.
3500   cmpwi(CCR0, cnt1, 0);
3501   beq(CCR0, Lreturn_diff);
3502   bind(Lslow);
3503   mtctr(cnt1);
3504 
3505   switch (ae) {
3506     case StrIntrinsicNode::LL: stride1 = 1; stride2 = 1; break;
3507     case StrIntrinsicNode::UL: // fallthru (see comment above)
3508     case StrIntrinsicNode::LU: stride1 = 1; stride2 = 2; break;
3509     case StrIntrinsicNode::UU: stride1 = 2; stride2 = 2; break;
3510     default: ShouldNotReachHere(); break;
3511   }
3512 
3513   bind(Lloop);
3514   if (stride1 == 1) { lbz(chr1, 0, str1); } else { lhz(chr1, 0, str1); }
3515   if (stride2 == 1) { lbz(chr2, 0, str2); } else { lhz(chr2, 0, str2); }
3516   subf_(result, chr2, chr1); // result = chr1 - chr2
3517   bne(CCR0, Ldone);
3518   addi(str1, str1, stride1);
3519   addi(str2, str2, stride2);
3520   bdnz(Lloop);
3521 
3522   // If strings are equal up to min length, return the length difference.
3523   bind(Lreturn_diff);
3524   mr(result, diff);
3525 
3526   // Otherwise, return the difference between the first mismatched chars.
3527   bind(Ldone);
3528   if (ae == StrIntrinsicNode::UL) {
3529     neg(result, result); // Negate result (see note above).
3530   }
3531 }
3532 
3533 void MacroAssembler::array_equals(bool is_array_equ, Register ary1, Register ary2,
3534                                   Register limit, Register tmp1, Register result, bool is_byte) {
3535   const Register tmp0 = R0;
3536   assert_different_registers(ary1, ary2, limit, tmp0, tmp1, result);
3537   Label Ldone, Lskiploop, Lloop, Lfastloop, Lskipfast;
3538   bool limit_needs_shift = false;
3539 
3540   if (is_array_equ) {
3541     const int length_offset = arrayOopDesc::length_offset_in_bytes();
3542     const int base_offset   = arrayOopDesc::base_offset_in_bytes(is_byte ? T_BYTE : T_CHAR);
3543 
3544     // Return true if the same array.
3545     cmpd(CCR0, ary1, ary2);
3546     beq(CCR0, Lskiploop);
3547 
3548     // Return false if one of them is NULL.
3549     cmpdi(CCR0, ary1, 0);
3550     cmpdi(CCR1, ary2, 0);
3551     li(result, 0);
3552     cror(CCR0, Assembler::equal, CCR1, Assembler::equal);
3553     beq(CCR0, Ldone);
3554 
3555     // Load the lengths of arrays.
3556     lwz(limit, length_offset, ary1);
3557     lwz(tmp0, length_offset, ary2);
3558 
3559     // Return false if the two arrays are not equal length.
3560     cmpw(CCR0, limit, tmp0);
3561     bne(CCR0, Ldone);
3562 
3563     // Load array addresses.
3564     addi(ary1, ary1, base_offset);
3565     addi(ary2, ary2, base_offset);
3566   } else {
3567     limit_needs_shift = !is_byte;
3568     li(result, 0); // Assume not equal.
3569   }
3570 
3571   // Rename registers
3572   Register chr1 = tmp0;
3573   Register chr2 = tmp1;
3574 
3575   // Compare 8 bytes per iteration in fast loop.
3576   const int log2_chars_per_iter = is_byte ? 3 : 2;
3577 
3578   srwi_(tmp0, limit, log2_chars_per_iter + (limit_needs_shift ? 1 : 0));
3579   beq(CCR0, Lskipfast);
3580   mtctr(tmp0);
3581 
3582   bind(Lfastloop);
3583   ld(chr1, 0, ary1);
3584   ld(chr2, 0, ary2);
3585   addi(ary1, ary1, 8);
3586   addi(ary2, ary2, 8);
3587   cmpd(CCR0, chr1, chr2);
3588   bne(CCR0, Ldone);
3589   bdnz(Lfastloop);
3590 
3591   bind(Lskipfast);
3592   rldicl_(limit, limit, limit_needs_shift ? 64 - 1 : 0, 64 - log2_chars_per_iter); // Remaining characters.
3593   beq(CCR0, Lskiploop);
3594   mtctr(limit);
3595 
3596   // Character by character.
3597   bind(Lloop);
3598   if (is_byte) {
3599     lbz(chr1, 0, ary1);
3600     lbz(chr2, 0, ary2);
3601     addi(ary1, ary1, 1);
3602     addi(ary2, ary2, 1);
3603   } else {
3604     lhz(chr1, 0, ary1);
3605     lhz(chr2, 0, ary2);
3606     addi(ary1, ary1, 2);
3607     addi(ary2, ary2, 2);
3608   }
3609   cmpw(CCR0, chr1, chr2);
3610   bne(CCR0, Ldone);
3611   bdnz(Lloop);
3612 
3613   bind(Lskiploop);
3614   li(result, 1); // All characters are equal.
3615   bind(Ldone);
3616 }
3617 
3618 void MacroAssembler::string_indexof(Register result, Register haystack, Register haycnt,
3619                                     Register needle, ciTypeArray* needle_values, Register needlecnt, int needlecntval,
3620                                     Register tmp1, Register tmp2, Register tmp3, Register tmp4, int ae) {
3621 
3622   // Ensure 0<needlecnt<=haycnt in ideal graph as prerequisite!
3623   Label L_TooShort, L_Found, L_NotFound, L_End;
3624   Register last_addr = haycnt, // Kill haycnt at the beginning.
3625   addr      = tmp1,
3626   n_start   = tmp2,
3627   ch1       = tmp3,
3628   ch2       = R0;
3629 
3630   assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
3631   const int h_csize = (ae == StrIntrinsicNode::LL) ? 1 : 2;
3632   const int n_csize = (ae == StrIntrinsicNode::UU) ? 2 : 1;
3633 
3634   // **************************************************************************************************
3635   // Prepare for main loop: optimized for needle count >=2, bail out otherwise.
3636   // **************************************************************************************************
3637 
3638   // Compute last haystack addr to use if no match gets found.
3639   clrldi(haycnt, haycnt, 32);         // Ensure positive int is valid as 64 bit value.
3640   addi(addr, haystack, -h_csize);     // Accesses use pre-increment.
3641   if (needlecntval == 0) { // variable needlecnt
3642    cmpwi(CCR6, needlecnt, 2);
3643    clrldi(needlecnt, needlecnt, 32);  // Ensure positive int is valid as 64 bit value.
3644    blt(CCR6, L_TooShort);             // Variable needlecnt: handle short needle separately.
3645   }
3646 
3647   if (n_csize == 2) { lwz(n_start, 0, needle); } else { lhz(n_start, 0, needle); } // Load first 2 characters of needle.
3648 
3649   if (needlecntval == 0) { // variable needlecnt
3650    subf(ch1, needlecnt, haycnt);      // Last character index to compare is haycnt-needlecnt.
3651    addi(needlecnt, needlecnt, -2);    // Rest of needle.
3652   } else { // constant needlecnt
3653   guarantee(needlecntval != 1, "IndexOf with single-character needle must be handled separately");
3654   assert((needlecntval & 0x7fff) == needlecntval, "wrong immediate");
3655    addi(ch1, haycnt, -needlecntval);  // Last character index to compare is haycnt-needlecnt.
3656    if (needlecntval > 3) { li(needlecnt, needlecntval - 2); } // Rest of needle.
3657   }
3658 
3659   if (h_csize == 2) { slwi(ch1, ch1, 1); } // Scale to number of bytes.
3660 
3661   if (ae ==StrIntrinsicNode::UL) {
3662    srwi(tmp4, n_start, 1*8);          // ___0
3663    rlwimi(n_start, tmp4, 2*8, 0, 23); // _0_1
3664   }
3665 
3666   add(last_addr, haystack, ch1);      // Point to last address to compare (haystack+2*(haycnt-needlecnt)).
3667 
3668   // Main Loop (now we have at least 2 characters).
3669   Label L_OuterLoop, L_InnerLoop, L_FinalCheck, L_Comp1, L_Comp2;
3670   bind(L_OuterLoop); // Search for 1st 2 characters.
3671   Register addr_diff = tmp4;
3672    subf(addr_diff, addr, last_addr);  // Difference between already checked address and last address to check.
3673    addi(addr, addr, h_csize);         // This is the new address we want to use for comparing.
3674    srdi_(ch2, addr_diff, h_csize);
3675    beq(CCR0, L_FinalCheck);           // 2 characters left?
3676    mtctr(ch2);                        // num of characters / 2
3677   bind(L_InnerLoop);                  // Main work horse (2x unrolled search loop)
3678    if (h_csize == 2) {                // Load 2 characters of haystack (ignore alignment).
3679     lwz(ch1, 0, addr);
3680     lwz(ch2, 2, addr);
3681    } else {
3682     lhz(ch1, 0, addr);
3683     lhz(ch2, 1, addr);
3684    }
3685    cmpw(CCR0, ch1, n_start);          // Compare 2 characters (1 would be sufficient but try to reduce branches to CompLoop).
3686    cmpw(CCR1, ch2, n_start);
3687    beq(CCR0, L_Comp1);                // Did we find the needle start?
3688    beq(CCR1, L_Comp2);
3689    addi(addr, addr, 2 * h_csize);
3690    bdnz(L_InnerLoop);
3691   bind(L_FinalCheck);
3692    andi_(addr_diff, addr_diff, h_csize); // Remaining characters not covered by InnerLoop: (num of characters) & 1.
3693    beq(CCR0, L_NotFound);
3694    if (h_csize == 2) { lwz(ch1, 0, addr); } else { lhz(ch1, 0, addr); } // One position left at which we have to compare.
3695    cmpw(CCR1, ch1, n_start);
3696    beq(CCR1, L_Comp1);
3697   bind(L_NotFound);
3698    li(result, -1);                    // not found
3699    b(L_End);
3700 
3701    // **************************************************************************************************
3702    // Special Case: unfortunately, the variable needle case can be called with needlecnt<2
3703    // **************************************************************************************************
3704   if (needlecntval == 0) {           // We have to handle these cases separately.
3705   Label L_OneCharLoop;
3706   bind(L_TooShort);
3707    mtctr(haycnt);
3708    if (n_csize == 2) { lhz(n_start, 0, needle); } else { lbz(n_start, 0, needle); } // First character of needle
3709   bind(L_OneCharLoop);
3710    if (h_csize == 2) { lhzu(ch1, 2, addr); } else { lbzu(ch1, 1, addr); }
3711    cmpw(CCR1, ch1, n_start);
3712    beq(CCR1, L_Found);               // Did we find the one character needle?
3713    bdnz(L_OneCharLoop);
3714    li(result, -1);                   // Not found.
3715    b(L_End);
3716   }
3717 
3718   // **************************************************************************************************
3719   // Regular Case Part II: compare rest of needle (first 2 characters have been compared already)
3720   // **************************************************************************************************
3721 
3722   // Compare the rest
3723   bind(L_Comp2);
3724    addi(addr, addr, h_csize);        // First comparison has failed, 2nd one hit.
3725   bind(L_Comp1);                     // Addr points to possible needle start.
3726   if (needlecntval != 2) {           // Const needlecnt==2?
3727    if (needlecntval != 3) {
3728     if (needlecntval == 0) { beq(CCR6, L_Found); } // Variable needlecnt==2?
3729     Register n_ind = tmp4,
3730              h_ind = n_ind;
3731     li(n_ind, 2 * n_csize);          // First 2 characters are already compared, use index 2.
3732     mtctr(needlecnt);                // Decremented by 2, still > 0.
3733    Label L_CompLoop;
3734    bind(L_CompLoop);
3735     if (ae ==StrIntrinsicNode::UL) {
3736       h_ind = ch1;
3737       sldi(h_ind, n_ind, 1);
3738     }
3739     if (n_csize == 2) { lhzx(ch2, needle, n_ind); } else { lbzx(ch2, needle, n_ind); }
3740     if (h_csize == 2) { lhzx(ch1, addr, h_ind); } else { lbzx(ch1, addr, h_ind); }
3741     cmpw(CCR1, ch1, ch2);
3742     bne(CCR1, L_OuterLoop);
3743     addi(n_ind, n_ind, n_csize);
3744     bdnz(L_CompLoop);
3745    } else { // No loop required if there's only one needle character left.
3746     if (n_csize == 2) { lhz(ch2, 2 * 2, needle); } else { lbz(ch2, 2 * 1, needle); }
3747     if (h_csize == 2) { lhz(ch1, 2 * 2, addr); } else { lbz(ch1, 2 * 1, addr); }
3748     cmpw(CCR1, ch1, ch2);
3749     bne(CCR1, L_OuterLoop);
3750    }
3751   }
3752   // Return index ...
3753   bind(L_Found);
3754    subf(result, haystack, addr);     // relative to haystack, ...
3755    if (h_csize == 2) { srdi(result, result, 1); } // in characters.
3756   bind(L_End);
3757 } // string_indexof
3758 
3759 void MacroAssembler::string_indexof_char(Register result, Register haystack, Register haycnt,
3760                                          Register needle, jchar needleChar, Register tmp1, Register tmp2, bool is_byte) {
3761   assert_different_registers(haystack, haycnt, needle, tmp1, tmp2);
3762 
3763   Label L_InnerLoop, L_FinalCheck, L_Found1, L_Found2, L_NotFound, L_End;
3764   Register addr = tmp1,
3765            ch1 = tmp2,
3766            ch2 = R0;
3767 
3768   const int h_csize = is_byte ? 1 : 2;
3769 
3770 //4:
3771    srwi_(tmp2, haycnt, 1);   // Shift right by exact_log2(UNROLL_FACTOR).
3772    mr(addr, haystack);
3773    beq(CCR0, L_FinalCheck);
3774    mtctr(tmp2);              // Move to count register.
3775 //8:
3776   bind(L_InnerLoop);         // Main work horse (2x unrolled search loop).
3777    if (!is_byte) {
3778     lhz(ch1, 0, addr);
3779     lhz(ch2, 2, addr);
3780    } else {
3781     lbz(ch1, 0, addr);
3782     lbz(ch2, 1, addr);
3783    }
3784    (needle != R0) ? cmpw(CCR0, ch1, needle) : cmplwi(CCR0, ch1, (unsigned int)needleChar);
3785    (needle != R0) ? cmpw(CCR1, ch2, needle) : cmplwi(CCR1, ch2, (unsigned int)needleChar);
3786    beq(CCR0, L_Found1);      // Did we find the needle?
3787    beq(CCR1, L_Found2);
3788    addi(addr, addr, 2 * h_csize);
3789    bdnz(L_InnerLoop);
3790 //16:
3791   bind(L_FinalCheck);
3792    andi_(R0, haycnt, 1);
3793    beq(CCR0, L_NotFound);
3794    if (!is_byte) { lhz(ch1, 0, addr); } else { lbz(ch1, 0, addr); } // One position left at which we have to compare.
3795    (needle != R0) ? cmpw(CCR1, ch1, needle) : cmplwi(CCR1, ch1, (unsigned int)needleChar);
3796    beq(CCR1, L_Found1);
3797 //21:
3798   bind(L_NotFound);
3799    li(result, -1);           // Not found.
3800    b(L_End);
3801 
3802   bind(L_Found2);
3803    addi(addr, addr, h_csize);
3804 //24:
3805   bind(L_Found1);            // Return index ...
3806    subf(result, haystack, addr); // relative to haystack, ...
3807    if (!is_byte) { srdi(result, result, 1); } // in characters.
3808   bind(L_End);
3809 } // string_indexof_char
3810 
3811 
3812 void MacroAssembler::has_negatives(Register src, Register cnt, Register result,
3813                                    Register tmp1, Register tmp2) {
3814   const Register tmp0 = R0;
3815   assert_different_registers(src, result, cnt, tmp0, tmp1, tmp2);
3816   Label Lfastloop, Lslow, Lloop, Lnoneg, Ldone;
3817 
3818   // Check if cnt >= 8 (= 16 bytes)
3819   lis(tmp1, (int)(short)0x8080);  // tmp1 = 0x8080808080808080
3820   srwi_(tmp2, cnt, 4);
3821   li(result, 1);                  // Assume there's a negative byte.
3822   beq(CCR0, Lslow);
3823   ori(tmp1, tmp1, 0x8080);
3824   rldimi(tmp1, tmp1, 32, 0);
3825   mtctr(tmp2);
3826 
3827   // 2x unrolled loop
3828   bind(Lfastloop);
3829   ld(tmp2, 0, src);
3830   ld(tmp0, 8, src);
3831 
3832   orr(tmp0, tmp2, tmp0);
3833 
3834   and_(tmp0, tmp0, tmp1);
3835   bne(CCR0, Ldone);               // Found negative byte.
3836   addi(src, src, 16);
3837 
3838   bdnz(Lfastloop);
3839 
3840   bind(Lslow);                    // Fallback to slow version
3841   rldicl_(tmp0, cnt, 0, 64-4);
3842   beq(CCR0, Lnoneg);
3843   mtctr(tmp0);
3844   bind(Lloop);
3845   lbz(tmp0, 0, src);
3846   addi(src, src, 1);
3847   andi_(tmp0, tmp0, 0x80);
3848   bne(CCR0, Ldone);               // Found negative byte.
3849   bdnz(Lloop);
3850   bind(Lnoneg);
3851   li(result, 0);
3852 
3853   bind(Ldone);
3854 }
3855 
3856 #endif // Compiler2
3857 
3858 // Helpers for Intrinsic Emitters
3859 //
3860 // Revert the byte order of a 32bit value in a register
3861 //   src: 0x44556677
3862 //   dst: 0x77665544
3863 // Three steps to obtain the result:
3864 //  1) Rotate src (as doubleword) left 5 bytes. That puts the leftmost byte of the src word
3865 //     into the rightmost byte position. Afterwards, everything left of the rightmost byte is cleared.
3866 //     This value initializes dst.
3867 //  2) Rotate src (as word) left 3 bytes. That puts the rightmost byte of the src word into the leftmost
3868 //     byte position. Furthermore, byte 5 is rotated into byte 6 position where it is supposed to go.
3869 //     This value is mask inserted into dst with a [0..23] mask of 1s.
3870 //  3) Rotate src (as word) left 1 byte. That puts byte 6 into byte 5 position.
3871 //     This value is mask inserted into dst with a [8..15] mask of 1s.
3872 void MacroAssembler::load_reverse_32(Register dst, Register src) {
3873   assert_different_registers(dst, src);
3874 
3875   rldicl(dst, src, (4+1)*8, 56);       // Rotate byte 4 into position 7 (rightmost), clear all to the left.
3876   rlwimi(dst, src,     3*8,  0, 23);   // Insert byte 5 into position 6, 7 into 4, leave pos 7 alone.
3877   rlwimi(dst, src,     1*8,  8, 15);   // Insert byte 6 into position 5, leave the rest alone.
3878 }
3879 
3880 // Calculate the column addresses of the crc32 lookup table into distinct registers.
3881 // This loop-invariant calculation is moved out of the loop body, reducing the loop
3882 // body size from 20 to 16 instructions.
3883 // Returns the offset that was used to calculate the address of column tc3.
3884 // Due to register shortage, setting tc3 may overwrite table. With the return offset
3885 // at hand, the original table address can be easily reconstructed.
3886 int MacroAssembler::crc32_table_columns(Register table, Register tc0, Register tc1, Register tc2, Register tc3) {
3887   assert(!VM_Version::has_vpmsumb(), "Vector version should be used instead!");
3888 
3889   // Point to 4 byte folding tables (byte-reversed version for Big Endian)
3890   // Layout: See StubRoutines::generate_crc_constants.
3891 #ifdef VM_LITTLE_ENDIAN
3892   const int ix0 = 3 * CRC32_TABLE_SIZE;
3893   const int ix1 = 2 * CRC32_TABLE_SIZE;
3894   const int ix2 = 1 * CRC32_TABLE_SIZE;
3895   const int ix3 = 0 * CRC32_TABLE_SIZE;
3896 #else
3897   const int ix0 = 1 * CRC32_TABLE_SIZE;
3898   const int ix1 = 2 * CRC32_TABLE_SIZE;
3899   const int ix2 = 3 * CRC32_TABLE_SIZE;
3900   const int ix3 = 4 * CRC32_TABLE_SIZE;
3901 #endif
3902   assert_different_registers(table, tc0, tc1, tc2);
3903   assert(table == tc3, "must be!");
3904 
3905   addi(tc0, table, ix0);
3906   addi(tc1, table, ix1);
3907   addi(tc2, table, ix2);
3908   if (ix3 != 0) addi(tc3, table, ix3);
3909 
3910   return ix3;
3911 }
3912 
3913 /**
3914  * uint32_t crc;
3915  * table[crc & 0xFF] ^ (crc >> 8);
3916  */
3917 void MacroAssembler::fold_byte_crc32(Register crc, Register val, Register table, Register tmp) {
3918   assert_different_registers(crc, table, tmp);
3919   assert_different_registers(val, table);
3920 
3921   if (crc == val) {                   // Must rotate first to use the unmodified value.
3922     rlwinm(tmp, val, 2, 24-2, 31-2);  // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.
3923                                       // As we use a word (4-byte) instruction, we have to adapt the mask bit positions.
3924     srwi(crc, crc, 8);                // Unsigned shift, clear leftmost 8 bits.
3925   } else {
3926     srwi(crc, crc, 8);                // Unsigned shift, clear leftmost 8 bits.
3927     rlwinm(tmp, val, 2, 24-2, 31-2);  // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.
3928   }
3929   lwzx(tmp, table, tmp);
3930   xorr(crc, crc, tmp);
3931 }
3932 
3933 /**
3934  * Emits code to update CRC-32 with a byte value according to constants in table.
3935  *
3936  * @param [in,out]crc   Register containing the crc.
3937  * @param [in]val       Register containing the byte to fold into the CRC.
3938  * @param [in]table     Register containing the table of crc constants.
3939  *
3940  * uint32_t crc;
3941  * val = crc_table[(val ^ crc) & 0xFF];
3942  * crc = val ^ (crc >> 8);
3943  */
3944 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
3945   BLOCK_COMMENT("update_byte_crc32:");
3946   xorr(val, val, crc);
3947   fold_byte_crc32(crc, val, table, val);
3948 }
3949 
3950 /**
3951  * @param crc   register containing existing CRC (32-bit)
3952  * @param buf   register pointing to input byte buffer (byte*)
3953  * @param len   register containing number of bytes
3954  * @param table register pointing to CRC table
3955  */
3956 void MacroAssembler::update_byteLoop_crc32(Register crc, Register buf, Register len, Register table,
3957                                            Register data, bool loopAlignment) {
3958   assert_different_registers(crc, buf, len, table, data);
3959 
3960   Label L_mainLoop, L_done;
3961   const int mainLoop_stepping  = 1;
3962   const int mainLoop_alignment = loopAlignment ? 32 : 4; // (InputForNewCode > 4 ? InputForNewCode : 32) : 4;
3963 
3964   // Process all bytes in a single-byte loop.
3965   clrldi_(len, len, 32);                         // Enforce 32 bit. Anything to do?
3966   beq(CCR0, L_done);
3967 
3968   mtctr(len);
3969   align(mainLoop_alignment);
3970   BIND(L_mainLoop);
3971     lbz(data, 0, buf);                           // Byte from buffer, zero-extended.
3972     addi(buf, buf, mainLoop_stepping);           // Advance buffer position.
3973     update_byte_crc32(crc, data, table);
3974     bdnz(L_mainLoop);                            // Iterate.
3975 
3976   bind(L_done);
3977 }
3978 
3979 /**
3980  * Emits code to update CRC-32 with a 4-byte value according to constants in table
3981  * Implementation according to jdk/src/share/native/java/util/zip/zlib-1.2.8/crc32.c
3982  */
3983 // A note on the lookup table address(es):
3984 // The implementation uses 4 table columns (byte-reversed versions for Big Endian).
3985 // To save the effort of adding the column offset to the table address each time
3986 // a table element is looked up, it is possible to pass the pre-calculated
3987 // column addresses.
3988 // Uses R9..R12 as work register. Must be saved/restored by caller, if necessary.
3989 void MacroAssembler::update_1word_crc32(Register crc, Register buf, Register table, int bufDisp, int bufInc,
3990                                         Register t0,  Register t1,  Register t2,  Register t3,
3991                                         Register tc0, Register tc1, Register tc2, Register tc3) {
3992   assert_different_registers(crc, t3);
3993 
3994   // XOR crc with next four bytes of buffer.
3995   lwz(t3, bufDisp, buf);
3996   if (bufInc != 0) {
3997     addi(buf, buf, bufInc);
3998   }
3999   xorr(t3, t3, crc);
4000 
4001   // Chop crc into 4 single-byte pieces, shifted left 2 bits, to form the table indices.
4002   rlwinm(t0, t3,  2,         24-2, 31-2);  // ((t1 >>  0) & 0xff) << 2
4003   rlwinm(t1, t3,  32+(2- 8), 24-2, 31-2);  // ((t1 >>  8) & 0xff) << 2
4004   rlwinm(t2, t3,  32+(2-16), 24-2, 31-2);  // ((t1 >> 16) & 0xff) << 2
4005   rlwinm(t3, t3,  32+(2-24), 24-2, 31-2);  // ((t1 >> 24) & 0xff) << 2
4006 
4007   // Use the pre-calculated column addresses.
4008   // Load pre-calculated table values.
4009   lwzx(t0, tc0, t0);
4010   lwzx(t1, tc1, t1);
4011   lwzx(t2, tc2, t2);
4012   lwzx(t3, tc3, t3);
4013 
4014   // Calculate new crc from table values.
4015   xorr(t0,  t0, t1);
4016   xorr(t2,  t2, t3);
4017   xorr(crc, t0, t2);  // Now crc contains the final checksum value.
4018 }
4019 
4020 /**
4021  * @param crc   register containing existing CRC (32-bit)
4022  * @param buf   register pointing to input byte buffer (byte*)
4023  * @param len   register containing number of bytes
4024  * @param table register pointing to CRC table
4025  *
4026  * uses R9..R12 as work register. Must be saved/restored by caller!
4027  */
4028 void MacroAssembler::kernel_crc32_1word(Register crc, Register buf, Register len, Register table,
4029                                         Register t0,  Register t1,  Register t2,  Register t3,
4030                                         Register tc0, Register tc1, Register tc2, Register tc3,
4031                                         bool invertCRC) {
4032   assert_different_registers(crc, buf, len, table);
4033 
4034   Label L_mainLoop, L_tail;
4035   Register  tmp          = t0;
4036   Register  data         = t0;
4037   Register  tmp2         = t1;
4038   const int mainLoop_stepping  = 4;
4039   const int tailLoop_stepping  = 1;
4040   const int log_stepping       = exact_log2(mainLoop_stepping);
4041   const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32;
4042   const int complexThreshold   = 2*mainLoop_stepping;
4043 
4044   // Don't test for len <= 0 here. This pathological case should not occur anyway.
4045   // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles
4046   // for all well-behaved cases. The situation itself is detected and handled correctly
4047   // within update_byteLoop_crc32.
4048   assert(tailLoop_stepping == 1, "check tailLoop_stepping!");
4049 
4050   BLOCK_COMMENT("kernel_crc32_1word {");
4051 
4052   if (invertCRC) {
4053     nand(crc, crc, crc);                      // 1s complement of crc
4054   }
4055 
4056   // Check for short (<mainLoop_stepping) buffer.
4057   cmpdi(CCR0, len, complexThreshold);
4058   blt(CCR0, L_tail);
4059 
4060   // Pre-mainLoop alignment did show a slight (1%) positive effect on performance.
4061   // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions.
4062   {
4063     // Align buf addr to mainLoop_stepping boundary.
4064     neg(tmp2, buf);                              // Calculate # preLoop iterations for alignment.
4065     rldicl(tmp2, tmp2, 0, 64-log_stepping);      // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63.
4066 
4067     if (complexThreshold > mainLoop_stepping) {
4068       sub(len, len, tmp2);                       // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
4069     } else {
4070       sub(tmp, len, tmp2);                       // Remaining bytes for main loop.
4071       cmpdi(CCR0, tmp, mainLoop_stepping);
4072       blt(CCR0, L_tail);                         // For less than one mainloop_stepping left, do only tail processing
4073       mr(len, tmp);                              // remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
4074     }
4075     update_byteLoop_crc32(crc, buf, tmp2, table, data, false);
4076   }
4077 
4078   srdi(tmp2, len, log_stepping);                 // #iterations for mainLoop
4079   andi(len, len, mainLoop_stepping-1);           // remaining bytes for tailLoop
4080   mtctr(tmp2);
4081 
4082 #ifdef VM_LITTLE_ENDIAN
4083   Register crc_rv = crc;
4084 #else
4085   Register crc_rv = tmp;                         // Load_reverse needs separate registers to work on.
4086                                                  // Occupies tmp, but frees up crc.
4087   load_reverse_32(crc_rv, crc);                  // Revert byte order because we are dealing with big-endian data.
4088   tmp = crc;
4089 #endif
4090 
4091   int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3);
4092 
4093   align(mainLoop_alignment);                     // Octoword-aligned loop address. Shows 2% improvement.
4094   BIND(L_mainLoop);
4095     update_1word_crc32(crc_rv, buf, table, 0, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3);
4096     bdnz(L_mainLoop);
4097 
4098 #ifndef VM_LITTLE_ENDIAN
4099   load_reverse_32(crc, crc_rv);                  // Revert byte order because we are dealing with big-endian data.
4100   tmp = crc_rv;                                  // Tmp uses it's original register again.
4101 #endif
4102 
4103   // Restore original table address for tailLoop.
4104   if (reconstructTableOffset != 0) {
4105     addi(table, table, -reconstructTableOffset);
4106   }
4107 
4108   // Process last few (<complexThreshold) bytes of buffer.
4109   BIND(L_tail);
4110   update_byteLoop_crc32(crc, buf, len, table, data, false);
4111 
4112   if (invertCRC) {
4113     nand(crc, crc, crc);                      // 1s complement of crc
4114   }
4115   BLOCK_COMMENT("} kernel_crc32_1word");
4116 }
4117 
4118 /**
4119  * @param crc             register containing existing CRC (32-bit)
4120  * @param buf             register pointing to input byte buffer (byte*)
4121  * @param len             register containing number of bytes
4122  * @param constants       register pointing to precomputed constants
4123  * @param t0-t6           temp registers
4124  */
4125 void MacroAssembler::kernel_crc32_vpmsum(Register crc, Register buf, Register len, Register constants,
4126                                          Register t0, Register t1, Register t2, Register t3,
4127                                          Register t4, Register t5, Register t6, bool invertCRC) {
4128   assert_different_registers(crc, buf, len, constants);
4129 
4130   Label L_tail;
4131 
4132   BLOCK_COMMENT("kernel_crc32_vpmsum {");
4133 
4134   if (invertCRC) {
4135     nand(crc, crc, crc);                      // 1s complement of crc
4136   }
4137 
4138   // Enforce 32 bit.
4139   clrldi(len, len, 32);
4140 
4141   // Align if we have enough bytes for the fast version.
4142   const int alignment = 16,
4143             threshold = 32;
4144   Register prealign = t0;
4145 
4146   neg(prealign, buf);
4147   addi(t1, len, -threshold);
4148   andi(prealign, prealign, alignment - 1);
4149   cmpw(CCR0, t1, prealign);
4150   blt(CCR0, L_tail); // len - prealign < threshold?
4151 
4152   subf(len, prealign, len);
4153   update_byteLoop_crc32(crc, buf, prealign, constants, t2, false);
4154 
4155   // Calculate from first aligned address as far as possible.
4156   addi(constants, constants, CRC32_TABLE_SIZE); // Point to vector constants.
4157   kernel_crc32_vpmsum_aligned(crc, buf, len, constants, t0, t1, t2, t3, t4, t5, t6);
4158   addi(constants, constants, -CRC32_TABLE_SIZE); // Point to table again.
4159 
4160   // Remaining bytes.
4161   BIND(L_tail);
4162   update_byteLoop_crc32(crc, buf, len, constants, t2, false);
4163 
4164   if (invertCRC) {
4165     nand(crc, crc, crc);                      // 1s complement of crc
4166   }
4167 
4168   BLOCK_COMMENT("} kernel_crc32_vpmsum");
4169 }
4170 
4171 /**
4172  * @param crc             register containing existing CRC (32-bit)
4173  * @param buf             register pointing to input byte buffer (byte*)
4174  * @param len             register containing number of bytes (will get updated to remaining bytes)
4175  * @param constants       register pointing to CRC table for 128-bit aligned memory
4176  * @param t0-t6           temp registers
4177  */
4178 void MacroAssembler::kernel_crc32_vpmsum_aligned(Register crc, Register buf, Register len, Register constants,
4179     Register t0, Register t1, Register t2, Register t3, Register t4, Register t5, Register t6) {
4180 
4181   // Save non-volatile vector registers (frameless).
4182   Register offset = t1;
4183   int offsetInt = 0;
4184   offsetInt -= 16; li(offset, offsetInt); stvx(VR20, offset, R1_SP);
4185   offsetInt -= 16; li(offset, offsetInt); stvx(VR21, offset, R1_SP);
4186   offsetInt -= 16; li(offset, offsetInt); stvx(VR22, offset, R1_SP);
4187   offsetInt -= 16; li(offset, offsetInt); stvx(VR23, offset, R1_SP);
4188   offsetInt -= 16; li(offset, offsetInt); stvx(VR24, offset, R1_SP);
4189   offsetInt -= 16; li(offset, offsetInt); stvx(VR25, offset, R1_SP);
4190 #ifndef VM_LITTLE_ENDIAN
4191   offsetInt -= 16; li(offset, offsetInt); stvx(VR26, offset, R1_SP);
4192 #endif
4193   offsetInt -= 8; std(R14, offsetInt, R1_SP);
4194   offsetInt -= 8; std(R15, offsetInt, R1_SP);
4195 
4196   // Implementation uses an inner loop which uses between 256 and 16 * unroll_factor
4197   // bytes per iteration. The basic scheme is:
4198   // lvx: load vector (Big Endian needs reversal)
4199   // vpmsumw: carry-less 32 bit multiplications with constant representing a large CRC shift
4200   // vxor: xor partial results together to get unroll_factor2 vectors
4201 
4202   // Outer loop performs the CRC shifts needed to combine the unroll_factor2 vectors.
4203 
4204   // Using 16 * unroll_factor / unroll_factor_2 bytes for constants.
4205   const int unroll_factor = CRC32_UNROLL_FACTOR,
4206             unroll_factor2 = CRC32_UNROLL_FACTOR2;
4207 
4208   const int outer_consts_size = (unroll_factor2 - 1) * 16,
4209             inner_consts_size = (unroll_factor / unroll_factor2) * 16;
4210 
4211   // Support registers.
4212   Register offs[] = { noreg, t0, t1, t2, t3, t4, t5, t6 };
4213   Register num_bytes = R14,
4214            loop_count = R15,
4215            cur_const = crc; // will live in VCRC
4216   // Constant array for outer loop: unroll_factor2 - 1 registers,
4217   // Constant array for inner loop: unroll_factor / unroll_factor2 registers.
4218   VectorRegister consts0[] = { VR16, VR17, VR18, VR19, VR20, VR21, VR22 },
4219                  consts1[] = { VR23, VR24 };
4220   // Data register arrays: 2 arrays with unroll_factor2 registers.
4221   VectorRegister data0[] = { VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7 },
4222                  data1[] = { VR8, VR9, VR10, VR11, VR12, VR13, VR14, VR15 };
4223 
4224   VectorRegister VCRC = data0[0];
4225   VectorRegister Vc = VR25;
4226   VectorRegister swap_bytes = VR26; // Only for Big Endian.
4227 
4228   // We have at least 1 iteration (ensured by caller).
4229   Label L_outer_loop, L_inner_loop, L_last;
4230 
4231   // If supported set DSCR pre-fetch to deepest.
4232   if (VM_Version::has_mfdscr()) {
4233     load_const_optimized(t0, VM_Version::_dscr_val | 7);
4234     mtdscr(t0);
4235   }
4236 
4237   mtvrwz(VCRC, crc); // crc lives in VCRC, now
4238 
4239   for (int i = 1; i < unroll_factor2; ++i) {
4240     li(offs[i], 16 * i);
4241   }
4242 
4243   // Load consts for outer loop
4244   lvx(consts0[0], constants);
4245   for (int i = 1; i < unroll_factor2 - 1; ++i) {
4246     lvx(consts0[i], offs[i], constants);
4247   }
4248 
4249   load_const_optimized(num_bytes, 16 * unroll_factor);
4250 
4251   // Reuse data registers outside of the loop.
4252   VectorRegister Vtmp = data1[0];
4253   VectorRegister Vtmp2 = data1[1];
4254   VectorRegister zeroes = data1[2];
4255 
4256   vspltisb(Vtmp, 0);
4257   vsldoi(VCRC, Vtmp, VCRC, 8); // 96 bit zeroes, 32 bit CRC.
4258 
4259   // Load vector for vpermxor (to xor both 64 bit parts together)
4260   lvsl(Vtmp, buf);   // 000102030405060708090a0b0c0d0e0f
4261   vspltisb(Vc, 4);
4262   vsl(Vc, Vtmp, Vc); // 00102030405060708090a0b0c0d0e0f0
4263   xxspltd(Vc->to_vsr(), Vc->to_vsr(), 0);
4264   vor(Vc, Vtmp, Vc); // 001122334455667708192a3b4c5d6e7f
4265 
4266 #ifdef VM_LITTLE_ENDIAN
4267 #define BE_swap_bytes(x)
4268 #else
4269   vspltisb(Vtmp2, 0xf);
4270   vxor(swap_bytes, Vtmp, Vtmp2);
4271 #define BE_swap_bytes(x) vperm(x, x, x, swap_bytes)
4272 #endif
4273 
4274   cmpd(CCR0, len, num_bytes);
4275   blt(CCR0, L_last);
4276 
4277   addi(cur_const, constants, outer_consts_size); // Point to consts for inner loop
4278   load_const_optimized(loop_count, unroll_factor / (2 * unroll_factor2) - 1); // One double-iteration peeled off.
4279 
4280   // ********** Main loop start **********
4281   align(32);
4282   bind(L_outer_loop);
4283 
4284   // Begin of unrolled first iteration (no xor).
4285   lvx(data1[0], buf);
4286   for (int i = 1; i < unroll_factor2 / 2; ++i) {
4287     lvx(data1[i], offs[i], buf);
4288   }
4289   vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result.
4290   lvx(consts1[0], cur_const);
4291   mtctr(loop_count);
4292   for (int i = 0; i < unroll_factor2 / 2; ++i) {
4293     BE_swap_bytes(data1[i]);
4294     if (i == 0) { vxor(data1[0], data1[0], VCRC); } // xor in previous CRC.
4295     lvx(data1[i + unroll_factor2 / 2], offs[i + unroll_factor2 / 2], buf);
4296     vpmsumw(data0[i], data1[i], consts1[0]);
4297   }
4298   addi(buf, buf, 16 * unroll_factor2);
4299   subf(len, num_bytes, len);
4300   lvx(consts1[1], offs[1], cur_const);
4301   addi(cur_const, cur_const, 32);
4302   // Begin of unrolled second iteration (head).
4303   for (int i = 0; i < unroll_factor2 / 2; ++i) {
4304     BE_swap_bytes(data1[i + unroll_factor2 / 2]);
4305     if (i == 0) { lvx(data1[0], buf); } else { lvx(data1[i], offs[i], buf); }
4306     vpmsumw(data0[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2], consts1[0]);
4307   }
4308   for (int i = 0; i < unroll_factor2 / 2; ++i) {
4309     BE_swap_bytes(data1[i]);
4310     lvx(data1[i + unroll_factor2 / 2], offs[i + unroll_factor2 / 2], buf);
4311     vpmsumw(data1[i], data1[i], consts1[1]);
4312   }
4313   addi(buf, buf, 16 * unroll_factor2);
4314 
4315   // Generate most performance relevant code. Loads + half of the vpmsumw have been generated.
4316   // Double-iteration allows using the 2 constant registers alternatingly.
4317   align(32);
4318   bind(L_inner_loop);
4319   for (int j = 1; j < 3; ++j) { // j < unroll_factor / unroll_factor2 - 1 for complete unrolling.
4320     if (j & 1) {
4321       lvx(consts1[0], cur_const);
4322     } else {
4323       lvx(consts1[1], offs[1], cur_const);
4324       addi(cur_const, cur_const, 32);
4325     }
4326     for (int i = 0; i < unroll_factor2; ++i) {
4327       int idx = i + unroll_factor2 / 2, inc = 0; // For modulo-scheduled input.
4328       if (idx >= unroll_factor2) { idx -= unroll_factor2; inc = 1; }
4329       BE_swap_bytes(data1[idx]);
4330       vxor(data0[i], data0[i], data1[i]);
4331       if (i == 0) lvx(data1[0], buf); else lvx(data1[i], offs[i], buf);
4332       vpmsumw(data1[idx], data1[idx], consts1[(j + inc) & 1]);
4333     }
4334     addi(buf, buf, 16 * unroll_factor2);
4335   }
4336   bdnz(L_inner_loop);
4337 
4338   addi(cur_const, constants, outer_consts_size); // Reset
4339 
4340   // Tail of last iteration (no loads).
4341   for (int i = 0; i < unroll_factor2 / 2; ++i) {
4342     BE_swap_bytes(data1[i + unroll_factor2 / 2]);
4343     vxor(data0[i], data0[i], data1[i]);
4344     vpmsumw(data1[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2], consts1[1]);
4345   }
4346   for (int i = 0; i < unroll_factor2 / 2; ++i) {
4347     vpmsumw(data0[i], data0[i], consts0[unroll_factor2 - 2 - i]); // First half of fixup shifts.
4348     vxor(data0[i + unroll_factor2 / 2], data0[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2]);
4349   }
4350 
4351   // Last data register is ok, other ones need fixup shift.
4352   for (int i = unroll_factor2 / 2; i < unroll_factor2 - 1; ++i) {
4353     vpmsumw(data0[i], data0[i], consts0[unroll_factor2 - 2 - i]);
4354   }
4355 
4356   // Combine to 128 bit result vector VCRC = data0[0].
4357   for (int i = 1; i < unroll_factor2; i<<=1) {
4358     for (int j = 0; j <= unroll_factor2 - 2*i; j+=2*i) {
4359       vxor(data0[j], data0[j], data0[j+i]);
4360     }
4361   }
4362   cmpd(CCR0, len, num_bytes);
4363   bge(CCR0, L_outer_loop);
4364 
4365   // Last chance with lower num_bytes.
4366   bind(L_last);
4367   srdi(loop_count, len, exact_log2(16 * 2 * unroll_factor2)); // Use double-iterations.
4368   // Point behind last const for inner loop.
4369   add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size);
4370   sldi(R0, loop_count, exact_log2(16 * 2)); // Bytes of constants to be used.
4371   clrrdi(num_bytes, len, exact_log2(16 * 2 * unroll_factor2));
4372   subf(cur_const, R0, cur_const); // Point to constant to be used first.
4373 
4374   addic_(loop_count, loop_count, -1); // One double-iteration peeled off.
4375   bgt(CCR0, L_outer_loop);
4376   // ********** Main loop end **********
4377 
4378   // Restore DSCR pre-fetch value.
4379   if (VM_Version::has_mfdscr()) {
4380     load_const_optimized(t0, VM_Version::_dscr_val);
4381     mtdscr(t0);
4382   }
4383 
4384   // ********** Simple loop for remaining 16 byte blocks **********
4385   {
4386     Label L_loop, L_done;
4387 
4388     srdi_(t0, len, 4); // 16 bytes per iteration
4389     clrldi(len, len, 64-4);
4390     beq(CCR0, L_done);
4391 
4392     // Point to const (same as last const for inner loop).
4393     add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size - 16);
4394     mtctr(t0);
4395     lvx(Vtmp2, cur_const);
4396 
4397     align(32);
4398     bind(L_loop);
4399 
4400     lvx(Vtmp, buf);
4401     addi(buf, buf, 16);
4402     vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result.
4403     BE_swap_bytes(Vtmp);
4404     vxor(VCRC, VCRC, Vtmp);
4405     vpmsumw(VCRC, VCRC, Vtmp2);
4406     bdnz(L_loop);
4407 
4408     bind(L_done);
4409   }
4410   // ********** Simple loop end **********
4411 #undef BE_swap_bytes
4412 
4413   // Point to Barrett constants
4414   add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size);
4415 
4416   vspltisb(zeroes, 0);
4417 
4418   // Combine to 64 bit result.
4419   vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result.
4420 
4421   // Reduce to 32 bit CRC: Remainder by multiply-high.
4422   lvx(Vtmp, cur_const);
4423   vsldoi(Vtmp2, zeroes, VCRC, 12);  // Extract high 32 bit.
4424   vpmsumd(Vtmp2, Vtmp2, Vtmp);      // Multiply by inverse long poly.
4425   vsldoi(Vtmp2, zeroes, Vtmp2, 12); // Extract high 32 bit.
4426   vsldoi(Vtmp, zeroes, Vtmp, 8);
4427   vpmsumd(Vtmp2, Vtmp2, Vtmp);      // Multiply quotient by long poly.
4428   vxor(VCRC, VCRC, Vtmp2);          // Remainder fits into 32 bit.
4429 
4430   // Move result. len is already updated.
4431   vsldoi(VCRC, VCRC, zeroes, 8);
4432   mfvrd(crc, VCRC);
4433 
4434   // Restore non-volatile Vector registers (frameless).
4435   offsetInt = 0;
4436   offsetInt -= 16; li(offset, offsetInt); lvx(VR20, offset, R1_SP);
4437   offsetInt -= 16; li(offset, offsetInt); lvx(VR21, offset, R1_SP);
4438   offsetInt -= 16; li(offset, offsetInt); lvx(VR22, offset, R1_SP);
4439   offsetInt -= 16; li(offset, offsetInt); lvx(VR23, offset, R1_SP);
4440   offsetInt -= 16; li(offset, offsetInt); lvx(VR24, offset, R1_SP);
4441   offsetInt -= 16; li(offset, offsetInt); lvx(VR25, offset, R1_SP);
4442 #ifndef VM_LITTLE_ENDIAN
4443   offsetInt -= 16; li(offset, offsetInt); lvx(VR26, offset, R1_SP);
4444 #endif
4445   offsetInt -= 8;  ld(R14, offsetInt, R1_SP);
4446   offsetInt -= 8;  ld(R15, offsetInt, R1_SP);
4447 }
4448 
4449 void MacroAssembler::crc32(Register crc, Register buf, Register len, Register t0, Register t1, Register t2,
4450                            Register t3, Register t4, Register t5, Register t6, Register t7, bool is_crc32c) {
4451   load_const_optimized(t0, is_crc32c ? StubRoutines::crc32c_table_addr()
4452                                      : StubRoutines::crc_table_addr()   , R0);
4453 
4454   if (VM_Version::has_vpmsumb()) {
4455     kernel_crc32_vpmsum(crc, buf, len, t0, t1, t2, t3, t4, t5, t6, t7, !is_crc32c);
4456   } else {
4457     kernel_crc32_1word(crc, buf, len, t0, t1, t2, t3, t4, t5, t6, t7, t0, !is_crc32c);
4458   }
4459 }
4460 
4461 void MacroAssembler::kernel_crc32_singleByteReg(Register crc, Register val, Register table, bool invertCRC) {
4462   assert_different_registers(crc, val, table);
4463 
4464   BLOCK_COMMENT("kernel_crc32_singleByteReg:");
4465   if (invertCRC) {
4466     nand(crc, crc, crc);                // 1s complement of crc
4467   }
4468 
4469   update_byte_crc32(crc, val, table);
4470 
4471   if (invertCRC) {
4472     nand(crc, crc, crc);                // 1s complement of crc
4473   }
4474 }
4475 
4476 // dest_lo += src1 + src2
4477 // dest_hi += carry1 + carry2
4478 void MacroAssembler::add2_with_carry(Register dest_hi,
4479                                      Register dest_lo,
4480                                      Register src1, Register src2) {
4481   li(R0, 0);
4482   addc(dest_lo, dest_lo, src1);
4483   adde(dest_hi, dest_hi, R0);
4484   addc(dest_lo, dest_lo, src2);
4485   adde(dest_hi, dest_hi, R0);
4486 }
4487 
4488 // Multiply 64 bit by 64 bit first loop.
4489 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart,
4490                                            Register x_xstart,
4491                                            Register y, Register y_idx,
4492                                            Register z,
4493                                            Register carry,
4494                                            Register product_high, Register product,
4495                                            Register idx, Register kdx,
4496                                            Register tmp) {
4497   //  jlong carry, x[], y[], z[];
4498   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) {
4499   //    huge_128 product = y[idx] * x[xstart] + carry;
4500   //    z[kdx] = (jlong)product;
4501   //    carry  = (jlong)(product >>> 64);
4502   //  }
4503   //  z[xstart] = carry;
4504 
4505   Label L_first_loop, L_first_loop_exit;
4506   Label L_one_x, L_one_y, L_multiply;
4507 
4508   addic_(xstart, xstart, -1);
4509   blt(CCR0, L_one_x);   // Special case: length of x is 1.
4510 
4511   // Load next two integers of x.
4512   sldi(tmp, xstart, LogBytesPerInt);
4513   ldx(x_xstart, x, tmp);
4514 #ifdef VM_LITTLE_ENDIAN
4515   rldicl(x_xstart, x_xstart, 32, 0);
4516 #endif
4517 
4518   align(32, 16);
4519   bind(L_first_loop);
4520 
4521   cmpdi(CCR0, idx, 1);
4522   blt(CCR0, L_first_loop_exit);
4523   addi(idx, idx, -2);
4524   beq(CCR0, L_one_y);
4525 
4526   // Load next two integers of y.
4527   sldi(tmp, idx, LogBytesPerInt);
4528   ldx(y_idx, y, tmp);
4529 #ifdef VM_LITTLE_ENDIAN
4530   rldicl(y_idx, y_idx, 32, 0);
4531 #endif
4532 
4533 
4534   bind(L_multiply);
4535   multiply64(product_high, product, x_xstart, y_idx);
4536 
4537   li(tmp, 0);
4538   addc(product, product, carry);         // Add carry to result.
4539   adde(product_high, product_high, tmp); // Add carry of the last addition.
4540   addi(kdx, kdx, -2);
4541 
4542   // Store result.
4543 #ifdef VM_LITTLE_ENDIAN
4544   rldicl(product, product, 32, 0);
4545 #endif
4546   sldi(tmp, kdx, LogBytesPerInt);
4547   stdx(product, z, tmp);
4548   mr_if_needed(carry, product_high);
4549   b(L_first_loop);
4550 
4551 
4552   bind(L_one_y); // Load one 32 bit portion of y as (0,value).
4553 
4554   lwz(y_idx, 0, y);
4555   b(L_multiply);
4556 
4557 
4558   bind(L_one_x); // Load one 32 bit portion of x as (0,value).
4559 
4560   lwz(x_xstart, 0, x);
4561   b(L_first_loop);
4562 
4563   bind(L_first_loop_exit);
4564 }
4565 
4566 // Multiply 64 bit by 64 bit and add 128 bit.
4567 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y,
4568                                             Register z, Register yz_idx,
4569                                             Register idx, Register carry,
4570                                             Register product_high, Register product,
4571                                             Register tmp, int offset) {
4572 
4573   //  huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry;
4574   //  z[kdx] = (jlong)product;
4575 
4576   sldi(tmp, idx, LogBytesPerInt);
4577   if (offset) {
4578     addi(tmp, tmp, offset);
4579   }
4580   ldx(yz_idx, y, tmp);
4581 #ifdef VM_LITTLE_ENDIAN
4582   rldicl(yz_idx, yz_idx, 32, 0);
4583 #endif
4584 
4585   multiply64(product_high, product, x_xstart, yz_idx);
4586   ldx(yz_idx, z, tmp);
4587 #ifdef VM_LITTLE_ENDIAN
4588   rldicl(yz_idx, yz_idx, 32, 0);
4589 #endif
4590 
4591   add2_with_carry(product_high, product, carry, yz_idx);
4592 
4593   sldi(tmp, idx, LogBytesPerInt);
4594   if (offset) {
4595     addi(tmp, tmp, offset);
4596   }
4597 #ifdef VM_LITTLE_ENDIAN
4598   rldicl(product, product, 32, 0);
4599 #endif
4600   stdx(product, z, tmp);
4601 }
4602 
4603 // Multiply 128 bit by 128 bit. Unrolled inner loop.
4604 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart,
4605                                              Register y, Register z,
4606                                              Register yz_idx, Register idx, Register carry,
4607                                              Register product_high, Register product,
4608                                              Register carry2, Register tmp) {
4609 
4610   //  jlong carry, x[], y[], z[];
4611   //  int kdx = ystart+1;
4612   //  for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
4613   //    huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry;
4614   //    z[kdx+idx+1] = (jlong)product;
4615   //    jlong carry2 = (jlong)(product >>> 64);
4616   //    product = (y[idx] * x_xstart) + z[kdx+idx] + carry2;
4617   //    z[kdx+idx] = (jlong)product;
4618   //    carry = (jlong)(product >>> 64);
4619   //  }
4620   //  idx += 2;
4621   //  if (idx > 0) {
4622   //    product = (y[idx] * x_xstart) + z[kdx+idx] + carry;
4623   //    z[kdx+idx] = (jlong)product;
4624   //    carry = (jlong)(product >>> 64);
4625   //  }
4626 
4627   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
4628   const Register jdx = R0;
4629 
4630   // Scale the index.
4631   srdi_(jdx, idx, 2);
4632   beq(CCR0, L_third_loop_exit);
4633   mtctr(jdx);
4634 
4635   align(32, 16);
4636   bind(L_third_loop);
4637 
4638   addi(idx, idx, -4);
4639 
4640   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 8);
4641   mr_if_needed(carry2, product_high);
4642 
4643   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product_high, product, tmp, 0);
4644   mr_if_needed(carry, product_high);
4645   bdnz(L_third_loop);
4646 
4647   bind(L_third_loop_exit);  // Handle any left-over operand parts.
4648 
4649   andi_(idx, idx, 0x3);
4650   beq(CCR0, L_post_third_loop_done);
4651 
4652   Label L_check_1;
4653 
4654   addic_(idx, idx, -2);
4655   blt(CCR0, L_check_1);
4656 
4657   multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 0);
4658   mr_if_needed(carry, product_high);
4659 
4660   bind(L_check_1);
4661 
4662   addi(idx, idx, 0x2);
4663   andi_(idx, idx, 0x1);
4664   addic_(idx, idx, -1);
4665   blt(CCR0, L_post_third_loop_done);
4666 
4667   sldi(tmp, idx, LogBytesPerInt);
4668   lwzx(yz_idx, y, tmp);
4669   multiply64(product_high, product, x_xstart, yz_idx);
4670   lwzx(yz_idx, z, tmp);
4671 
4672   add2_with_carry(product_high, product, yz_idx, carry);
4673 
4674   sldi(tmp, idx, LogBytesPerInt);
4675   stwx(product, z, tmp);
4676   srdi(product, product, 32);
4677 
4678   sldi(product_high, product_high, 32);
4679   orr(product, product, product_high);
4680   mr_if_needed(carry, product);
4681 
4682   bind(L_post_third_loop_done);
4683 }   // multiply_128_x_128_loop
4684 
4685 void MacroAssembler::muladd(Register out, Register in,
4686                             Register offset, Register len, Register k,
4687                             Register tmp1, Register tmp2, Register carry) {
4688 
4689   // Labels
4690   Label LOOP, SKIP;
4691 
4692   // Make sure length is positive.
4693   cmpdi  (CCR0,    len,     0);
4694 
4695   // Prepare variables
4696   subi   (offset,  offset,  4);
4697   li     (carry,   0);
4698   ble    (CCR0,    SKIP);
4699 
4700   mtctr  (len);
4701   subi   (len,     len,     1    );
4702   sldi   (len,     len,     2    );
4703 
4704   // Main loop
4705   bind(LOOP);
4706   lwzx   (tmp1,    len,     in   );
4707   lwzx   (tmp2,    offset,  out  );
4708   mulld  (tmp1,    tmp1,    k    );
4709   add    (tmp2,    carry,   tmp2 );
4710   add    (tmp2,    tmp1,    tmp2 );
4711   stwx   (tmp2,    offset,  out  );
4712   srdi   (carry,   tmp2,    32   );
4713   subi   (offset,  offset,  4    );
4714   subi   (len,     len,     4    );
4715   bdnz   (LOOP);
4716   bind(SKIP);
4717 }
4718 
4719 void MacroAssembler::multiply_to_len(Register x, Register xlen,
4720                                      Register y, Register ylen,
4721                                      Register z, Register zlen,
4722                                      Register tmp1, Register tmp2,
4723                                      Register tmp3, Register tmp4,
4724                                      Register tmp5, Register tmp6,
4725                                      Register tmp7, Register tmp8,
4726                                      Register tmp9, Register tmp10,
4727                                      Register tmp11, Register tmp12,
4728                                      Register tmp13) {
4729 
4730   ShortBranchVerifier sbv(this);
4731 
4732   assert_different_registers(x, xlen, y, ylen, z, zlen,
4733                              tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
4734   assert_different_registers(x, xlen, y, ylen, z, zlen,
4735                              tmp1, tmp2, tmp3, tmp4, tmp5, tmp7);
4736   assert_different_registers(x, xlen, y, ylen, z, zlen,
4737                              tmp1, tmp2, tmp3, tmp4, tmp5, tmp8);
4738 
4739   const Register idx = tmp1;
4740   const Register kdx = tmp2;
4741   const Register xstart = tmp3;
4742 
4743   const Register y_idx = tmp4;
4744   const Register carry = tmp5;
4745   const Register product = tmp6;
4746   const Register product_high = tmp7;
4747   const Register x_xstart = tmp8;
4748   const Register tmp = tmp9;
4749 
4750   // First Loop.
4751   //
4752   //  final static long LONG_MASK = 0xffffffffL;
4753   //  int xstart = xlen - 1;
4754   //  int ystart = ylen - 1;
4755   //  long carry = 0;
4756   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
4757   //    long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
4758   //    z[kdx] = (int)product;
4759   //    carry = product >>> 32;
4760   //  }
4761   //  z[xstart] = (int)carry;
4762 
4763   mr_if_needed(idx, ylen);        // idx = ylen
4764   mr_if_needed(kdx, zlen);        // kdx = xlen + ylen
4765   li(carry, 0);                   // carry = 0
4766 
4767   Label L_done;
4768 
4769   addic_(xstart, xlen, -1);
4770   blt(CCR0, L_done);
4771 
4772   multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z,
4773                         carry, product_high, product, idx, kdx, tmp);
4774 
4775   Label L_second_loop;
4776 
4777   cmpdi(CCR0, kdx, 0);
4778   beq(CCR0, L_second_loop);
4779 
4780   Label L_carry;
4781 
4782   addic_(kdx, kdx, -1);
4783   beq(CCR0, L_carry);
4784 
4785   // Store lower 32 bits of carry.
4786   sldi(tmp, kdx, LogBytesPerInt);
4787   stwx(carry, z, tmp);
4788   srdi(carry, carry, 32);
4789   addi(kdx, kdx, -1);
4790 
4791 
4792   bind(L_carry);
4793 
4794   // Store upper 32 bits of carry.
4795   sldi(tmp, kdx, LogBytesPerInt);
4796   stwx(carry, z, tmp);
4797 
4798   // Second and third (nested) loops.
4799   //
4800   //  for (int i = xstart-1; i >= 0; i--) { // Second loop
4801   //    carry = 0;
4802   //    for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
4803   //      long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
4804   //                     (z[k] & LONG_MASK) + carry;
4805   //      z[k] = (int)product;
4806   //      carry = product >>> 32;
4807   //    }
4808   //    z[i] = (int)carry;
4809   //  }
4810   //
4811   //  i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx
4812 
4813   bind(L_second_loop);
4814 
4815   li(carry, 0);                   // carry = 0;
4816 
4817   addic_(xstart, xstart, -1);     // i = xstart-1;
4818   blt(CCR0, L_done);
4819 
4820   Register zsave = tmp10;
4821 
4822   mr(zsave, z);
4823 
4824 
4825   Label L_last_x;
4826 
4827   sldi(tmp, xstart, LogBytesPerInt);
4828   add(z, z, tmp);                 // z = z + k - j
4829   addi(z, z, 4);
4830   addic_(xstart, xstart, -1);     // i = xstart-1;
4831   blt(CCR0, L_last_x);
4832 
4833   sldi(tmp, xstart, LogBytesPerInt);
4834   ldx(x_xstart, x, tmp);
4835 #ifdef VM_LITTLE_ENDIAN
4836   rldicl(x_xstart, x_xstart, 32, 0);
4837 #endif
4838 
4839 
4840   Label L_third_loop_prologue;
4841 
4842   bind(L_third_loop_prologue);
4843 
4844   Register xsave = tmp11;
4845   Register xlensave = tmp12;
4846   Register ylensave = tmp13;
4847 
4848   mr(xsave, x);
4849   mr(xlensave, xstart);
4850   mr(ylensave, ylen);
4851 
4852 
4853   multiply_128_x_128_loop(x_xstart, y, z, y_idx, ylen,
4854                           carry, product_high, product, x, tmp);
4855 
4856   mr(z, zsave);
4857   mr(x, xsave);
4858   mr(xlen, xlensave);   // This is the decrement of the loop counter!
4859   mr(ylen, ylensave);
4860 
4861   addi(tmp3, xlen, 1);
4862   sldi(tmp, tmp3, LogBytesPerInt);
4863   stwx(carry, z, tmp);
4864   addic_(tmp3, tmp3, -1);
4865   blt(CCR0, L_done);
4866 
4867   srdi(carry, carry, 32);
4868   sldi(tmp, tmp3, LogBytesPerInt);
4869   stwx(carry, z, tmp);
4870   b(L_second_loop);
4871 
4872   // Next infrequent code is moved outside loops.
4873   bind(L_last_x);
4874 
4875   lwz(x_xstart, 0, x);
4876   b(L_third_loop_prologue);
4877 
4878   bind(L_done);
4879 }   // multiply_to_len
4880 
4881 void MacroAssembler::asm_assert(bool check_equal, const char *msg, int id) {
4882 #ifdef ASSERT
4883   Label ok;
4884   if (check_equal) {
4885     beq(CCR0, ok);
4886   } else {
4887     bne(CCR0, ok);
4888   }
4889   stop(msg, id);
4890   bind(ok);
4891 #endif
4892 }
4893 
4894 void MacroAssembler::asm_assert_mems_zero(bool check_equal, int size, int mem_offset,
4895                                           Register mem_base, const char* msg, int id) {
4896 #ifdef ASSERT
4897   switch (size) {
4898     case 4:
4899       lwz(R0, mem_offset, mem_base);
4900       cmpwi(CCR0, R0, 0);
4901       break;
4902     case 8:
4903       ld(R0, mem_offset, mem_base);
4904       cmpdi(CCR0, R0, 0);
4905       break;
4906     default:
4907       ShouldNotReachHere();
4908   }
4909   asm_assert(check_equal, msg, id);
4910 #endif // ASSERT
4911 }
4912 
4913 void MacroAssembler::verify_thread() {
4914   if (VerifyThread) {
4915     unimplemented("'VerifyThread' currently not implemented on PPC");
4916   }
4917 }
4918 
4919 // READ: oop. KILL: R0. Volatile floats perhaps.
4920 void MacroAssembler::verify_oop(Register oop, const char* msg) {
4921   if (!VerifyOops) {
4922     return;
4923   }
4924 
4925   address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address();
4926   const Register tmp = R11; // Will be preserved.
4927   const int nbytes_save = MacroAssembler::num_volatile_regs * 8;
4928   save_volatile_gprs(R1_SP, -nbytes_save); // except R0
4929 
4930   mr_if_needed(R4_ARG2, oop);
4931   save_LR_CR(tmp); // save in old frame
4932   push_frame_reg_args(nbytes_save, tmp);
4933   // load FunctionDescriptor** / entry_address *
4934   load_const_optimized(tmp, fd, R0);
4935   // load FunctionDescriptor* / entry_address
4936   ld(tmp, 0, tmp);
4937   load_const_optimized(R3_ARG1, (address)msg, R0);
4938   // Call destination for its side effect.
4939   call_c(tmp);
4940 
4941   pop_frame();
4942   restore_LR_CR(tmp);
4943   restore_volatile_gprs(R1_SP, -nbytes_save); // except R0
4944 }
4945 
4946 void MacroAssembler::verify_oop_addr(RegisterOrConstant offs, Register base, const char* msg) {
4947   if (!VerifyOops) {
4948     return;
4949   }
4950 
4951   address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address();
4952   const Register tmp = R11; // Will be preserved.
4953   const int nbytes_save = MacroAssembler::num_volatile_regs * 8;
4954   save_volatile_gprs(R1_SP, -nbytes_save); // except R0
4955 
4956   ld(R4_ARG2, offs, base);
4957   save_LR_CR(tmp); // save in old frame
4958   push_frame_reg_args(nbytes_save, tmp);
4959   // load FunctionDescriptor** / entry_address *
4960   load_const_optimized(tmp, fd, R0);
4961   // load FunctionDescriptor* / entry_address
4962   ld(tmp, 0, tmp);
4963   load_const_optimized(R3_ARG1, (address)msg, R0);
4964   // Call destination for its side effect.
4965   call_c(tmp);
4966 
4967   pop_frame();
4968   restore_LR_CR(tmp);
4969   restore_volatile_gprs(R1_SP, -nbytes_save); // except R0
4970 }
4971 
4972 const char* stop_types[] = {
4973   "stop",
4974   "untested",
4975   "unimplemented",
4976   "shouldnotreachhere"
4977 };
4978 
4979 static void stop_on_request(int tp, const char* msg) {
4980   tty->print("PPC assembly code requires stop: (%s) %s\n", stop_types[tp%/*stop_end*/4], msg);
4981   guarantee(false, "PPC assembly code requires stop: %s", msg);
4982 }
4983 
4984 // Call a C-function that prints output.
4985 void MacroAssembler::stop(int type, const char* msg, int id) {
4986 #ifndef PRODUCT
4987   block_comment(err_msg("stop: %s %s {", stop_types[type%stop_end], msg));
4988 #else
4989   block_comment("stop {");
4990 #endif
4991 
4992   // setup arguments
4993   load_const_optimized(R3_ARG1, type);
4994   load_const_optimized(R4_ARG2, (void *)msg, /*tmp=*/R0);
4995   call_VM_leaf(CAST_FROM_FN_PTR(address, stop_on_request), R3_ARG1, R4_ARG2);
4996   illtrap();
4997   emit_int32(id);
4998   block_comment("} stop;");
4999 }
5000 
5001 #ifndef PRODUCT
5002 // Write pattern 0x0101010101010101 in memory region [low-before, high+after].
5003 // Val, addr are temp registers.
5004 // If low == addr, addr is killed.
5005 // High is preserved.
5006 void MacroAssembler::zap_from_to(Register low, int before, Register high, int after, Register val, Register addr) {
5007   if (!ZapMemory) return;
5008 
5009   assert_different_registers(low, val);
5010 
5011   BLOCK_COMMENT("zap memory region {");
5012   load_const_optimized(val, 0x0101010101010101);
5013   int size = before + after;
5014   if (low == high && size < 5 && size > 0) {
5015     int offset = -before*BytesPerWord;
5016     for (int i = 0; i < size; ++i) {
5017       std(val, offset, low);
5018       offset += (1*BytesPerWord);
5019     }
5020   } else {
5021     addi(addr, low, -before*BytesPerWord);
5022     assert_different_registers(high, val);
5023     if (after) addi(high, high, after * BytesPerWord);
5024     Label loop;
5025     bind(loop);
5026     std(val, 0, addr);
5027     addi(addr, addr, 8);
5028     cmpd(CCR6, addr, high);
5029     ble(CCR6, loop);
5030     if (after) addi(high, high, -after * BytesPerWord);  // Correct back to old value.
5031   }
5032   BLOCK_COMMENT("} zap memory region");
5033 }
5034 
5035 #endif // !PRODUCT
5036 
5037 void SkipIfEqualZero::skip_to_label_if_equal_zero(MacroAssembler* masm, Register temp,
5038                                                   const bool* flag_addr, Label& label) {
5039   int simm16_offset = masm->load_const_optimized(temp, (address)flag_addr, R0, true);
5040   assert(sizeof(bool) == 1, "PowerPC ABI");
5041   masm->lbz(temp, simm16_offset, temp);
5042   masm->cmpwi(CCR0, temp, 0);
5043   masm->beq(CCR0, label);
5044 }
5045 
5046 SkipIfEqualZero::SkipIfEqualZero(MacroAssembler* masm, Register temp, const bool* flag_addr) : _masm(masm), _label() {
5047   skip_to_label_if_equal_zero(masm, temp, flag_addr, _label);
5048 }
5049 
5050 SkipIfEqualZero::~SkipIfEqualZero() {
5051   _masm->bind(_label);
5052 }