New src/hotspot/cpu/ppc/stubGenerator

   1 /*
   2  * Copyright (c) 1997, 2019, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2012, 2019, SAP SE. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/macroAssembler.inline.hpp"
  28 #include "gc/shared/barrierSet.hpp"
  29 #include "gc/shared/barrierSetAssembler.hpp"
  30 #include "interpreter/interpreter.hpp"
  31 #include "nativeInst_ppc.hpp"
  32 #include "oops/instanceOop.hpp"
  33 #include "oops/method.hpp"
  34 #include "oops/objArrayKlass.hpp"
  35 #include "oops/oop.inline.hpp"
  36 #include "prims/methodHandles.hpp"
  37 #include "runtime/frame.inline.hpp"
  38 #include "runtime/handles.inline.hpp"
  39 #include "runtime/sharedRuntime.hpp"
  40 #include "runtime/stubCodeGenerator.hpp"
  41 #include "runtime/stubRoutines.hpp"
  42 #include "runtime/thread.inline.hpp"
  43 #include "utilities/align.hpp"
  44 
  45 // Declaration and definition of StubGenerator (no .hpp file).
  46 // For a more detailed description of the stub routine structure
  47 // see the comment in stubRoutines.hpp.
  48 
  49 #define __ _masm->
  50 
  51 #ifdef PRODUCT
  52 #define BLOCK_COMMENT(str) // nothing
  53 #else
  54 #define BLOCK_COMMENT(str) __ block_comment(str)
  55 #endif
  56 
  57 #if defined(ABI_ELFv2)
  58 #define STUB_ENTRY(name) StubRoutines::name()
  59 #else
  60 #define STUB_ENTRY(name) ((FunctionDescriptor*)StubRoutines::name())->entry()
  61 #endif
  62 
  63 class StubGenerator: public StubCodeGenerator {
  64  private:
  65 
  66   // Call stubs are used to call Java from C
  67   //
  68   // Arguments:
  69   //
  70   //   R3  - call wrapper address     : address
  71   //   R4  - result                   : intptr_t*
  72   //   R5  - result type              : BasicType
  73   //   R6  - method                   : Method
  74   //   R7  - frame mgr entry point    : address
  75   //   R8  - parameter block          : intptr_t*
  76   //   R9  - parameter count in words : int
  77   //   R10 - thread                   : Thread*
  78   //
  79   address generate_call_stub(address& return_address) {
  80     // Setup a new c frame, copy java arguments, call frame manager or
  81     // native_entry, and process result.
  82 
  83     StubCodeMark mark(this, "StubRoutines", "call_stub");
  84 
  85     address start = __ function_entry();
  86 
  87     // some sanity checks
  88     assert((sizeof(frame::abi_minframe) % 16) == 0,           "unaligned");
  89     assert((sizeof(frame::abi_reg_args) % 16) == 0,           "unaligned");
  90     assert((sizeof(frame::spill_nonvolatiles) % 16) == 0,     "unaligned");
  91     assert((sizeof(frame::parent_ijava_frame_abi) % 16) == 0, "unaligned");
  92     assert((sizeof(frame::entry_frame_locals) % 16) == 0,     "unaligned");
  93 
  94     Register r_arg_call_wrapper_addr        = R3;
  95     Register r_arg_result_addr              = R4;
  96     Register r_arg_result_type              = R5;
  97     Register r_arg_method                   = R6;
  98     Register r_arg_entry                    = R7;
  99     Register r_arg_thread                   = R10;
 100 
 101     Register r_temp                         = R24;
 102     Register r_top_of_arguments_addr        = R25;
 103     Register r_entryframe_fp                = R26;
 104 
 105     {
 106       // Stack on entry to call_stub:
 107       //
 108       //      F1      [C_FRAME]
 109       //              ...
 110 
 111       Register r_arg_argument_addr          = R8;
 112       Register r_arg_argument_count         = R9;
 113       Register r_frame_alignment_in_bytes   = R27;
 114       Register r_argument_addr              = R28;
 115       Register r_argumentcopy_addr          = R29;
 116       Register r_argument_size_in_bytes     = R30;
 117       Register r_frame_size                 = R23;
 118 
 119       Label arguments_copied;
 120 
 121       // Save LR/CR to caller's C_FRAME.
 122       __ save_LR_CR(R0);
 123 
 124       // Zero extend arg_argument_count.
 125       __ clrldi(r_arg_argument_count, r_arg_argument_count, 32);
 126 
 127       // Save non-volatiles GPRs to ENTRY_FRAME (not yet pushed, but it's safe).
 128       __ save_nonvolatile_gprs(R1_SP, _spill_nonvolatiles_neg(r14));
 129 
 130       // Keep copy of our frame pointer (caller's SP).
 131       __ mr(r_entryframe_fp, R1_SP);
 132 
 133       BLOCK_COMMENT("Push ENTRY_FRAME including arguments");
 134       // Push ENTRY_FRAME including arguments:
 135       //
 136       //      F0      [TOP_IJAVA_FRAME_ABI]
 137       //              alignment (optional)
 138       //              [outgoing Java arguments]
 139       //              [ENTRY_FRAME_LOCALS]
 140       //      F1      [C_FRAME]
 141       //              ...
 142 
 143       // calculate frame size
 144 
 145       // unaligned size of arguments
 146       __ sldi(r_argument_size_in_bytes,
 147                   r_arg_argument_count, Interpreter::logStackElementSize);
 148       // arguments alignment (max 1 slot)
 149       // FIXME: use round_to() here
 150       __ andi_(r_frame_alignment_in_bytes, r_arg_argument_count, 1);
 151       __ sldi(r_frame_alignment_in_bytes,
 152               r_frame_alignment_in_bytes, Interpreter::logStackElementSize);
 153 
 154       // size = unaligned size of arguments + top abi's size
 155       __ addi(r_frame_size, r_argument_size_in_bytes,
 156               frame::top_ijava_frame_abi_size);
 157       // size += arguments alignment
 158       __ add(r_frame_size,
 159              r_frame_size, r_frame_alignment_in_bytes);
 160       // size += size of call_stub locals
 161       __ addi(r_frame_size,
 162               r_frame_size, frame::entry_frame_locals_size);
 163 
 164       // push ENTRY_FRAME
 165       __ push_frame(r_frame_size, r_temp);
 166 
 167       // initialize call_stub locals (step 1)
 168       __ std(r_arg_call_wrapper_addr,
 169              _entry_frame_locals_neg(call_wrapper_address), r_entryframe_fp);
 170       __ std(r_arg_result_addr,
 171              _entry_frame_locals_neg(result_address), r_entryframe_fp);
 172       __ std(r_arg_result_type,
 173              _entry_frame_locals_neg(result_type), r_entryframe_fp);
 174       // we will save arguments_tos_address later
 175 
 176 
 177       BLOCK_COMMENT("Copy Java arguments");
 178       // copy Java arguments
 179 
 180       // Calculate top_of_arguments_addr which will be R17_tos (not prepushed) later.
 181       // FIXME: why not simply use SP+frame::top_ijava_frame_size?
 182       __ addi(r_top_of_arguments_addr,
 183               R1_SP, frame::top_ijava_frame_abi_size);
 184       __ add(r_top_of_arguments_addr,
 185              r_top_of_arguments_addr, r_frame_alignment_in_bytes);
 186 
 187       // any arguments to copy?
 188       __ cmpdi(CCR0, r_arg_argument_count, 0);
 189       __ beq(CCR0, arguments_copied);
 190 
 191       // prepare loop and copy arguments in reverse order
 192       {
 193         // init CTR with arg_argument_count
 194         __ mtctr(r_arg_argument_count);
 195 
 196         // let r_argumentcopy_addr point to last outgoing Java arguments P
 197         __ mr(r_argumentcopy_addr, r_top_of_arguments_addr);
 198 
 199         // let r_argument_addr point to last incoming java argument
 200         __ add(r_argument_addr,
 201                    r_arg_argument_addr, r_argument_size_in_bytes);
 202         __ addi(r_argument_addr, r_argument_addr, -BytesPerWord);
 203 
 204         // now loop while CTR > 0 and copy arguments
 205         {
 206           Label next_argument;
 207           __ bind(next_argument);
 208 
 209           __ ld(r_temp, 0, r_argument_addr);
 210           // argument_addr--;
 211           __ addi(r_argument_addr, r_argument_addr, -BytesPerWord);
 212           __ std(r_temp, 0, r_argumentcopy_addr);
 213           // argumentcopy_addr++;
 214           __ addi(r_argumentcopy_addr, r_argumentcopy_addr, BytesPerWord);
 215 
 216           __ bdnz(next_argument);
 217         }
 218       }
 219 
 220       // Arguments copied, continue.
 221       __ bind(arguments_copied);
 222     }
 223 
 224     {
 225       BLOCK_COMMENT("Call frame manager or native entry.");
 226       // Call frame manager or native entry.
 227       Register r_new_arg_entry = R14;
 228       assert_different_registers(r_new_arg_entry, r_top_of_arguments_addr,
 229                                  r_arg_method, r_arg_thread);
 230 
 231       __ mr(r_new_arg_entry, r_arg_entry);
 232 
 233       // Register state on entry to frame manager / native entry:
 234       //
 235       //   tos         -  intptr_t*    sender tos (prepushed) Lesp = (SP) + copied_arguments_offset - 8
 236       //   R19_method  -  Method
 237       //   R16_thread  -  JavaThread*
 238 
 239       // Tos must point to last argument - element_size.
 240       const Register tos = R15_esp;
 241 
 242       __ addi(tos, r_top_of_arguments_addr, -Interpreter::stackElementSize);
 243 
 244       // initialize call_stub locals (step 2)
 245       // now save tos as arguments_tos_address
 246       __ std(tos, _entry_frame_locals_neg(arguments_tos_address), r_entryframe_fp);
 247 
 248       // load argument registers for call
 249       __ mr(R19_method, r_arg_method);
 250       __ mr(R16_thread, r_arg_thread);
 251       assert(tos != r_arg_method, "trashed r_arg_method");
 252       assert(tos != r_arg_thread && R19_method != r_arg_thread, "trashed r_arg_thread");
 253 
 254       // Set R15_prev_state to 0 for simplifying checks in callee.
 255       __ load_const_optimized(R25_templateTableBase, (address)Interpreter::dispatch_table((TosState)0), R11_scratch1);
 256       // Stack on entry to frame manager / native entry:
 257       //
 258       //      F0      [TOP_IJAVA_FRAME_ABI]
 259       //              alignment (optional)
 260       //              [outgoing Java arguments]
 261       //              [ENTRY_FRAME_LOCALS]
 262       //      F1      [C_FRAME]
 263       //              ...
 264       //
 265 
 266       // global toc register
 267       __ load_const_optimized(R29_TOC, MacroAssembler::global_toc(), R11_scratch1);
 268       // Remember the senderSP so we interpreter can pop c2i arguments off of the stack
 269       // when called via a c2i.
 270 
 271       // Pass initial_caller_sp to framemanager.
 272       __ mr(R21_sender_SP, R1_SP);
 273 
 274       // Do a light-weight C-call here, r_new_arg_entry holds the address
 275       // of the interpreter entry point (frame manager or native entry)
 276       // and save runtime-value of LR in return_address.
 277       assert(r_new_arg_entry != tos && r_new_arg_entry != R19_method && r_new_arg_entry != R16_thread,
 278              "trashed r_new_arg_entry");
 279       return_address = __ call_stub(r_new_arg_entry);
 280     }
 281 
 282     {
 283       BLOCK_COMMENT("Returned from frame manager or native entry.");
 284       // Returned from frame manager or native entry.
 285       // Now pop frame, process result, and return to caller.
 286 
 287       // Stack on exit from frame manager / native entry:
 288       //
 289       //      F0      [ABI]
 290       //              ...
 291       //              [ENTRY_FRAME_LOCALS]
 292       //      F1      [C_FRAME]
 293       //              ...
 294       //
 295       // Just pop the topmost frame ...
 296       //
 297 
 298       Label ret_is_object;
 299       Label ret_is_long;
 300       Label ret_is_float;
 301       Label ret_is_double;
 302 
 303       Register r_entryframe_fp = R30;
 304       Register r_lr            = R7_ARG5;
 305       Register r_cr            = R8_ARG6;
 306 
 307       // Reload some volatile registers which we've spilled before the call
 308       // to frame manager / native entry.
 309       // Access all locals via frame pointer, because we know nothing about
 310       // the topmost frame's size.
 311       __ ld(r_entryframe_fp, _abi(callers_sp), R1_SP);
 312       assert_different_registers(r_entryframe_fp, R3_RET, r_arg_result_addr, r_arg_result_type, r_cr, r_lr);
 313       __ ld(r_arg_result_addr,
 314             _entry_frame_locals_neg(result_address), r_entryframe_fp);
 315       __ ld(r_arg_result_type,
 316             _entry_frame_locals_neg(result_type), r_entryframe_fp);
 317       __ ld(r_cr, _abi(cr), r_entryframe_fp);
 318       __ ld(r_lr, _abi(lr), r_entryframe_fp);
 319 
 320       // pop frame and restore non-volatiles, LR and CR
 321       __ mr(R1_SP, r_entryframe_fp);
 322       __ mtcr(r_cr);
 323       __ mtlr(r_lr);
 324 
 325       // Store result depending on type. Everything that is not
 326       // T_OBJECT, T_LONG, T_FLOAT, or T_DOUBLE is treated as T_INT.
 327       __ cmpwi(CCR0, r_arg_result_type, T_OBJECT);
 328       __ cmpwi(CCR1, r_arg_result_type, T_LONG);
 329       __ cmpwi(CCR5, r_arg_result_type, T_FLOAT);
 330       __ cmpwi(CCR6, r_arg_result_type, T_DOUBLE);
 331 
 332       // restore non-volatile registers
 333       __ restore_nonvolatile_gprs(R1_SP, _spill_nonvolatiles_neg(r14));
 334 
 335 
 336       // Stack on exit from call_stub:
 337       //
 338       //      0       [C_FRAME]
 339       //              ...
 340       //
 341       //  no call_stub frames left.
 342 
 343       // All non-volatiles have been restored at this point!!
 344       assert(R3_RET == R3, "R3_RET should be R3");
 345 
 346       __ beq(CCR0, ret_is_object);
 347       __ beq(CCR1, ret_is_long);
 348       __ beq(CCR5, ret_is_float);
 349       __ beq(CCR6, ret_is_double);
 350 
 351       // default:
 352       __ stw(R3_RET, 0, r_arg_result_addr);
 353       __ blr(); // return to caller
 354 
 355       // case T_OBJECT:
 356       __ bind(ret_is_object);
 357       __ std(R3_RET, 0, r_arg_result_addr);
 358       __ blr(); // return to caller
 359 
 360       // case T_LONG:
 361       __ bind(ret_is_long);
 362       __ std(R3_RET, 0, r_arg_result_addr);
 363       __ blr(); // return to caller
 364 
 365       // case T_FLOAT:
 366       __ bind(ret_is_float);
 367       __ stfs(F1_RET, 0, r_arg_result_addr);
 368       __ blr(); // return to caller
 369 
 370       // case T_DOUBLE:
 371       __ bind(ret_is_double);
 372       __ stfd(F1_RET, 0, r_arg_result_addr);
 373       __ blr(); // return to caller
 374     }
 375 
 376     return start;
 377   }
 378 
 379   // Return point for a Java call if there's an exception thrown in
 380   // Java code.  The exception is caught and transformed into a
 381   // pending exception stored in JavaThread that can be tested from
 382   // within the VM.
 383   //
 384   address generate_catch_exception() {
 385     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 386 
 387     address start = __ pc();
 388 
 389     // Registers alive
 390     //
 391     //  R16_thread
 392     //  R3_ARG1 - address of pending exception
 393     //  R4_ARG2 - return address in call stub
 394 
 395     const Register exception_file = R21_tmp1;
 396     const Register exception_line = R22_tmp2;
 397 
 398     __ load_const(exception_file, (void*)__FILE__);
 399     __ load_const(exception_line, (void*)__LINE__);
 400 
 401     __ std(R3_ARG1, in_bytes(JavaThread::pending_exception_offset()), R16_thread);
 402     // store into `char *'
 403     __ std(exception_file, in_bytes(JavaThread::exception_file_offset()), R16_thread);
 404     // store into `int'
 405     __ stw(exception_line, in_bytes(JavaThread::exception_line_offset()), R16_thread);
 406 
 407     // complete return to VM
 408     assert(StubRoutines::_call_stub_return_address != NULL, "must have been generated before");
 409 
 410     __ mtlr(R4_ARG2);
 411     // continue in call stub
 412     __ blr();
 413 
 414     return start;
 415   }
 416 
 417   // Continuation point for runtime calls returning with a pending
 418   // exception.  The pending exception check happened in the runtime
 419   // or native call stub.  The pending exception in Thread is
 420   // converted into a Java-level exception.
 421   //
 422   // Read:
 423   //
 424   //   LR:     The pc the runtime library callee wants to return to.
 425   //           Since the exception occurred in the callee, the return pc
 426   //           from the point of view of Java is the exception pc.
 427   //   thread: Needed for method handles.
 428   //
 429   // Invalidate:
 430   //
 431   //   volatile registers (except below).
 432   //
 433   // Update:
 434   //
 435   //   R4_ARG2: exception
 436   //
 437   // (LR is unchanged and is live out).
 438   //
 439   address generate_forward_exception() {
 440     StubCodeMark mark(this, "StubRoutines", "forward_exception");
 441     address start = __ pc();
 442 
 443     if (VerifyOops) {
 444       // Get pending exception oop.
 445       __ ld(R3_ARG1,
 446                 in_bytes(Thread::pending_exception_offset()),
 447                 R16_thread);
 448       // Make sure that this code is only executed if there is a pending exception.
 449       {
 450         Label L;
 451         __ cmpdi(CCR0, R3_ARG1, 0);
 452         __ bne(CCR0, L);
 453         __ stop("StubRoutines::forward exception: no pending exception (1)");
 454         __ bind(L);
 455       }
 456       __ verify_oop(R3_ARG1, "StubRoutines::forward exception: not an oop");
 457     }
 458 
 459     // Save LR/CR and copy exception pc (LR) into R4_ARG2.
 460     __ save_LR_CR(R4_ARG2);
 461     __ push_frame_reg_args(0, R0);
 462     // Find exception handler.
 463     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 464                      SharedRuntime::exception_handler_for_return_address),
 465                     R16_thread,
 466                     R4_ARG2);
 467     // Copy handler's address.
 468     __ mtctr(R3_RET);
 469     __ pop_frame();
 470     __ restore_LR_CR(R0);
 471 
 472     // Set up the arguments for the exception handler:
 473     //  - R3_ARG1: exception oop
 474     //  - R4_ARG2: exception pc.
 475 
 476     // Load pending exception oop.
 477     __ ld(R3_ARG1,
 478               in_bytes(Thread::pending_exception_offset()),
 479               R16_thread);
 480 
 481     // The exception pc is the return address in the caller.
 482     // Must load it into R4_ARG2.
 483     __ mflr(R4_ARG2);
 484 
 485 #ifdef ASSERT
 486     // Make sure exception is set.
 487     {
 488       Label L;
 489       __ cmpdi(CCR0, R3_ARG1, 0);
 490       __ bne(CCR0, L);
 491       __ stop("StubRoutines::forward exception: no pending exception (2)");
 492       __ bind(L);
 493     }
 494 #endif
 495 
 496     // Clear the pending exception.
 497     __ li(R0, 0);
 498     __ std(R0,
 499                in_bytes(Thread::pending_exception_offset()),
 500                R16_thread);
 501     // Jump to exception handler.
 502     __ bctr();
 503 
 504     return start;
 505   }
 506 
 507 #undef __
 508 #define __ masm->
 509   // Continuation point for throwing of implicit exceptions that are
 510   // not handled in the current activation. Fabricates an exception
 511   // oop and initiates normal exception dispatching in this
 512   // frame. Only callee-saved registers are preserved (through the
 513   // normal register window / RegisterMap handling).  If the compiler
 514   // needs all registers to be preserved between the fault point and
 515   // the exception handler then it must assume responsibility for that
 516   // in AbstractCompiler::continuation_for_implicit_null_exception or
 517   // continuation_for_implicit_division_by_zero_exception. All other
 518   // implicit exceptions (e.g., NullPointerException or
 519   // AbstractMethodError on entry) are either at call sites or
 520   // otherwise assume that stack unwinding will be initiated, so
 521   // caller saved registers were assumed volatile in the compiler.
 522   //
 523   // Note that we generate only this stub into a RuntimeStub, because
 524   // it needs to be properly traversed and ignored during GC, so we
 525   // change the meaning of the "__" macro within this method.
 526   //
 527   // Note: the routine set_pc_not_at_call_for_caller in
 528   // SharedRuntime.cpp requires that this code be generated into a
 529   // RuntimeStub.
 530   address generate_throw_exception(const char* name, address runtime_entry, bool restore_saved_exception_pc,
 531                                    Register arg1 = noreg, Register arg2 = noreg) {
 532     CodeBuffer code(name, 1024 DEBUG_ONLY(+ 512), 0);
 533     MacroAssembler* masm = new MacroAssembler(&code);
 534 
 535     OopMapSet* oop_maps  = new OopMapSet();
 536     int frame_size_in_bytes = frame::abi_reg_args_size;
 537     OopMap* map = new OopMap(frame_size_in_bytes / sizeof(jint), 0);
 538 
 539     address start = __ pc();
 540 
 541     __ save_LR_CR(R11_scratch1);
 542 
 543     // Push a frame.
 544     __ push_frame_reg_args(0, R11_scratch1);
 545 
 546     address frame_complete_pc = __ pc();
 547 
 548     if (restore_saved_exception_pc) {
 549       __ unimplemented("StubGenerator::throw_exception with restore_saved_exception_pc", 74);
 550     }
 551 
 552     // Note that we always have a runtime stub frame on the top of
 553     // stack by this point. Remember the offset of the instruction
 554     // whose address will be moved to R11_scratch1.
 555     address gc_map_pc = __ get_PC_trash_LR(R11_scratch1);
 556 
 557     __ set_last_Java_frame(/*sp*/R1_SP, /*pc*/R11_scratch1);
 558 
 559     __ mr(R3_ARG1, R16_thread);
 560     if (arg1 != noreg) {
 561       __ mr(R4_ARG2, arg1);
 562     }
 563     if (arg2 != noreg) {
 564       __ mr(R5_ARG3, arg2);
 565     }
 566 #if defined(ABI_ELFv2)
 567     __ call_c(runtime_entry, relocInfo::none);
 568 #else
 569     __ call_c(CAST_FROM_FN_PTR(FunctionDescriptor*, runtime_entry), relocInfo::none);
 570 #endif
 571 
 572     // Set an oopmap for the call site.
 573     oop_maps->add_gc_map((int)(gc_map_pc - start), map);
 574 
 575     __ reset_last_Java_frame();
 576 
 577 #ifdef ASSERT
 578     // Make sure that this code is only executed if there is a pending
 579     // exception.
 580     {
 581       Label L;
 582       __ ld(R0,
 583                 in_bytes(Thread::pending_exception_offset()),
 584                 R16_thread);
 585       __ cmpdi(CCR0, R0, 0);
 586       __ bne(CCR0, L);
 587       __ stop("StubRoutines::throw_exception: no pending exception");
 588       __ bind(L);
 589     }
 590 #endif
 591 
 592     // Pop frame.
 593     __ pop_frame();
 594 
 595     __ restore_LR_CR(R11_scratch1);
 596 
 597     __ load_const(R11_scratch1, StubRoutines::forward_exception_entry());
 598     __ mtctr(R11_scratch1);
 599     __ bctr();
 600 
 601     // Create runtime stub with OopMap.
 602     RuntimeStub* stub =
 603       RuntimeStub::new_runtime_stub(name, &code,
 604                                     /*frame_complete=*/ (int)(frame_complete_pc - start),
 605                                     frame_size_in_bytes/wordSize,
 606                                     oop_maps,
 607                                     false);
 608     return stub->entry_point();
 609   }
 610 #undef __
 611 #define __ _masm->
 612 
 613 
 614   // Support for void zero_words_aligned8(HeapWord* to, size_t count)
 615   //
 616   // Arguments:
 617   //   to:
 618   //   count:
 619   //
 620   // Destroys:
 621   //
 622   address generate_zero_words_aligned8() {
 623     StubCodeMark mark(this, "StubRoutines", "zero_words_aligned8");
 624 
 625     // Implemented as in ClearArray.
 626     address start = __ function_entry();
 627 
 628     Register base_ptr_reg   = R3_ARG1; // tohw (needs to be 8b aligned)
 629     Register cnt_dwords_reg = R4_ARG2; // count (in dwords)
 630     Register tmp1_reg       = R5_ARG3;
 631     Register tmp2_reg       = R6_ARG4;
 632     Register zero_reg       = R7_ARG5;
 633 
 634     // Procedure for large arrays (uses data cache block zero instruction).
 635     Label dwloop, fast, fastloop, restloop, lastdword, done;
 636     int cl_size = VM_Version::L1_data_cache_line_size();
 637     int cl_dwords = cl_size >> 3;
 638     int cl_dwordaddr_bits = exact_log2(cl_dwords);
 639     int min_dcbz = 2; // Needs to be positive, apply dcbz only to at least min_dcbz cache lines.
 640 
 641     // Clear up to 128byte boundary if long enough, dword_cnt=(16-(base>>3))%16.
 642     __ dcbtst(base_ptr_reg);                    // Indicate write access to first cache line ...
 643     __ andi(tmp2_reg, cnt_dwords_reg, 1);       // to check if number of dwords is even.
 644     __ srdi_(tmp1_reg, cnt_dwords_reg, 1);      // number of double dwords
 645     __ load_const_optimized(zero_reg, 0L);      // Use as zero register.
 646 
 647     __ cmpdi(CCR1, tmp2_reg, 0);                // cnt_dwords even?
 648     __ beq(CCR0, lastdword);                    // size <= 1
 649     __ mtctr(tmp1_reg);                         // Speculatively preload counter for rest loop (>0).
 650     __ cmpdi(CCR0, cnt_dwords_reg, (min_dcbz+1)*cl_dwords-1); // Big enough to ensure >=min_dcbz cache lines are included?
 651     __ neg(tmp1_reg, base_ptr_reg);             // bit 0..58: bogus, bit 57..60: (16-(base>>3))%16, bit 61..63: 000
 652 
 653     __ blt(CCR0, restloop);                     // Too small. (<31=(2*cl_dwords)-1 is sufficient, but bigger performs better.)
 654     __ rldicl_(tmp1_reg, tmp1_reg, 64-3, 64-cl_dwordaddr_bits); // Extract number of dwords to 128byte boundary=(16-(base>>3))%16.
 655 
 656     __ beq(CCR0, fast);                         // already 128byte aligned
 657     __ mtctr(tmp1_reg);                         // Set ctr to hit 128byte boundary (0<ctr<cnt).
 658     __ subf(cnt_dwords_reg, tmp1_reg, cnt_dwords_reg); // rest (>0 since size>=256-8)
 659 
 660     // Clear in first cache line dword-by-dword if not already 128byte aligned.
 661     __ bind(dwloop);
 662       __ std(zero_reg, 0, base_ptr_reg);        // Clear 8byte aligned block.
 663       __ addi(base_ptr_reg, base_ptr_reg, 8);
 664     __ bdnz(dwloop);
 665 
 666     // clear 128byte blocks
 667     __ bind(fast);
 668     __ srdi(tmp1_reg, cnt_dwords_reg, cl_dwordaddr_bits); // loop count for 128byte loop (>0 since size>=256-8)
 669     __ andi(tmp2_reg, cnt_dwords_reg, 1);       // to check if rest even
 670 
 671     __ mtctr(tmp1_reg);                         // load counter
 672     __ cmpdi(CCR1, tmp2_reg, 0);                // rest even?
 673     __ rldicl_(tmp1_reg, cnt_dwords_reg, 63, 65-cl_dwordaddr_bits); // rest in double dwords
 674 
 675     __ bind(fastloop);
 676       __ dcbz(base_ptr_reg);                    // Clear 128byte aligned block.
 677       __ addi(base_ptr_reg, base_ptr_reg, cl_size);
 678     __ bdnz(fastloop);
 679 
 680     //__ dcbtst(base_ptr_reg);                  // Indicate write access to last cache line.
 681     __ beq(CCR0, lastdword);                    // rest<=1
 682     __ mtctr(tmp1_reg);                         // load counter
 683 
 684     // Clear rest.
 685     __ bind(restloop);
 686       __ std(zero_reg, 0, base_ptr_reg);        // Clear 8byte aligned block.
 687       __ std(zero_reg, 8, base_ptr_reg);        // Clear 8byte aligned block.
 688       __ addi(base_ptr_reg, base_ptr_reg, 16);
 689     __ bdnz(restloop);
 690 
 691     __ bind(lastdword);
 692     __ beq(CCR1, done);
 693     __ std(zero_reg, 0, base_ptr_reg);
 694     __ bind(done);
 695     __ blr();                                   // return
 696 
 697     return start;
 698   }
 699 
 700 #if !defined(PRODUCT)
 701   // Wrapper which calls oopDesc::is_oop_or_null()
 702   // Only called by MacroAssembler::verify_oop
 703   static void verify_oop_helper(const char* message, oop o) {
 704     if (!oopDesc::is_oop_or_null(o)) {
 705       fatal("%s. oop: " PTR_FORMAT, message, p2i(o));
 706     }
 707     ++ StubRoutines::_verify_oop_count;
 708   }
 709 #endif
 710 
 711   // Return address of code to be called from code generated by
 712   // MacroAssembler::verify_oop.
 713   //
 714   // Don't generate, rather use C++ code.
 715   address generate_verify_oop() {
 716     // this is actually a `FunctionDescriptor*'.
 717     address start = 0;
 718 
 719 #if !defined(PRODUCT)
 720     start = CAST_FROM_FN_PTR(address, verify_oop_helper);
 721 #endif
 722 
 723     return start;
 724   }
 725 
 726   // -XX:+OptimizeFill : convert fill/copy loops into intrinsic
 727   //
 728   // The code is implemented(ported from sparc) as we believe it benefits JVM98, however
 729   // tracing(-XX:+TraceOptimizeFill) shows the intrinsic replacement doesn't happen at all!
 730   //
 731   // Source code in function is_range_check_if() shows that OptimizeFill relaxed the condition
 732   // for turning on loop predication optimization, and hence the behavior of "array range check"
 733   // and "loop invariant check" could be influenced, which potentially boosted JVM98.
 734   //
 735   // Generate stub for disjoint short fill. If "aligned" is true, the
 736   // "to" address is assumed to be heapword aligned.
 737   //
 738   // Arguments for generated stub:
 739   //   to:    R3_ARG1
 740   //   value: R4_ARG2
 741   //   count: R5_ARG3 treated as signed
 742   //
 743   address generate_fill(BasicType t, bool aligned, const char* name) {
 744     StubCodeMark mark(this, "StubRoutines", name);
 745     address start = __ function_entry();
 746 
 747     const Register to    = R3_ARG1;   // source array address
 748     const Register value = R4_ARG2;   // fill value
 749     const Register count = R5_ARG3;   // elements count
 750     const Register temp  = R6_ARG4;   // temp register
 751 
 752     //assert_clean_int(count, O3);    // Make sure 'count' is clean int.
 753 
 754     Label L_exit, L_skip_align1, L_skip_align2, L_fill_byte;
 755     Label L_fill_2_bytes, L_fill_4_bytes, L_fill_elements, L_fill_32_bytes;
 756 
 757     int shift = -1;
 758     switch (t) {
 759        case T_BYTE:
 760         shift = 2;
 761         // Clone bytes (zero extend not needed because store instructions below ignore high order bytes).
 762         __ rldimi(value, value, 8, 48);     // 8 bit -> 16 bit
 763         __ cmpdi(CCR0, count, 2<<shift);    // Short arrays (< 8 bytes) fill by element.
 764         __ blt(CCR0, L_fill_elements);
 765         __ rldimi(value, value, 16, 32);    // 16 bit -> 32 bit
 766         break;
 767        case T_SHORT:
 768         shift = 1;
 769         // Clone bytes (zero extend not needed because store instructions below ignore high order bytes).
 770         __ rldimi(value, value, 16, 32);    // 16 bit -> 32 bit
 771         __ cmpdi(CCR0, count, 2<<shift);    // Short arrays (< 8 bytes) fill by element.
 772         __ blt(CCR0, L_fill_elements);
 773         break;
 774       case T_INT:
 775         shift = 0;
 776         __ cmpdi(CCR0, count, 2<<shift);    // Short arrays (< 8 bytes) fill by element.
 777         __ blt(CCR0, L_fill_4_bytes);
 778         break;
 779       default: ShouldNotReachHere();
 780     }
 781 
 782     if (!aligned && (t == T_BYTE || t == T_SHORT)) {
 783       // Align source address at 4 bytes address boundary.
 784       if (t == T_BYTE) {
 785         // One byte misalignment happens only for byte arrays.
 786         __ andi_(temp, to, 1);
 787         __ beq(CCR0, L_skip_align1);
 788         __ stb(value, 0, to);
 789         __ addi(to, to, 1);
 790         __ addi(count, count, -1);
 791         __ bind(L_skip_align1);
 792       }
 793       // Two bytes misalignment happens only for byte and short (char) arrays.
 794       __ andi_(temp, to, 2);
 795       __ beq(CCR0, L_skip_align2);
 796       __ sth(value, 0, to);
 797       __ addi(to, to, 2);
 798       __ addi(count, count, -(1 << (shift - 1)));
 799       __ bind(L_skip_align2);
 800     }
 801 
 802     if (!aligned) {
 803       // Align to 8 bytes, we know we are 4 byte aligned to start.
 804       __ andi_(temp, to, 7);
 805       __ beq(CCR0, L_fill_32_bytes);
 806       __ stw(value, 0, to);
 807       __ addi(to, to, 4);
 808       __ addi(count, count, -(1 << shift));
 809       __ bind(L_fill_32_bytes);
 810     }
 811 
 812     __ li(temp, 8<<shift);                  // Prepare for 32 byte loop.
 813     // Clone bytes int->long as above.
 814     __ rldimi(value, value, 32, 0);         // 32 bit -> 64 bit
 815 
 816     Label L_check_fill_8_bytes;
 817     // Fill 32-byte chunks.
 818     __ subf_(count, temp, count);
 819     __ blt(CCR0, L_check_fill_8_bytes);
 820 
 821     Label L_fill_32_bytes_loop;
 822     __ align(32);
 823     __ bind(L_fill_32_bytes_loop);
 824 
 825     __ std(value, 0, to);
 826     __ std(value, 8, to);
 827     __ subf_(count, temp, count);           // Update count.
 828     __ std(value, 16, to);
 829     __ std(value, 24, to);
 830 
 831     __ addi(to, to, 32);
 832     __ bge(CCR0, L_fill_32_bytes_loop);
 833 
 834     __ bind(L_check_fill_8_bytes);
 835     __ add_(count, temp, count);
 836     __ beq(CCR0, L_exit);
 837     __ addic_(count, count, -(2 << shift));
 838     __ blt(CCR0, L_fill_4_bytes);
 839 
 840     //
 841     // Length is too short, just fill 8 bytes at a time.
 842     //
 843     Label L_fill_8_bytes_loop;
 844     __ bind(L_fill_8_bytes_loop);
 845     __ std(value, 0, to);
 846     __ addic_(count, count, -(2 << shift));
 847     __ addi(to, to, 8);
 848     __ bge(CCR0, L_fill_8_bytes_loop);
 849 
 850     // Fill trailing 4 bytes.
 851     __ bind(L_fill_4_bytes);
 852     __ andi_(temp, count, 1<<shift);
 853     __ beq(CCR0, L_fill_2_bytes);
 854 
 855     __ stw(value, 0, to);
 856     if (t == T_BYTE || t == T_SHORT) {
 857       __ addi(to, to, 4);
 858       // Fill trailing 2 bytes.
 859       __ bind(L_fill_2_bytes);
 860       __ andi_(temp, count, 1<<(shift-1));
 861       __ beq(CCR0, L_fill_byte);
 862       __ sth(value, 0, to);
 863       if (t == T_BYTE) {
 864         __ addi(to, to, 2);
 865         // Fill trailing byte.
 866         __ bind(L_fill_byte);
 867         __ andi_(count, count, 1);
 868         __ beq(CCR0, L_exit);
 869         __ stb(value, 0, to);
 870       } else {
 871         __ bind(L_fill_byte);
 872       }
 873     } else {
 874       __ bind(L_fill_2_bytes);
 875     }
 876     __ bind(L_exit);
 877     __ blr();
 878 
 879     // Handle copies less than 8 bytes. Int is handled elsewhere.
 880     if (t == T_BYTE) {
 881       __ bind(L_fill_elements);
 882       Label L_fill_2, L_fill_4;
 883       __ andi_(temp, count, 1);
 884       __ beq(CCR0, L_fill_2);
 885       __ stb(value, 0, to);
 886       __ addi(to, to, 1);
 887       __ bind(L_fill_2);
 888       __ andi_(temp, count, 2);
 889       __ beq(CCR0, L_fill_4);
 890       __ stb(value, 0, to);
 891       __ stb(value, 0, to);
 892       __ addi(to, to, 2);
 893       __ bind(L_fill_4);
 894       __ andi_(temp, count, 4);
 895       __ beq(CCR0, L_exit);
 896       __ stb(value, 0, to);
 897       __ stb(value, 1, to);
 898       __ stb(value, 2, to);
 899       __ stb(value, 3, to);
 900       __ blr();
 901     }
 902 
 903     if (t == T_SHORT) {
 904       Label L_fill_2;
 905       __ bind(L_fill_elements);
 906       __ andi_(temp, count, 1);
 907       __ beq(CCR0, L_fill_2);
 908       __ sth(value, 0, to);
 909       __ addi(to, to, 2);
 910       __ bind(L_fill_2);
 911       __ andi_(temp, count, 2);
 912       __ beq(CCR0, L_exit);
 913       __ sth(value, 0, to);
 914       __ sth(value, 2, to);
 915       __ blr();
 916     }
 917     return start;
 918   }
 919 
 920   inline void assert_positive_int(Register count) {
 921 #ifdef ASSERT
 922     __ srdi_(R0, count, 31);
 923     __ asm_assert_eq("missing zero extend", 0xAFFE);
 924 #endif
 925   }
 926 
 927   // Generate overlap test for array copy stubs.
 928   //
 929   // Input:
 930   //   R3_ARG1    -  from
 931   //   R4_ARG2    -  to
 932   //   R5_ARG3    -  element count
 933   //
 934   void array_overlap_test(address no_overlap_target, int log2_elem_size) {
 935     Register tmp1 = R6_ARG4;
 936     Register tmp2 = R7_ARG5;
 937 
 938     assert_positive_int(R5_ARG3);
 939 
 940     __ subf(tmp1, R3_ARG1, R4_ARG2); // distance in bytes
 941     __ sldi(tmp2, R5_ARG3, log2_elem_size); // size in bytes
 942     __ cmpld(CCR0, R3_ARG1, R4_ARG2); // Use unsigned comparison!
 943     __ cmpld(CCR1, tmp1, tmp2);
 944     __ crnand(CCR0, Assembler::less, CCR1, Assembler::less);
 945     // Overlaps if Src before dst and distance smaller than size.
 946     // Branch to forward copy routine otherwise (within range of 32kB).
 947     __ bc(Assembler::bcondCRbiIs1, Assembler::bi0(CCR0, Assembler::less), no_overlap_target);
 948 
 949     // need to copy backwards
 950   }
 951 
 952   // This is common errorexit stub for UnsafeCopyMemory.
 953   address generate_unsafecopy_common_error_exit() {
 954     address start_pc = __ pc();
 955     Register tmp1 = R6_ARG4;
 956     // probably copy stub would have changed value reset it.
 957     if (VM_Version::has_mfdscr()) {
 958       __ load_const_optimized(tmp1, VM_Version::_dscr_val);
 959       __ mtdscr(tmp1);
 960     }
 961     __ li(R3_RET, 0); // return 0
 962     __ blr();
 963     return start_pc;
 964   }
 965 
 966   // The guideline in the implementations of generate_disjoint_xxx_copy
 967   // (xxx=byte,short,int,long,oop) is to copy as many elements as possible with
 968   // single instructions, but to avoid alignment interrupts (see subsequent
 969   // comment). Furthermore, we try to minimize misaligned access, even
 970   // though they cause no alignment interrupt.
 971   //
 972   // In Big-Endian mode, the PowerPC architecture requires implementations to
 973   // handle automatically misaligned integer halfword and word accesses,
 974   // word-aligned integer doubleword accesses, and word-aligned floating-point
 975   // accesses. Other accesses may or may not generate an Alignment interrupt
 976   // depending on the implementation.
 977   // Alignment interrupt handling may require on the order of hundreds of cycles,
 978   // so every effort should be made to avoid misaligned memory values.
 979   //
 980   //
 981   // Generate stub for disjoint byte copy.  If "aligned" is true, the
 982   // "from" and "to" addresses are assumed to be heapword aligned.
 983   //
 984   // Arguments for generated stub:
 985   //      from:  R3_ARG1
 986   //      to:    R4_ARG2
 987   //      count: R5_ARG3 treated as signed
 988   //
 989   address generate_disjoint_byte_copy(bool aligned, const char * name) {
 990     StubCodeMark mark(this, "StubRoutines", name);
 991     address start = __ function_entry();
 992     assert_positive_int(R5_ARG3);
 993 
 994     Register tmp1 = R6_ARG4;
 995     Register tmp2 = R7_ARG5;
 996     Register tmp3 = R8_ARG6;
 997     Register tmp4 = R9_ARG7;
 998 
 999     VectorSRegister tmp_vsr1  = VSR1;
1000     VectorSRegister tmp_vsr2  = VSR2;
1001 
1002     Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8, l_9, l_10;
1003     {
1004       // UnsafeCopyMemory page error: continue at UnsafeCopyMemory common_error_exit
1005       UnsafeCopyMemoryMark ucmm(this, !aligned, false);
1006 
1007       // Don't try anything fancy if arrays don't have many elements.
1008       __ li(tmp3, 0);
1009       __ cmpwi(CCR0, R5_ARG3, 17);
1010       __ ble(CCR0, l_6); // copy 4 at a time
1011 
1012       if (!aligned) {
1013         __ xorr(tmp1, R3_ARG1, R4_ARG2);
1014         __ andi_(tmp1, tmp1, 3);
1015         __ bne(CCR0, l_6); // If arrays don't have the same alignment mod 4, do 4 element copy.
1016 
1017         // Copy elements if necessary to align to 4 bytes.
1018         __ neg(tmp1, R3_ARG1); // Compute distance to alignment boundary.
1019         __ andi_(tmp1, tmp1, 3);
1020         __ beq(CCR0, l_2);
1021 
1022         __ subf(R5_ARG3, tmp1, R5_ARG3);
1023         __ bind(l_9);
1024         __ lbz(tmp2, 0, R3_ARG1);
1025         __ addic_(tmp1, tmp1, -1);
1026         __ stb(tmp2, 0, R4_ARG2);
1027         __ addi(R3_ARG1, R3_ARG1, 1);
1028         __ addi(R4_ARG2, R4_ARG2, 1);
1029         __ bne(CCR0, l_9);
1030 
1031         __ bind(l_2);
1032       }
1033 
1034       // copy 8 elements at a time
1035       __ xorr(tmp2, R3_ARG1, R4_ARG2); // skip if src & dest have differing alignment mod 8
1036       __ andi_(tmp1, tmp2, 7);
1037       __ bne(CCR0, l_7); // not same alignment -> to or from is aligned -> copy 8
1038 
1039       // copy a 2-element word if necessary to align to 8 bytes
1040       __ andi_(R0, R3_ARG1, 7);
1041       __ beq(CCR0, l_7);
1042 
1043       __ lwzx(tmp2, R3_ARG1, tmp3);
1044       __ addi(R5_ARG3, R5_ARG3, -4);
1045       __ stwx(tmp2, R4_ARG2, tmp3);
1046       { // FasterArrayCopy
1047         __ addi(R3_ARG1, R3_ARG1, 4);
1048         __ addi(R4_ARG2, R4_ARG2, 4);
1049       }
1050       __ bind(l_7);
1051 
1052       { // FasterArrayCopy
1053         __ cmpwi(CCR0, R5_ARG3, 31);
1054         __ ble(CCR0, l_6); // copy 2 at a time if less than 32 elements remain
1055 
1056         __ srdi(tmp1, R5_ARG3, 5);
1057         __ andi_(R5_ARG3, R5_ARG3, 31);
1058         __ mtctr(tmp1);
1059 
1060        if (!VM_Version::has_vsx()) {
1061 
1062         __ bind(l_8);
1063         // Use unrolled version for mass copying (copy 32 elements a time)
1064         // Load feeding store gets zero latency on Power6, however not on Power5.
1065         // Therefore, the following sequence is made for the good of both.
1066         __ ld(tmp1, 0, R3_ARG1);
1067         __ ld(tmp2, 8, R3_ARG1);
1068         __ ld(tmp3, 16, R3_ARG1);
1069         __ ld(tmp4, 24, R3_ARG1);
1070         __ std(tmp1, 0, R4_ARG2);
1071         __ std(tmp2, 8, R4_ARG2);
1072         __ std(tmp3, 16, R4_ARG2);
1073         __ std(tmp4, 24, R4_ARG2);
1074         __ addi(R3_ARG1, R3_ARG1, 32);
1075         __ addi(R4_ARG2, R4_ARG2, 32);
1076         __ bdnz(l_8);
1077 
1078       } else { // Processor supports VSX, so use it to mass copy.
1079 
1080         // Prefetch the data into the L2 cache.
1081         __ dcbt(R3_ARG1, 0);
1082 
1083         // If supported set DSCR pre-fetch to deepest.
1084         if (VM_Version::has_mfdscr()) {
1085           __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
1086           __ mtdscr(tmp2);
1087         }
1088 
1089         __ li(tmp1, 16);
1090 
1091         // Backbranch target aligned to 32-byte. Not 16-byte align as
1092         // loop contains < 8 instructions that fit inside a single
1093         // i-cache sector.
1094         __ align(32);
1095 
1096         __ bind(l_10);
1097         // Use loop with VSX load/store instructions to
1098         // copy 32 elements a time.
1099         __ lxvd2x(tmp_vsr1, R3_ARG1);        // Load src
1100         __ stxvd2x(tmp_vsr1, R4_ARG2);       // Store to dst
1101         __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1);  // Load src + 16
1102         __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst + 16
1103         __ addi(R3_ARG1, R3_ARG1, 32);       // Update src+=32
1104         __ addi(R4_ARG2, R4_ARG2, 32);       // Update dsc+=32
1105         __ bdnz(l_10);                       // Dec CTR and loop if not zero.
1106 
1107         // Restore DSCR pre-fetch value.
1108         if (VM_Version::has_mfdscr()) {
1109           __ load_const_optimized(tmp2, VM_Version::_dscr_val);
1110           __ mtdscr(tmp2);
1111         }
1112 
1113       } // VSX
1114      } // FasterArrayCopy
1115 
1116       __ bind(l_6);
1117 
1118       // copy 4 elements at a time
1119       __ cmpwi(CCR0, R5_ARG3, 4);
1120       __ blt(CCR0, l_1);
1121       __ srdi(tmp1, R5_ARG3, 2);
1122       __ mtctr(tmp1); // is > 0
1123       __ andi_(R5_ARG3, R5_ARG3, 3);
1124 
1125       { // FasterArrayCopy
1126         __ addi(R3_ARG1, R3_ARG1, -4);
1127         __ addi(R4_ARG2, R4_ARG2, -4);
1128         __ bind(l_3);
1129         __ lwzu(tmp2, 4, R3_ARG1);
1130         __ stwu(tmp2, 4, R4_ARG2);
1131         __ bdnz(l_3);
1132         __ addi(R3_ARG1, R3_ARG1, 4);
1133         __ addi(R4_ARG2, R4_ARG2, 4);
1134       }
1135 
1136       // do single element copy
1137       __ bind(l_1);
1138       __ cmpwi(CCR0, R5_ARG3, 0);
1139       __ beq(CCR0, l_4);
1140 
1141       { // FasterArrayCopy
1142         __ mtctr(R5_ARG3);
1143         __ addi(R3_ARG1, R3_ARG1, -1);
1144         __ addi(R4_ARG2, R4_ARG2, -1);
1145 
1146         __ bind(l_5);
1147         __ lbzu(tmp2, 1, R3_ARG1);
1148         __ stbu(tmp2, 1, R4_ARG2);
1149         __ bdnz(l_5);
1150       }
1151     }
1152 
1153     __ bind(l_4);
1154     __ li(R3_RET, 0); // return 0
1155     __ blr();
1156 
1157     return start;
1158   }
1159 
1160   // Generate stub for conjoint byte copy.  If "aligned" is true, the
1161   // "from" and "to" addresses are assumed to be heapword aligned.
1162   //
1163   // Arguments for generated stub:
1164   //      from:  R3_ARG1
1165   //      to:    R4_ARG2
1166   //      count: R5_ARG3 treated as signed
1167   //
1168   address generate_conjoint_byte_copy(bool aligned, const char * name) {
1169     StubCodeMark mark(this, "StubRoutines", name);
1170     address start = __ function_entry();
1171     assert_positive_int(R5_ARG3);
1172 
1173     Register tmp1 = R6_ARG4;
1174     Register tmp2 = R7_ARG5;
1175     Register tmp3 = R8_ARG6;
1176 
1177     address nooverlap_target = aligned ?
1178       STUB_ENTRY(arrayof_jbyte_disjoint_arraycopy) :
1179       STUB_ENTRY(jbyte_disjoint_arraycopy);
1180 
1181     array_overlap_test(nooverlap_target, 0);
1182     // Do reverse copy. We assume the case of actual overlap is rare enough
1183     // that we don't have to optimize it.
1184     Label l_1, l_2;
1185     {
1186       // UnsafeCopyMemory page error: continue at UnsafeCopyMemory common_error_exit
1187       UnsafeCopyMemoryMark ucmm(this, !aligned, false);
1188       __ b(l_2);
1189       __ bind(l_1);
1190       __ stbx(tmp1, R4_ARG2, R5_ARG3);
1191       __ bind(l_2);
1192       __ addic_(R5_ARG3, R5_ARG3, -1);
1193       __ lbzx(tmp1, R3_ARG1, R5_ARG3);
1194       __ bge(CCR0, l_1);
1195     }
1196     __ li(R3_RET, 0); // return 0
1197     __ blr();
1198 
1199     return start;
1200   }
1201 
1202   // Generate stub for disjoint short copy.  If "aligned" is true, the
1203   // "from" and "to" addresses are assumed to be heapword aligned.
1204   //
1205   // Arguments for generated stub:
1206   //      from:  R3_ARG1
1207   //      to:    R4_ARG2
1208   //  elm.count: R5_ARG3 treated as signed
1209   //
1210   // Strategy for aligned==true:
1211   //
1212   //  If length <= 9:
1213   //     1. copy 2 elements at a time (l_6)
1214   //     2. copy last element if original element count was odd (l_1)
1215   //
1216   //  If length > 9:
1217   //     1. copy 4 elements at a time until less than 4 elements are left (l_7)
1218   //     2. copy 2 elements at a time until less than 2 elements are left (l_6)
1219   //     3. copy last element if one was left in step 2. (l_1)
1220   //
1221   //
1222   // Strategy for aligned==false:
1223   //
1224   //  If length <= 9: same as aligned==true case, but NOTE: load/stores
1225   //                  can be unaligned (see comment below)
1226   //
1227   //  If length > 9:
1228   //     1. continue with step 6. if the alignment of from and to mod 4
1229   //        is different.
1230   //     2. align from and to to 4 bytes by copying 1 element if necessary
1231   //     3. at l_2 from and to are 4 byte aligned; continue with
1232   //        5. if they cannot be aligned to 8 bytes because they have
1233   //        got different alignment mod 8.
1234   //     4. at this point we know that both, from and to, have the same
1235   //        alignment mod 8, now copy one element if necessary to get
1236   //        8 byte alignment of from and to.
1237   //     5. copy 4 elements at a time until less than 4 elements are
1238   //        left; depending on step 3. all load/stores are aligned or
1239   //        either all loads or all stores are unaligned.
1240   //     6. copy 2 elements at a time until less than 2 elements are
1241   //        left (l_6); arriving here from step 1., there is a chance
1242   //        that all accesses are unaligned.
1243   //     7. copy last element if one was left in step 6. (l_1)
1244   //
1245   //  There are unaligned data accesses using integer load/store
1246   //  instructions in this stub. POWER allows such accesses.
1247   //
1248   //  According to the manuals (PowerISA_V2.06_PUBLIC, Book II,
1249   //  Chapter 2: Effect of Operand Placement on Performance) unaligned
1250   //  integer load/stores have good performance. Only unaligned
1251   //  floating point load/stores can have poor performance.
1252   //
1253   //  TODO:
1254   //
1255   //  1. check if aligning the backbranch target of loops is beneficial
1256   //
1257   address generate_disjoint_short_copy(bool aligned, const char * name) {
1258     StubCodeMark mark(this, "StubRoutines", name);
1259 
1260     Register tmp1 = R6_ARG4;
1261     Register tmp2 = R7_ARG5;
1262     Register tmp3 = R8_ARG6;
1263     Register tmp4 = R9_ARG7;
1264 
1265     VectorSRegister tmp_vsr1  = VSR1;
1266     VectorSRegister tmp_vsr2  = VSR2;
1267 
1268     address start = __ function_entry();
1269     assert_positive_int(R5_ARG3);
1270 
1271     Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8, l_9;
1272     {
1273       // UnsafeCopyMemory page error: continue at UnsafeCopyMemory common_error_exit
1274       UnsafeCopyMemoryMark ucmm(this, !aligned, false);
1275       // don't try anything fancy if arrays don't have many elements
1276       __ li(tmp3, 0);
1277       __ cmpwi(CCR0, R5_ARG3, 9);
1278       __ ble(CCR0, l_6); // copy 2 at a time
1279 
1280       if (!aligned) {
1281         __ xorr(tmp1, R3_ARG1, R4_ARG2);
1282         __ andi_(tmp1, tmp1, 3);
1283         __ bne(CCR0, l_6); // if arrays don't have the same alignment mod 4, do 2 element copy
1284 
1285         // At this point it is guaranteed that both, from and to have the same alignment mod 4.
1286 
1287         // Copy 1 element if necessary to align to 4 bytes.
1288         __ andi_(tmp1, R3_ARG1, 3);
1289         __ beq(CCR0, l_2);
1290 
1291         __ lhz(tmp2, 0, R3_ARG1);
1292         __ addi(R3_ARG1, R3_ARG1, 2);
1293         __ sth(tmp2, 0, R4_ARG2);
1294         __ addi(R4_ARG2, R4_ARG2, 2);
1295         __ addi(R5_ARG3, R5_ARG3, -1);
1296         __ bind(l_2);
1297 
1298         // At this point the positions of both, from and to, are at least 4 byte aligned.
1299 
1300         // Copy 4 elements at a time.
1301         // Align to 8 bytes, but only if both, from and to, have same alignment mod 8.
1302         __ xorr(tmp2, R3_ARG1, R4_ARG2);
1303         __ andi_(tmp1, tmp2, 7);
1304         __ bne(CCR0, l_7); // not same alignment mod 8 -> copy 4, either from or to will be unaligned
1305 
1306         // Copy a 2-element word if necessary to align to 8 bytes.
1307         __ andi_(R0, R3_ARG1, 7);
1308         __ beq(CCR0, l_7);
1309 
1310         __ lwzx(tmp2, R3_ARG1, tmp3);
1311         __ addi(R5_ARG3, R5_ARG3, -2);
1312         __ stwx(tmp2, R4_ARG2, tmp3);
1313         { // FasterArrayCopy
1314           __ addi(R3_ARG1, R3_ARG1, 4);
1315           __ addi(R4_ARG2, R4_ARG2, 4);
1316         }
1317       }
1318 
1319       __ bind(l_7);
1320 
1321       // Copy 4 elements at a time; either the loads or the stores can
1322       // be unaligned if aligned == false.
1323 
1324       { // FasterArrayCopy
1325         __ cmpwi(CCR0, R5_ARG3, 15);
1326         __ ble(CCR0, l_6); // copy 2 at a time if less than 16 elements remain
1327 
1328         __ srdi(tmp1, R5_ARG3, 4);
1329         __ andi_(R5_ARG3, R5_ARG3, 15);
1330         __ mtctr(tmp1);
1331 
1332         if (!VM_Version::has_vsx()) {
1333 
1334           __ bind(l_8);
1335           // Use unrolled version for mass copying (copy 16 elements a time).
1336           // Load feeding store gets zero latency on Power6, however not on Power5.
1337           // Therefore, the following sequence is made for the good of both.
1338           __ ld(tmp1, 0, R3_ARG1);
1339           __ ld(tmp2, 8, R3_ARG1);
1340           __ ld(tmp3, 16, R3_ARG1);
1341           __ ld(tmp4, 24, R3_ARG1);
1342           __ std(tmp1, 0, R4_ARG2);
1343           __ std(tmp2, 8, R4_ARG2);
1344           __ std(tmp3, 16, R4_ARG2);
1345           __ std(tmp4, 24, R4_ARG2);
1346           __ addi(R3_ARG1, R3_ARG1, 32);
1347           __ addi(R4_ARG2, R4_ARG2, 32);
1348           __ bdnz(l_8);
1349 
1350         } else { // Processor supports VSX, so use it to mass copy.
1351 
1352           // Prefetch src data into L2 cache.
1353           __ dcbt(R3_ARG1, 0);
1354 
1355           // If supported set DSCR pre-fetch to deepest.
1356           if (VM_Version::has_mfdscr()) {
1357             __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
1358             __ mtdscr(tmp2);
1359           }
1360           __ li(tmp1, 16);
1361 
1362           // Backbranch target aligned to 32-byte. It's not aligned 16-byte
1363           // as loop contains < 8 instructions that fit inside a single
1364           // i-cache sector.
1365           __ align(32);
1366 
1367           __ bind(l_9);
1368           // Use loop with VSX load/store instructions to
1369           // copy 16 elements a time.
1370           __ lxvd2x(tmp_vsr1, R3_ARG1);        // Load from src.
1371           __ stxvd2x(tmp_vsr1, R4_ARG2);       // Store to dst.
1372           __ lxvd2x(tmp_vsr2, R3_ARG1, tmp1);  // Load from src + 16.
1373           __ stxvd2x(tmp_vsr2, R4_ARG2, tmp1); // Store to dst + 16.
1374           __ addi(R3_ARG1, R3_ARG1, 32);       // Update src+=32.
1375           __ addi(R4_ARG2, R4_ARG2, 32);       // Update dsc+=32.
1376           __ bdnz(l_9);                        // Dec CTR and loop if not zero.
1377 
1378           // Restore DSCR pre-fetch value.
1379           if (VM_Version::has_mfdscr()) {
1380             __ load_const_optimized(tmp2, VM_Version::_dscr_val);
1381             __ mtdscr(tmp2);
1382           }
1383 
1384         }
1385       } // FasterArrayCopy
1386       __ bind(l_6);
1387 
1388       // copy 2 elements at a time
1389       { // FasterArrayCopy
1390         __ cmpwi(CCR0, R5_ARG3, 2);
1391         __ blt(CCR0, l_1);
1392         __ srdi(tmp1, R5_ARG3, 1);
1393         __ andi_(R5_ARG3, R5_ARG3, 1);
1394 
1395         __ addi(R3_ARG1, R3_ARG1, -4);
1396         __ addi(R4_ARG2, R4_ARG2, -4);
1397         __ mtctr(tmp1);
1398 
1399         __ bind(l_3);
1400         __ lwzu(tmp2, 4, R3_ARG1);
1401         __ stwu(tmp2, 4, R4_ARG2);
1402         __ bdnz(l_3);
1403 
1404         __ addi(R3_ARG1, R3_ARG1, 4);
1405         __ addi(R4_ARG2, R4_ARG2, 4);
1406       }
1407 
1408       // do single element copy
1409       __ bind(l_1);
1410       __ cmpwi(CCR0, R5_ARG3, 0);
1411       __ beq(CCR0, l_4);
1412 
1413       { // FasterArrayCopy
1414         __ mtctr(R5_ARG3);
1415         __ addi(R3_ARG1, R3_ARG1, -2);
1416         __ addi(R4_ARG2, R4_ARG2, -2);
1417 
1418         __ bind(l_5);
1419         __ lhzu(tmp2, 2, R3_ARG1);
1420         __ sthu(tmp2, 2, R4_ARG2);
1421         __ bdnz(l_5);
1422       }
1423     }
1424 
1425     __ bind(l_4);
1426     __ li(R3_RET, 0); // return 0
1427     __ blr();
1428 
1429     return start;
1430   }
1431 
1432   // Generate stub for conjoint short copy.  If "aligned" is true, the
1433   // "from" and "to" addresses are assumed to be heapword aligned.
1434   //
1435   // Arguments for generated stub:
1436   //      from:  R3_ARG1
1437   //      to:    R4_ARG2
1438   //      count: R5_ARG3 treated as signed
1439   //
1440   address generate_conjoint_short_copy(bool aligned, const char * name) {
1441     StubCodeMark mark(this, "StubRoutines", name);
1442     address start = __ function_entry();
1443     assert_positive_int(R5_ARG3);
1444 
1445     Register tmp1 = R6_ARG4;
1446     Register tmp2 = R7_ARG5;
1447     Register tmp3 = R8_ARG6;
1448 
1449     address nooverlap_target = aligned ?
1450       STUB_ENTRY(arrayof_jshort_disjoint_arraycopy) :
1451       STUB_ENTRY(jshort_disjoint_arraycopy);
1452 
1453     array_overlap_test(nooverlap_target, 1);
1454 
1455     Label l_1, l_2;
1456     {
1457       // UnsafeCopyMemory page error: continue at UnsafeCopyMemory common_error_exit
1458       UnsafeCopyMemoryMark ucmm(this, !aligned, false);
1459       __ sldi(tmp1, R5_ARG3, 1);
1460       __ b(l_2);
1461       __ bind(l_1);
1462       __ sthx(tmp2, R4_ARG2, tmp1);
1463       __ bind(l_2);
1464       __ addic_(tmp1, tmp1, -2);
1465       __ lhzx(tmp2, R3_ARG1, tmp1);
1466       __ bge(CCR0, l_1);
1467     }
1468     __ li(R3_RET, 0); // return 0
1469     __ blr();
1470 
1471     return start;
1472   }
1473 
1474   // Generate core code for disjoint int copy (and oop copy on 32-bit).  If "aligned"
1475   // is true, the "from" and "to" addresses are assumed to be heapword aligned.
1476   //
1477   // Arguments:
1478   //      from:  R3_ARG1
1479   //      to:    R4_ARG2
1480   //      count: R5_ARG3 treated as signed
1481   //
1482   void generate_disjoint_int_copy_core(bool aligned) {
1483     Register tmp1 = R6_ARG4;
1484     Register tmp2 = R7_ARG5;
1485     Register tmp3 = R8_ARG6;
1486     Register tmp4 = R0;
1487 
1488     VectorSRegister tmp_vsr1  = VSR1;
1489     VectorSRegister tmp_vsr2  = VSR2;
1490 
1491     Label l_1, l_2, l_3, l_4, l_5, l_6, l_7;
1492 
1493     // for short arrays, just do single element copy
1494     __ li(tmp3, 0);
1495     __ cmpwi(CCR0, R5_ARG3, 5);
1496     __ ble(CCR0, l_2);
1497 
1498     if (!aligned) {
1499         // check if arrays have same alignment mod 8.
1500         __ xorr(tmp1, R3_ARG1, R4_ARG2);
1501         __ andi_(R0, tmp1, 7);
1502         // Not the same alignment, but ld and std just need to be 4 byte aligned.
1503         __ bne(CCR0, l_4); // to OR from is 8 byte aligned -> copy 2 at a time
1504 
1505         // copy 1 element to align to and from on an 8 byte boundary
1506         __ andi_(R0, R3_ARG1, 7);
1507         __ beq(CCR0, l_4);
1508 
1509         __ lwzx(tmp2, R3_ARG1, tmp3);
1510         __ addi(R5_ARG3, R5_ARG3, -1);
1511         __ stwx(tmp2, R4_ARG2, tmp3);
1512         { // FasterArrayCopy
1513           __ addi(R3_ARG1, R3_ARG1, 4);
1514           __ addi(R4_ARG2, R4_ARG2, 4);
1515         }
1516         __ bind(l_4);
1517       }
1518 
1519     { // FasterArrayCopy
1520       __ cmpwi(CCR0, R5_ARG3, 7);
1521       __ ble(CCR0, l_2); // copy 1 at a time if less than 8 elements remain
1522 
1523       __ srdi(tmp1, R5_ARG3, 3);
1524       __ andi_(R5_ARG3, R5_ARG3, 7);
1525       __ mtctr(tmp1);
1526 
1527      if (!VM_Version::has_vsx()) {
1528 
1529       __ bind(l_6);
1530       // Use unrolled version for mass copying (copy 8 elements a time).
1531       // Load feeding store gets zero latency on power6, however not on power 5.
1532       // Therefore, the following sequence is made for the good of both.
1533       __ ld(tmp1, 0, R3_ARG1);
1534       __ ld(tmp2, 8, R3_ARG1);
1535       __ ld(tmp3, 16, R3_ARG1);
1536       __ ld(tmp4, 24, R3_ARG1);
1537       __ std(tmp1, 0, R4_ARG2);
1538       __ std(tmp2, 8, R4_ARG2);
1539       __ std(tmp3, 16, R4_ARG2);
1540       __ std(tmp4, 24, R4_ARG2);
1541       __ addi(R3_ARG1, R3_ARG1, 32);
1542       __ addi(R4_ARG2, R4_ARG2, 32);
1543       __ bdnz(l_6);
1544 
1545     } else { // Processor supports VSX, so use it to mass copy.
1546 
1547       // Prefetch the data into the L2 cache.
1548       __ dcbt(R3_ARG1, 0);
1549 
1550       // If supported set DSCR pre-fetch to deepest.
1551       if (VM_Version::has_mfdscr()) {
1552         __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
1553         __ mtdscr(tmp2);
1554       }
1555 
1556       __ li(tmp1, 16);
1557 
1558       // Backbranch target aligned to 32-byte. Not 16-byte align as
1559       // loop contains < 8 instructions that fit inside a single
1560       // i-cache sector.
1561       __ align(32);
1562 
1563       __ bind(l_7);
1564       // Use loop with VSX load/store instructions to
1565       // copy 8 elements a time.
1566       __ lxvd2x(tmp_vsr1, R3_ARG1);        // Load src
1567       __ stxvd2x(tmp_vsr1, R4_ARG2);       // Store to dst
1568       __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1);  // Load src + 16
1569       __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst + 16
1570       __ addi(R3_ARG1, R3_ARG1, 32);       // Update src+=32
1571       __ addi(R4_ARG2, R4_ARG2, 32);       // Update dsc+=32
1572       __ bdnz(l_7);                        // Dec CTR and loop if not zero.
1573 
1574       // Restore DSCR pre-fetch value.
1575       if (VM_Version::has_mfdscr()) {
1576         __ load_const_optimized(tmp2, VM_Version::_dscr_val);
1577         __ mtdscr(tmp2);
1578       }
1579 
1580     } // VSX
1581    } // FasterArrayCopy
1582 
1583     // copy 1 element at a time
1584     __ bind(l_2);
1585     __ cmpwi(CCR0, R5_ARG3, 0);
1586     __ beq(CCR0, l_1);
1587 
1588     { // FasterArrayCopy
1589       __ mtctr(R5_ARG3);
1590       __ addi(R3_ARG1, R3_ARG1, -4);
1591       __ addi(R4_ARG2, R4_ARG2, -4);
1592 
1593       __ bind(l_3);
1594       __ lwzu(tmp2, 4, R3_ARG1);
1595       __ stwu(tmp2, 4, R4_ARG2);
1596       __ bdnz(l_3);
1597     }
1598 
1599     __ bind(l_1);
1600     return;
1601   }
1602 
1603   // Generate stub for disjoint int copy.  If "aligned" is true, the
1604   // "from" and "to" addresses are assumed to be heapword aligned.
1605   //
1606   // Arguments for generated stub:
1607   //      from:  R3_ARG1
1608   //      to:    R4_ARG2
1609   //      count: R5_ARG3 treated as signed
1610   //
1611   address generate_disjoint_int_copy(bool aligned, const char * name) {
1612     StubCodeMark mark(this, "StubRoutines", name);
1613     address start = __ function_entry();
1614     assert_positive_int(R5_ARG3);
1615     {
1616       // UnsafeCopyMemory page error: continue at UnsafeCopyMemory common_error_exit
1617       UnsafeCopyMemoryMark ucmm(this, !aligned, false);
1618       generate_disjoint_int_copy_core(aligned);
1619     }
1620     __ li(R3_RET, 0); // return 0
1621     __ blr();
1622     return start;
1623   }
1624 
1625   // Generate core code for conjoint int copy (and oop copy on
1626   // 32-bit).  If "aligned" is true, the "from" and "to" addresses
1627   // are assumed to be heapword aligned.
1628   //
1629   // Arguments:
1630   //      from:  R3_ARG1
1631   //      to:    R4_ARG2
1632   //      count: R5_ARG3 treated as signed
1633   //
1634   void generate_conjoint_int_copy_core(bool aligned) {
1635     // Do reverse copy.  We assume the case of actual overlap is rare enough
1636     // that we don't have to optimize it.
1637 
1638     Label l_1, l_2, l_3, l_4, l_5, l_6, l_7;
1639 
1640     Register tmp1 = R6_ARG4;
1641     Register tmp2 = R7_ARG5;
1642     Register tmp3 = R8_ARG6;
1643     Register tmp4 = R0;
1644 
1645     VectorSRegister tmp_vsr1  = VSR1;
1646     VectorSRegister tmp_vsr2  = VSR2;
1647 
1648     { // FasterArrayCopy
1649       __ cmpwi(CCR0, R5_ARG3, 0);
1650       __ beq(CCR0, l_6);
1651 
1652       __ sldi(R5_ARG3, R5_ARG3, 2);
1653       __ add(R3_ARG1, R3_ARG1, R5_ARG3);
1654       __ add(R4_ARG2, R4_ARG2, R5_ARG3);
1655       __ srdi(R5_ARG3, R5_ARG3, 2);
1656 
1657       if (!aligned) {
1658         // check if arrays have same alignment mod 8.
1659         __ xorr(tmp1, R3_ARG1, R4_ARG2);
1660         __ andi_(R0, tmp1, 7);
1661         // Not the same alignment, but ld and std just need to be 4 byte aligned.
1662         __ bne(CCR0, l_7); // to OR from is 8 byte aligned -> copy 2 at a time
1663 
1664         // copy 1 element to align to and from on an 8 byte boundary
1665         __ andi_(R0, R3_ARG1, 7);
1666         __ beq(CCR0, l_7);
1667 
1668         __ addi(R3_ARG1, R3_ARG1, -4);
1669         __ addi(R4_ARG2, R4_ARG2, -4);
1670         __ addi(R5_ARG3, R5_ARG3, -1);
1671         __ lwzx(tmp2, R3_ARG1);
1672         __ stwx(tmp2, R4_ARG2);
1673         __ bind(l_7);
1674       }
1675 
1676       __ cmpwi(CCR0, R5_ARG3, 7);
1677       __ ble(CCR0, l_5); // copy 1 at a time if less than 8 elements remain
1678 
1679       __ srdi(tmp1, R5_ARG3, 3);
1680       __ andi(R5_ARG3, R5_ARG3, 7);
1681       __ mtctr(tmp1);
1682 
1683      if (!VM_Version::has_vsx()) {
1684       __ bind(l_4);
1685       // Use unrolled version for mass copying (copy 4 elements a time).
1686       // Load feeding store gets zero latency on Power6, however not on Power5.
1687       // Therefore, the following sequence is made for the good of both.
1688       __ addi(R3_ARG1, R3_ARG1, -32);
1689       __ addi(R4_ARG2, R4_ARG2, -32);
1690       __ ld(tmp4, 24, R3_ARG1);
1691       __ ld(tmp3, 16, R3_ARG1);
1692       __ ld(tmp2, 8, R3_ARG1);
1693       __ ld(tmp1, 0, R3_ARG1);
1694       __ std(tmp4, 24, R4_ARG2);
1695       __ std(tmp3, 16, R4_ARG2);
1696       __ std(tmp2, 8, R4_ARG2);
1697       __ std(tmp1, 0, R4_ARG2);
1698       __ bdnz(l_4);
1699      } else {  // Processor supports VSX, so use it to mass copy.
1700       // Prefetch the data into the L2 cache.
1701       __ dcbt(R3_ARG1, 0);
1702 
1703       // If supported set DSCR pre-fetch to deepest.
1704       if (VM_Version::has_mfdscr()) {
1705         __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
1706         __ mtdscr(tmp2);
1707       }
1708 
1709       __ li(tmp1, 16);
1710 
1711       // Backbranch target aligned to 32-byte. Not 16-byte align as
1712       // loop contains < 8 instructions that fit inside a single
1713       // i-cache sector.
1714       __ align(32);
1715 
1716       __ bind(l_4);
1717       // Use loop with VSX load/store instructions to
1718       // copy 8 elements a time.
1719       __ addi(R3_ARG1, R3_ARG1, -32);      // Update src-=32
1720       __ addi(R4_ARG2, R4_ARG2, -32);      // Update dsc-=32
1721       __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1);  // Load src+16
1722       __ lxvd2x(tmp_vsr1, R3_ARG1);        // Load src
1723       __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst+16
1724       __ stxvd2x(tmp_vsr1, R4_ARG2);       // Store to dst
1725       __ bdnz(l_4);
1726 
1727       // Restore DSCR pre-fetch value.
1728       if (VM_Version::has_mfdscr()) {
1729         __ load_const_optimized(tmp2, VM_Version::_dscr_val);
1730         __ mtdscr(tmp2);
1731       }
1732      }
1733 
1734       __ cmpwi(CCR0, R5_ARG3, 0);
1735       __ beq(CCR0, l_6);
1736 
1737       __ bind(l_5);
1738       __ mtctr(R5_ARG3);
1739       __ bind(l_3);
1740       __ lwz(R0, -4, R3_ARG1);
1741       __ stw(R0, -4, R4_ARG2);
1742       __ addi(R3_ARG1, R3_ARG1, -4);
1743       __ addi(R4_ARG2, R4_ARG2, -4);
1744       __ bdnz(l_3);
1745 
1746       __ bind(l_6);
1747     }
1748   }
1749 
1750   // Generate stub for conjoint int copy.  If "aligned" is true, the
1751   // "from" and "to" addresses are assumed to be heapword aligned.
1752   //
1753   // Arguments for generated stub:
1754   //      from:  R3_ARG1
1755   //      to:    R4_ARG2
1756   //      count: R5_ARG3 treated as signed
1757   //
1758   address generate_conjoint_int_copy(bool aligned, const char * name) {
1759     StubCodeMark mark(this, "StubRoutines", name);
1760     address start = __ function_entry();
1761     assert_positive_int(R5_ARG3);
1762     address nooverlap_target = aligned ?
1763       STUB_ENTRY(arrayof_jint_disjoint_arraycopy) :
1764       STUB_ENTRY(jint_disjoint_arraycopy);
1765 
1766     array_overlap_test(nooverlap_target, 2);
1767     {
1768       // UnsafeCopyMemory page error: continue at UnsafeCopyMemory common_error_exit
1769       UnsafeCopyMemoryMark ucmm(this, !aligned, false);
1770       generate_conjoint_int_copy_core(aligned);
1771     }
1772 
1773     __ li(R3_RET, 0); // return 0
1774     __ blr();
1775 
1776     return start;
1777   }
1778 
1779   // Generate core code for disjoint long copy (and oop copy on
1780   // 64-bit).  If "aligned" is true, the "from" and "to" addresses
1781   // are assumed to be heapword aligned.
1782   //
1783   // Arguments:
1784   //      from:  R3_ARG1
1785   //      to:    R4_ARG2
1786   //      count: R5_ARG3 treated as signed
1787   //
1788   void generate_disjoint_long_copy_core(bool aligned) {
1789     Register tmp1 = R6_ARG4;
1790     Register tmp2 = R7_ARG5;
1791     Register tmp3 = R8_ARG6;
1792     Register tmp4 = R0;
1793 
1794     Label l_1, l_2, l_3, l_4, l_5;
1795 
1796     VectorSRegister tmp_vsr1  = VSR1;
1797     VectorSRegister tmp_vsr2  = VSR2;
1798 
1799     { // FasterArrayCopy
1800       __ cmpwi(CCR0, R5_ARG3, 3);
1801       __ ble(CCR0, l_3); // copy 1 at a time if less than 4 elements remain
1802 
1803       __ srdi(tmp1, R5_ARG3, 2);
1804       __ andi_(R5_ARG3, R5_ARG3, 3);
1805       __ mtctr(tmp1);
1806 
1807     if (!VM_Version::has_vsx()) {
1808       __ bind(l_4);
1809       // Use unrolled version for mass copying (copy 4 elements a time).
1810       // Load feeding store gets zero latency on Power6, however not on Power5.
1811       // Therefore, the following sequence is made for the good of both.
1812       __ ld(tmp1, 0, R3_ARG1);
1813       __ ld(tmp2, 8, R3_ARG1);
1814       __ ld(tmp3, 16, R3_ARG1);
1815       __ ld(tmp4, 24, R3_ARG1);
1816       __ std(tmp1, 0, R4_ARG2);
1817       __ std(tmp2, 8, R4_ARG2);
1818       __ std(tmp3, 16, R4_ARG2);
1819       __ std(tmp4, 24, R4_ARG2);
1820       __ addi(R3_ARG1, R3_ARG1, 32);
1821       __ addi(R4_ARG2, R4_ARG2, 32);
1822       __ bdnz(l_4);
1823 
1824     } else { // Processor supports VSX, so use it to mass copy.
1825 
1826       // Prefetch the data into the L2 cache.
1827       __ dcbt(R3_ARG1, 0);
1828 
1829       // If supported set DSCR pre-fetch to deepest.
1830       if (VM_Version::has_mfdscr()) {
1831         __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
1832         __ mtdscr(tmp2);
1833       }
1834 
1835       __ li(tmp1, 16);
1836 
1837       // Backbranch target aligned to 32-byte. Not 16-byte align as
1838       // loop contains < 8 instructions that fit inside a single
1839       // i-cache sector.
1840       __ align(32);
1841 
1842       __ bind(l_5);
1843       // Use loop with VSX load/store instructions to
1844       // copy 4 elements a time.
1845       __ lxvd2x(tmp_vsr1, R3_ARG1);        // Load src
1846       __ stxvd2x(tmp_vsr1, R4_ARG2);       // Store to dst
1847       __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1);  // Load src + 16
1848       __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst + 16
1849       __ addi(R3_ARG1, R3_ARG1, 32);       // Update src+=32
1850       __ addi(R4_ARG2, R4_ARG2, 32);       // Update dsc+=32
1851       __ bdnz(l_5);                        // Dec CTR and loop if not zero.
1852 
1853       // Restore DSCR pre-fetch value.
1854       if (VM_Version::has_mfdscr()) {
1855         __ load_const_optimized(tmp2, VM_Version::_dscr_val);
1856         __ mtdscr(tmp2);
1857       }
1858 
1859     } // VSX
1860    } // FasterArrayCopy
1861 
1862     // copy 1 element at a time
1863     __ bind(l_3);
1864     __ cmpwi(CCR0, R5_ARG3, 0);
1865     __ beq(CCR0, l_1);
1866 
1867     { // FasterArrayCopy
1868       __ mtctr(R5_ARG3);
1869       __ addi(R3_ARG1, R3_ARG1, -8);
1870       __ addi(R4_ARG2, R4_ARG2, -8);
1871 
1872       __ bind(l_2);
1873       __ ldu(R0, 8, R3_ARG1);
1874       __ stdu(R0, 8, R4_ARG2);
1875       __ bdnz(l_2);
1876 
1877     }
1878     __ bind(l_1);
1879   }
1880 
1881   // Generate stub for disjoint long copy.  If "aligned" is true, the
1882   // "from" and "to" addresses are assumed to be heapword aligned.
1883   //
1884   // Arguments for generated stub:
1885   //      from:  R3_ARG1
1886   //      to:    R4_ARG2
1887   //      count: R5_ARG3 treated as signed
1888   //
1889   address generate_disjoint_long_copy(bool aligned, const char * name) {
1890     StubCodeMark mark(this, "StubRoutines", name);
1891     address start = __ function_entry();
1892     assert_positive_int(R5_ARG3);
1893     {
1894       // UnsafeCopyMemory page error: continue at UnsafeCopyMemory common_error_exit
1895       UnsafeCopyMemoryMark ucmm(this, !aligned, false);
1896       generate_disjoint_long_copy_core(aligned);
1897     }
1898     __ li(R3_RET, 0); // return 0
1899     __ blr();
1900 
1901   return start;
1902   }
1903 
1904   // Generate core code for conjoint long copy (and oop copy on
1905   // 64-bit).  If "aligned" is true, the "from" and "to" addresses
1906   // are assumed to be heapword aligned.
1907   //
1908   // Arguments:
1909   //      from:  R3_ARG1
1910   //      to:    R4_ARG2
1911   //      count: R5_ARG3 treated as signed
1912   //
1913   void generate_conjoint_long_copy_core(bool aligned) {
1914     Register tmp1 = R6_ARG4;
1915     Register tmp2 = R7_ARG5;
1916     Register tmp3 = R8_ARG6;
1917     Register tmp4 = R0;
1918 
1919     VectorSRegister tmp_vsr1  = VSR1;
1920     VectorSRegister tmp_vsr2  = VSR2;
1921 
1922     Label l_1, l_2, l_3, l_4, l_5;
1923 
1924     __ cmpwi(CCR0, R5_ARG3, 0);
1925     __ beq(CCR0, l_1);
1926 
1927     { // FasterArrayCopy
1928       __ sldi(R5_ARG3, R5_ARG3, 3);
1929       __ add(R3_ARG1, R3_ARG1, R5_ARG3);
1930       __ add(R4_ARG2, R4_ARG2, R5_ARG3);
1931       __ srdi(R5_ARG3, R5_ARG3, 3);
1932 
1933       __ cmpwi(CCR0, R5_ARG3, 3);
1934       __ ble(CCR0, l_5); // copy 1 at a time if less than 4 elements remain
1935 
1936       __ srdi(tmp1, R5_ARG3, 2);
1937       __ andi(R5_ARG3, R5_ARG3, 3);
1938       __ mtctr(tmp1);
1939 
1940      if (!VM_Version::has_vsx()) {
1941       __ bind(l_4);
1942       // Use unrolled version for mass copying (copy 4 elements a time).
1943       // Load feeding store gets zero latency on Power6, however not on Power5.
1944       // Therefore, the following sequence is made for the good of both.
1945       __ addi(R3_ARG1, R3_ARG1, -32);
1946       __ addi(R4_ARG2, R4_ARG2, -32);
1947       __ ld(tmp4, 24, R3_ARG1);
1948       __ ld(tmp3, 16, R3_ARG1);
1949       __ ld(tmp2, 8, R3_ARG1);
1950       __ ld(tmp1, 0, R3_ARG1);
1951       __ std(tmp4, 24, R4_ARG2);
1952       __ std(tmp3, 16, R4_ARG2);
1953       __ std(tmp2, 8, R4_ARG2);
1954       __ std(tmp1, 0, R4_ARG2);
1955       __ bdnz(l_4);
1956      } else { // Processor supports VSX, so use it to mass copy.
1957       // Prefetch the data into the L2 cache.
1958       __ dcbt(R3_ARG1, 0);
1959 
1960       // If supported set DSCR pre-fetch to deepest.
1961       if (VM_Version::has_mfdscr()) {
1962         __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
1963         __ mtdscr(tmp2);
1964       }
1965 
1966       __ li(tmp1, 16);
1967 
1968       // Backbranch target aligned to 32-byte. Not 16-byte align as
1969       // loop contains < 8 instructions that fit inside a single
1970       // i-cache sector.
1971       __ align(32);
1972 
1973       __ bind(l_4);
1974       // Use loop with VSX load/store instructions to
1975       // copy 4 elements a time.
1976       __ addi(R3_ARG1, R3_ARG1, -32);      // Update src-=32
1977       __ addi(R4_ARG2, R4_ARG2, -32);      // Update dsc-=32
1978       __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1);  // Load src+16
1979       __ lxvd2x(tmp_vsr1, R3_ARG1);        // Load src
1980       __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst+16
1981       __ stxvd2x(tmp_vsr1, R4_ARG2);       // Store to dst
1982       __ bdnz(l_4);
1983 
1984       // Restore DSCR pre-fetch value.
1985       if (VM_Version::has_mfdscr()) {
1986         __ load_const_optimized(tmp2, VM_Version::_dscr_val);
1987         __ mtdscr(tmp2);
1988       }
1989      }
1990 
1991       __ cmpwi(CCR0, R5_ARG3, 0);
1992       __ beq(CCR0, l_1);
1993 
1994       __ bind(l_5);
1995       __ mtctr(R5_ARG3);
1996       __ bind(l_3);
1997       __ ld(R0, -8, R3_ARG1);
1998       __ std(R0, -8, R4_ARG2);
1999       __ addi(R3_ARG1, R3_ARG1, -8);
2000       __ addi(R4_ARG2, R4_ARG2, -8);
2001       __ bdnz(l_3);
2002 
2003     }
2004     __ bind(l_1);
2005   }
2006 
2007   // Generate stub for conjoint long copy.  If "aligned" is true, the
2008   // "from" and "to" addresses are assumed to be heapword aligned.
2009   //
2010   // Arguments for generated stub:
2011   //      from:  R3_ARG1
2012   //      to:    R4_ARG2
2013   //      count: R5_ARG3 treated as signed
2014   //
2015   address generate_conjoint_long_copy(bool aligned, const char * name) {
2016     StubCodeMark mark(this, "StubRoutines", name);
2017     address start = __ function_entry();
2018     assert_positive_int(R5_ARG3);
2019     address nooverlap_target = aligned ?
2020       STUB_ENTRY(arrayof_jlong_disjoint_arraycopy) :
2021       STUB_ENTRY(jlong_disjoint_arraycopy);
2022 
2023     array_overlap_test(nooverlap_target, 3);
2024     {
2025       // UnsafeCopyMemory page error: continue at UnsafeCopyMemory common_error_exit
2026       UnsafeCopyMemoryMark ucmm(this, !aligned, false);
2027       generate_conjoint_long_copy_core(aligned);
2028     }
2029     __ li(R3_RET, 0); // return 0
2030     __ blr();
2031 
2032     return start;
2033   }
2034 
2035   // Generate stub for conjoint oop copy.  If "aligned" is true, the
2036   // "from" and "to" addresses are assumed to be heapword aligned.
2037   //
2038   // Arguments for generated stub:
2039   //      from:  R3_ARG1
2040   //      to:    R4_ARG2
2041   //      count: R5_ARG3 treated as signed
2042   //      dest_uninitialized: G1 support
2043   //
2044   address generate_conjoint_oop_copy(bool aligned, const char * name, bool dest_uninitialized) {
2045     StubCodeMark mark(this, "StubRoutines", name);
2046 
2047     address start = __ function_entry();
2048     assert_positive_int(R5_ARG3);
2049     address nooverlap_target = aligned ?
2050       STUB_ENTRY(arrayof_oop_disjoint_arraycopy) :
2051       STUB_ENTRY(oop_disjoint_arraycopy);
2052 
2053     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
2054     if (dest_uninitialized) {
2055       decorators |= IS_DEST_UNINITIALIZED;
2056     }
2057     if (aligned) {
2058       decorators |= ARRAYCOPY_ALIGNED;
2059     }
2060 
2061     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2062     bs->arraycopy_prologue(_masm, decorators, T_OBJECT, R3_ARG1, R4_ARG2, R5_ARG3, noreg, noreg);
2063 
2064     if (UseCompressedOops) {
2065       array_overlap_test(nooverlap_target, 2);
2066       generate_conjoint_int_copy_core(aligned);
2067     } else {
2068       array_overlap_test(nooverlap_target, 3);
2069       generate_conjoint_long_copy_core(aligned);
2070     }
2071 
2072     bs->arraycopy_epilogue(_masm, decorators, T_OBJECT, R4_ARG2, R5_ARG3, noreg);
2073     __ li(R3_RET, 0); // return 0
2074     __ blr();
2075     return start;
2076   }
2077 
2078   // Generate stub for disjoint oop copy.  If "aligned" is true, the
2079   // "from" and "to" addresses are assumed to be heapword aligned.
2080   //
2081   // Arguments for generated stub:
2082   //      from:  R3_ARG1
2083   //      to:    R4_ARG2
2084   //      count: R5_ARG3 treated as signed
2085   //      dest_uninitialized: G1 support
2086   //
2087   address generate_disjoint_oop_copy(bool aligned, const char * name, bool dest_uninitialized) {
2088     StubCodeMark mark(this, "StubRoutines", name);
2089     address start = __ function_entry();
2090     assert_positive_int(R5_ARG3);
2091 
2092     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
2093     if (dest_uninitialized) {
2094       decorators |= IS_DEST_UNINITIALIZED;
2095     }
2096     if (aligned) {
2097       decorators |= ARRAYCOPY_ALIGNED;
2098     }
2099 
2100     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2101     bs->arraycopy_prologue(_masm, decorators, T_OBJECT, R3_ARG1, R4_ARG2, R5_ARG3, noreg, noreg);
2102 
2103     if (UseCompressedOops) {
2104       generate_disjoint_int_copy_core(aligned);
2105     } else {
2106       generate_disjoint_long_copy_core(aligned);
2107     }
2108 
2109     bs->arraycopy_epilogue(_masm, decorators, T_OBJECT, R4_ARG2, R5_ARG3, noreg);
2110     __ li(R3_RET, 0); // return 0
2111     __ blr();
2112 
2113     return start;
2114   }
2115 
2116 
2117   // Helper for generating a dynamic type check.
2118   // Smashes only the given temp registers.
2119   void generate_type_check(Register sub_klass,
2120                            Register super_check_offset,
2121                            Register super_klass,
2122                            Register temp,
2123                            Label& L_success) {
2124     assert_different_registers(sub_klass, super_check_offset, super_klass);
2125 
2126     BLOCK_COMMENT("type_check:");
2127 
2128     Label L_miss;
2129 
2130     __ check_klass_subtype_fast_path(sub_klass, super_klass, temp, R0, &L_success, &L_miss, NULL,
2131                                      super_check_offset);
2132     __ check_klass_subtype_slow_path(sub_klass, super_klass, temp, R0, &L_success, NULL);
2133 
2134     // Fall through on failure!
2135     __ bind(L_miss);
2136   }
2137 
2138 
2139   //  Generate stub for checked oop copy.
2140   //
2141   // Arguments for generated stub:
2142   //      from:  R3
2143   //      to:    R4
2144   //      count: R5 treated as signed
2145   //      ckoff: R6 (super_check_offset)
2146   //      ckval: R7 (super_klass)
2147   //      ret:   R3 zero for success; (-1^K) where K is partial transfer count
2148   //
2149   address generate_checkcast_copy(const char *name, bool dest_uninitialized) {
2150 
2151     const Register R3_from   = R3_ARG1;      // source array address
2152     const Register R4_to     = R4_ARG2;      // destination array address
2153     const Register R5_count  = R5_ARG3;      // elements count
2154     const Register R6_ckoff  = R6_ARG4;      // super_check_offset
2155     const Register R7_ckval  = R7_ARG5;      // super_klass
2156 
2157     const Register R8_offset = R8_ARG6;      // loop var, with stride wordSize
2158     const Register R9_remain = R9_ARG7;      // loop var, with stride -1
2159     const Register R10_oop   = R10_ARG8;     // actual oop copied
2160     const Register R11_klass = R11_scratch1; // oop._klass
2161     const Register R12_tmp   = R12_scratch2;
2162 
2163     const Register R2_minus1 = R2;
2164 
2165     //__ align(CodeEntryAlignment);
2166     StubCodeMark mark(this, "StubRoutines", name);
2167     address start = __ function_entry();
2168 
2169     // Assert that int is 64 bit sign extended and arrays are not conjoint.
2170 #ifdef ASSERT
2171     {
2172     assert_positive_int(R5_ARG3);
2173     const Register tmp1 = R11_scratch1, tmp2 = R12_scratch2;
2174     Label no_overlap;
2175     __ subf(tmp1, R3_ARG1, R4_ARG2); // distance in bytes
2176     __ sldi(tmp2, R5_ARG3, LogBytesPerHeapOop); // size in bytes
2177     __ cmpld(CCR0, R3_ARG1, R4_ARG2); // Use unsigned comparison!
2178     __ cmpld(CCR1, tmp1, tmp2);
2179     __ crnand(CCR0, Assembler::less, CCR1, Assembler::less);
2180     // Overlaps if Src before dst and distance smaller than size.
2181     // Branch to forward copy routine otherwise.
2182     __ blt(CCR0, no_overlap);
2183     __ stop("overlap in checkcast_copy", 0x9543);
2184     __ bind(no_overlap);
2185     }
2186 #endif
2187 
2188     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST;
2189     if (dest_uninitialized) {
2190       decorators |= IS_DEST_UNINITIALIZED;
2191     }
2192 
2193     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2194     bs->arraycopy_prologue(_masm, decorators, T_OBJECT, R3_from, R4_to, R5_count, /* preserve: */ R6_ckoff, R7_ckval);
2195 
2196     //inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr, R12_tmp, R3_RET);
2197 
2198     Label load_element, store_element, store_null, success, do_epilogue;
2199     __ or_(R9_remain, R5_count, R5_count); // Initialize loop index, and test it.
2200     __ li(R8_offset, 0);                   // Offset from start of arrays.
2201     __ li(R2_minus1, -1);
2202     __ bne(CCR0, load_element);
2203 
2204     // Empty array: Nothing to do.
2205     __ li(R3_RET, 0);           // Return 0 on (trivial) success.
2206     __ blr();
2207 
2208     // ======== begin loop ========
2209     // (Entry is load_element.)
2210     __ align(OptoLoopAlignment);
2211     __ bind(store_element);
2212     if (UseCompressedOops) {
2213       __ encode_heap_oop_not_null(R10_oop);
2214       __ bind(store_null);
2215       __ stw(R10_oop, R8_offset, R4_to);
2216     } else {
2217       __ bind(store_null);
2218       __ std(R10_oop, R8_offset, R4_to);
2219     }
2220 
2221     __ addi(R8_offset, R8_offset, heapOopSize);   // Step to next offset.
2222     __ add_(R9_remain, R2_minus1, R9_remain);     // Decrement the count.
2223     __ beq(CCR0, success);
2224 
2225     // ======== loop entry is here ========
2226     __ bind(load_element);
2227     __ load_heap_oop(R10_oop, R8_offset, R3_from, R12_tmp, noreg, false, AS_RAW, &store_null);
2228 
2229     __ load_klass(R11_klass, R10_oop); // Query the object klass.
2230 
2231     generate_type_check(R11_klass, R6_ckoff, R7_ckval, R12_tmp,
2232                         // Branch to this on success:
2233                         store_element);
2234     // ======== end loop ========
2235 
2236     // It was a real error; we must depend on the caller to finish the job.
2237     // Register R9_remain has number of *remaining* oops, R5_count number of *total* oops.
2238     // Emit GC store barriers for the oops we have copied (R5_count minus R9_remain),
2239     // and report their number to the caller.
2240     __ subf_(R5_count, R9_remain, R5_count);
2241     __ nand(R3_RET, R5_count, R5_count);   // report (-1^K) to caller
2242     __ bne(CCR0, do_epilogue);
2243     __ blr();
2244 
2245     __ bind(success);
2246     __ li(R3_RET, 0);
2247 
2248     __ bind(do_epilogue);
2249     bs->arraycopy_epilogue(_masm, decorators, T_OBJECT, R4_to, R5_count, /* preserve */ R3_RET);
2250 
2251     __ blr();
2252     return start;
2253   }
2254 
2255 
2256   //  Generate 'unsafe' array copy stub.
2257   //  Though just as safe as the other stubs, it takes an unscaled
2258   //  size_t argument instead of an element count.
2259   //
2260   // Arguments for generated stub:
2261   //      from:  R3
2262   //      to:    R4
2263   //      count: R5 byte count, treated as ssize_t, can be zero
2264   //
2265   // Examines the alignment of the operands and dispatches
2266   // to a long, int, short, or byte copy loop.
2267   //
2268   address generate_unsafe_copy(const char* name,
2269                                address byte_copy_entry,
2270                                address short_copy_entry,
2271                                address int_copy_entry,
2272                                address long_copy_entry) {
2273 
2274     const Register R3_from   = R3_ARG1;      // source array address
2275     const Register R4_to     = R4_ARG2;      // destination array address
2276     const Register R5_count  = R5_ARG3;      // elements count (as long on PPC64)
2277 
2278     const Register R6_bits   = R6_ARG4;      // test copy of low bits
2279     const Register R7_tmp    = R7_ARG5;
2280 
2281     //__ align(CodeEntryAlignment);
2282     StubCodeMark mark(this, "StubRoutines", name);
2283     address start = __ function_entry();
2284 
2285     // Bump this on entry, not on exit:
2286     //inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr, R6_bits, R7_tmp);
2287 
2288     Label short_copy, int_copy, long_copy;
2289 
2290     __ orr(R6_bits, R3_from, R4_to);
2291     __ orr(R6_bits, R6_bits, R5_count);
2292     __ andi_(R0, R6_bits, (BytesPerLong-1));
2293     __ beq(CCR0, long_copy);
2294 
2295     __ andi_(R0, R6_bits, (BytesPerInt-1));
2296     __ beq(CCR0, int_copy);
2297 
2298     __ andi_(R0, R6_bits, (BytesPerShort-1));
2299     __ beq(CCR0, short_copy);
2300 
2301     // byte_copy:
2302     __ b(byte_copy_entry);
2303 
2304     __ bind(short_copy);
2305     __ srwi(R5_count, R5_count, LogBytesPerShort);
2306     __ b(short_copy_entry);
2307 
2308     __ bind(int_copy);
2309     __ srwi(R5_count, R5_count, LogBytesPerInt);
2310     __ b(int_copy_entry);
2311 
2312     __ bind(long_copy);
2313     __ srwi(R5_count, R5_count, LogBytesPerLong);
2314     __ b(long_copy_entry);
2315 
2316     return start;
2317   }
2318 
2319 
2320   // Perform range checks on the proposed arraycopy.
2321   // Kills the two temps, but nothing else.
2322   // Also, clean the sign bits of src_pos and dst_pos.
2323   void arraycopy_range_checks(Register src,     // source array oop
2324                               Register src_pos, // source position
2325                               Register dst,     // destination array oop
2326                               Register dst_pos, // destination position
2327                               Register length,  // length of copy
2328                               Register temp1, Register temp2,
2329                               Label& L_failed) {
2330     BLOCK_COMMENT("arraycopy_range_checks:");
2331 
2332     const Register array_length = temp1;  // scratch
2333     const Register end_pos      = temp2;  // scratch
2334 
2335     //  if (src_pos + length > arrayOop(src)->length() ) FAIL;
2336     __ lwa(array_length, arrayOopDesc::length_offset_in_bytes(), src);
2337     __ add(end_pos, src_pos, length);  // src_pos + length
2338     __ cmpd(CCR0, end_pos, array_length);
2339     __ bgt(CCR0, L_failed);
2340 
2341     //  if (dst_pos + length > arrayOop(dst)->length() ) FAIL;
2342     __ lwa(array_length, arrayOopDesc::length_offset_in_bytes(), dst);
2343     __ add(end_pos, dst_pos, length);  // src_pos + length
2344     __ cmpd(CCR0, end_pos, array_length);
2345     __ bgt(CCR0, L_failed);
2346 
2347     BLOCK_COMMENT("arraycopy_range_checks done");
2348   }
2349 
2350 
2351   //
2352   //  Generate generic array copy stubs
2353   //
2354   //  Input:
2355   //    R3    -  src oop
2356   //    R4    -  src_pos
2357   //    R5    -  dst oop
2358   //    R6    -  dst_pos
2359   //    R7    -  element count
2360   //
2361   //  Output:
2362   //    R3 ==  0  -  success
2363   //    R3 == -1  -  need to call System.arraycopy
2364   //
2365   address generate_generic_copy(const char *name,
2366                                 address entry_jbyte_arraycopy,
2367                                 address entry_jshort_arraycopy,
2368                                 address entry_jint_arraycopy,
2369                                 address entry_oop_arraycopy,
2370                                 address entry_disjoint_oop_arraycopy,
2371                                 address entry_jlong_arraycopy,
2372                                 address entry_checkcast_arraycopy) {
2373     Label L_failed, L_objArray;
2374 
2375     // Input registers
2376     const Register src       = R3_ARG1;  // source array oop
2377     const Register src_pos   = R4_ARG2;  // source position
2378     const Register dst       = R5_ARG3;  // destination array oop
2379     const Register dst_pos   = R6_ARG4;  // destination position
2380     const Register length    = R7_ARG5;  // elements count
2381 
2382     // registers used as temp
2383     const Register src_klass = R8_ARG6;  // source array klass
2384     const Register dst_klass = R9_ARG7;  // destination array klass
2385     const Register lh        = R10_ARG8; // layout handler
2386     const Register temp      = R2;
2387 
2388     //__ align(CodeEntryAlignment);
2389     StubCodeMark mark(this, "StubRoutines", name);
2390     address start = __ function_entry();
2391 
2392     // Bump this on entry, not on exit:
2393     //inc_counter_np(SharedRuntime::_generic_array_copy_ctr, lh, temp);
2394 
2395     // In principle, the int arguments could be dirty.
2396 
2397     //-----------------------------------------------------------------------
2398     // Assembler stubs will be used for this call to arraycopy
2399     // if the following conditions are met:
2400     //
2401     // (1) src and dst must not be null.
2402     // (2) src_pos must not be negative.
2403     // (3) dst_pos must not be negative.
2404     // (4) length  must not be negative.
2405     // (5) src klass and dst klass should be the same and not NULL.
2406     // (6) src and dst should be arrays.
2407     // (7) src_pos + length must not exceed length of src.
2408     // (8) dst_pos + length must not exceed length of dst.
2409     BLOCK_COMMENT("arraycopy initial argument checks");
2410 
2411     __ cmpdi(CCR1, src, 0);      // if (src == NULL) return -1;
2412     __ extsw_(src_pos, src_pos); // if (src_pos < 0) return -1;
2413     __ cmpdi(CCR5, dst, 0);      // if (dst == NULL) return -1;
2414     __ cror(CCR1, Assembler::equal, CCR0, Assembler::less);
2415     __ extsw_(dst_pos, dst_pos); // if (src_pos < 0) return -1;
2416     __ cror(CCR5, Assembler::equal, CCR0, Assembler::less);
2417     __ extsw_(length, length);   // if (length < 0) return -1;
2418     __ cror(CCR1, Assembler::equal, CCR5, Assembler::equal);
2419     __ cror(CCR1, Assembler::equal, CCR0, Assembler::less);
2420     __ beq(CCR1, L_failed);
2421 
2422     BLOCK_COMMENT("arraycopy argument klass checks");
2423     __ load_klass(src_klass, src);
2424     __ load_klass(dst_klass, dst);
2425 
2426     // Load layout helper
2427     //
2428     //  |array_tag|     | header_size | element_type |     |log2_element_size|
2429     // 32        30    24            16              8     2                 0
2430     //
2431     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2432     //
2433 
2434     int lh_offset = in_bytes(Klass::layout_helper_offset());
2435 
2436     // Load 32-bits signed value. Use br() instruction with it to check icc.
2437     __ lwz(lh, lh_offset, src_klass);
2438 
2439     // Handle objArrays completely differently...
2440     jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2441     __ load_const_optimized(temp, objArray_lh, R0);
2442     __ cmpw(CCR0, lh, temp);
2443     __ beq(CCR0, L_objArray);
2444 
2445     __ cmpd(CCR5, src_klass, dst_klass);          // if (src->klass() != dst->klass()) return -1;
2446     __ cmpwi(CCR6, lh, Klass::_lh_neutral_value); // if (!src->is_Array()) return -1;
2447 
2448     __ crnand(CCR5, Assembler::equal, CCR6, Assembler::less);
2449     __ beq(CCR5, L_failed);
2450 
2451     // At this point, it is known to be a typeArray (array_tag 0x3).
2452 #ifdef ASSERT
2453     { Label L;
2454       jint lh_prim_tag_in_place = (Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
2455       __ load_const_optimized(temp, lh_prim_tag_in_place, R0);
2456       __ cmpw(CCR0, lh, temp);
2457       __ bge(CCR0, L);
2458       __ stop("must be a primitive array");
2459       __ bind(L);
2460     }
2461 #endif
2462 
2463     arraycopy_range_checks(src, src_pos, dst, dst_pos, length,
2464                            temp, dst_klass, L_failed);
2465 
2466     // TypeArrayKlass
2467     //
2468     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2469     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2470     //
2471 
2472     const Register offset = dst_klass;    // array offset
2473     const Register elsize = src_klass;    // log2 element size
2474 
2475     __ rldicl(offset, lh, 64 - Klass::_lh_header_size_shift, 64 - exact_log2(Klass::_lh_header_size_mask + 1));
2476     __ andi(elsize, lh, Klass::_lh_log2_element_size_mask);
2477     __ add(src, offset, src);       // src array offset
2478     __ add(dst, offset, dst);       // dst array offset
2479 
2480     // Next registers should be set before the jump to corresponding stub.
2481     const Register from     = R3_ARG1;  // source array address
2482     const Register to       = R4_ARG2;  // destination array address
2483     const Register count    = R5_ARG3;  // elements count
2484 
2485     // 'from', 'to', 'count' registers should be set in this order
2486     // since they are the same as 'src', 'src_pos', 'dst'.
2487 
2488     BLOCK_COMMENT("scale indexes to element size");
2489     __ sld(src_pos, src_pos, elsize);
2490     __ sld(dst_pos, dst_pos, elsize);
2491     __ add(from, src_pos, src);  // src_addr
2492     __ add(to, dst_pos, dst);    // dst_addr
2493     __ mr(count, length);        // length
2494 
2495     BLOCK_COMMENT("choose copy loop based on element size");
2496     // Using conditional branches with range 32kB.
2497     const int bo = Assembler::bcondCRbiIs1, bi = Assembler::bi0(CCR0, Assembler::equal);
2498     __ cmpwi(CCR0, elsize, 0);
2499     __ bc(bo, bi, entry_jbyte_arraycopy);
2500     __ cmpwi(CCR0, elsize, LogBytesPerShort);
2501     __ bc(bo, bi, entry_jshort_arraycopy);
2502     __ cmpwi(CCR0, elsize, LogBytesPerInt);
2503     __ bc(bo, bi, entry_jint_arraycopy);
2504 #ifdef ASSERT
2505     { Label L;
2506       __ cmpwi(CCR0, elsize, LogBytesPerLong);
2507       __ beq(CCR0, L);
2508       __ stop("must be long copy, but elsize is wrong");
2509       __ bind(L);
2510     }
2511 #endif
2512     __ b(entry_jlong_arraycopy);
2513 
2514     // ObjArrayKlass
2515   __ bind(L_objArray);
2516     // live at this point:  src_klass, dst_klass, src[_pos], dst[_pos], length
2517 
2518     Label L_disjoint_plain_copy, L_checkcast_copy;
2519     //  test array classes for subtyping
2520     __ cmpd(CCR0, src_klass, dst_klass);         // usual case is exact equality
2521     __ bne(CCR0, L_checkcast_copy);
2522 
2523     // Identically typed arrays can be copied without element-wise checks.
2524     arraycopy_range_checks(src, src_pos, dst, dst_pos, length,
2525                            temp, lh, L_failed);
2526 
2527     __ addi(src, src, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); //src offset
2528     __ addi(dst, dst, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); //dst offset
2529     __ sldi(src_pos, src_pos, LogBytesPerHeapOop);
2530     __ sldi(dst_pos, dst_pos, LogBytesPerHeapOop);
2531     __ add(from, src_pos, src);  // src_addr
2532     __ add(to, dst_pos, dst);    // dst_addr
2533     __ mr(count, length);        // length
2534     __ b(entry_oop_arraycopy);
2535 
2536   __ bind(L_checkcast_copy);
2537     // live at this point:  src_klass, dst_klass
2538     {
2539       // Before looking at dst.length, make sure dst is also an objArray.
2540       __ lwz(temp, lh_offset, dst_klass);
2541       __ cmpw(CCR0, lh, temp);
2542       __ bne(CCR0, L_failed);
2543 
2544       // It is safe to examine both src.length and dst.length.
2545       arraycopy_range_checks(src, src_pos, dst, dst_pos, length,
2546                              temp, lh, L_failed);
2547 
2548       // Marshal the base address arguments now, freeing registers.
2549       __ addi(src, src, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); //src offset
2550       __ addi(dst, dst, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); //dst offset
2551       __ sldi(src_pos, src_pos, LogBytesPerHeapOop);
2552       __ sldi(dst_pos, dst_pos, LogBytesPerHeapOop);
2553       __ add(from, src_pos, src);  // src_addr
2554       __ add(to, dst_pos, dst);    // dst_addr
2555       __ mr(count, length);        // length
2556 
2557       Register sco_temp = R6_ARG4;             // This register is free now.
2558       assert_different_registers(from, to, count, sco_temp,
2559                                  dst_klass, src_klass);
2560 
2561       // Generate the type check.
2562       int sco_offset = in_bytes(Klass::super_check_offset_offset());
2563       __ lwz(sco_temp, sco_offset, dst_klass);
2564       generate_type_check(src_klass, sco_temp, dst_klass,
2565                           temp, L_disjoint_plain_copy);
2566 
2567       // Fetch destination element klass from the ObjArrayKlass header.
2568       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2569 
2570       // The checkcast_copy loop needs two extra arguments:
2571       __ ld(R7_ARG5, ek_offset, dst_klass);   // dest elem klass
2572       __ lwz(R6_ARG4, sco_offset, R7_ARG5);   // sco of elem klass
2573       __ b(entry_checkcast_arraycopy);
2574     }
2575 
2576     __ bind(L_disjoint_plain_copy);
2577     __ b(entry_disjoint_oop_arraycopy);
2578 
2579   __ bind(L_failed);
2580     __ li(R3_RET, -1); // return -1
2581     __ blr();
2582     return start;
2583   }
2584 
2585   // Arguments for generated stub:
2586   //   R3_ARG1   - source byte array address
2587   //   R4_ARG2   - destination byte array address
2588   //   R5_ARG3   - round key array
2589   address generate_aescrypt_encryptBlock() {
2590     assert(UseAES, "need AES instructions and misaligned SSE support");
2591     StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
2592 
2593     address start = __ function_entry();
2594 
2595     Label L_doLast;
2596 
2597     Register from           = R3_ARG1;  // source array address
2598     Register to             = R4_ARG2;  // destination array address
2599     Register key            = R5_ARG3;  // round key array
2600 
2601     Register keylen         = R8;
2602     Register temp           = R9;
2603     Register keypos         = R10;
2604     Register fifteen        = R12;
2605 
2606     VectorRegister vRet     = VR0;
2607 
2608     VectorRegister vKey1    = VR1;
2609     VectorRegister vKey2    = VR2;
2610     VectorRegister vKey3    = VR3;
2611     VectorRegister vKey4    = VR4;
2612 
2613     VectorRegister fromPerm = VR5;
2614     VectorRegister keyPerm  = VR6;
2615     VectorRegister toPerm   = VR7;
2616     VectorRegister fSplt    = VR8;
2617 
2618     VectorRegister vTmp1    = VR9;
2619     VectorRegister vTmp2    = VR10;
2620     VectorRegister vTmp3    = VR11;
2621     VectorRegister vTmp4    = VR12;
2622 
2623     __ li              (fifteen, 15);
2624 
2625     // load unaligned from[0-15] to vsRet
2626     __ lvx             (vRet, from);
2627     __ lvx             (vTmp1, fifteen, from);
2628     __ lvsl            (fromPerm, from);
2629 #ifdef VM_LITTLE_ENDIAN
2630     __ vspltisb        (fSplt, 0x0f);
2631     __ vxor            (fromPerm, fromPerm, fSplt);
2632 #endif
2633     __ vperm           (vRet, vRet, vTmp1, fromPerm);
2634 
2635     // load keylen (44 or 52 or 60)
2636     __ lwz             (keylen, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT), key);
2637 
2638     // to load keys
2639     __ load_perm       (keyPerm, key);
2640 #ifdef VM_LITTLE_ENDIAN
2641     __ vspltisb        (vTmp2, -16);
2642     __ vrld            (keyPerm, keyPerm, vTmp2);
2643     __ vrld            (keyPerm, keyPerm, vTmp2);
2644     __ vsldoi          (keyPerm, keyPerm, keyPerm, 8);
2645 #endif
2646 
2647     // load the 1st round key to vTmp1
2648     __ lvx             (vTmp1, key);
2649     __ li              (keypos, 16);
2650     __ lvx             (vKey1, keypos, key);
2651     __ vec_perm        (vTmp1, vKey1, keyPerm);
2652 
2653     // 1st round
2654     __ vxor            (vRet, vRet, vTmp1);
2655 
2656     // load the 2nd round key to vKey1
2657     __ li              (keypos, 32);
2658     __ lvx             (vKey2, keypos, key);
2659     __ vec_perm        (vKey1, vKey2, keyPerm);
2660 
2661     // load the 3rd round key to vKey2
2662     __ li              (keypos, 48);
2663     __ lvx             (vKey3, keypos, key);
2664     __ vec_perm        (vKey2, vKey3, keyPerm);
2665 
2666     // load the 4th round key to vKey3
2667     __ li              (keypos, 64);
2668     __ lvx             (vKey4, keypos, key);
2669     __ vec_perm        (vKey3, vKey4, keyPerm);
2670 
2671     // load the 5th round key to vKey4
2672     __ li              (keypos, 80);
2673     __ lvx             (vTmp1, keypos, key);
2674     __ vec_perm        (vKey4, vTmp1, keyPerm);
2675 
2676     // 2nd - 5th rounds
2677     __ vcipher         (vRet, vRet, vKey1);
2678     __ vcipher         (vRet, vRet, vKey2);
2679     __ vcipher         (vRet, vRet, vKey3);
2680     __ vcipher         (vRet, vRet, vKey4);
2681 
2682     // load the 6th round key to vKey1
2683     __ li              (keypos, 96);
2684     __ lvx             (vKey2, keypos, key);
2685     __ vec_perm        (vKey1, vTmp1, vKey2, keyPerm);
2686 
2687     // load the 7th round key to vKey2
2688     __ li              (keypos, 112);
2689     __ lvx             (vKey3, keypos, key);
2690     __ vec_perm        (vKey2, vKey3, keyPerm);
2691 
2692     // load the 8th round key to vKey3
2693     __ li              (keypos, 128);
2694     __ lvx             (vKey4, keypos, key);
2695     __ vec_perm        (vKey3, vKey4, keyPerm);
2696 
2697     // load the 9th round key to vKey4
2698     __ li              (keypos, 144);
2699     __ lvx             (vTmp1, keypos, key);
2700     __ vec_perm        (vKey4, vTmp1, keyPerm);
2701 
2702     // 6th - 9th rounds
2703     __ vcipher         (vRet, vRet, vKey1);
2704     __ vcipher         (vRet, vRet, vKey2);
2705     __ vcipher         (vRet, vRet, vKey3);
2706     __ vcipher         (vRet, vRet, vKey4);
2707 
2708     // load the 10th round key to vKey1
2709     __ li              (keypos, 160);
2710     __ lvx             (vKey2, keypos, key);
2711     __ vec_perm        (vKey1, vTmp1, vKey2, keyPerm);
2712 
2713     // load the 11th round key to vKey2
2714     __ li              (keypos, 176);
2715     __ lvx             (vTmp1, keypos, key);
2716     __ vec_perm        (vKey2, vTmp1, keyPerm);
2717 
2718     // if all round keys are loaded, skip next 4 rounds
2719     __ cmpwi           (CCR0, keylen, 44);
2720     __ beq             (CCR0, L_doLast);
2721 
2722     // 10th - 11th rounds
2723     __ vcipher         (vRet, vRet, vKey1);
2724     __ vcipher         (vRet, vRet, vKey2);
2725 
2726     // load the 12th round key to vKey1
2727     __ li              (keypos, 192);
2728     __ lvx             (vKey2, keypos, key);
2729     __ vec_perm        (vKey1, vTmp1, vKey2, keyPerm);
2730 
2731     // load the 13th round key to vKey2
2732     __ li              (keypos, 208);
2733     __ lvx             (vTmp1, keypos, key);
2734     __ vec_perm        (vKey2, vTmp1, keyPerm);
2735 
2736     // if all round keys are loaded, skip next 2 rounds
2737     __ cmpwi           (CCR0, keylen, 52);
2738     __ beq             (CCR0, L_doLast);
2739 
2740     // 12th - 13th rounds
2741     __ vcipher         (vRet, vRet, vKey1);
2742     __ vcipher         (vRet, vRet, vKey2);
2743 
2744     // load the 14th round key to vKey1
2745     __ li              (keypos, 224);
2746     __ lvx             (vKey2, keypos, key);
2747     __ vec_perm        (vKey1, vTmp1, vKey2, keyPerm);
2748 
2749     // load the 15th round key to vKey2
2750     __ li              (keypos, 240);
2751     __ lvx             (vTmp1, keypos, key);
2752     __ vec_perm        (vKey2, vTmp1, keyPerm);
2753 
2754     __ bind(L_doLast);
2755 
2756     // last two rounds
2757     __ vcipher         (vRet, vRet, vKey1);
2758     __ vcipherlast     (vRet, vRet, vKey2);
2759 
2760     // store result (unaligned)
2761 #ifdef VM_LITTLE_ENDIAN
2762     __ lvsl            (toPerm, to);
2763 #else
2764     __ lvsr            (toPerm, to);
2765 #endif
2766     __ vspltisb        (vTmp3, -1);
2767     __ vspltisb        (vTmp4, 0);
2768     __ lvx             (vTmp1, to);
2769     __ lvx             (vTmp2, fifteen, to);
2770 #ifdef VM_LITTLE_ENDIAN
2771     __ vperm           (vTmp3, vTmp3, vTmp4, toPerm); // generate select mask
2772     __ vxor            (toPerm, toPerm, fSplt);       // swap bytes
2773 #else
2774     __ vperm           (vTmp3, vTmp4, vTmp3, toPerm); // generate select mask
2775 #endif
2776     __ vperm           (vTmp4, vRet, vRet, toPerm);   // rotate data
2777     __ vsel            (vTmp2, vTmp4, vTmp2, vTmp3);
2778     __ vsel            (vTmp1, vTmp1, vTmp4, vTmp3);
2779     __ stvx            (vTmp2, fifteen, to);          // store this one first (may alias)
2780     __ stvx            (vTmp1, to);
2781 
2782     __ blr();
2783      return start;
2784   }
2785 
2786   // Arguments for generated stub:
2787   //   R3_ARG1   - source byte array address
2788   //   R4_ARG2   - destination byte array address
2789   //   R5_ARG3   - K (key) in little endian int array
2790   address generate_aescrypt_decryptBlock() {
2791     assert(UseAES, "need AES instructions and misaligned SSE support");
2792     StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
2793 
2794     address start = __ function_entry();
2795 
2796     Label L_doLast;
2797     Label L_do44;
2798     Label L_do52;
2799 
2800     Register from           = R3_ARG1;  // source array address
2801     Register to             = R4_ARG2;  // destination array address
2802     Register key            = R5_ARG3;  // round key array
2803 
2804     Register keylen         = R8;
2805     Register temp           = R9;
2806     Register keypos         = R10;
2807     Register fifteen        = R12;
2808 
2809     VectorRegister vRet     = VR0;
2810 
2811     VectorRegister vKey1    = VR1;
2812     VectorRegister vKey2    = VR2;
2813     VectorRegister vKey3    = VR3;
2814     VectorRegister vKey4    = VR4;
2815     VectorRegister vKey5    = VR5;
2816 
2817     VectorRegister fromPerm = VR6;
2818     VectorRegister keyPerm  = VR7;
2819     VectorRegister toPerm   = VR8;
2820     VectorRegister fSplt    = VR9;
2821 
2822     VectorRegister vTmp1    = VR10;
2823     VectorRegister vTmp2    = VR11;
2824     VectorRegister vTmp3    = VR12;
2825     VectorRegister vTmp4    = VR13;
2826 
2827     __ li              (fifteen, 15);
2828 
2829     // load unaligned from[0-15] to vsRet
2830     __ lvx             (vRet, from);
2831     __ lvx             (vTmp1, fifteen, from);
2832     __ lvsl            (fromPerm, from);
2833 #ifdef VM_LITTLE_ENDIAN
2834     __ vspltisb        (fSplt, 0x0f);
2835     __ vxor            (fromPerm, fromPerm, fSplt);
2836 #endif
2837     __ vperm           (vRet, vRet, vTmp1, fromPerm); // align [and byte swap in LE]
2838 
2839     // load keylen (44 or 52 or 60)
2840     __ lwz             (keylen, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT), key);
2841 
2842     // to load keys
2843     __ load_perm       (keyPerm, key);
2844 #ifdef VM_LITTLE_ENDIAN
2845     __ vxor            (vTmp2, vTmp2, vTmp2);
2846     __ vspltisb        (vTmp2, -16);
2847     __ vrld            (keyPerm, keyPerm, vTmp2);
2848     __ vrld            (keyPerm, keyPerm, vTmp2);
2849     __ vsldoi          (keyPerm, keyPerm, keyPerm, 8);
2850 #endif
2851 
2852     __ cmpwi           (CCR0, keylen, 44);
2853     __ beq             (CCR0, L_do44);
2854 
2855     __ cmpwi           (CCR0, keylen, 52);
2856     __ beq             (CCR0, L_do52);
2857 
2858     // load the 15th round key to vKey1
2859     __ li              (keypos, 240);
2860     __ lvx             (vKey1, keypos, key);
2861     __ li              (keypos, 224);
2862     __ lvx             (vKey2, keypos, key);
2863     __ vec_perm        (vKey1, vKey2, vKey1, keyPerm);
2864 
2865     // load the 14th round key to vKey2
2866     __ li              (keypos, 208);
2867     __ lvx             (vKey3, keypos, key);
2868     __ vec_perm        (vKey2, vKey3, vKey2, keyPerm);
2869 
2870     // load the 13th round key to vKey3
2871     __ li              (keypos, 192);
2872     __ lvx             (vKey4, keypos, key);
2873     __ vec_perm        (vKey3, vKey4, vKey3, keyPerm);
2874 
2875     // load the 12th round key to vKey4
2876     __ li              (keypos, 176);
2877     __ lvx             (vKey5, keypos, key);
2878     __ vec_perm        (vKey4, vKey5, vKey4, keyPerm);
2879 
2880     // load the 11th round key to vKey5
2881     __ li              (keypos, 160);
2882     __ lvx             (vTmp1, keypos, key);
2883     __ vec_perm        (vKey5, vTmp1, vKey5, keyPerm);
2884 
2885     // 1st - 5th rounds
2886     __ vxor            (vRet, vRet, vKey1);
2887     __ vncipher        (vRet, vRet, vKey2);
2888     __ vncipher        (vRet, vRet, vKey3);
2889     __ vncipher        (vRet, vRet, vKey4);
2890     __ vncipher        (vRet, vRet, vKey5);
2891 
2892     __ b               (L_doLast);
2893 
2894     __ bind            (L_do52);
2895 
2896     // load the 13th round key to vKey1
2897     __ li              (keypos, 208);
2898     __ lvx             (vKey1, keypos, key);
2899     __ li              (keypos, 192);
2900     __ lvx             (vKey2, keypos, key);
2901     __ vec_perm        (vKey1, vKey2, vKey1, keyPerm);
2902 
2903     // load the 12th round key to vKey2
2904     __ li              (keypos, 176);
2905     __ lvx             (vKey3, keypos, key);
2906     __ vec_perm        (vKey2, vKey3, vKey2, keyPerm);
2907 
2908     // load the 11th round key to vKey3
2909     __ li              (keypos, 160);
2910     __ lvx             (vTmp1, keypos, key);
2911     __ vec_perm        (vKey3, vTmp1, vKey3, keyPerm);
2912 
2913     // 1st - 3rd rounds
2914     __ vxor            (vRet, vRet, vKey1);
2915     __ vncipher        (vRet, vRet, vKey2);
2916     __ vncipher        (vRet, vRet, vKey3);
2917 
2918     __ b               (L_doLast);
2919 
2920     __ bind            (L_do44);
2921 
2922     // load the 11th round key to vKey1
2923     __ li              (keypos, 176);
2924     __ lvx             (vKey1, keypos, key);
2925     __ li              (keypos, 160);
2926     __ lvx             (vTmp1, keypos, key);
2927     __ vec_perm        (vKey1, vTmp1, vKey1, keyPerm);
2928 
2929     // 1st round
2930     __ vxor            (vRet, vRet, vKey1);
2931 
2932     __ bind            (L_doLast);
2933 
2934     // load the 10th round key to vKey1
2935     __ li              (keypos, 144);
2936     __ lvx             (vKey2, keypos, key);
2937     __ vec_perm        (vKey1, vKey2, vTmp1, keyPerm);
2938 
2939     // load the 9th round key to vKey2
2940     __ li              (keypos, 128);
2941     __ lvx             (vKey3, keypos, key);
2942     __ vec_perm        (vKey2, vKey3, vKey2, keyPerm);
2943 
2944     // load the 8th round key to vKey3
2945     __ li              (keypos, 112);
2946     __ lvx             (vKey4, keypos, key);
2947     __ vec_perm        (vKey3, vKey4, vKey3, keyPerm);
2948 
2949     // load the 7th round key to vKey4
2950     __ li              (keypos, 96);
2951     __ lvx             (vKey5, keypos, key);
2952     __ vec_perm        (vKey4, vKey5, vKey4, keyPerm);
2953 
2954     // load the 6th round key to vKey5
2955     __ li              (keypos, 80);
2956     __ lvx             (vTmp1, keypos, key);
2957     __ vec_perm        (vKey5, vTmp1, vKey5, keyPerm);
2958 
2959     // last 10th - 6th rounds
2960     __ vncipher        (vRet, vRet, vKey1);
2961     __ vncipher        (vRet, vRet, vKey2);
2962     __ vncipher        (vRet, vRet, vKey3);
2963     __ vncipher        (vRet, vRet, vKey4);
2964     __ vncipher        (vRet, vRet, vKey5);
2965 
2966     // load the 5th round key to vKey1
2967     __ li              (keypos, 64);
2968     __ lvx             (vKey2, keypos, key);
2969     __ vec_perm        (vKey1, vKey2, vTmp1, keyPerm);
2970 
2971     // load the 4th round key to vKey2
2972     __ li              (keypos, 48);
2973     __ lvx             (vKey3, keypos, key);
2974     __ vec_perm        (vKey2, vKey3, vKey2, keyPerm);
2975 
2976     // load the 3rd round key to vKey3
2977     __ li              (keypos, 32);
2978     __ lvx             (vKey4, keypos, key);
2979     __ vec_perm        (vKey3, vKey4, vKey3, keyPerm);
2980 
2981     // load the 2nd round key to vKey4
2982     __ li              (keypos, 16);
2983     __ lvx             (vKey5, keypos, key);
2984     __ vec_perm        (vKey4, vKey5, vKey4, keyPerm);
2985 
2986     // load the 1st round key to vKey5
2987     __ lvx             (vTmp1, key);
2988     __ vec_perm        (vKey5, vTmp1, vKey5, keyPerm);
2989 
2990     // last 5th - 1th rounds
2991     __ vncipher        (vRet, vRet, vKey1);
2992     __ vncipher        (vRet, vRet, vKey2);
2993     __ vncipher        (vRet, vRet, vKey3);
2994     __ vncipher        (vRet, vRet, vKey4);
2995     __ vncipherlast    (vRet, vRet, vKey5);
2996 
2997     // store result (unaligned)
2998 #ifdef VM_LITTLE_ENDIAN
2999     __ lvsl            (toPerm, to);
3000 #else
3001     __ lvsr            (toPerm, to);
3002 #endif
3003     __ vspltisb        (vTmp3, -1);
3004     __ vspltisb        (vTmp4, 0);
3005     __ lvx             (vTmp1, to);
3006     __ lvx             (vTmp2, fifteen, to);
3007 #ifdef VM_LITTLE_ENDIAN
3008     __ vperm           (vTmp3, vTmp3, vTmp4, toPerm); // generate select mask
3009     __ vxor            (toPerm, toPerm, fSplt);       // swap bytes
3010 #else
3011     __ vperm           (vTmp3, vTmp4, vTmp3, toPerm); // generate select mask
3012 #endif
3013     __ vperm           (vTmp4, vRet, vRet, toPerm);   // rotate data
3014     __ vsel            (vTmp2, vTmp4, vTmp2, vTmp3);
3015     __ vsel            (vTmp1, vTmp1, vTmp4, vTmp3);
3016     __ stvx            (vTmp2, fifteen, to);          // store this one first (may alias)
3017     __ stvx            (vTmp1, to);
3018 
3019     __ blr();
3020      return start;
3021   }
3022 
3023   address generate_sha256_implCompress(bool multi_block, const char *name) {
3024     assert(UseSHA, "need SHA instructions");
3025     StubCodeMark mark(this, "StubRoutines", name);
3026     address start = __ function_entry();
3027 
3028     __ sha256 (multi_block);
3029 
3030     __ blr();
3031     return start;
3032   }
3033 
3034   address generate_sha512_implCompress(bool multi_block, const char *name) {
3035     assert(UseSHA, "need SHA instructions");
3036     StubCodeMark mark(this, "StubRoutines", name);
3037     address start = __ function_entry();
3038 
3039     __ sha512 (multi_block);
3040 
3041     __ blr();
3042     return start;
3043   }
3044 
3045   void generate_arraycopy_stubs() {
3046     // Note: the disjoint stubs must be generated first, some of
3047     // the conjoint stubs use them.
3048 
3049     address ucm_common_error_exit       =  generate_unsafecopy_common_error_exit();
3050     UnsafeCopyMemory::set_common_exit_stub_pc(ucm_common_error_exit);
3051 
3052     // non-aligned disjoint versions
3053     StubRoutines::_jbyte_disjoint_arraycopy       = generate_disjoint_byte_copy(false, "jbyte_disjoint_arraycopy");
3054     StubRoutines::_jshort_disjoint_arraycopy      = generate_disjoint_short_copy(false, "jshort_disjoint_arraycopy");
3055     StubRoutines::_jint_disjoint_arraycopy        = generate_disjoint_int_copy(false, "jint_disjoint_arraycopy");
3056     StubRoutines::_jlong_disjoint_arraycopy       = generate_disjoint_long_copy(false, "jlong_disjoint_arraycopy");
3057     StubRoutines::_oop_disjoint_arraycopy         = generate_disjoint_oop_copy(false, "oop_disjoint_arraycopy", false);
3058     StubRoutines::_oop_disjoint_arraycopy_uninit  = generate_disjoint_oop_copy(false, "oop_disjoint_arraycopy_uninit", true);
3059 
3060     // aligned disjoint versions
3061     StubRoutines::_arrayof_jbyte_disjoint_arraycopy      = generate_disjoint_byte_copy(true, "arrayof_jbyte_disjoint_arraycopy");
3062     StubRoutines::_arrayof_jshort_disjoint_arraycopy     = generate_disjoint_short_copy(true, "arrayof_jshort_disjoint_arraycopy");
3063     StubRoutines::_arrayof_jint_disjoint_arraycopy       = generate_disjoint_int_copy(true, "arrayof_jint_disjoint_arraycopy");
3064     StubRoutines::_arrayof_jlong_disjoint_arraycopy      = generate_disjoint_long_copy(true, "arrayof_jlong_disjoint_arraycopy");
3065     StubRoutines::_arrayof_oop_disjoint_arraycopy        = generate_disjoint_oop_copy(true, "arrayof_oop_disjoint_arraycopy", false);
3066     StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit = generate_disjoint_oop_copy(true, "oop_disjoint_arraycopy_uninit", true);
3067 
3068     // non-aligned conjoint versions
3069     StubRoutines::_jbyte_arraycopy      = generate_conjoint_byte_copy(false, "jbyte_arraycopy");
3070     StubRoutines::_jshort_arraycopy     = generate_conjoint_short_copy(false, "jshort_arraycopy");
3071     StubRoutines::_jint_arraycopy       = generate_conjoint_int_copy(false, "jint_arraycopy");
3072     StubRoutines::_jlong_arraycopy      = generate_conjoint_long_copy(false, "jlong_arraycopy");
3073     StubRoutines::_oop_arraycopy        = generate_conjoint_oop_copy(false, "oop_arraycopy", false);
3074     StubRoutines::_oop_arraycopy_uninit = generate_conjoint_oop_copy(false, "oop_arraycopy_uninit", true);
3075 
3076     // aligned conjoint versions
3077     StubRoutines::_arrayof_jbyte_arraycopy      = generate_conjoint_byte_copy(true, "arrayof_jbyte_arraycopy");
3078     StubRoutines::_arrayof_jshort_arraycopy     = generate_conjoint_short_copy(true, "arrayof_jshort_arraycopy");
3079     StubRoutines::_arrayof_jint_arraycopy       = generate_conjoint_int_copy(true, "arrayof_jint_arraycopy");
3080     StubRoutines::_arrayof_jlong_arraycopy      = generate_conjoint_long_copy(true, "arrayof_jlong_arraycopy");
3081     StubRoutines::_arrayof_oop_arraycopy        = generate_conjoint_oop_copy(true, "arrayof_oop_arraycopy", false);
3082     StubRoutines::_arrayof_oop_arraycopy_uninit = generate_conjoint_oop_copy(true, "arrayof_oop_arraycopy", true);
3083 
3084     // special/generic versions
3085     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", false);
3086     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", true);
3087 
3088     StubRoutines::_unsafe_arraycopy  = generate_unsafe_copy("unsafe_arraycopy",
3089                                                             STUB_ENTRY(jbyte_arraycopy),
3090                                                             STUB_ENTRY(jshort_arraycopy),
3091                                                             STUB_ENTRY(jint_arraycopy),
3092                                                             STUB_ENTRY(jlong_arraycopy));
3093     StubRoutines::_generic_arraycopy = generate_generic_copy("generic_arraycopy",
3094                                                              STUB_ENTRY(jbyte_arraycopy),
3095                                                              STUB_ENTRY(jshort_arraycopy),
3096                                                              STUB_ENTRY(jint_arraycopy),
3097                                                              STUB_ENTRY(oop_arraycopy),
3098                                                              STUB_ENTRY(oop_disjoint_arraycopy),
3099                                                              STUB_ENTRY(jlong_arraycopy),
3100                                                              STUB_ENTRY(checkcast_arraycopy));
3101 
3102     // fill routines
3103     if (OptimizeFill) {
3104       StubRoutines::_jbyte_fill          = generate_fill(T_BYTE,  false, "jbyte_fill");
3105       StubRoutines::_jshort_fill         = generate_fill(T_SHORT, false, "jshort_fill");
3106       StubRoutines::_jint_fill           = generate_fill(T_INT,   false, "jint_fill");
3107       StubRoutines::_arrayof_jbyte_fill  = generate_fill(T_BYTE,  true, "arrayof_jbyte_fill");
3108       StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
3109       StubRoutines::_arrayof_jint_fill   = generate_fill(T_INT,   true, "arrayof_jint_fill");
3110     }
3111   }
3112 
3113   // Safefetch stubs.
3114   void generate_safefetch(const char* name, int size, address* entry, address* fault_pc, address* continuation_pc) {
3115     // safefetch signatures:
3116     //   int      SafeFetch32(int*      adr, int      errValue);
3117     //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
3118     //
3119     // arguments:
3120     //   R3_ARG1 = adr
3121     //   R4_ARG2 = errValue
3122     //
3123     // result:
3124     //   R3_RET  = *adr or errValue
3125 
3126     StubCodeMark mark(this, "StubRoutines", name);
3127 
3128     // Entry point, pc or function descriptor.
3129     *entry = __ function_entry();
3130 
3131     // Load *adr into R4_ARG2, may fault.
3132     *fault_pc = __ pc();
3133     switch (size) {
3134       case 4:
3135         // int32_t, signed extended
3136         __ lwa(R4_ARG2, 0, R3_ARG1);
3137         break;
3138       case 8:
3139         // int64_t
3140         __ ld(R4_ARG2, 0, R3_ARG1);
3141         break;
3142       default:
3143         ShouldNotReachHere();
3144     }
3145 
3146     // return errValue or *adr
3147     *continuation_pc = __ pc();
3148     __ mr(R3_RET, R4_ARG2);
3149     __ blr();
3150   }
3151 
3152   // Stub for BigInteger::multiplyToLen()
3153   //
3154   //  Arguments:
3155   //
3156   //  Input:
3157   //    R3 - x address
3158   //    R4 - x length
3159   //    R5 - y address
3160   //    R6 - y length
3161   //    R7 - z address
3162   //    R8 - z length
3163   //
3164   address generate_multiplyToLen() {
3165 
3166     StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
3167 
3168     address start = __ function_entry();
3169 
3170     const Register x     = R3;
3171     const Register xlen  = R4;
3172     const Register y     = R5;
3173     const Register ylen  = R6;
3174     const Register z     = R7;
3175     const Register zlen  = R8;
3176 
3177     const Register tmp1  = R2; // TOC not used.
3178     const Register tmp2  = R9;
3179     const Register tmp3  = R10;
3180     const Register tmp4  = R11;
3181     const Register tmp5  = R12;
3182 
3183     // non-volatile regs
3184     const Register tmp6  = R31;
3185     const Register tmp7  = R30;
3186     const Register tmp8  = R29;
3187     const Register tmp9  = R28;
3188     const Register tmp10 = R27;
3189     const Register tmp11 = R26;
3190     const Register tmp12 = R25;
3191     const Register tmp13 = R24;
3192 
3193     BLOCK_COMMENT("Entry:");
3194 
3195     // C2 does not respect int to long conversion for stub calls.
3196     __ clrldi(xlen, xlen, 32);
3197     __ clrldi(ylen, ylen, 32);
3198     __ clrldi(zlen, zlen, 32);
3199 
3200     // Save non-volatile regs (frameless).
3201     int current_offs = 8;
3202     __ std(R24, -current_offs, R1_SP); current_offs += 8;
3203     __ std(R25, -current_offs, R1_SP); current_offs += 8;
3204     __ std(R26, -current_offs, R1_SP); current_offs += 8;
3205     __ std(R27, -current_offs, R1_SP); current_offs += 8;
3206     __ std(R28, -current_offs, R1_SP); current_offs += 8;
3207     __ std(R29, -current_offs, R1_SP); current_offs += 8;
3208     __ std(R30, -current_offs, R1_SP); current_offs += 8;
3209     __ std(R31, -current_offs, R1_SP);
3210 
3211     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5,
3212                        tmp6, tmp7, tmp8, tmp9, tmp10, tmp11, tmp12, tmp13);
3213 
3214     // Restore non-volatile regs.
3215     current_offs = 8;
3216     __ ld(R24, -current_offs, R1_SP); current_offs += 8;
3217     __ ld(R25, -current_offs, R1_SP); current_offs += 8;
3218     __ ld(R26, -current_offs, R1_SP); current_offs += 8;
3219     __ ld(R27, -current_offs, R1_SP); current_offs += 8;
3220     __ ld(R28, -current_offs, R1_SP); current_offs += 8;
3221     __ ld(R29, -current_offs, R1_SP); current_offs += 8;
3222     __ ld(R30, -current_offs, R1_SP); current_offs += 8;
3223     __ ld(R31, -current_offs, R1_SP);
3224 
3225     __ blr();  // Return to caller.
3226 
3227     return start;
3228   }
3229 
3230   /**
3231   *  Arguments:
3232   *
3233   *  Input:
3234   *   R3_ARG1    - out address
3235   *   R4_ARG2    - in address
3236   *   R5_ARG3    - offset
3237   *   R6_ARG4    - len
3238   *   R7_ARG5    - k
3239   *  Output:
3240   *   R3_RET     - carry
3241   */
3242   address generate_mulAdd() {
3243     __ align(CodeEntryAlignment);
3244     StubCodeMark mark(this, "StubRoutines", "mulAdd");
3245 
3246     address start = __ function_entry();
3247 
3248     // C2 does not sign extend signed parameters to full 64 bits registers:
3249     __ rldic (R5_ARG3, R5_ARG3, 2, 32);  // always positive
3250     __ clrldi(R6_ARG4, R6_ARG4, 32);     // force zero bits on higher word
3251     __ clrldi(R7_ARG5, R7_ARG5, 32);     // force zero bits on higher word
3252 
3253     __ muladd(R3_ARG1, R4_ARG2, R5_ARG3, R6_ARG4, R7_ARG5, R8, R9, R10);
3254 
3255     // Moves output carry to return register
3256     __ mr    (R3_RET,  R10);
3257 
3258     __ blr();
3259 
3260     return start;
3261   }
3262 
3263   /**
3264   *  Arguments:
3265   *
3266   *  Input:
3267   *   R3_ARG1    - in address
3268   *   R4_ARG2    - in length
3269   *   R5_ARG3    - out address
3270   *   R6_ARG4    - out length
3271   */
3272   address generate_squareToLen() {
3273     __ align(CodeEntryAlignment);
3274     StubCodeMark mark(this, "StubRoutines", "squareToLen");
3275 
3276     address start = __ function_entry();
3277 
3278     // args - higher word is cleaned (unsignedly) due to int to long casting
3279     const Register in        = R3_ARG1;
3280     const Register in_len    = R4_ARG2;
3281     __ clrldi(in_len, in_len, 32);
3282     const Register out       = R5_ARG3;
3283     const Register out_len   = R6_ARG4;
3284     __ clrldi(out_len, out_len, 32);
3285 
3286     // output
3287     const Register ret       = R3_RET;
3288 
3289     // temporaries
3290     const Register lplw_s    = R7;
3291     const Register in_aux    = R8;
3292     const Register out_aux   = R9;
3293     const Register piece     = R10;
3294     const Register product   = R14;
3295     const Register lplw      = R15;
3296     const Register i_minus1  = R16;
3297     const Register carry     = R17;
3298     const Register offset    = R18;
3299     const Register off_aux   = R19;
3300     const Register t         = R20;
3301     const Register mlen      = R21;
3302     const Register len       = R22;
3303     const Register a         = R23;
3304     const Register b         = R24;
3305     const Register i         = R25;
3306     const Register c         = R26;
3307     const Register cs        = R27;
3308 
3309     // Labels
3310     Label SKIP_LSHIFT, SKIP_DIAGONAL_SUM, SKIP_ADDONE, SKIP_LOOP_SQUARE;
3311     Label LOOP_LSHIFT, LOOP_DIAGONAL_SUM, LOOP_ADDONE, LOOP_SQUARE;
3312 
3313     // Save non-volatile regs (frameless).
3314     int current_offs = -8;
3315     __ std(R28, current_offs, R1_SP); current_offs -= 8;
3316     __ std(R27, current_offs, R1_SP); current_offs -= 8;
3317     __ std(R26, current_offs, R1_SP); current_offs -= 8;
3318     __ std(R25, current_offs, R1_SP); current_offs -= 8;
3319     __ std(R24, current_offs, R1_SP); current_offs -= 8;
3320     __ std(R23, current_offs, R1_SP); current_offs -= 8;
3321     __ std(R22, current_offs, R1_SP); current_offs -= 8;
3322     __ std(R21, current_offs, R1_SP); current_offs -= 8;
3323     __ std(R20, current_offs, R1_SP); current_offs -= 8;
3324     __ std(R19, current_offs, R1_SP); current_offs -= 8;
3325     __ std(R18, current_offs, R1_SP); current_offs -= 8;
3326     __ std(R17, current_offs, R1_SP); current_offs -= 8;
3327     __ std(R16, current_offs, R1_SP); current_offs -= 8;
3328     __ std(R15, current_offs, R1_SP); current_offs -= 8;
3329     __ std(R14, current_offs, R1_SP);
3330 
3331     // Store the squares, right shifted one bit (i.e., divided by 2)
3332     __ subi   (out_aux,   out,       8);
3333     __ subi   (in_aux,    in,        4);
3334     __ cmpwi  (CCR0,      in_len,    0);
3335     // Initialize lplw outside of the loop
3336     __ xorr   (lplw,      lplw,      lplw);
3337     __ ble    (CCR0,      SKIP_LOOP_SQUARE);    // in_len <= 0
3338     __ mtctr  (in_len);
3339 
3340     __ bind(LOOP_SQUARE);
3341     __ lwzu   (piece,     4,         in_aux);
3342     __ mulld  (product,   piece,     piece);
3343     // shift left 63 bits and only keep the MSB
3344     __ rldic  (lplw_s,    lplw,      63, 0);
3345     __ mr     (lplw,      product);
3346     // shift right 1 bit without sign extension
3347     __ srdi   (product,   product,   1);
3348     // join them to the same register and store it
3349     __ orr    (product,   lplw_s,    product);
3350 #ifdef VM_LITTLE_ENDIAN
3351     // Swap low and high words for little endian
3352     __ rldicl (product,   product,   32, 0);
3353 #endif
3354     __ stdu   (product,   8,         out_aux);
3355     __ bdnz   (LOOP_SQUARE);
3356 
3357     __ bind(SKIP_LOOP_SQUARE);
3358 
3359     // Add in off-diagonal sums
3360     __ cmpwi  (CCR0,      in_len,    0);
3361     __ ble    (CCR0,      SKIP_DIAGONAL_SUM);
3362     // Avoid CTR usage here in order to use it at mulAdd
3363     __ subi   (i_minus1,  in_len,    1);
3364     __ li     (offset,    4);
3365 
3366     __ bind(LOOP_DIAGONAL_SUM);
3367 
3368     __ sldi   (off_aux,   out_len,   2);
3369     __ sub    (off_aux,   off_aux,   offset);
3370 
3371     __ mr     (len,       i_minus1);
3372     __ sldi   (mlen,      i_minus1,  2);
3373     __ lwzx   (t,         in,        mlen);
3374 
3375     __ muladd (out, in, off_aux, len, t, a, b, carry);
3376 
3377     // begin<addOne>
3378     // off_aux = out_len*4 - 4 - mlen - offset*4 - 4;
3379     __ addi   (mlen,      mlen,      4);
3380     __ sldi   (a,         out_len,   2);
3381     __ subi   (a,         a,         4);
3382     __ sub    (a,         a,         mlen);
3383     __ subi   (off_aux,   offset,    4);
3384     __ sub    (off_aux,   a,         off_aux);
3385 
3386     __ lwzx   (b,         off_aux,   out);
3387     __ add    (b,         b,         carry);
3388     __ stwx   (b,         off_aux,   out);
3389 
3390     // if (((uint64_t)s >> 32) != 0) {
3391     __ srdi_  (a,         b,         32);
3392     __ beq    (CCR0,      SKIP_ADDONE);
3393 
3394     // while (--mlen >= 0) {
3395     __ bind(LOOP_ADDONE);
3396     __ subi   (mlen,      mlen,      4);
3397     __ cmpwi  (CCR0,      mlen,      0);
3398     __ beq    (CCR0,      SKIP_ADDONE);
3399 
3400     // if (--offset_aux < 0) { // Carry out of number
3401     __ subi   (off_aux,   off_aux,   4);
3402     __ cmpwi  (CCR0,      off_aux,   0);
3403     __ blt    (CCR0,      SKIP_ADDONE);
3404 
3405     // } else {
3406     __ lwzx   (b,         off_aux,   out);
3407     __ addi   (b,         b,         1);
3408     __ stwx   (b,         off_aux,   out);
3409     __ cmpwi  (CCR0,      b,         0);
3410     __ bne    (CCR0,      SKIP_ADDONE);
3411     __ b      (LOOP_ADDONE);
3412 
3413     __ bind(SKIP_ADDONE);
3414     // } } } end<addOne>
3415 
3416     __ addi   (offset,    offset,    8);
3417     __ subi   (i_minus1,  i_minus1,  1);
3418     __ cmpwi  (CCR0,      i_minus1,  0);
3419     __ bge    (CCR0,      LOOP_DIAGONAL_SUM);
3420 
3421     __ bind(SKIP_DIAGONAL_SUM);
3422 
3423     // Shift back up and set low bit
3424     // Shifts 1 bit left up to len positions. Assumes no leading zeros
3425     // begin<primitiveLeftShift>
3426     __ cmpwi  (CCR0,      out_len,   0);
3427     __ ble    (CCR0,      SKIP_LSHIFT);
3428     __ li     (i,         0);
3429     __ lwz    (c,         0,         out);
3430     __ subi   (b,         out_len,   1);
3431     __ mtctr  (b);
3432 
3433     __ bind(LOOP_LSHIFT);
3434     __ mr     (b,         c);
3435     __ addi   (cs,        i,         4);
3436     __ lwzx   (c,         out,       cs);
3437 
3438     __ sldi   (b,         b,         1);
3439     __ srwi   (cs,        c,         31);
3440     __ orr    (b,         b,         cs);
3441     __ stwx   (b,         i,         out);
3442 
3443     __ addi   (i,         i,         4);
3444     __ bdnz   (LOOP_LSHIFT);
3445 
3446     __ sldi   (c,         out_len,   2);
3447     __ subi   (c,         c,         4);
3448     __ lwzx   (b,         out,       c);
3449     __ sldi   (b,         b,         1);
3450     __ stwx   (b,         out,       c);
3451 
3452     __ bind(SKIP_LSHIFT);
3453     // end<primitiveLeftShift>
3454 
3455     // Set low bit
3456     __ sldi   (i,         in_len,    2);
3457     __ subi   (i,         i,         4);
3458     __ lwzx   (i,         in,        i);
3459     __ sldi   (c,         out_len,   2);
3460     __ subi   (c,         c,         4);
3461     __ lwzx   (b,         out,       c);
3462 
3463     __ andi   (i,         i,         1);
3464     __ orr    (i,         b,         i);
3465 
3466     __ stwx   (i,         out,       c);
3467 
3468     // Restore non-volatile regs.
3469     current_offs = -8;
3470     __ ld(R28, current_offs, R1_SP); current_offs -= 8;
3471     __ ld(R27, current_offs, R1_SP); current_offs -= 8;
3472     __ ld(R26, current_offs, R1_SP); current_offs -= 8;
3473     __ ld(R25, current_offs, R1_SP); current_offs -= 8;
3474     __ ld(R24, current_offs, R1_SP); current_offs -= 8;
3475     __ ld(R23, current_offs, R1_SP); current_offs -= 8;
3476     __ ld(R22, current_offs, R1_SP); current_offs -= 8;
3477     __ ld(R21, current_offs, R1_SP); current_offs -= 8;
3478     __ ld(R20, current_offs, R1_SP); current_offs -= 8;
3479     __ ld(R19, current_offs, R1_SP); current_offs -= 8;
3480     __ ld(R18, current_offs, R1_SP); current_offs -= 8;
3481     __ ld(R17, current_offs, R1_SP); current_offs -= 8;
3482     __ ld(R16, current_offs, R1_SP); current_offs -= 8;
3483     __ ld(R15, current_offs, R1_SP); current_offs -= 8;
3484     __ ld(R14, current_offs, R1_SP);
3485 
3486     __ mr(ret, out);
3487     __ blr();
3488 
3489     return start;
3490   }
3491 
3492   /**
3493    * Arguments:
3494    *
3495    * Inputs:
3496    *   R3_ARG1    - int   crc
3497    *   R4_ARG2    - byte* buf
3498    *   R5_ARG3    - int   length (of buffer)
3499    *
3500    * scratch:
3501    *   R2, R6-R12
3502    *
3503    * Ouput:
3504    *   R3_RET     - int   crc result
3505    */
3506   // Compute CRC32 function.
3507   address generate_CRC32_updateBytes(bool is_crc32c) {
3508     __ align(CodeEntryAlignment);
3509     StubCodeMark mark(this, "StubRoutines", is_crc32c ? "CRC32C_updateBytes" : "CRC32_updateBytes");
3510     address start = __ function_entry();  // Remember stub start address (is rtn value).
3511     __ crc32(R3_ARG1, R4_ARG2, R5_ARG3, R2, R6, R7, R8, R9, R10, R11, R12, is_crc32c);
3512     __ blr();
3513     return start;
3514   }
3515 
3516   // Initialization
3517   void generate_initial() {
3518     // Generates all stubs and initializes the entry points
3519 
3520     // Entry points that exist in all platforms.
3521     // Note: This is code that could be shared among different platforms - however the
3522     // benefit seems to be smaller than the disadvantage of having a
3523     // much more complicated generator structure. See also comment in
3524     // stubRoutines.hpp.
3525 
3526     StubRoutines::_forward_exception_entry          = generate_forward_exception();
3527     StubRoutines::_call_stub_entry                  = generate_call_stub(StubRoutines::_call_stub_return_address);
3528     StubRoutines::_catch_exception_entry            = generate_catch_exception();
3529 
3530     // Build this early so it's available for the interpreter.
3531     StubRoutines::_throw_StackOverflowError_entry   =
3532       generate_throw_exception("StackOverflowError throw_exception",
3533                                CAST_FROM_FN_PTR(address, SharedRuntime::throw_StackOverflowError), false);
3534     StubRoutines::_throw_delayed_StackOverflowError_entry =
3535       generate_throw_exception("delayed StackOverflowError throw_exception",
3536                                CAST_FROM_FN_PTR(address, SharedRuntime::throw_delayed_StackOverflowError), false);
3537 
3538     // CRC32 Intrinsics.
3539     if (UseCRC32Intrinsics) {
3540       StubRoutines::_crc_table_adr = StubRoutines::generate_crc_constants(REVERSE_CRC32_POLY);
3541       StubRoutines::_updateBytesCRC32 = generate_CRC32_updateBytes(false);
3542     }
3543 
3544     // CRC32C Intrinsics.
3545     if (UseCRC32CIntrinsics) {
3546       StubRoutines::_crc32c_table_addr = StubRoutines::generate_crc_constants(REVERSE_CRC32C_POLY);
3547       StubRoutines::_updateBytesCRC32C = generate_CRC32_updateBytes(true);
3548     }
3549   }
3550 
3551   void generate_all() {
3552     // Generates all stubs and initializes the entry points
3553 
3554     // These entry points require SharedInfo::stack0 to be set up in
3555     // non-core builds
3556     StubRoutines::_throw_AbstractMethodError_entry         = generate_throw_exception("AbstractMethodError throw_exception",          CAST_FROM_FN_PTR(address, SharedRuntime::throw_AbstractMethodError),  false);
3557     // Handle IncompatibleClassChangeError in itable stubs.
3558     StubRoutines::_throw_IncompatibleClassChangeError_entry= generate_throw_exception("IncompatibleClassChangeError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_IncompatibleClassChangeError),  false);
3559     StubRoutines::_throw_NullPointerException_at_call_entry= generate_throw_exception("NullPointerException at call throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_NullPointerException_at_call), false);
3560 
3561     // support for verify_oop (must happen after universe_init)
3562     StubRoutines::_verify_oop_subroutine_entry             = generate_verify_oop();
3563 
3564     // arraycopy stubs used by compilers
3565     generate_arraycopy_stubs();
3566 
3567     // Safefetch stubs.
3568     generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
3569                                                        &StubRoutines::_safefetch32_fault_pc,
3570                                                        &StubRoutines::_safefetch32_continuation_pc);
3571     generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
3572                                                        &StubRoutines::_safefetchN_fault_pc,
3573                                                        &StubRoutines::_safefetchN_continuation_pc);
3574 
3575 #ifdef COMPILER2
3576     if (UseMultiplyToLenIntrinsic) {
3577       StubRoutines::_multiplyToLen = generate_multiplyToLen();
3578     }
3579 #endif
3580 
3581     if (UseSquareToLenIntrinsic) {
3582       StubRoutines::_squareToLen = generate_squareToLen();
3583     }
3584     if (UseMulAddIntrinsic) {
3585       StubRoutines::_mulAdd = generate_mulAdd();
3586     }
3587     if (UseMontgomeryMultiplyIntrinsic) {
3588       StubRoutines::_montgomeryMultiply
3589         = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_multiply);
3590     }
3591     if (UseMontgomerySquareIntrinsic) {
3592       StubRoutines::_montgomerySquare
3593         = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_square);
3594     }
3595 
3596     if (UseAESIntrinsics) {
3597       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
3598       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
3599     }
3600 
3601     if (UseSHA256Intrinsics) {
3602       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(false, "sha256_implCompress");
3603       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true,  "sha256_implCompressMB");
3604     }
3605     if (UseSHA512Intrinsics) {
3606       StubRoutines::_sha512_implCompress   = generate_sha512_implCompress(false, "sha512_implCompress");
3607       StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true, "sha512_implCompressMB");
3608     }
3609   }
3610 
3611  public:
3612   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
3613     // replace the standard masm with a special one:
3614     _masm = new MacroAssembler(code);
3615     if (all) {
3616       generate_all();
3617     } else {
3618       generate_initial();
3619     }
3620   }
3621 };
3622 
3623 #define UCM_TABLE_MAX_ENTRIES 8
3624 void StubGenerator_generate(CodeBuffer* code, bool all) {
3625   if (UnsafeCopyMemory::_table == NULL) {
3626     UnsafeCopyMemory::create_table(UCM_TABLE_MAX_ENTRIES);
3627   }
3628   StubGenerator g(code, all);
3629 }