1 /*
   2  * Copyright (c) 2003, 2016, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "asm/macroAssembler.hpp"
  27 #include "asm/macroAssembler.inline.hpp"
  28 #include "interpreter/interpreter.hpp"
  29 #include "nativeInst_x86.hpp"
  30 #include "oops/instanceOop.hpp"
  31 #include "oops/method.hpp"
  32 #include "oops/objArrayKlass.hpp"
  33 #include "oops/oop.inline.hpp"
  34 #include "prims/methodHandles.hpp"
  35 #include "runtime/frame.inline.hpp"
  36 #include "runtime/handles.inline.hpp"
  37 #include "runtime/sharedRuntime.hpp"
  38 #include "runtime/stubCodeGenerator.hpp"
  39 #include "runtime/stubRoutines.hpp"
  40 #include "runtime/thread.inline.hpp"
  41 #ifdef COMPILER2
  42 #include "opto/runtime.hpp"
  43 #endif
  44 
  45 // Declaration and definition of StubGenerator (no .hpp file).
  46 // For a more detailed description of the stub routine structure
  47 // see the comment in stubRoutines.hpp
  48 
  49 #define __ _masm->
  50 #define TIMES_OOP (UseCompressedOops ? Address::times_4 : Address::times_8)
  51 #define a__ ((Assembler*)_masm)->
  52 
  53 #ifdef PRODUCT
  54 #define BLOCK_COMMENT(str) /* nothing */
  55 #else
  56 #define BLOCK_COMMENT(str) __ block_comment(str)
  57 #endif
  58 
  59 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  60 const int MXCSR_MASK = 0xFFC0;  // Mask out any pending exceptions
  61 
  62 // Stub Code definitions
  63 
  64 class StubGenerator: public StubCodeGenerator {
  65  private:
  66 
  67 #ifdef PRODUCT
  68 #define inc_counter_np(counter) ((void)0)
  69 #else
  70   void inc_counter_np_(int& counter) {
  71     // This can destroy rscratch1 if counter is far from the code cache
  72     __ incrementl(ExternalAddress((address)&counter));
  73   }
  74 #define inc_counter_np(counter) \
  75   BLOCK_COMMENT("inc_counter " #counter); \
  76   inc_counter_np_(counter);
  77 #endif
  78 
  79   // Call stubs are used to call Java from C
  80   //
  81   // Linux Arguments:
  82   //    c_rarg0:   call wrapper address                   address
  83   //    c_rarg1:   result                                 address
  84   //    c_rarg2:   result type                            BasicType
  85   //    c_rarg3:   method                                 Method*
  86   //    c_rarg4:   (interpreter) entry point              address
  87   //    c_rarg5:   parameters                             intptr_t*
  88   //    16(rbp): parameter size (in words)              int
  89   //    24(rbp): thread                                 Thread*
  90   //
  91   //     [ return_from_Java     ] <--- rsp
  92   //     [ argument word n      ]
  93   //      ...
  94   // -12 [ argument word 1      ]
  95   // -11 [ saved r15            ] <--- rsp_after_call
  96   // -10 [ saved r14            ]
  97   //  -9 [ saved r13            ]
  98   //  -8 [ saved r12            ]
  99   //  -7 [ saved rbx            ]
 100   //  -6 [ call wrapper         ]
 101   //  -5 [ result               ]
 102   //  -4 [ result type          ]
 103   //  -3 [ method               ]
 104   //  -2 [ entry point          ]
 105   //  -1 [ parameters           ]
 106   //   0 [ saved rbp            ] <--- rbp
 107   //   1 [ return address       ]
 108   //   2 [ parameter size       ]
 109   //   3 [ thread               ]
 110   //
 111   // Windows Arguments:
 112   //    c_rarg0:   call wrapper address                   address
 113   //    c_rarg1:   result                                 address
 114   //    c_rarg2:   result type                            BasicType
 115   //    c_rarg3:   method                                 Method*
 116   //    48(rbp): (interpreter) entry point              address
 117   //    56(rbp): parameters                             intptr_t*
 118   //    64(rbp): parameter size (in words)              int
 119   //    72(rbp): thread                                 Thread*
 120   //
 121   //     [ return_from_Java     ] <--- rsp
 122   //     [ argument word n      ]
 123   //      ...
 124   // -60 [ argument word 1      ]
 125   // -59 [ saved xmm31          ] <--- rsp after_call
 126   //     [ saved xmm16-xmm30    ] (EVEX enabled, else the space is blank)
 127   // -27 [ saved xmm15          ]
 128   //     [ saved xmm7-xmm14     ]
 129   //  -9 [ saved xmm6           ] (each xmm register takes 2 slots)
 130   //  -7 [ saved r15            ]
 131   //  -6 [ saved r14            ]
 132   //  -5 [ saved r13            ]
 133   //  -4 [ saved r12            ]
 134   //  -3 [ saved rdi            ]
 135   //  -2 [ saved rsi            ]
 136   //  -1 [ saved rbx            ]
 137   //   0 [ saved rbp            ] <--- rbp
 138   //   1 [ return address       ]
 139   //   2 [ call wrapper         ]
 140   //   3 [ result               ]
 141   //   4 [ result type          ]
 142   //   5 [ method               ]
 143   //   6 [ entry point          ]
 144   //   7 [ parameters           ]
 145   //   8 [ parameter size       ]
 146   //   9 [ thread               ]
 147   //
 148   //    Windows reserves the callers stack space for arguments 1-4.
 149   //    We spill c_rarg0-c_rarg3 to this space.
 150 
 151   // Call stub stack layout word offsets from rbp
 152   enum call_stub_layout {
 153 #ifdef _WIN64
 154     xmm_save_first     = 6,  // save from xmm6
 155     xmm_save_last      = 31, // to xmm31
 156     xmm_save_base      = -9,
 157     rsp_after_call_off = xmm_save_base - 2 * (xmm_save_last - xmm_save_first), // -27
 158     r15_off            = -7,
 159     r14_off            = -6,
 160     r13_off            = -5,
 161     r12_off            = -4,
 162     rdi_off            = -3,
 163     rsi_off            = -2,
 164     rbx_off            = -1,
 165     rbp_off            =  0,
 166     retaddr_off        =  1,
 167     call_wrapper_off   =  2,
 168     result_off         =  3,
 169     result_type_off    =  4,
 170     method_off         =  5,
 171     entry_point_off    =  6,
 172     parameters_off     =  7,
 173     parameter_size_off =  8,
 174     thread_off         =  9
 175 #else
 176     rsp_after_call_off = -12,
 177     mxcsr_off          = rsp_after_call_off,
 178     r15_off            = -11,
 179     r14_off            = -10,
 180     r13_off            = -9,
 181     r12_off            = -8,
 182     rbx_off            = -7,
 183     call_wrapper_off   = -6,
 184     result_off         = -5,
 185     result_type_off    = -4,
 186     method_off         = -3,
 187     entry_point_off    = -2,
 188     parameters_off     = -1,
 189     rbp_off            =  0,
 190     retaddr_off        =  1,
 191     parameter_size_off =  2,
 192     thread_off         =  3
 193 #endif
 194   };
 195 
 196 #ifdef _WIN64
 197   Address xmm_save(int reg) {
 198     assert(reg >= xmm_save_first && reg <= xmm_save_last, "XMM register number out of range");
 199     return Address(rbp, (xmm_save_base - (reg - xmm_save_first) * 2) * wordSize);
 200   }
 201 #endif
 202 
 203   address generate_call_stub(address& return_address) {
 204     assert((int)frame::entry_frame_after_call_words == -(int)rsp_after_call_off + 1 &&
 205            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
 206            "adjust this code");
 207     StubCodeMark mark(this, "StubRoutines", "call_stub");
 208     address start = __ pc();
 209 
 210     // same as in generate_catch_exception()!
 211     const Address rsp_after_call(rbp, rsp_after_call_off * wordSize);
 212 
 213     const Address call_wrapper  (rbp, call_wrapper_off   * wordSize);
 214     const Address result        (rbp, result_off         * wordSize);
 215     const Address result_type   (rbp, result_type_off    * wordSize);
 216     const Address method        (rbp, method_off         * wordSize);
 217     const Address entry_point   (rbp, entry_point_off    * wordSize);
 218     const Address parameters    (rbp, parameters_off     * wordSize);
 219     const Address parameter_size(rbp, parameter_size_off * wordSize);
 220 
 221     // same as in generate_catch_exception()!
 222     const Address thread        (rbp, thread_off         * wordSize);
 223 
 224     const Address r15_save(rbp, r15_off * wordSize);
 225     const Address r14_save(rbp, r14_off * wordSize);
 226     const Address r13_save(rbp, r13_off * wordSize);
 227     const Address r12_save(rbp, r12_off * wordSize);
 228     const Address rbx_save(rbp, rbx_off * wordSize);
 229 
 230     // stub code
 231     __ enter();
 232     __ subptr(rsp, -rsp_after_call_off * wordSize);
 233 
 234     // save register parameters
 235 #ifndef _WIN64
 236     __ movptr(parameters,   c_rarg5); // parameters
 237     __ movptr(entry_point,  c_rarg4); // entry_point
 238 #endif
 239 
 240     __ movptr(method,       c_rarg3); // method
 241     __ movl(result_type,  c_rarg2);   // result type
 242     __ movptr(result,       c_rarg1); // result
 243     __ movptr(call_wrapper, c_rarg0); // call wrapper
 244 
 245     // save regs belonging to calling function
 246     __ movptr(rbx_save, rbx);
 247     __ movptr(r12_save, r12);
 248     __ movptr(r13_save, r13);
 249     __ movptr(r14_save, r14);
 250     __ movptr(r15_save, r15);
 251     if (UseAVX > 2) {
 252       __ movl(rbx, 0xffff);
 253       __ kmovwl(k1, rbx);
 254     }
 255 #ifdef _WIN64
 256     int last_reg = 15;
 257     if (UseAVX > 2) {
 258       last_reg = 31;
 259     }
 260     if (VM_Version::supports_evex()) {
 261       for (int i = xmm_save_first; i <= last_reg; i++) {
 262         __ vextractf32x4(xmm_save(i), as_XMMRegister(i), 0);
 263       }
 264     } else {
 265       for (int i = xmm_save_first; i <= last_reg; i++) {
 266         __ movdqu(xmm_save(i), as_XMMRegister(i));
 267       }
 268     }
 269 
 270     const Address rdi_save(rbp, rdi_off * wordSize);
 271     const Address rsi_save(rbp, rsi_off * wordSize);
 272 
 273     __ movptr(rsi_save, rsi);
 274     __ movptr(rdi_save, rdi);
 275 #else
 276     const Address mxcsr_save(rbp, mxcsr_off * wordSize);
 277     {
 278       Label skip_ldmx;
 279       __ stmxcsr(mxcsr_save);
 280       __ movl(rax, mxcsr_save);
 281       __ andl(rax, MXCSR_MASK);    // Only check control and mask bits
 282       ExternalAddress mxcsr_std(StubRoutines::addr_mxcsr_std());
 283       __ cmp32(rax, mxcsr_std);
 284       __ jcc(Assembler::equal, skip_ldmx);
 285       __ ldmxcsr(mxcsr_std);
 286       __ bind(skip_ldmx);
 287     }
 288 #endif
 289 
 290     // Load up thread register
 291     __ movptr(r15_thread, thread);
 292     __ reinit_heapbase();
 293 
 294 #ifdef ASSERT
 295     // make sure we have no pending exceptions
 296     {
 297       Label L;
 298       __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
 299       __ jcc(Assembler::equal, L);
 300       __ stop("StubRoutines::call_stub: entered with pending exception");
 301       __ bind(L);
 302     }
 303 #endif
 304 
 305     // pass parameters if any
 306     BLOCK_COMMENT("pass parameters if any");
 307     Label parameters_done;
 308     __ movl(c_rarg3, parameter_size);
 309     __ testl(c_rarg3, c_rarg3);
 310     __ jcc(Assembler::zero, parameters_done);
 311 
 312     Label loop;
 313     __ movptr(c_rarg2, parameters);       // parameter pointer
 314     __ movl(c_rarg1, c_rarg3);            // parameter counter is in c_rarg1
 315     __ BIND(loop);
 316     __ movptr(rax, Address(c_rarg2, 0));// get parameter
 317     __ addptr(c_rarg2, wordSize);       // advance to next parameter
 318     __ decrementl(c_rarg1);             // decrement counter
 319     __ push(rax);                       // pass parameter
 320     __ jcc(Assembler::notZero, loop);
 321 
 322     // call Java function
 323     __ BIND(parameters_done);
 324     __ movptr(rbx, method);             // get Method*
 325     __ movptr(c_rarg1, entry_point);    // get entry_point
 326     __ mov(r13, rsp);                   // set sender sp
 327     BLOCK_COMMENT("call Java function");
 328     __ call(c_rarg1);
 329 
 330     BLOCK_COMMENT("call_stub_return_address:");
 331     return_address = __ pc();
 332 
 333     // store result depending on type (everything that is not
 334     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
 335     __ movptr(c_rarg0, result);
 336     Label is_long, is_float, is_double, exit;
 337     __ movl(c_rarg1, result_type);
 338     __ cmpl(c_rarg1, T_OBJECT);
 339     __ jcc(Assembler::equal, is_long);
 340     __ cmpl(c_rarg1, T_LONG);
 341     __ jcc(Assembler::equal, is_long);
 342     __ cmpl(c_rarg1, T_FLOAT);
 343     __ jcc(Assembler::equal, is_float);
 344     __ cmpl(c_rarg1, T_DOUBLE);
 345     __ jcc(Assembler::equal, is_double);
 346 
 347     // handle T_INT case
 348     __ movl(Address(c_rarg0, 0), rax);
 349 
 350     __ BIND(exit);
 351 
 352     // pop parameters
 353     __ lea(rsp, rsp_after_call);
 354 
 355 #ifdef ASSERT
 356     // verify that threads correspond
 357     {
 358      Label L1, L2, L3;
 359       __ cmpptr(r15_thread, thread);
 360       __ jcc(Assembler::equal, L1);
 361       __ stop("StubRoutines::call_stub: r15_thread is corrupted");
 362       __ bind(L1);
 363       __ get_thread(rbx);
 364       __ cmpptr(r15_thread, thread);
 365       __ jcc(Assembler::equal, L2);
 366       __ stop("StubRoutines::call_stub: r15_thread is modified by call");
 367       __ bind(L2);
 368       __ cmpptr(r15_thread, rbx);
 369       __ jcc(Assembler::equal, L3);
 370       __ stop("StubRoutines::call_stub: threads must correspond");
 371       __ bind(L3);
 372     }
 373 #endif
 374 
 375     // restore regs belonging to calling function
 376 #ifdef _WIN64
 377     // emit the restores for xmm regs
 378     if (VM_Version::supports_evex()) {
 379       for (int i = xmm_save_first; i <= last_reg; i++) {
 380         __ vinsertf32x4(as_XMMRegister(i), as_XMMRegister(i), xmm_save(i), 0);
 381       }
 382     } else {
 383       for (int i = xmm_save_first; i <= last_reg; i++) {
 384         __ movdqu(as_XMMRegister(i), xmm_save(i));
 385       }
 386     }
 387 #endif
 388     __ movptr(r15, r15_save);
 389     __ movptr(r14, r14_save);
 390     __ movptr(r13, r13_save);
 391     __ movptr(r12, r12_save);
 392     __ movptr(rbx, rbx_save);
 393 
 394 #ifdef _WIN64
 395     __ movptr(rdi, rdi_save);
 396     __ movptr(rsi, rsi_save);
 397 #else
 398     __ ldmxcsr(mxcsr_save);
 399 #endif
 400 
 401     // restore rsp
 402     __ addptr(rsp, -rsp_after_call_off * wordSize);
 403 
 404     // return
 405     __ pop(rbp);
 406     __ ret(0);
 407 
 408     // handle return types different from T_INT
 409     __ BIND(is_long);
 410     __ movq(Address(c_rarg0, 0), rax);
 411     __ jmp(exit);
 412 
 413     __ BIND(is_float);
 414     __ movflt(Address(c_rarg0, 0), xmm0);
 415     __ jmp(exit);
 416 
 417     __ BIND(is_double);
 418     __ movdbl(Address(c_rarg0, 0), xmm0);
 419     __ jmp(exit);
 420 
 421     return start;
 422   }
 423 
 424   // Return point for a Java call if there's an exception thrown in
 425   // Java code.  The exception is caught and transformed into a
 426   // pending exception stored in JavaThread that can be tested from
 427   // within the VM.
 428   //
 429   // Note: Usually the parameters are removed by the callee. In case
 430   // of an exception crossing an activation frame boundary, that is
 431   // not the case if the callee is compiled code => need to setup the
 432   // rsp.
 433   //
 434   // rax: exception oop
 435 
 436   address generate_catch_exception() {
 437     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 438     address start = __ pc();
 439 
 440     // same as in generate_call_stub():
 441     const Address rsp_after_call(rbp, rsp_after_call_off * wordSize);
 442     const Address thread        (rbp, thread_off         * wordSize);
 443 
 444 #ifdef ASSERT
 445     // verify that threads correspond
 446     {
 447       Label L1, L2, L3;
 448       __ cmpptr(r15_thread, thread);
 449       __ jcc(Assembler::equal, L1);
 450       __ stop("StubRoutines::catch_exception: r15_thread is corrupted");
 451       __ bind(L1);
 452       __ get_thread(rbx);
 453       __ cmpptr(r15_thread, thread);
 454       __ jcc(Assembler::equal, L2);
 455       __ stop("StubRoutines::catch_exception: r15_thread is modified by call");
 456       __ bind(L2);
 457       __ cmpptr(r15_thread, rbx);
 458       __ jcc(Assembler::equal, L3);
 459       __ stop("StubRoutines::catch_exception: threads must correspond");
 460       __ bind(L3);
 461     }
 462 #endif
 463 
 464     // set pending exception
 465     __ verify_oop(rax);
 466 
 467     __ movptr(Address(r15_thread, Thread::pending_exception_offset()), rax);
 468     __ lea(rscratch1, ExternalAddress((address)__FILE__));
 469     __ movptr(Address(r15_thread, Thread::exception_file_offset()), rscratch1);
 470     __ movl(Address(r15_thread, Thread::exception_line_offset()), (int)  __LINE__);
 471 
 472     // complete return to VM
 473     assert(StubRoutines::_call_stub_return_address != NULL,
 474            "_call_stub_return_address must have been generated before");
 475     __ jump(RuntimeAddress(StubRoutines::_call_stub_return_address));
 476 
 477     return start;
 478   }
 479 
 480   // Continuation point for runtime calls returning with a pending
 481   // exception.  The pending exception check happened in the runtime
 482   // or native call stub.  The pending exception in Thread is
 483   // converted into a Java-level exception.
 484   //
 485   // Contract with Java-level exception handlers:
 486   // rax: exception
 487   // rdx: throwing pc
 488   //
 489   // NOTE: At entry of this stub, exception-pc must be on stack !!
 490 
 491   address generate_forward_exception() {
 492     StubCodeMark mark(this, "StubRoutines", "forward exception");
 493     address start = __ pc();
 494 
 495     // Upon entry, the sp points to the return address returning into
 496     // Java (interpreted or compiled) code; i.e., the return address
 497     // becomes the throwing pc.
 498     //
 499     // Arguments pushed before the runtime call are still on the stack
 500     // but the exception handler will reset the stack pointer ->
 501     // ignore them.  A potential result in registers can be ignored as
 502     // well.
 503 
 504 #ifdef ASSERT
 505     // make sure this code is only executed if there is a pending exception
 506     {
 507       Label L;
 508       __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t) NULL);
 509       __ jcc(Assembler::notEqual, L);
 510       __ stop("StubRoutines::forward exception: no pending exception (1)");
 511       __ bind(L);
 512     }
 513 #endif
 514 
 515     // compute exception handler into rbx
 516     __ movptr(c_rarg0, Address(rsp, 0));
 517     BLOCK_COMMENT("call exception_handler_for_return_address");
 518     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 519                          SharedRuntime::exception_handler_for_return_address),
 520                     r15_thread, c_rarg0);
 521     __ mov(rbx, rax);
 522 
 523     // setup rax & rdx, remove return address & clear pending exception
 524     __ pop(rdx);
 525     __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
 526     __ movptr(Address(r15_thread, Thread::pending_exception_offset()), (int32_t)NULL_WORD);
 527 
 528 #ifdef ASSERT
 529     // make sure exception is set
 530     {
 531       Label L;
 532       __ testptr(rax, rax);
 533       __ jcc(Assembler::notEqual, L);
 534       __ stop("StubRoutines::forward exception: no pending exception (2)");
 535       __ bind(L);
 536     }
 537 #endif
 538 
 539     // continue at exception handler (return address removed)
 540     // rax: exception
 541     // rbx: exception handler
 542     // rdx: throwing pc
 543     __ verify_oop(rax);
 544     __ jmp(rbx);
 545 
 546     return start;
 547   }
 548 
 549   // Support for jint atomic::xchg(jint exchange_value, volatile jint* dest)
 550   //
 551   // Arguments :
 552   //    c_rarg0: exchange_value
 553   //    c_rarg0: dest
 554   //
 555   // Result:
 556   //    *dest <- ex, return (orig *dest)
 557   address generate_atomic_xchg() {
 558     StubCodeMark mark(this, "StubRoutines", "atomic_xchg");
 559     address start = __ pc();
 560 
 561     __ movl(rax, c_rarg0); // Copy to eax we need a return value anyhow
 562     __ xchgl(rax, Address(c_rarg1, 0)); // automatic LOCK
 563     __ ret(0);
 564 
 565     return start;
 566   }
 567 
 568   // Support for intptr_t atomic::xchg_ptr(intptr_t exchange_value, volatile intptr_t* dest)
 569   //
 570   // Arguments :
 571   //    c_rarg0: exchange_value
 572   //    c_rarg1: dest
 573   //
 574   // Result:
 575   //    *dest <- ex, return (orig *dest)
 576   address generate_atomic_xchg_ptr() {
 577     StubCodeMark mark(this, "StubRoutines", "atomic_xchg_ptr");
 578     address start = __ pc();
 579 
 580     __ movptr(rax, c_rarg0); // Copy to eax we need a return value anyhow
 581     __ xchgptr(rax, Address(c_rarg1, 0)); // automatic LOCK
 582     __ ret(0);
 583 
 584     return start;
 585   }
 586 
 587   // Support for jint atomic::atomic_cmpxchg(jint exchange_value, volatile jint* dest,
 588   //                                         jint compare_value)
 589   //
 590   // Arguments :
 591   //    c_rarg0: exchange_value
 592   //    c_rarg1: dest
 593   //    c_rarg2: compare_value
 594   //
 595   // Result:
 596   //    if ( compare_value == *dest ) {
 597   //       *dest = exchange_value
 598   //       return compare_value;
 599   //    else
 600   //       return *dest;
 601   address generate_atomic_cmpxchg() {
 602     StubCodeMark mark(this, "StubRoutines", "atomic_cmpxchg");
 603     address start = __ pc();
 604 
 605     __ movl(rax, c_rarg2);
 606    if ( os::is_MP() ) __ lock();
 607     __ cmpxchgl(c_rarg0, Address(c_rarg1, 0));
 608     __ ret(0);
 609 
 610     return start;
 611   }
 612 
 613   // Support for jbyte atomic::atomic_cmpxchg(jbyte exchange_value, volatile jbyte* dest,
 614   //                                          jbyte compare_value)
 615   //
 616   // Arguments :
 617   //    c_rarg0: exchange_value
 618   //    c_rarg1: dest
 619   //    c_rarg2: compare_value
 620   //
 621   // Result:
 622   //    if ( compare_value == *dest ) {
 623   //       *dest = exchange_value
 624   //       return compare_value;
 625   //    else
 626   //       return *dest;
 627   address generate_atomic_cmpxchg_byte() {
 628     StubCodeMark mark(this, "StubRoutines", "atomic_cmpxchg_byte");
 629     address start = __ pc();
 630 
 631     __ movsbq(rax, c_rarg2);
 632    if ( os::is_MP() ) __ lock();
 633     __ cmpxchgb(c_rarg0, Address(c_rarg1, 0));
 634     __ ret(0);
 635 
 636     return start;
 637   }
 638 
 639   // Support for jlong atomic::atomic_cmpxchg(jlong exchange_value,
 640   //                                          volatile jlong* dest,
 641   //                                          jlong compare_value)
 642   // Arguments :
 643   //    c_rarg0: exchange_value
 644   //    c_rarg1: dest
 645   //    c_rarg2: compare_value
 646   //
 647   // Result:
 648   //    if ( compare_value == *dest ) {
 649   //       *dest = exchange_value
 650   //       return compare_value;
 651   //    else
 652   //       return *dest;
 653   address generate_atomic_cmpxchg_long() {
 654     StubCodeMark mark(this, "StubRoutines", "atomic_cmpxchg_long");
 655     address start = __ pc();
 656 
 657     __ movq(rax, c_rarg2);
 658    if ( os::is_MP() ) __ lock();
 659     __ cmpxchgq(c_rarg0, Address(c_rarg1, 0));
 660     __ ret(0);
 661 
 662     return start;
 663   }
 664 
 665   // Support for jint atomic::add(jint add_value, volatile jint* dest)
 666   //
 667   // Arguments :
 668   //    c_rarg0: add_value
 669   //    c_rarg1: dest
 670   //
 671   // Result:
 672   //    *dest += add_value
 673   //    return *dest;
 674   address generate_atomic_add() {
 675     StubCodeMark mark(this, "StubRoutines", "atomic_add");
 676     address start = __ pc();
 677 
 678     __ movl(rax, c_rarg0);
 679    if ( os::is_MP() ) __ lock();
 680     __ xaddl(Address(c_rarg1, 0), c_rarg0);
 681     __ addl(rax, c_rarg0);
 682     __ ret(0);
 683 
 684     return start;
 685   }
 686 
 687   // Support for intptr_t atomic::add_ptr(intptr_t add_value, volatile intptr_t* dest)
 688   //
 689   // Arguments :
 690   //    c_rarg0: add_value
 691   //    c_rarg1: dest
 692   //
 693   // Result:
 694   //    *dest += add_value
 695   //    return *dest;
 696   address generate_atomic_add_ptr() {
 697     StubCodeMark mark(this, "StubRoutines", "atomic_add_ptr");
 698     address start = __ pc();
 699 
 700     __ movptr(rax, c_rarg0); // Copy to eax we need a return value anyhow
 701    if ( os::is_MP() ) __ lock();
 702     __ xaddptr(Address(c_rarg1, 0), c_rarg0);
 703     __ addptr(rax, c_rarg0);
 704     __ ret(0);
 705 
 706     return start;
 707   }
 708 
 709   // Support for intptr_t OrderAccess::fence()
 710   //
 711   // Arguments :
 712   //
 713   // Result:
 714   address generate_orderaccess_fence() {
 715     StubCodeMark mark(this, "StubRoutines", "orderaccess_fence");
 716     address start = __ pc();
 717     __ membar(Assembler::StoreLoad);
 718     __ ret(0);
 719 
 720     return start;
 721   }
 722 
 723   // Support for intptr_t get_previous_fp()
 724   //
 725   // This routine is used to find the previous frame pointer for the
 726   // caller (current_frame_guess). This is used as part of debugging
 727   // ps() is seemingly lost trying to find frames.
 728   // This code assumes that caller current_frame_guess) has a frame.
 729   address generate_get_previous_fp() {
 730     StubCodeMark mark(this, "StubRoutines", "get_previous_fp");
 731     const Address old_fp(rbp, 0);
 732     const Address older_fp(rax, 0);
 733     address start = __ pc();
 734 
 735     __ enter();
 736     __ movptr(rax, old_fp); // callers fp
 737     __ movptr(rax, older_fp); // the frame for ps()
 738     __ pop(rbp);
 739     __ ret(0);
 740 
 741     return start;
 742   }
 743 
 744   // Support for intptr_t get_previous_sp()
 745   //
 746   // This routine is used to find the previous stack pointer for the
 747   // caller.
 748   address generate_get_previous_sp() {
 749     StubCodeMark mark(this, "StubRoutines", "get_previous_sp");
 750     address start = __ pc();
 751 
 752     __ movptr(rax, rsp);
 753     __ addptr(rax, 8); // return address is at the top of the stack.
 754     __ ret(0);
 755 
 756     return start;
 757   }
 758 
 759   //----------------------------------------------------------------------------------------------------
 760   // Support for void verify_mxcsr()
 761   //
 762   // This routine is used with -Xcheck:jni to verify that native
 763   // JNI code does not return to Java code without restoring the
 764   // MXCSR register to our expected state.
 765 
 766   address generate_verify_mxcsr() {
 767     StubCodeMark mark(this, "StubRoutines", "verify_mxcsr");
 768     address start = __ pc();
 769 
 770     const Address mxcsr_save(rsp, 0);
 771 
 772     if (CheckJNICalls) {
 773       Label ok_ret;
 774       ExternalAddress mxcsr_std(StubRoutines::addr_mxcsr_std());
 775       __ push(rax);
 776       __ subptr(rsp, wordSize);      // allocate a temp location
 777       __ stmxcsr(mxcsr_save);
 778       __ movl(rax, mxcsr_save);
 779       __ andl(rax, MXCSR_MASK);    // Only check control and mask bits
 780       __ cmp32(rax, mxcsr_std);
 781       __ jcc(Assembler::equal, ok_ret);
 782 
 783       __ warn("MXCSR changed by native JNI code, use -XX:+RestoreMXCSROnJNICall");
 784 
 785       __ ldmxcsr(mxcsr_std);
 786 
 787       __ bind(ok_ret);
 788       __ addptr(rsp, wordSize);
 789       __ pop(rax);
 790     }
 791 
 792     __ ret(0);
 793 
 794     return start;
 795   }
 796 
 797   address generate_f2i_fixup() {
 798     StubCodeMark mark(this, "StubRoutines", "f2i_fixup");
 799     Address inout(rsp, 5 * wordSize); // return address + 4 saves
 800 
 801     address start = __ pc();
 802 
 803     Label L;
 804 
 805     __ push(rax);
 806     __ push(c_rarg3);
 807     __ push(c_rarg2);
 808     __ push(c_rarg1);
 809 
 810     __ movl(rax, 0x7f800000);
 811     __ xorl(c_rarg3, c_rarg3);
 812     __ movl(c_rarg2, inout);
 813     __ movl(c_rarg1, c_rarg2);
 814     __ andl(c_rarg1, 0x7fffffff);
 815     __ cmpl(rax, c_rarg1); // NaN? -> 0
 816     __ jcc(Assembler::negative, L);
 817     __ testl(c_rarg2, c_rarg2); // signed ? min_jint : max_jint
 818     __ movl(c_rarg3, 0x80000000);
 819     __ movl(rax, 0x7fffffff);
 820     __ cmovl(Assembler::positive, c_rarg3, rax);
 821 
 822     __ bind(L);
 823     __ movptr(inout, c_rarg3);
 824 
 825     __ pop(c_rarg1);
 826     __ pop(c_rarg2);
 827     __ pop(c_rarg3);
 828     __ pop(rax);
 829 
 830     __ ret(0);
 831 
 832     return start;
 833   }
 834 
 835   address generate_f2l_fixup() {
 836     StubCodeMark mark(this, "StubRoutines", "f2l_fixup");
 837     Address inout(rsp, 5 * wordSize); // return address + 4 saves
 838     address start = __ pc();
 839 
 840     Label L;
 841 
 842     __ push(rax);
 843     __ push(c_rarg3);
 844     __ push(c_rarg2);
 845     __ push(c_rarg1);
 846 
 847     __ movl(rax, 0x7f800000);
 848     __ xorl(c_rarg3, c_rarg3);
 849     __ movl(c_rarg2, inout);
 850     __ movl(c_rarg1, c_rarg2);
 851     __ andl(c_rarg1, 0x7fffffff);
 852     __ cmpl(rax, c_rarg1); // NaN? -> 0
 853     __ jcc(Assembler::negative, L);
 854     __ testl(c_rarg2, c_rarg2); // signed ? min_jlong : max_jlong
 855     __ mov64(c_rarg3, 0x8000000000000000);
 856     __ mov64(rax, 0x7fffffffffffffff);
 857     __ cmov(Assembler::positive, c_rarg3, rax);
 858 
 859     __ bind(L);
 860     __ movptr(inout, c_rarg3);
 861 
 862     __ pop(c_rarg1);
 863     __ pop(c_rarg2);
 864     __ pop(c_rarg3);
 865     __ pop(rax);
 866 
 867     __ ret(0);
 868 
 869     return start;
 870   }
 871 
 872   address generate_d2i_fixup() {
 873     StubCodeMark mark(this, "StubRoutines", "d2i_fixup");
 874     Address inout(rsp, 6 * wordSize); // return address + 5 saves
 875 
 876     address start = __ pc();
 877 
 878     Label L;
 879 
 880     __ push(rax);
 881     __ push(c_rarg3);
 882     __ push(c_rarg2);
 883     __ push(c_rarg1);
 884     __ push(c_rarg0);
 885 
 886     __ movl(rax, 0x7ff00000);
 887     __ movq(c_rarg2, inout);
 888     __ movl(c_rarg3, c_rarg2);
 889     __ mov(c_rarg1, c_rarg2);
 890     __ mov(c_rarg0, c_rarg2);
 891     __ negl(c_rarg3);
 892     __ shrptr(c_rarg1, 0x20);
 893     __ orl(c_rarg3, c_rarg2);
 894     __ andl(c_rarg1, 0x7fffffff);
 895     __ xorl(c_rarg2, c_rarg2);
 896     __ shrl(c_rarg3, 0x1f);
 897     __ orl(c_rarg1, c_rarg3);
 898     __ cmpl(rax, c_rarg1);
 899     __ jcc(Assembler::negative, L); // NaN -> 0
 900     __ testptr(c_rarg0, c_rarg0); // signed ? min_jint : max_jint
 901     __ movl(c_rarg2, 0x80000000);
 902     __ movl(rax, 0x7fffffff);
 903     __ cmov(Assembler::positive, c_rarg2, rax);
 904 
 905     __ bind(L);
 906     __ movptr(inout, c_rarg2);
 907 
 908     __ pop(c_rarg0);
 909     __ pop(c_rarg1);
 910     __ pop(c_rarg2);
 911     __ pop(c_rarg3);
 912     __ pop(rax);
 913 
 914     __ ret(0);
 915 
 916     return start;
 917   }
 918 
 919   address generate_d2l_fixup() {
 920     StubCodeMark mark(this, "StubRoutines", "d2l_fixup");
 921     Address inout(rsp, 6 * wordSize); // return address + 5 saves
 922 
 923     address start = __ pc();
 924 
 925     Label L;
 926 
 927     __ push(rax);
 928     __ push(c_rarg3);
 929     __ push(c_rarg2);
 930     __ push(c_rarg1);
 931     __ push(c_rarg0);
 932 
 933     __ movl(rax, 0x7ff00000);
 934     __ movq(c_rarg2, inout);
 935     __ movl(c_rarg3, c_rarg2);
 936     __ mov(c_rarg1, c_rarg2);
 937     __ mov(c_rarg0, c_rarg2);
 938     __ negl(c_rarg3);
 939     __ shrptr(c_rarg1, 0x20);
 940     __ orl(c_rarg3, c_rarg2);
 941     __ andl(c_rarg1, 0x7fffffff);
 942     __ xorl(c_rarg2, c_rarg2);
 943     __ shrl(c_rarg3, 0x1f);
 944     __ orl(c_rarg1, c_rarg3);
 945     __ cmpl(rax, c_rarg1);
 946     __ jcc(Assembler::negative, L); // NaN -> 0
 947     __ testq(c_rarg0, c_rarg0); // signed ? min_jlong : max_jlong
 948     __ mov64(c_rarg2, 0x8000000000000000);
 949     __ mov64(rax, 0x7fffffffffffffff);
 950     __ cmovq(Assembler::positive, c_rarg2, rax);
 951 
 952     __ bind(L);
 953     __ movq(inout, c_rarg2);
 954 
 955     __ pop(c_rarg0);
 956     __ pop(c_rarg1);
 957     __ pop(c_rarg2);
 958     __ pop(c_rarg3);
 959     __ pop(rax);
 960 
 961     __ ret(0);
 962 
 963     return start;
 964   }
 965 
 966   address generate_fp_mask(const char *stub_name, int64_t mask) {
 967     __ align(CodeEntryAlignment);
 968     StubCodeMark mark(this, "StubRoutines", stub_name);
 969     address start = __ pc();
 970 
 971     __ emit_data64( mask, relocInfo::none );
 972     __ emit_data64( mask, relocInfo::none );
 973 
 974     return start;
 975   }
 976 
 977   // Non-destructive plausibility checks for oops
 978   //
 979   // Arguments:
 980   //    all args on stack!
 981   //
 982   // Stack after saving c_rarg3:
 983   //    [tos + 0]: saved c_rarg3
 984   //    [tos + 1]: saved c_rarg2
 985   //    [tos + 2]: saved r12 (several TemplateTable methods use it)
 986   //    [tos + 3]: saved flags
 987   //    [tos + 4]: return address
 988   //  * [tos + 5]: error message (char*)
 989   //  * [tos + 6]: object to verify (oop)
 990   //  * [tos + 7]: saved rax - saved by caller and bashed
 991   //  * [tos + 8]: saved r10 (rscratch1) - saved by caller
 992   //  * = popped on exit
 993   address generate_verify_oop() {
 994     StubCodeMark mark(this, "StubRoutines", "verify_oop");
 995     address start = __ pc();
 996 
 997     Label exit, error;
 998 
 999     __ pushf();
1000     __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
1001 
1002     __ push(r12);
1003 
1004     // save c_rarg2 and c_rarg3
1005     __ push(c_rarg2);
1006     __ push(c_rarg3);
1007 
1008     enum {
1009            // After previous pushes.
1010            oop_to_verify = 6 * wordSize,
1011            saved_rax     = 7 * wordSize,
1012            saved_r10     = 8 * wordSize,
1013 
1014            // Before the call to MacroAssembler::debug(), see below.
1015            return_addr   = 16 * wordSize,
1016            error_msg     = 17 * wordSize
1017     };
1018 
1019     // get object
1020     __ movptr(rax, Address(rsp, oop_to_verify));
1021 
1022     // make sure object is 'reasonable'
1023     __ testptr(rax, rax);
1024     __ jcc(Assembler::zero, exit); // if obj is NULL it is OK
1025     // Check if the oop is in the right area of memory
1026     __ movptr(c_rarg2, rax);
1027     __ movptr(c_rarg3, (intptr_t) Universe::verify_oop_mask());
1028     __ andptr(c_rarg2, c_rarg3);
1029     __ movptr(c_rarg3, (intptr_t) Universe::verify_oop_bits());
1030     __ cmpptr(c_rarg2, c_rarg3);
1031     __ jcc(Assembler::notZero, error);
1032 
1033     // set r12 to heapbase for load_klass()
1034     __ reinit_heapbase();
1035 
1036     // make sure klass is 'reasonable', which is not zero.
1037     __ load_klass(rax, rax);  // get klass
1038     __ testptr(rax, rax);
1039     __ jcc(Assembler::zero, error); // if klass is NULL it is broken
1040 
1041     // return if everything seems ok
1042     __ bind(exit);
1043     __ movptr(rax, Address(rsp, saved_rax));     // get saved rax back
1044     __ movptr(rscratch1, Address(rsp, saved_r10)); // get saved r10 back
1045     __ pop(c_rarg3);                             // restore c_rarg3
1046     __ pop(c_rarg2);                             // restore c_rarg2
1047     __ pop(r12);                                 // restore r12
1048     __ popf();                                   // restore flags
1049     __ ret(4 * wordSize);                        // pop caller saved stuff
1050 
1051     // handle errors
1052     __ bind(error);
1053     __ movptr(rax, Address(rsp, saved_rax));     // get saved rax back
1054     __ movptr(rscratch1, Address(rsp, saved_r10)); // get saved r10 back
1055     __ pop(c_rarg3);                             // get saved c_rarg3 back
1056     __ pop(c_rarg2);                             // get saved c_rarg2 back
1057     __ pop(r12);                                 // get saved r12 back
1058     __ popf();                                   // get saved flags off stack --
1059                                                  // will be ignored
1060 
1061     __ pusha();                                  // push registers
1062                                                  // (rip is already
1063                                                  // already pushed)
1064     // debug(char* msg, int64_t pc, int64_t regs[])
1065     // We've popped the registers we'd saved (c_rarg3, c_rarg2 and flags), and
1066     // pushed all the registers, so now the stack looks like:
1067     //     [tos +  0] 16 saved registers
1068     //     [tos + 16] return address
1069     //   * [tos + 17] error message (char*)
1070     //   * [tos + 18] object to verify (oop)
1071     //   * [tos + 19] saved rax - saved by caller and bashed
1072     //   * [tos + 20] saved r10 (rscratch1) - saved by caller
1073     //   * = popped on exit
1074 
1075     __ movptr(c_rarg0, Address(rsp, error_msg));    // pass address of error message
1076     __ movptr(c_rarg1, Address(rsp, return_addr));  // pass return address
1077     __ movq(c_rarg2, rsp);                          // pass address of regs on stack
1078     __ mov(r12, rsp);                               // remember rsp
1079     __ subptr(rsp, frame::arg_reg_save_area_bytes); // windows
1080     __ andptr(rsp, -16);                            // align stack as required by ABI
1081     BLOCK_COMMENT("call MacroAssembler::debug");
1082     __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug64)));
1083     __ mov(rsp, r12);                               // restore rsp
1084     __ popa();                                      // pop registers (includes r12)
1085     __ ret(4 * wordSize);                           // pop caller saved stuff
1086 
1087     return start;
1088   }
1089 
1090   //
1091   // Verify that a register contains clean 32-bits positive value
1092   // (high 32-bits are 0) so it could be used in 64-bits shifts.
1093   //
1094   //  Input:
1095   //    Rint  -  32-bits value
1096   //    Rtmp  -  scratch
1097   //
1098   void assert_clean_int(Register Rint, Register Rtmp) {
1099 #ifdef ASSERT
1100     Label L;
1101     assert_different_registers(Rtmp, Rint);
1102     __ movslq(Rtmp, Rint);
1103     __ cmpq(Rtmp, Rint);
1104     __ jcc(Assembler::equal, L);
1105     __ stop("high 32-bits of int value are not 0");
1106     __ bind(L);
1107 #endif
1108   }
1109 
1110   //  Generate overlap test for array copy stubs
1111   //
1112   //  Input:
1113   //     c_rarg0 - from
1114   //     c_rarg1 - to
1115   //     c_rarg2 - element count
1116   //
1117   //  Output:
1118   //     rax   - &from[element count - 1]
1119   //
1120   void array_overlap_test(address no_overlap_target, Address::ScaleFactor sf) {
1121     assert(no_overlap_target != NULL, "must be generated");
1122     array_overlap_test(no_overlap_target, NULL, sf);
1123   }
1124   void array_overlap_test(Label& L_no_overlap, Address::ScaleFactor sf) {
1125     array_overlap_test(NULL, &L_no_overlap, sf);
1126   }
1127   void array_overlap_test(address no_overlap_target, Label* NOLp, Address::ScaleFactor sf) {
1128     const Register from     = c_rarg0;
1129     const Register to       = c_rarg1;
1130     const Register count    = c_rarg2;
1131     const Register end_from = rax;
1132 
1133     __ cmpptr(to, from);
1134     __ lea(end_from, Address(from, count, sf, 0));
1135     if (NOLp == NULL) {
1136       ExternalAddress no_overlap(no_overlap_target);
1137       __ jump_cc(Assembler::belowEqual, no_overlap);
1138       __ cmpptr(to, end_from);
1139       __ jump_cc(Assembler::aboveEqual, no_overlap);
1140     } else {
1141       __ jcc(Assembler::belowEqual, (*NOLp));
1142       __ cmpptr(to, end_from);
1143       __ jcc(Assembler::aboveEqual, (*NOLp));
1144     }
1145   }
1146 
1147   // Shuffle first three arg regs on Windows into Linux/Solaris locations.
1148   //
1149   // Outputs:
1150   //    rdi - rcx
1151   //    rsi - rdx
1152   //    rdx - r8
1153   //    rcx - r9
1154   //
1155   // Registers r9 and r10 are used to save rdi and rsi on Windows, which latter
1156   // are non-volatile.  r9 and r10 should not be used by the caller.
1157   //
1158   void setup_arg_regs(int nargs = 3) {
1159     const Register saved_rdi = r9;
1160     const Register saved_rsi = r10;
1161     assert(nargs == 3 || nargs == 4, "else fix");
1162 #ifdef _WIN64
1163     assert(c_rarg0 == rcx && c_rarg1 == rdx && c_rarg2 == r8 && c_rarg3 == r9,
1164            "unexpected argument registers");
1165     if (nargs >= 4)
1166       __ mov(rax, r9);  // r9 is also saved_rdi
1167     __ movptr(saved_rdi, rdi);
1168     __ movptr(saved_rsi, rsi);
1169     __ mov(rdi, rcx); // c_rarg0
1170     __ mov(rsi, rdx); // c_rarg1
1171     __ mov(rdx, r8);  // c_rarg2
1172     if (nargs >= 4)
1173       __ mov(rcx, rax); // c_rarg3 (via rax)
1174 #else
1175     assert(c_rarg0 == rdi && c_rarg1 == rsi && c_rarg2 == rdx && c_rarg3 == rcx,
1176            "unexpected argument registers");
1177 #endif
1178   }
1179 
1180   void restore_arg_regs() {
1181     const Register saved_rdi = r9;
1182     const Register saved_rsi = r10;
1183 #ifdef _WIN64
1184     __ movptr(rdi, saved_rdi);
1185     __ movptr(rsi, saved_rsi);
1186 #endif
1187   }
1188 
1189   // Generate code for an array write pre barrier
1190   //
1191   //     addr    -  starting address
1192   //     count   -  element count
1193   //     tmp     - scratch register
1194   //
1195   //     Destroy no registers!
1196   //
1197   void  gen_write_ref_array_pre_barrier(Register addr, Register count, bool dest_uninitialized) {
1198     BarrierSet* bs = Universe::heap()->barrier_set();
1199     switch (bs->kind()) {
1200       case BarrierSet::G1SATBCTLogging:
1201         // With G1, don't generate the call if we statically know that the target in uninitialized
1202         if (!dest_uninitialized) {
1203            __ pusha();                      // push registers
1204            if (count == c_rarg0) {
1205              if (addr == c_rarg1) {
1206                // exactly backwards!!
1207                __ xchgptr(c_rarg1, c_rarg0);
1208              } else {
1209                __ movptr(c_rarg1, count);
1210                __ movptr(c_rarg0, addr);
1211              }
1212            } else {
1213              __ movptr(c_rarg0, addr);
1214              __ movptr(c_rarg1, count);
1215            }
1216            __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_pre), 2);
1217            __ popa();
1218         }
1219          break;
1220       case BarrierSet::CardTableForRS:
1221       case BarrierSet::CardTableExtension:
1222       case BarrierSet::ModRef:
1223       case BarrierSet::Epsilon:
1224         break;
1225       default:
1226         ShouldNotReachHere();
1227 
1228     }
1229   }
1230 
1231   //
1232   // Generate code for an array write post barrier
1233   //
1234   //  Input:
1235   //     start    - register containing starting address of destination array
1236   //     count    - elements count
1237   //     scratch  - scratch register
1238   //
1239   //  The input registers are overwritten.
1240   //
1241   void  gen_write_ref_array_post_barrier(Register start, Register count, Register scratch) {
1242     assert_different_registers(start, count, scratch);
1243     BarrierSet* bs = Universe::heap()->barrier_set();
1244     switch (bs->kind()) {
1245       case BarrierSet::G1SATBCTLogging:
1246         {
1247           __ pusha();             // push registers (overkill)
1248           if (c_rarg0 == count) { // On win64 c_rarg0 == rcx
1249             assert_different_registers(c_rarg1, start);
1250             __ mov(c_rarg1, count);
1251             __ mov(c_rarg0, start);
1252           } else {
1253             assert_different_registers(c_rarg0, count);
1254             __ mov(c_rarg0, start);
1255             __ mov(c_rarg1, count);
1256           }
1257           __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post), 2);
1258           __ popa();
1259         }
1260         break;
1261       case BarrierSet::CardTableForRS:
1262       case BarrierSet::CardTableExtension:
1263         {
1264           CardTableModRefBS* ct = barrier_set_cast<CardTableModRefBS>(bs);
1265           assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
1266 
1267           Label L_loop;
1268           const Register end = count;
1269 
1270           __ leaq(end, Address(start, count, TIMES_OOP, 0));  // end == start+count*oop_size
1271           __ subptr(end, BytesPerHeapOop); // end - 1 to make inclusive
1272           __ shrptr(start, CardTableModRefBS::card_shift);
1273           __ shrptr(end,   CardTableModRefBS::card_shift);
1274           __ subptr(end, start); // end --> cards count
1275 
1276           int64_t disp = (int64_t) ct->byte_map_base;
1277           __ mov64(scratch, disp);
1278           __ addptr(start, scratch);
1279         __ BIND(L_loop);
1280           __ movb(Address(start, count, Address::times_1), 0);
1281           __ decrement(count);
1282           __ jcc(Assembler::greaterEqual, L_loop);
1283         }
1284         break;
1285       case BarrierSet::Epsilon:
1286         // TODO: why are we here at all?
1287         break;
1288       default:
1289         ShouldNotReachHere();
1290 
1291     }
1292   }
1293 
1294 
1295   // Copy big chunks forward
1296   //
1297   // Inputs:
1298   //   end_from     - source arrays end address
1299   //   end_to       - destination array end address
1300   //   qword_count  - 64-bits element count, negative
1301   //   to           - scratch
1302   //   L_copy_bytes - entry label
1303   //   L_copy_8_bytes  - exit  label
1304   //
1305   void copy_bytes_forward(Register end_from, Register end_to,
1306                              Register qword_count, Register to,
1307                              Label& L_copy_bytes, Label& L_copy_8_bytes) {
1308     DEBUG_ONLY(__ stop("enter at entry label, not here"));
1309     Label L_loop;
1310     __ align(OptoLoopAlignment);
1311     if (UseUnalignedLoadStores) {
1312       Label L_end;
1313       if (UseAVX > 2) {
1314         __ movl(to, 0xffff);
1315         __ kmovwl(k1, to);
1316       }
1317       // Copy 64-bytes per iteration
1318       __ BIND(L_loop);
1319       if (UseAVX > 2) {
1320         __ evmovdqul(xmm0, Address(end_from, qword_count, Address::times_8, -56), Assembler::AVX_512bit);
1321         __ evmovdqul(Address(end_to, qword_count, Address::times_8, -56), xmm0, Assembler::AVX_512bit);
1322       } else if (UseAVX == 2) {
1323         __ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56));
1324         __ vmovdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0);
1325         __ vmovdqu(xmm1, Address(end_from, qword_count, Address::times_8, -24));
1326         __ vmovdqu(Address(end_to, qword_count, Address::times_8, -24), xmm1);
1327       } else {
1328         __ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56));
1329         __ movdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0);
1330         __ movdqu(xmm1, Address(end_from, qword_count, Address::times_8, -40));
1331         __ movdqu(Address(end_to, qword_count, Address::times_8, -40), xmm1);
1332         __ movdqu(xmm2, Address(end_from, qword_count, Address::times_8, -24));
1333         __ movdqu(Address(end_to, qword_count, Address::times_8, -24), xmm2);
1334         __ movdqu(xmm3, Address(end_from, qword_count, Address::times_8, - 8));
1335         __ movdqu(Address(end_to, qword_count, Address::times_8, - 8), xmm3);
1336       }
1337       __ BIND(L_copy_bytes);
1338       __ addptr(qword_count, 8);
1339       __ jcc(Assembler::lessEqual, L_loop);
1340       __ subptr(qword_count, 4);  // sub(8) and add(4)
1341       __ jccb(Assembler::greater, L_end);
1342       // Copy trailing 32 bytes
1343       if (UseAVX >= 2) {
1344         __ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -24));
1345         __ vmovdqu(Address(end_to, qword_count, Address::times_8, -24), xmm0);
1346       } else {
1347         __ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -24));
1348         __ movdqu(Address(end_to, qword_count, Address::times_8, -24), xmm0);
1349         __ movdqu(xmm1, Address(end_from, qword_count, Address::times_8, - 8));
1350         __ movdqu(Address(end_to, qword_count, Address::times_8, - 8), xmm1);
1351       }
1352       __ addptr(qword_count, 4);
1353       __ BIND(L_end);
1354       if (UseAVX >= 2) {
1355         // clean upper bits of YMM registers
1356         __ vpxor(xmm0, xmm0);
1357         __ vpxor(xmm1, xmm1);
1358       }
1359     } else {
1360       // Copy 32-bytes per iteration
1361       __ BIND(L_loop);
1362       __ movq(to, Address(end_from, qword_count, Address::times_8, -24));
1363       __ movq(Address(end_to, qword_count, Address::times_8, -24), to);
1364       __ movq(to, Address(end_from, qword_count, Address::times_8, -16));
1365       __ movq(Address(end_to, qword_count, Address::times_8, -16), to);
1366       __ movq(to, Address(end_from, qword_count, Address::times_8, - 8));
1367       __ movq(Address(end_to, qword_count, Address::times_8, - 8), to);
1368       __ movq(to, Address(end_from, qword_count, Address::times_8, - 0));
1369       __ movq(Address(end_to, qword_count, Address::times_8, - 0), to);
1370 
1371       __ BIND(L_copy_bytes);
1372       __ addptr(qword_count, 4);
1373       __ jcc(Assembler::lessEqual, L_loop);
1374     }
1375     __ subptr(qword_count, 4);
1376     __ jcc(Assembler::less, L_copy_8_bytes); // Copy trailing qwords
1377   }
1378 
1379   // Copy big chunks backward
1380   //
1381   // Inputs:
1382   //   from         - source arrays address
1383   //   dest         - destination array address
1384   //   qword_count  - 64-bits element count
1385   //   to           - scratch
1386   //   L_copy_bytes - entry label
1387   //   L_copy_8_bytes  - exit  label
1388   //
1389   void copy_bytes_backward(Register from, Register dest,
1390                               Register qword_count, Register to,
1391                               Label& L_copy_bytes, Label& L_copy_8_bytes) {
1392     DEBUG_ONLY(__ stop("enter at entry label, not here"));
1393     Label L_loop;
1394     __ align(OptoLoopAlignment);
1395     if (UseUnalignedLoadStores) {
1396       Label L_end;
1397       if (UseAVX > 2) {
1398         __ movl(to, 0xffff);
1399         __ kmovwl(k1, to);
1400       }
1401       // Copy 64-bytes per iteration
1402       __ BIND(L_loop);
1403       if (UseAVX > 2) {
1404         __ evmovdqul(xmm0, Address(from, qword_count, Address::times_8, 0), Assembler::AVX_512bit);
1405         __ evmovdqul(Address(dest, qword_count, Address::times_8, 0), xmm0, Assembler::AVX_512bit);
1406       } else if (UseAVX == 2) {
1407         __ vmovdqu(xmm0, Address(from, qword_count, Address::times_8, 32));
1408         __ vmovdqu(Address(dest, qword_count, Address::times_8, 32), xmm0);
1409         __ vmovdqu(xmm1, Address(from, qword_count, Address::times_8,  0));
1410         __ vmovdqu(Address(dest, qword_count, Address::times_8,  0), xmm1);
1411       } else {
1412         __ movdqu(xmm0, Address(from, qword_count, Address::times_8, 48));
1413         __ movdqu(Address(dest, qword_count, Address::times_8, 48), xmm0);
1414         __ movdqu(xmm1, Address(from, qword_count, Address::times_8, 32));
1415         __ movdqu(Address(dest, qword_count, Address::times_8, 32), xmm1);
1416         __ movdqu(xmm2, Address(from, qword_count, Address::times_8, 16));
1417         __ movdqu(Address(dest, qword_count, Address::times_8, 16), xmm2);
1418         __ movdqu(xmm3, Address(from, qword_count, Address::times_8,  0));
1419         __ movdqu(Address(dest, qword_count, Address::times_8,  0), xmm3);
1420       }
1421       __ BIND(L_copy_bytes);
1422       __ subptr(qword_count, 8);
1423       __ jcc(Assembler::greaterEqual, L_loop);
1424 
1425       __ addptr(qword_count, 4);  // add(8) and sub(4)
1426       __ jccb(Assembler::less, L_end);
1427       // Copy trailing 32 bytes
1428       if (UseAVX >= 2) {
1429         __ vmovdqu(xmm0, Address(from, qword_count, Address::times_8, 0));
1430         __ vmovdqu(Address(dest, qword_count, Address::times_8, 0), xmm0);
1431       } else {
1432         __ movdqu(xmm0, Address(from, qword_count, Address::times_8, 16));
1433         __ movdqu(Address(dest, qword_count, Address::times_8, 16), xmm0);
1434         __ movdqu(xmm1, Address(from, qword_count, Address::times_8,  0));
1435         __ movdqu(Address(dest, qword_count, Address::times_8,  0), xmm1);
1436       }
1437       __ subptr(qword_count, 4);
1438       __ BIND(L_end);
1439       if (UseAVX >= 2) {
1440         // clean upper bits of YMM registers
1441         __ vpxor(xmm0, xmm0);
1442         __ vpxor(xmm1, xmm1);
1443       }
1444     } else {
1445       // Copy 32-bytes per iteration
1446       __ BIND(L_loop);
1447       __ movq(to, Address(from, qword_count, Address::times_8, 24));
1448       __ movq(Address(dest, qword_count, Address::times_8, 24), to);
1449       __ movq(to, Address(from, qword_count, Address::times_8, 16));
1450       __ movq(Address(dest, qword_count, Address::times_8, 16), to);
1451       __ movq(to, Address(from, qword_count, Address::times_8,  8));
1452       __ movq(Address(dest, qword_count, Address::times_8,  8), to);
1453       __ movq(to, Address(from, qword_count, Address::times_8,  0));
1454       __ movq(Address(dest, qword_count, Address::times_8,  0), to);
1455 
1456       __ BIND(L_copy_bytes);
1457       __ subptr(qword_count, 4);
1458       __ jcc(Assembler::greaterEqual, L_loop);
1459     }
1460     __ addptr(qword_count, 4);
1461     __ jcc(Assembler::greater, L_copy_8_bytes); // Copy trailing qwords
1462   }
1463 
1464 
1465   // Arguments:
1466   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1467   //             ignored
1468   //   name    - stub name string
1469   //
1470   // Inputs:
1471   //   c_rarg0   - source array address
1472   //   c_rarg1   - destination array address
1473   //   c_rarg2   - element count, treated as ssize_t, can be zero
1474   //
1475   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1476   // we let the hardware handle it.  The one to eight bytes within words,
1477   // dwords or qwords that span cache line boundaries will still be loaded
1478   // and stored atomically.
1479   //
1480   // Side Effects:
1481   //   disjoint_byte_copy_entry is set to the no-overlap entry point
1482   //   used by generate_conjoint_byte_copy().
1483   //
1484   address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
1485     __ align(CodeEntryAlignment);
1486     StubCodeMark mark(this, "StubRoutines", name);
1487     address start = __ pc();
1488 
1489     Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes;
1490     Label L_copy_byte, L_exit;
1491     const Register from        = rdi;  // source array address
1492     const Register to          = rsi;  // destination array address
1493     const Register count       = rdx;  // elements count
1494     const Register byte_count  = rcx;
1495     const Register qword_count = count;
1496     const Register end_from    = from; // source array end address
1497     const Register end_to      = to;   // destination array end address
1498     // End pointers are inclusive, and if count is not zero they point
1499     // to the last unit copied:  end_to[0] := end_from[0]
1500 
1501     __ enter(); // required for proper stackwalking of RuntimeStub frame
1502     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1503 
1504     if (entry != NULL) {
1505       *entry = __ pc();
1506        // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1507       BLOCK_COMMENT("Entry:");
1508     }
1509 
1510     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1511                       // r9 and r10 may be used to save non-volatile registers
1512 
1513     // 'from', 'to' and 'count' are now valid
1514     __ movptr(byte_count, count);
1515     __ shrptr(count, 3); // count => qword_count
1516 
1517     // Copy from low to high addresses.  Use 'to' as scratch.
1518     __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
1519     __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
1520     __ negptr(qword_count); // make the count negative
1521     __ jmp(L_copy_bytes);
1522 
1523     // Copy trailing qwords
1524   __ BIND(L_copy_8_bytes);
1525     __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
1526     __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
1527     __ increment(qword_count);
1528     __ jcc(Assembler::notZero, L_copy_8_bytes);
1529 
1530     // Check for and copy trailing dword
1531   __ BIND(L_copy_4_bytes);
1532     __ testl(byte_count, 4);
1533     __ jccb(Assembler::zero, L_copy_2_bytes);
1534     __ movl(rax, Address(end_from, 8));
1535     __ movl(Address(end_to, 8), rax);
1536 
1537     __ addptr(end_from, 4);
1538     __ addptr(end_to, 4);
1539 
1540     // Check for and copy trailing word
1541   __ BIND(L_copy_2_bytes);
1542     __ testl(byte_count, 2);
1543     __ jccb(Assembler::zero, L_copy_byte);
1544     __ movw(rax, Address(end_from, 8));
1545     __ movw(Address(end_to, 8), rax);
1546 
1547     __ addptr(end_from, 2);
1548     __ addptr(end_to, 2);
1549 
1550     // Check for and copy trailing byte
1551   __ BIND(L_copy_byte);
1552     __ testl(byte_count, 1);
1553     __ jccb(Assembler::zero, L_exit);
1554     __ movb(rax, Address(end_from, 8));
1555     __ movb(Address(end_to, 8), rax);
1556 
1557   __ BIND(L_exit);
1558     restore_arg_regs();
1559     inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free
1560     __ xorptr(rax, rax); // return 0
1561     __ leave(); // required for proper stackwalking of RuntimeStub frame
1562     __ ret(0);
1563 
1564     // Copy in multi-bytes chunks
1565     copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
1566     __ jmp(L_copy_4_bytes);
1567 
1568     return start;
1569   }
1570 
1571   // Arguments:
1572   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1573   //             ignored
1574   //   name    - stub name string
1575   //
1576   // Inputs:
1577   //   c_rarg0   - source array address
1578   //   c_rarg1   - destination array address
1579   //   c_rarg2   - element count, treated as ssize_t, can be zero
1580   //
1581   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1582   // we let the hardware handle it.  The one to eight bytes within words,
1583   // dwords or qwords that span cache line boundaries will still be loaded
1584   // and stored atomically.
1585   //
1586   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1587                                       address* entry, const char *name) {
1588     __ align(CodeEntryAlignment);
1589     StubCodeMark mark(this, "StubRoutines", name);
1590     address start = __ pc();
1591 
1592     Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes;
1593     const Register from        = rdi;  // source array address
1594     const Register to          = rsi;  // destination array address
1595     const Register count       = rdx;  // elements count
1596     const Register byte_count  = rcx;
1597     const Register qword_count = count;
1598 
1599     __ enter(); // required for proper stackwalking of RuntimeStub frame
1600     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1601 
1602     if (entry != NULL) {
1603       *entry = __ pc();
1604       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1605       BLOCK_COMMENT("Entry:");
1606     }
1607 
1608     array_overlap_test(nooverlap_target, Address::times_1);
1609     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1610                       // r9 and r10 may be used to save non-volatile registers
1611 
1612     // 'from', 'to' and 'count' are now valid
1613     __ movptr(byte_count, count);
1614     __ shrptr(count, 3);   // count => qword_count
1615 
1616     // Copy from high to low addresses.
1617 
1618     // Check for and copy trailing byte
1619     __ testl(byte_count, 1);
1620     __ jcc(Assembler::zero, L_copy_2_bytes);
1621     __ movb(rax, Address(from, byte_count, Address::times_1, -1));
1622     __ movb(Address(to, byte_count, Address::times_1, -1), rax);
1623     __ decrement(byte_count); // Adjust for possible trailing word
1624 
1625     // Check for and copy trailing word
1626   __ BIND(L_copy_2_bytes);
1627     __ testl(byte_count, 2);
1628     __ jcc(Assembler::zero, L_copy_4_bytes);
1629     __ movw(rax, Address(from, byte_count, Address::times_1, -2));
1630     __ movw(Address(to, byte_count, Address::times_1, -2), rax);
1631 
1632     // Check for and copy trailing dword
1633   __ BIND(L_copy_4_bytes);
1634     __ testl(byte_count, 4);
1635     __ jcc(Assembler::zero, L_copy_bytes);
1636     __ movl(rax, Address(from, qword_count, Address::times_8));
1637     __ movl(Address(to, qword_count, Address::times_8), rax);
1638     __ jmp(L_copy_bytes);
1639 
1640     // Copy trailing qwords
1641   __ BIND(L_copy_8_bytes);
1642     __ movq(rax, Address(from, qword_count, Address::times_8, -8));
1643     __ movq(Address(to, qword_count, Address::times_8, -8), rax);
1644     __ decrement(qword_count);
1645     __ jcc(Assembler::notZero, L_copy_8_bytes);
1646 
1647     restore_arg_regs();
1648     inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free
1649     __ xorptr(rax, rax); // return 0
1650     __ leave(); // required for proper stackwalking of RuntimeStub frame
1651     __ ret(0);
1652 
1653     // Copy in multi-bytes chunks
1654     copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
1655 
1656     restore_arg_regs();
1657     inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free
1658     __ xorptr(rax, rax); // return 0
1659     __ leave(); // required for proper stackwalking of RuntimeStub frame
1660     __ ret(0);
1661 
1662     return start;
1663   }
1664 
1665   // Arguments:
1666   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1667   //             ignored
1668   //   name    - stub name string
1669   //
1670   // Inputs:
1671   //   c_rarg0   - source array address
1672   //   c_rarg1   - destination array address
1673   //   c_rarg2   - element count, treated as ssize_t, can be zero
1674   //
1675   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1676   // let the hardware handle it.  The two or four words within dwords
1677   // or qwords that span cache line boundaries will still be loaded
1678   // and stored atomically.
1679   //
1680   // Side Effects:
1681   //   disjoint_short_copy_entry is set to the no-overlap entry point
1682   //   used by generate_conjoint_short_copy().
1683   //
1684   address generate_disjoint_short_copy(bool aligned, address *entry, const char *name) {
1685     __ align(CodeEntryAlignment);
1686     StubCodeMark mark(this, "StubRoutines", name);
1687     address start = __ pc();
1688 
1689     Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes,L_copy_2_bytes,L_exit;
1690     const Register from        = rdi;  // source array address
1691     const Register to          = rsi;  // destination array address
1692     const Register count       = rdx;  // elements count
1693     const Register word_count  = rcx;
1694     const Register qword_count = count;
1695     const Register end_from    = from; // source array end address
1696     const Register end_to      = to;   // destination array end address
1697     // End pointers are inclusive, and if count is not zero they point
1698     // to the last unit copied:  end_to[0] := end_from[0]
1699 
1700     __ enter(); // required for proper stackwalking of RuntimeStub frame
1701     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1702 
1703     if (entry != NULL) {
1704       *entry = __ pc();
1705       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1706       BLOCK_COMMENT("Entry:");
1707     }
1708 
1709     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1710                       // r9 and r10 may be used to save non-volatile registers
1711 
1712     // 'from', 'to' and 'count' are now valid
1713     __ movptr(word_count, count);
1714     __ shrptr(count, 2); // count => qword_count
1715 
1716     // Copy from low to high addresses.  Use 'to' as scratch.
1717     __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
1718     __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
1719     __ negptr(qword_count);
1720     __ jmp(L_copy_bytes);
1721 
1722     // Copy trailing qwords
1723   __ BIND(L_copy_8_bytes);
1724     __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
1725     __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
1726     __ increment(qword_count);
1727     __ jcc(Assembler::notZero, L_copy_8_bytes);
1728 
1729     // Original 'dest' is trashed, so we can't use it as a
1730     // base register for a possible trailing word copy
1731 
1732     // Check for and copy trailing dword
1733   __ BIND(L_copy_4_bytes);
1734     __ testl(word_count, 2);
1735     __ jccb(Assembler::zero, L_copy_2_bytes);
1736     __ movl(rax, Address(end_from, 8));
1737     __ movl(Address(end_to, 8), rax);
1738 
1739     __ addptr(end_from, 4);
1740     __ addptr(end_to, 4);
1741 
1742     // Check for and copy trailing word
1743   __ BIND(L_copy_2_bytes);
1744     __ testl(word_count, 1);
1745     __ jccb(Assembler::zero, L_exit);
1746     __ movw(rax, Address(end_from, 8));
1747     __ movw(Address(end_to, 8), rax);
1748 
1749   __ BIND(L_exit);
1750     restore_arg_regs();
1751     inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); // Update counter after rscratch1 is free
1752     __ xorptr(rax, rax); // return 0
1753     __ leave(); // required for proper stackwalking of RuntimeStub frame
1754     __ ret(0);
1755 
1756     // Copy in multi-bytes chunks
1757     copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
1758     __ jmp(L_copy_4_bytes);
1759 
1760     return start;
1761   }
1762 
1763   address generate_fill(BasicType t, bool aligned, const char *name) {
1764     __ align(CodeEntryAlignment);
1765     StubCodeMark mark(this, "StubRoutines", name);
1766     address start = __ pc();
1767 
1768     BLOCK_COMMENT("Entry:");
1769 
1770     const Register to       = c_rarg0;  // source array address
1771     const Register value    = c_rarg1;  // value
1772     const Register count    = c_rarg2;  // elements count
1773 
1774     __ enter(); // required for proper stackwalking of RuntimeStub frame
1775 
1776     __ generate_fill(t, aligned, to, value, count, rax, xmm0);
1777 
1778     __ leave(); // required for proper stackwalking of RuntimeStub frame
1779     __ ret(0);
1780     return start;
1781   }
1782 
1783   // Arguments:
1784   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1785   //             ignored
1786   //   name    - stub name string
1787   //
1788   // Inputs:
1789   //   c_rarg0   - source array address
1790   //   c_rarg1   - destination array address
1791   //   c_rarg2   - element count, treated as ssize_t, can be zero
1792   //
1793   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1794   // let the hardware handle it.  The two or four words within dwords
1795   // or qwords that span cache line boundaries will still be loaded
1796   // and stored atomically.
1797   //
1798   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1799                                        address *entry, const char *name) {
1800     __ align(CodeEntryAlignment);
1801     StubCodeMark mark(this, "StubRoutines", name);
1802     address start = __ pc();
1803 
1804     Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes;
1805     const Register from        = rdi;  // source array address
1806     const Register to          = rsi;  // destination array address
1807     const Register count       = rdx;  // elements count
1808     const Register word_count  = rcx;
1809     const Register qword_count = count;
1810 
1811     __ enter(); // required for proper stackwalking of RuntimeStub frame
1812     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1813 
1814     if (entry != NULL) {
1815       *entry = __ pc();
1816       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1817       BLOCK_COMMENT("Entry:");
1818     }
1819 
1820     array_overlap_test(nooverlap_target, Address::times_2);
1821     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1822                       // r9 and r10 may be used to save non-volatile registers
1823 
1824     // 'from', 'to' and 'count' are now valid
1825     __ movptr(word_count, count);
1826     __ shrptr(count, 2); // count => qword_count
1827 
1828     // Copy from high to low addresses.  Use 'to' as scratch.
1829 
1830     // Check for and copy trailing word
1831     __ testl(word_count, 1);
1832     __ jccb(Assembler::zero, L_copy_4_bytes);
1833     __ movw(rax, Address(from, word_count, Address::times_2, -2));
1834     __ movw(Address(to, word_count, Address::times_2, -2), rax);
1835 
1836     // Check for and copy trailing dword
1837   __ BIND(L_copy_4_bytes);
1838     __ testl(word_count, 2);
1839     __ jcc(Assembler::zero, L_copy_bytes);
1840     __ movl(rax, Address(from, qword_count, Address::times_8));
1841     __ movl(Address(to, qword_count, Address::times_8), rax);
1842     __ jmp(L_copy_bytes);
1843 
1844     // Copy trailing qwords
1845   __ BIND(L_copy_8_bytes);
1846     __ movq(rax, Address(from, qword_count, Address::times_8, -8));
1847     __ movq(Address(to, qword_count, Address::times_8, -8), rax);
1848     __ decrement(qword_count);
1849     __ jcc(Assembler::notZero, L_copy_8_bytes);
1850 
1851     restore_arg_regs();
1852     inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); // Update counter after rscratch1 is free
1853     __ xorptr(rax, rax); // return 0
1854     __ leave(); // required for proper stackwalking of RuntimeStub frame
1855     __ ret(0);
1856 
1857     // Copy in multi-bytes chunks
1858     copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
1859 
1860     restore_arg_regs();
1861     inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); // Update counter after rscratch1 is free
1862     __ xorptr(rax, rax); // return 0
1863     __ leave(); // required for proper stackwalking of RuntimeStub frame
1864     __ ret(0);
1865 
1866     return start;
1867   }
1868 
1869   // Arguments:
1870   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1871   //             ignored
1872   //   is_oop  - true => oop array, so generate store check code
1873   //   name    - stub name string
1874   //
1875   // Inputs:
1876   //   c_rarg0   - source array address
1877   //   c_rarg1   - destination array address
1878   //   c_rarg2   - element count, treated as ssize_t, can be zero
1879   //
1880   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1881   // the hardware handle it.  The two dwords within qwords that span
1882   // cache line boundaries will still be loaded and stored atomicly.
1883   //
1884   // Side Effects:
1885   //   disjoint_int_copy_entry is set to the no-overlap entry point
1886   //   used by generate_conjoint_int_oop_copy().
1887   //
1888   address generate_disjoint_int_oop_copy(bool aligned, bool is_oop, address* entry,
1889                                          const char *name, bool dest_uninitialized = false) {
1890     __ align(CodeEntryAlignment);
1891     StubCodeMark mark(this, "StubRoutines", name);
1892     address start = __ pc();
1893 
1894     Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_exit;
1895     const Register from        = rdi;  // source array address
1896     const Register to          = rsi;  // destination array address
1897     const Register count       = rdx;  // elements count
1898     const Register dword_count = rcx;
1899     const Register qword_count = count;
1900     const Register end_from    = from; // source array end address
1901     const Register end_to      = to;   // destination array end address
1902     const Register saved_to    = r11;  // saved destination array address
1903     // End pointers are inclusive, and if count is not zero they point
1904     // to the last unit copied:  end_to[0] := end_from[0]
1905 
1906     __ enter(); // required for proper stackwalking of RuntimeStub frame
1907     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1908 
1909     if (entry != NULL) {
1910       *entry = __ pc();
1911       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1912       BLOCK_COMMENT("Entry:");
1913     }
1914 
1915     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
1916                       // r9 and r10 may be used to save non-volatile registers
1917     if (is_oop) {
1918       __ movq(saved_to, to);
1919       gen_write_ref_array_pre_barrier(to, count, dest_uninitialized);
1920     }
1921 
1922     // 'from', 'to' and 'count' are now valid
1923     __ movptr(dword_count, count);
1924     __ shrptr(count, 1); // count => qword_count
1925 
1926     // Copy from low to high addresses.  Use 'to' as scratch.
1927     __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
1928     __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
1929     __ negptr(qword_count);
1930     __ jmp(L_copy_bytes);
1931 
1932     // Copy trailing qwords
1933   __ BIND(L_copy_8_bytes);
1934     __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
1935     __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
1936     __ increment(qword_count);
1937     __ jcc(Assembler::notZero, L_copy_8_bytes);
1938 
1939     // Check for and copy trailing dword
1940   __ BIND(L_copy_4_bytes);
1941     __ testl(dword_count, 1); // Only byte test since the value is 0 or 1
1942     __ jccb(Assembler::zero, L_exit);
1943     __ movl(rax, Address(end_from, 8));
1944     __ movl(Address(end_to, 8), rax);
1945 
1946   __ BIND(L_exit);
1947     if (is_oop) {
1948       gen_write_ref_array_post_barrier(saved_to, dword_count, rax);
1949     }
1950     restore_arg_regs();
1951     inc_counter_np(SharedRuntime::_jint_array_copy_ctr); // Update counter after rscratch1 is free
1952     __ xorptr(rax, rax); // return 0
1953     __ leave(); // required for proper stackwalking of RuntimeStub frame
1954     __ ret(0);
1955 
1956     // Copy in multi-bytes chunks
1957     copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
1958     __ jmp(L_copy_4_bytes);
1959 
1960     return start;
1961   }
1962 
1963   // Arguments:
1964   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1965   //             ignored
1966   //   is_oop  - true => oop array, so generate store check code
1967   //   name    - stub name string
1968   //
1969   // Inputs:
1970   //   c_rarg0   - source array address
1971   //   c_rarg1   - destination array address
1972   //   c_rarg2   - element count, treated as ssize_t, can be zero
1973   //
1974   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1975   // the hardware handle it.  The two dwords within qwords that span
1976   // cache line boundaries will still be loaded and stored atomicly.
1977   //
1978   address generate_conjoint_int_oop_copy(bool aligned, bool is_oop, address nooverlap_target,
1979                                          address *entry, const char *name,
1980                                          bool dest_uninitialized = false) {
1981     __ align(CodeEntryAlignment);
1982     StubCodeMark mark(this, "StubRoutines", name);
1983     address start = __ pc();
1984 
1985     Label L_copy_bytes, L_copy_8_bytes, L_copy_2_bytes, L_exit;
1986     const Register from        = rdi;  // source array address
1987     const Register to          = rsi;  // destination array address
1988     const Register count       = rdx;  // elements count
1989     const Register dword_count = rcx;
1990     const Register qword_count = count;
1991 
1992     __ enter(); // required for proper stackwalking of RuntimeStub frame
1993     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
1994 
1995     if (entry != NULL) {
1996       *entry = __ pc();
1997        // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1998       BLOCK_COMMENT("Entry:");
1999     }
2000 
2001     array_overlap_test(nooverlap_target, Address::times_4);
2002     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
2003                       // r9 and r10 may be used to save non-volatile registers
2004 
2005     if (is_oop) {
2006       // no registers are destroyed by this call
2007       gen_write_ref_array_pre_barrier(to, count, dest_uninitialized);
2008     }
2009 
2010     assert_clean_int(count, rax); // Make sure 'count' is clean int.
2011     // 'from', 'to' and 'count' are now valid
2012     __ movptr(dword_count, count);
2013     __ shrptr(count, 1); // count => qword_count
2014 
2015     // Copy from high to low addresses.  Use 'to' as scratch.
2016 
2017     // Check for and copy trailing dword
2018     __ testl(dword_count, 1);
2019     __ jcc(Assembler::zero, L_copy_bytes);
2020     __ movl(rax, Address(from, dword_count, Address::times_4, -4));
2021     __ movl(Address(to, dword_count, Address::times_4, -4), rax);
2022     __ jmp(L_copy_bytes);
2023 
2024     // Copy trailing qwords
2025   __ BIND(L_copy_8_bytes);
2026     __ movq(rax, Address(from, qword_count, Address::times_8, -8));
2027     __ movq(Address(to, qword_count, Address::times_8, -8), rax);
2028     __ decrement(qword_count);
2029     __ jcc(Assembler::notZero, L_copy_8_bytes);
2030 
2031     if (is_oop) {
2032       __ jmp(L_exit);
2033     }
2034     restore_arg_regs();
2035     inc_counter_np(SharedRuntime::_jint_array_copy_ctr); // Update counter after rscratch1 is free
2036     __ xorptr(rax, rax); // return 0
2037     __ leave(); // required for proper stackwalking of RuntimeStub frame
2038     __ ret(0);
2039 
2040     // Copy in multi-bytes chunks
2041     copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2042 
2043   __ BIND(L_exit);
2044     if (is_oop) {
2045       gen_write_ref_array_post_barrier(to, dword_count, rax);
2046     }
2047     restore_arg_regs();
2048     inc_counter_np(SharedRuntime::_jint_array_copy_ctr); // Update counter after rscratch1 is free
2049     __ xorptr(rax, rax); // return 0
2050     __ leave(); // required for proper stackwalking of RuntimeStub frame
2051     __ ret(0);
2052 
2053     return start;
2054   }
2055 
2056   // Arguments:
2057   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
2058   //             ignored
2059   //   is_oop  - true => oop array, so generate store check code
2060   //   name    - stub name string
2061   //
2062   // Inputs:
2063   //   c_rarg0   - source array address
2064   //   c_rarg1   - destination array address
2065   //   c_rarg2   - element count, treated as ssize_t, can be zero
2066   //
2067  // Side Effects:
2068   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
2069   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
2070   //
2071   address generate_disjoint_long_oop_copy(bool aligned, bool is_oop, address *entry,
2072                                           const char *name, bool dest_uninitialized = false) {
2073     __ align(CodeEntryAlignment);
2074     StubCodeMark mark(this, "StubRoutines", name);
2075     address start = __ pc();
2076 
2077     Label L_copy_bytes, L_copy_8_bytes, L_exit;
2078     const Register from        = rdi;  // source array address
2079     const Register to          = rsi;  // destination array address
2080     const Register qword_count = rdx;  // elements count
2081     const Register end_from    = from; // source array end address
2082     const Register end_to      = rcx;  // destination array end address
2083     const Register saved_to    = to;
2084     const Register saved_count = r11;
2085     // End pointers are inclusive, and if count is not zero they point
2086     // to the last unit copied:  end_to[0] := end_from[0]
2087 
2088     __ enter(); // required for proper stackwalking of RuntimeStub frame
2089     // Save no-overlap entry point for generate_conjoint_long_oop_copy()
2090     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
2091 
2092     if (entry != NULL) {
2093       *entry = __ pc();
2094       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2095       BLOCK_COMMENT("Entry:");
2096     }
2097 
2098     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
2099                       // r9 and r10 may be used to save non-volatile registers
2100     // 'from', 'to' and 'qword_count' are now valid
2101     if (is_oop) {
2102       // Save to and count for store barrier
2103       __ movptr(saved_count, qword_count);
2104       // no registers are destroyed by this call
2105       gen_write_ref_array_pre_barrier(to, qword_count, dest_uninitialized);
2106     }
2107 
2108     // Copy from low to high addresses.  Use 'to' as scratch.
2109     __ lea(end_from, Address(from, qword_count, Address::times_8, -8));
2110     __ lea(end_to,   Address(to,   qword_count, Address::times_8, -8));
2111     __ negptr(qword_count);
2112     __ jmp(L_copy_bytes);
2113 
2114     // Copy trailing qwords
2115   __ BIND(L_copy_8_bytes);
2116     __ movq(rax, Address(end_from, qword_count, Address::times_8, 8));
2117     __ movq(Address(end_to, qword_count, Address::times_8, 8), rax);
2118     __ increment(qword_count);
2119     __ jcc(Assembler::notZero, L_copy_8_bytes);
2120 
2121     if (is_oop) {
2122       __ jmp(L_exit);
2123     } else {
2124       restore_arg_regs();
2125       inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
2126       __ xorptr(rax, rax); // return 0
2127       __ leave(); // required for proper stackwalking of RuntimeStub frame
2128       __ ret(0);
2129     }
2130 
2131     // Copy in multi-bytes chunks
2132     copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2133 
2134     if (is_oop) {
2135     __ BIND(L_exit);
2136       gen_write_ref_array_post_barrier(saved_to, saved_count, rax);
2137     }
2138     restore_arg_regs();
2139     if (is_oop) {
2140       inc_counter_np(SharedRuntime::_oop_array_copy_ctr); // Update counter after rscratch1 is free
2141     } else {
2142       inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
2143     }
2144     __ xorptr(rax, rax); // return 0
2145     __ leave(); // required for proper stackwalking of RuntimeStub frame
2146     __ ret(0);
2147 
2148     return start;
2149   }
2150 
2151   // Arguments:
2152   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
2153   //             ignored
2154   //   is_oop  - true => oop array, so generate store check code
2155   //   name    - stub name string
2156   //
2157   // Inputs:
2158   //   c_rarg0   - source array address
2159   //   c_rarg1   - destination array address
2160   //   c_rarg2   - element count, treated as ssize_t, can be zero
2161   //
2162   address generate_conjoint_long_oop_copy(bool aligned, bool is_oop,
2163                                           address nooverlap_target, address *entry,
2164                                           const char *name, bool dest_uninitialized = false) {
2165     __ align(CodeEntryAlignment);
2166     StubCodeMark mark(this, "StubRoutines", name);
2167     address start = __ pc();
2168 
2169     Label L_copy_bytes, L_copy_8_bytes, L_exit;
2170     const Register from        = rdi;  // source array address
2171     const Register to          = rsi;  // destination array address
2172     const Register qword_count = rdx;  // elements count
2173     const Register saved_count = rcx;
2174 
2175     __ enter(); // required for proper stackwalking of RuntimeStub frame
2176     assert_clean_int(c_rarg2, rax);    // Make sure 'count' is clean int.
2177 
2178     if (entry != NULL) {
2179       *entry = __ pc();
2180       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2181       BLOCK_COMMENT("Entry:");
2182     }
2183 
2184     array_overlap_test(nooverlap_target, Address::times_8);
2185     setup_arg_regs(); // from => rdi, to => rsi, count => rdx
2186                       // r9 and r10 may be used to save non-volatile registers
2187     // 'from', 'to' and 'qword_count' are now valid
2188     if (is_oop) {
2189       // Save to and count for store barrier
2190       __ movptr(saved_count, qword_count);
2191       // No registers are destroyed by this call
2192       gen_write_ref_array_pre_barrier(to, saved_count, dest_uninitialized);
2193     }
2194 
2195     __ jmp(L_copy_bytes);
2196 
2197     // Copy trailing qwords
2198   __ BIND(L_copy_8_bytes);
2199     __ movq(rax, Address(from, qword_count, Address::times_8, -8));
2200     __ movq(Address(to, qword_count, Address::times_8, -8), rax);
2201     __ decrement(qword_count);
2202     __ jcc(Assembler::notZero, L_copy_8_bytes);
2203 
2204     if (is_oop) {
2205       __ jmp(L_exit);
2206     } else {
2207       restore_arg_regs();
2208       inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
2209       __ xorptr(rax, rax); // return 0
2210       __ leave(); // required for proper stackwalking of RuntimeStub frame
2211       __ ret(0);
2212     }
2213 
2214     // Copy in multi-bytes chunks
2215     copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
2216 
2217     if (is_oop) {
2218     __ BIND(L_exit);
2219       gen_write_ref_array_post_barrier(to, saved_count, rax);
2220     }
2221     restore_arg_regs();
2222     if (is_oop) {
2223       inc_counter_np(SharedRuntime::_oop_array_copy_ctr); // Update counter after rscratch1 is free
2224     } else {
2225       inc_counter_np(SharedRuntime::_jlong_array_copy_ctr); // Update counter after rscratch1 is free
2226     }
2227     __ xorptr(rax, rax); // return 0
2228     __ leave(); // required for proper stackwalking of RuntimeStub frame
2229     __ ret(0);
2230 
2231     return start;
2232   }
2233 
2234 
2235   // Helper for generating a dynamic type check.
2236   // Smashes no registers.
2237   void generate_type_check(Register sub_klass,
2238                            Register super_check_offset,
2239                            Register super_klass,
2240                            Label& L_success) {
2241     assert_different_registers(sub_klass, super_check_offset, super_klass);
2242 
2243     BLOCK_COMMENT("type_check:");
2244 
2245     Label L_miss;
2246 
2247     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, NULL,
2248                                      super_check_offset);
2249     __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL);
2250 
2251     // Fall through on failure!
2252     __ BIND(L_miss);
2253   }
2254 
2255   //
2256   //  Generate checkcasting array copy stub
2257   //
2258   //  Input:
2259   //    c_rarg0   - source array address
2260   //    c_rarg1   - destination array address
2261   //    c_rarg2   - element count, treated as ssize_t, can be zero
2262   //    c_rarg3   - size_t ckoff (super_check_offset)
2263   // not Win64
2264   //    c_rarg4   - oop ckval (super_klass)
2265   // Win64
2266   //    rsp+40    - oop ckval (super_klass)
2267   //
2268   //  Output:
2269   //    rax ==  0  -  success
2270   //    rax == -1^K - failure, where K is partial transfer count
2271   //
2272   address generate_checkcast_copy(const char *name, address *entry,
2273                                   bool dest_uninitialized = false) {
2274 
2275     Label L_load_element, L_store_element, L_do_card_marks, L_done;
2276 
2277     // Input registers (after setup_arg_regs)
2278     const Register from        = rdi;   // source array address
2279     const Register to          = rsi;   // destination array address
2280     const Register length      = rdx;   // elements count
2281     const Register ckoff       = rcx;   // super_check_offset
2282     const Register ckval       = r8;    // super_klass
2283 
2284     // Registers used as temps (r13, r14 are save-on-entry)
2285     const Register end_from    = from;  // source array end address
2286     const Register end_to      = r13;   // destination array end address
2287     const Register count       = rdx;   // -(count_remaining)
2288     const Register r14_length  = r14;   // saved copy of length
2289     // End pointers are inclusive, and if length is not zero they point
2290     // to the last unit copied:  end_to[0] := end_from[0]
2291 
2292     const Register rax_oop    = rax;    // actual oop copied
2293     const Register r11_klass  = r11;    // oop._klass
2294 
2295     //---------------------------------------------------------------
2296     // Assembler stub will be used for this call to arraycopy
2297     // if the two arrays are subtypes of Object[] but the
2298     // destination array type is not equal to or a supertype
2299     // of the source type.  Each element must be separately
2300     // checked.
2301 
2302     __ align(CodeEntryAlignment);
2303     StubCodeMark mark(this, "StubRoutines", name);
2304     address start = __ pc();
2305 
2306     __ enter(); // required for proper stackwalking of RuntimeStub frame
2307 
2308 #ifdef ASSERT
2309     // caller guarantees that the arrays really are different
2310     // otherwise, we would have to make conjoint checks
2311     { Label L;
2312       array_overlap_test(L, TIMES_OOP);
2313       __ stop("checkcast_copy within a single array");
2314       __ bind(L);
2315     }
2316 #endif //ASSERT
2317 
2318     setup_arg_regs(4); // from => rdi, to => rsi, length => rdx
2319                        // ckoff => rcx, ckval => r8
2320                        // r9 and r10 may be used to save non-volatile registers
2321 #ifdef _WIN64
2322     // last argument (#4) is on stack on Win64
2323     __ movptr(ckval, Address(rsp, 6 * wordSize));
2324 #endif
2325 
2326     // Caller of this entry point must set up the argument registers.
2327     if (entry != NULL) {
2328       *entry = __ pc();
2329       BLOCK_COMMENT("Entry:");
2330     }
2331 
2332     // allocate spill slots for r13, r14
2333     enum {
2334       saved_r13_offset,
2335       saved_r14_offset,
2336       saved_rbp_offset
2337     };
2338     __ subptr(rsp, saved_rbp_offset * wordSize);
2339     __ movptr(Address(rsp, saved_r13_offset * wordSize), r13);
2340     __ movptr(Address(rsp, saved_r14_offset * wordSize), r14);
2341 
2342     // check that int operands are properly extended to size_t
2343     assert_clean_int(length, rax);
2344     assert_clean_int(ckoff, rax);
2345 
2346 #ifdef ASSERT
2347     BLOCK_COMMENT("assert consistent ckoff/ckval");
2348     // The ckoff and ckval must be mutually consistent,
2349     // even though caller generates both.
2350     { Label L;
2351       int sco_offset = in_bytes(Klass::super_check_offset_offset());
2352       __ cmpl(ckoff, Address(ckval, sco_offset));
2353       __ jcc(Assembler::equal, L);
2354       __ stop("super_check_offset inconsistent");
2355       __ bind(L);
2356     }
2357 #endif //ASSERT
2358 
2359     // Loop-invariant addresses.  They are exclusive end pointers.
2360     Address end_from_addr(from, length, TIMES_OOP, 0);
2361     Address   end_to_addr(to,   length, TIMES_OOP, 0);
2362     // Loop-variant addresses.  They assume post-incremented count < 0.
2363     Address from_element_addr(end_from, count, TIMES_OOP, 0);
2364     Address   to_element_addr(end_to,   count, TIMES_OOP, 0);
2365 
2366     gen_write_ref_array_pre_barrier(to, count, dest_uninitialized);
2367 
2368     // Copy from low to high addresses, indexed from the end of each array.
2369     __ lea(end_from, end_from_addr);
2370     __ lea(end_to,   end_to_addr);
2371     __ movptr(r14_length, length);        // save a copy of the length
2372     assert(length == count, "");          // else fix next line:
2373     __ negptr(count);                     // negate and test the length
2374     __ jcc(Assembler::notZero, L_load_element);
2375 
2376     // Empty array:  Nothing to do.
2377     __ xorptr(rax, rax);                  // return 0 on (trivial) success
2378     __ jmp(L_done);
2379 
2380     // ======== begin loop ========
2381     // (Loop is rotated; its entry is L_load_element.)
2382     // Loop control:
2383     //   for (count = -count; count != 0; count++)
2384     // Base pointers src, dst are biased by 8*(count-1),to last element.
2385     __ align(OptoLoopAlignment);
2386 
2387     __ BIND(L_store_element);
2388     __ store_heap_oop(to_element_addr, rax_oop);  // store the oop
2389     __ increment(count);               // increment the count toward zero
2390     __ jcc(Assembler::zero, L_do_card_marks);
2391 
2392     // ======== loop entry is here ========
2393     __ BIND(L_load_element);
2394     __ load_heap_oop(rax_oop, from_element_addr); // load the oop
2395     __ testptr(rax_oop, rax_oop);
2396     __ jcc(Assembler::zero, L_store_element);
2397 
2398     __ load_klass(r11_klass, rax_oop);// query the object klass
2399     generate_type_check(r11_klass, ckoff, ckval, L_store_element);
2400     // ======== end loop ========
2401 
2402     // It was a real error; we must depend on the caller to finish the job.
2403     // Register rdx = -1 * number of *remaining* oops, r14 = *total* oops.
2404     // Emit GC store barriers for the oops we have copied (r14 + rdx),
2405     // and report their number to the caller.
2406     assert_different_registers(rax, r14_length, count, to, end_to, rcx, rscratch1);
2407     Label L_post_barrier;
2408     __ addptr(r14_length, count);     // K = (original - remaining) oops
2409     __ movptr(rax, r14_length);       // save the value
2410     __ notptr(rax);                   // report (-1^K) to caller (does not affect flags)
2411     __ jccb(Assembler::notZero, L_post_barrier);
2412     __ jmp(L_done); // K == 0, nothing was copied, skip post barrier
2413 
2414     // Come here on success only.
2415     __ BIND(L_do_card_marks);
2416     __ xorptr(rax, rax);              // return 0 on success
2417 
2418     __ BIND(L_post_barrier);
2419     gen_write_ref_array_post_barrier(to, r14_length, rscratch1);
2420 
2421     // Common exit point (success or failure).
2422     __ BIND(L_done);
2423     __ movptr(r13, Address(rsp, saved_r13_offset * wordSize));
2424     __ movptr(r14, Address(rsp, saved_r14_offset * wordSize));
2425     restore_arg_regs();
2426     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr); // Update counter after rscratch1 is free
2427     __ leave(); // required for proper stackwalking of RuntimeStub frame
2428     __ ret(0);
2429 
2430     return start;
2431   }
2432 
2433   //
2434   //  Generate 'unsafe' array copy stub
2435   //  Though just as safe as the other stubs, it takes an unscaled
2436   //  size_t argument instead of an element count.
2437   //
2438   //  Input:
2439   //    c_rarg0   - source array address
2440   //    c_rarg1   - destination array address
2441   //    c_rarg2   - byte count, treated as ssize_t, can be zero
2442   //
2443   // Examines the alignment of the operands and dispatches
2444   // to a long, int, short, or byte copy loop.
2445   //
2446   address generate_unsafe_copy(const char *name,
2447                                address byte_copy_entry, address short_copy_entry,
2448                                address int_copy_entry, address long_copy_entry) {
2449 
2450     Label L_long_aligned, L_int_aligned, L_short_aligned;
2451 
2452     // Input registers (before setup_arg_regs)
2453     const Register from        = c_rarg0;  // source array address
2454     const Register to          = c_rarg1;  // destination array address
2455     const Register size        = c_rarg2;  // byte count (size_t)
2456 
2457     // Register used as a temp
2458     const Register bits        = rax;      // test copy of low bits
2459 
2460     __ align(CodeEntryAlignment);
2461     StubCodeMark mark(this, "StubRoutines", name);
2462     address start = __ pc();
2463 
2464     __ enter(); // required for proper stackwalking of RuntimeStub frame
2465 
2466     // bump this on entry, not on exit:
2467     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
2468 
2469     __ mov(bits, from);
2470     __ orptr(bits, to);
2471     __ orptr(bits, size);
2472 
2473     __ testb(bits, BytesPerLong-1);
2474     __ jccb(Assembler::zero, L_long_aligned);
2475 
2476     __ testb(bits, BytesPerInt-1);
2477     __ jccb(Assembler::zero, L_int_aligned);
2478 
2479     __ testb(bits, BytesPerShort-1);
2480     __ jump_cc(Assembler::notZero, RuntimeAddress(byte_copy_entry));
2481 
2482     __ BIND(L_short_aligned);
2483     __ shrptr(size, LogBytesPerShort); // size => short_count
2484     __ jump(RuntimeAddress(short_copy_entry));
2485 
2486     __ BIND(L_int_aligned);
2487     __ shrptr(size, LogBytesPerInt); // size => int_count
2488     __ jump(RuntimeAddress(int_copy_entry));
2489 
2490     __ BIND(L_long_aligned);
2491     __ shrptr(size, LogBytesPerLong); // size => qword_count
2492     __ jump(RuntimeAddress(long_copy_entry));
2493 
2494     return start;
2495   }
2496 
2497   // Perform range checks on the proposed arraycopy.
2498   // Kills temp, but nothing else.
2499   // Also, clean the sign bits of src_pos and dst_pos.
2500   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
2501                               Register src_pos, // source position (c_rarg1)
2502                               Register dst,     // destination array oo (c_rarg2)
2503                               Register dst_pos, // destination position (c_rarg3)
2504                               Register length,
2505                               Register temp,
2506                               Label& L_failed) {
2507     BLOCK_COMMENT("arraycopy_range_checks:");
2508 
2509     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
2510     __ movl(temp, length);
2511     __ addl(temp, src_pos);             // src_pos + length
2512     __ cmpl(temp, Address(src, arrayOopDesc::length_offset_in_bytes()));
2513     __ jcc(Assembler::above, L_failed);
2514 
2515     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
2516     __ movl(temp, length);
2517     __ addl(temp, dst_pos);             // dst_pos + length
2518     __ cmpl(temp, Address(dst, arrayOopDesc::length_offset_in_bytes()));
2519     __ jcc(Assembler::above, L_failed);
2520 
2521     // Have to clean up high 32-bits of 'src_pos' and 'dst_pos'.
2522     // Move with sign extension can be used since they are positive.
2523     __ movslq(src_pos, src_pos);
2524     __ movslq(dst_pos, dst_pos);
2525 
2526     BLOCK_COMMENT("arraycopy_range_checks done");
2527   }
2528 
2529   //
2530   //  Generate generic array copy stubs
2531   //
2532   //  Input:
2533   //    c_rarg0    -  src oop
2534   //    c_rarg1    -  src_pos (32-bits)
2535   //    c_rarg2    -  dst oop
2536   //    c_rarg3    -  dst_pos (32-bits)
2537   // not Win64
2538   //    c_rarg4    -  element count (32-bits)
2539   // Win64
2540   //    rsp+40     -  element count (32-bits)
2541   //
2542   //  Output:
2543   //    rax ==  0  -  success
2544   //    rax == -1^K - failure, where K is partial transfer count
2545   //
2546   address generate_generic_copy(const char *name,
2547                                 address byte_copy_entry, address short_copy_entry,
2548                                 address int_copy_entry, address oop_copy_entry,
2549                                 address long_copy_entry, address checkcast_copy_entry) {
2550 
2551     Label L_failed, L_failed_0, L_objArray;
2552     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
2553 
2554     // Input registers
2555     const Register src        = c_rarg0;  // source array oop
2556     const Register src_pos    = c_rarg1;  // source position
2557     const Register dst        = c_rarg2;  // destination array oop
2558     const Register dst_pos    = c_rarg3;  // destination position
2559 #ifndef _WIN64
2560     const Register length     = c_rarg4;
2561 #else
2562     const Address  length(rsp, 6 * wordSize);  // elements count is on stack on Win64
2563 #endif
2564 
2565     { int modulus = CodeEntryAlignment;
2566       int target  = modulus - 5; // 5 = sizeof jmp(L_failed)
2567       int advance = target - (__ offset() % modulus);
2568       if (advance < 0)  advance += modulus;
2569       if (advance > 0)  __ nop(advance);
2570     }
2571     StubCodeMark mark(this, "StubRoutines", name);
2572 
2573     // Short-hop target to L_failed.  Makes for denser prologue code.
2574     __ BIND(L_failed_0);
2575     __ jmp(L_failed);
2576     assert(__ offset() % CodeEntryAlignment == 0, "no further alignment needed");
2577 
2578     __ align(CodeEntryAlignment);
2579     address start = __ pc();
2580 
2581     __ enter(); // required for proper stackwalking of RuntimeStub frame
2582 
2583     // bump this on entry, not on exit:
2584     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
2585 
2586     //-----------------------------------------------------------------------
2587     // Assembler stub will be used for this call to arraycopy
2588     // if the following conditions are met:
2589     //
2590     // (1) src and dst must not be null.
2591     // (2) src_pos must not be negative.
2592     // (3) dst_pos must not be negative.
2593     // (4) length  must not be negative.
2594     // (5) src klass and dst klass should be the same and not NULL.
2595     // (6) src and dst should be arrays.
2596     // (7) src_pos + length must not exceed length of src.
2597     // (8) dst_pos + length must not exceed length of dst.
2598     //
2599 
2600     //  if (src == NULL) return -1;
2601     __ testptr(src, src);         // src oop
2602     size_t j1off = __ offset();
2603     __ jccb(Assembler::zero, L_failed_0);
2604 
2605     //  if (src_pos < 0) return -1;
2606     __ testl(src_pos, src_pos); // src_pos (32-bits)
2607     __ jccb(Assembler::negative, L_failed_0);
2608 
2609     //  if (dst == NULL) return -1;
2610     __ testptr(dst, dst);         // dst oop
2611     __ jccb(Assembler::zero, L_failed_0);
2612 
2613     //  if (dst_pos < 0) return -1;
2614     __ testl(dst_pos, dst_pos); // dst_pos (32-bits)
2615     size_t j4off = __ offset();
2616     __ jccb(Assembler::negative, L_failed_0);
2617 
2618     // The first four tests are very dense code,
2619     // but not quite dense enough to put four
2620     // jumps in a 16-byte instruction fetch buffer.
2621     // That's good, because some branch predicters
2622     // do not like jumps so close together.
2623     // Make sure of this.
2624     guarantee(((j1off ^ j4off) & ~15) != 0, "I$ line of 1st & 4th jumps");
2625 
2626     // registers used as temp
2627     const Register r11_length    = r11; // elements count to copy
2628     const Register r10_src_klass = r10; // array klass
2629 
2630     //  if (length < 0) return -1;
2631     __ movl(r11_length, length);        // length (elements count, 32-bits value)
2632     __ testl(r11_length, r11_length);
2633     __ jccb(Assembler::negative, L_failed_0);
2634 
2635     __ load_klass(r10_src_klass, src);
2636 #ifdef ASSERT
2637     //  assert(src->klass() != NULL);
2638     {
2639       BLOCK_COMMENT("assert klasses not null {");
2640       Label L1, L2;
2641       __ testptr(r10_src_klass, r10_src_klass);
2642       __ jcc(Assembler::notZero, L2);   // it is broken if klass is NULL
2643       __ bind(L1);
2644       __ stop("broken null klass");
2645       __ bind(L2);
2646       __ load_klass(rax, dst);
2647       __ cmpq(rax, 0);
2648       __ jcc(Assembler::equal, L1);     // this would be broken also
2649       BLOCK_COMMENT("} assert klasses not null done");
2650     }
2651 #endif
2652 
2653     // Load layout helper (32-bits)
2654     //
2655     //  |array_tag|     | header_size | element_type |     |log2_element_size|
2656     // 32        30    24            16              8     2                 0
2657     //
2658     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2659     //
2660 
2661     const int lh_offset = in_bytes(Klass::layout_helper_offset());
2662 
2663     // Handle objArrays completely differently...
2664     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2665     __ cmpl(Address(r10_src_klass, lh_offset), objArray_lh);
2666     __ jcc(Assembler::equal, L_objArray);
2667 
2668     //  if (src->klass() != dst->klass()) return -1;
2669     __ load_klass(rax, dst);
2670     __ cmpq(r10_src_klass, rax);
2671     __ jcc(Assembler::notEqual, L_failed);
2672 
2673     const Register rax_lh = rax;  // layout helper
2674     __ movl(rax_lh, Address(r10_src_klass, lh_offset));
2675 
2676     //  if (!src->is_Array()) return -1;
2677     __ cmpl(rax_lh, Klass::_lh_neutral_value);
2678     __ jcc(Assembler::greaterEqual, L_failed);
2679 
2680     // At this point, it is known to be a typeArray (array_tag 0x3).
2681 #ifdef ASSERT
2682     {
2683       BLOCK_COMMENT("assert primitive array {");
2684       Label L;
2685       __ cmpl(rax_lh, (Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift));
2686       __ jcc(Assembler::greaterEqual, L);
2687       __ stop("must be a primitive array");
2688       __ bind(L);
2689       BLOCK_COMMENT("} assert primitive array done");
2690     }
2691 #endif
2692 
2693     arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
2694                            r10, L_failed);
2695 
2696     // TypeArrayKlass
2697     //
2698     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2699     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2700     //
2701 
2702     const Register r10_offset = r10;    // array offset
2703     const Register rax_elsize = rax_lh; // element size
2704 
2705     __ movl(r10_offset, rax_lh);
2706     __ shrl(r10_offset, Klass::_lh_header_size_shift);
2707     __ andptr(r10_offset, Klass::_lh_header_size_mask);   // array_offset
2708     __ addptr(src, r10_offset);           // src array offset
2709     __ addptr(dst, r10_offset);           // dst array offset
2710     BLOCK_COMMENT("choose copy loop based on element size");
2711     __ andl(rax_lh, Klass::_lh_log2_element_size_mask); // rax_lh -> rax_elsize
2712 
2713     // next registers should be set before the jump to corresponding stub
2714     const Register from     = c_rarg0;  // source array address
2715     const Register to       = c_rarg1;  // destination array address
2716     const Register count    = c_rarg2;  // elements count
2717 
2718     // 'from', 'to', 'count' registers should be set in such order
2719     // since they are the same as 'src', 'src_pos', 'dst'.
2720 
2721   __ BIND(L_copy_bytes);
2722     __ cmpl(rax_elsize, 0);
2723     __ jccb(Assembler::notEqual, L_copy_shorts);
2724     __ lea(from, Address(src, src_pos, Address::times_1, 0));// src_addr
2725     __ lea(to,   Address(dst, dst_pos, Address::times_1, 0));// dst_addr
2726     __ movl2ptr(count, r11_length); // length
2727     __ jump(RuntimeAddress(byte_copy_entry));
2728 
2729   __ BIND(L_copy_shorts);
2730     __ cmpl(rax_elsize, LogBytesPerShort);
2731     __ jccb(Assembler::notEqual, L_copy_ints);
2732     __ lea(from, Address(src, src_pos, Address::times_2, 0));// src_addr
2733     __ lea(to,   Address(dst, dst_pos, Address::times_2, 0));// dst_addr
2734     __ movl2ptr(count, r11_length); // length
2735     __ jump(RuntimeAddress(short_copy_entry));
2736 
2737   __ BIND(L_copy_ints);
2738     __ cmpl(rax_elsize, LogBytesPerInt);
2739     __ jccb(Assembler::notEqual, L_copy_longs);
2740     __ lea(from, Address(src, src_pos, Address::times_4, 0));// src_addr
2741     __ lea(to,   Address(dst, dst_pos, Address::times_4, 0));// dst_addr
2742     __ movl2ptr(count, r11_length); // length
2743     __ jump(RuntimeAddress(int_copy_entry));
2744 
2745   __ BIND(L_copy_longs);
2746 #ifdef ASSERT
2747     {
2748       BLOCK_COMMENT("assert long copy {");
2749       Label L;
2750       __ cmpl(rax_elsize, LogBytesPerLong);
2751       __ jcc(Assembler::equal, L);
2752       __ stop("must be long copy, but elsize is wrong");
2753       __ bind(L);
2754       BLOCK_COMMENT("} assert long copy done");
2755     }
2756 #endif
2757     __ lea(from, Address(src, src_pos, Address::times_8, 0));// src_addr
2758     __ lea(to,   Address(dst, dst_pos, Address::times_8, 0));// dst_addr
2759     __ movl2ptr(count, r11_length); // length
2760     __ jump(RuntimeAddress(long_copy_entry));
2761 
2762     // ObjArrayKlass
2763   __ BIND(L_objArray);
2764     // live at this point:  r10_src_klass, r11_length, src[_pos], dst[_pos]
2765 
2766     Label L_plain_copy, L_checkcast_copy;
2767     //  test array classes for subtyping
2768     __ load_klass(rax, dst);
2769     __ cmpq(r10_src_klass, rax); // usual case is exact equality
2770     __ jcc(Assembler::notEqual, L_checkcast_copy);
2771 
2772     // Identically typed arrays can be copied without element-wise checks.
2773     arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
2774                            r10, L_failed);
2775 
2776     __ lea(from, Address(src, src_pos, TIMES_OOP,
2777                  arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // src_addr
2778     __ lea(to,   Address(dst, dst_pos, TIMES_OOP,
2779                  arrayOopDesc::base_offset_in_bytes(T_OBJECT))); // dst_addr
2780     __ movl2ptr(count, r11_length); // length
2781   __ BIND(L_plain_copy);
2782     __ jump(RuntimeAddress(oop_copy_entry));
2783 
2784   __ BIND(L_checkcast_copy);
2785     // live at this point:  r10_src_klass, r11_length, rax (dst_klass)
2786     {
2787       // Before looking at dst.length, make sure dst is also an objArray.
2788       __ cmpl(Address(rax, lh_offset), objArray_lh);
2789       __ jcc(Assembler::notEqual, L_failed);
2790 
2791       // It is safe to examine both src.length and dst.length.
2792       arraycopy_range_checks(src, src_pos, dst, dst_pos, r11_length,
2793                              rax, L_failed);
2794 
2795       const Register r11_dst_klass = r11;
2796       __ load_klass(r11_dst_klass, dst); // reload
2797 
2798       // Marshal the base address arguments now, freeing registers.
2799       __ lea(from, Address(src, src_pos, TIMES_OOP,
2800                    arrayOopDesc::base_offset_in_bytes(T_OBJECT)));
2801       __ lea(to,   Address(dst, dst_pos, TIMES_OOP,
2802                    arrayOopDesc::base_offset_in_bytes(T_OBJECT)));
2803       __ movl(count, length);           // length (reloaded)
2804       Register sco_temp = c_rarg3;      // this register is free now
2805       assert_different_registers(from, to, count, sco_temp,
2806                                  r11_dst_klass, r10_src_klass);
2807       assert_clean_int(count, sco_temp);
2808 
2809       // Generate the type check.
2810       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
2811       __ movl(sco_temp, Address(r11_dst_klass, sco_offset));
2812       assert_clean_int(sco_temp, rax);
2813       generate_type_check(r10_src_klass, sco_temp, r11_dst_klass, L_plain_copy);
2814 
2815       // Fetch destination element klass from the ObjArrayKlass header.
2816       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2817       __ movptr(r11_dst_klass, Address(r11_dst_klass, ek_offset));
2818       __ movl(  sco_temp,      Address(r11_dst_klass, sco_offset));
2819       assert_clean_int(sco_temp, rax);
2820 
2821       // the checkcast_copy loop needs two extra arguments:
2822       assert(c_rarg3 == sco_temp, "#3 already in place");
2823       // Set up arguments for checkcast_copy_entry.
2824       setup_arg_regs(4);
2825       __ movptr(r8, r11_dst_klass);  // dst.klass.element_klass, r8 is c_rarg4 on Linux/Solaris
2826       __ jump(RuntimeAddress(checkcast_copy_entry));
2827     }
2828 
2829   __ BIND(L_failed);
2830     __ xorptr(rax, rax);
2831     __ notptr(rax); // return -1
2832     __ leave();   // required for proper stackwalking of RuntimeStub frame
2833     __ ret(0);
2834 
2835     return start;
2836   }
2837 
2838   void generate_arraycopy_stubs() {
2839     address entry;
2840     address entry_jbyte_arraycopy;
2841     address entry_jshort_arraycopy;
2842     address entry_jint_arraycopy;
2843     address entry_oop_arraycopy;
2844     address entry_jlong_arraycopy;
2845     address entry_checkcast_arraycopy;
2846 
2847     StubRoutines::_jbyte_disjoint_arraycopy  = generate_disjoint_byte_copy(false, &entry,
2848                                                                            "jbyte_disjoint_arraycopy");
2849     StubRoutines::_jbyte_arraycopy           = generate_conjoint_byte_copy(false, entry, &entry_jbyte_arraycopy,
2850                                                                            "jbyte_arraycopy");
2851 
2852     StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, &entry,
2853                                                                             "jshort_disjoint_arraycopy");
2854     StubRoutines::_jshort_arraycopy          = generate_conjoint_short_copy(false, entry, &entry_jshort_arraycopy,
2855                                                                             "jshort_arraycopy");
2856 
2857     StubRoutines::_jint_disjoint_arraycopy   = generate_disjoint_int_oop_copy(false, false, &entry,
2858                                                                               "jint_disjoint_arraycopy");
2859     StubRoutines::_jint_arraycopy            = generate_conjoint_int_oop_copy(false, false, entry,
2860                                                                               &entry_jint_arraycopy, "jint_arraycopy");
2861 
2862     StubRoutines::_jlong_disjoint_arraycopy  = generate_disjoint_long_oop_copy(false, false, &entry,
2863                                                                                "jlong_disjoint_arraycopy");
2864     StubRoutines::_jlong_arraycopy           = generate_conjoint_long_oop_copy(false, false, entry,
2865                                                                                &entry_jlong_arraycopy, "jlong_arraycopy");
2866 
2867 
2868     if (UseCompressedOops) {
2869       StubRoutines::_oop_disjoint_arraycopy  = generate_disjoint_int_oop_copy(false, true, &entry,
2870                                                                               "oop_disjoint_arraycopy");
2871       StubRoutines::_oop_arraycopy           = generate_conjoint_int_oop_copy(false, true, entry,
2872                                                                               &entry_oop_arraycopy, "oop_arraycopy");
2873       StubRoutines::_oop_disjoint_arraycopy_uninit  = generate_disjoint_int_oop_copy(false, true, &entry,
2874                                                                                      "oop_disjoint_arraycopy_uninit",
2875                                                                                      /*dest_uninitialized*/true);
2876       StubRoutines::_oop_arraycopy_uninit           = generate_conjoint_int_oop_copy(false, true, entry,
2877                                                                                      NULL, "oop_arraycopy_uninit",
2878                                                                                      /*dest_uninitialized*/true);
2879     } else {
2880       StubRoutines::_oop_disjoint_arraycopy  = generate_disjoint_long_oop_copy(false, true, &entry,
2881                                                                                "oop_disjoint_arraycopy");
2882       StubRoutines::_oop_arraycopy           = generate_conjoint_long_oop_copy(false, true, entry,
2883                                                                                &entry_oop_arraycopy, "oop_arraycopy");
2884       StubRoutines::_oop_disjoint_arraycopy_uninit  = generate_disjoint_long_oop_copy(false, true, &entry,
2885                                                                                       "oop_disjoint_arraycopy_uninit",
2886                                                                                       /*dest_uninitialized*/true);
2887       StubRoutines::_oop_arraycopy_uninit           = generate_conjoint_long_oop_copy(false, true, entry,
2888                                                                                       NULL, "oop_arraycopy_uninit",
2889                                                                                       /*dest_uninitialized*/true);
2890     }
2891 
2892     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
2893     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
2894                                                                         /*dest_uninitialized*/true);
2895 
2896     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
2897                                                               entry_jbyte_arraycopy,
2898                                                               entry_jshort_arraycopy,
2899                                                               entry_jint_arraycopy,
2900                                                               entry_jlong_arraycopy);
2901     StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
2902                                                                entry_jbyte_arraycopy,
2903                                                                entry_jshort_arraycopy,
2904                                                                entry_jint_arraycopy,
2905                                                                entry_oop_arraycopy,
2906                                                                entry_jlong_arraycopy,
2907                                                                entry_checkcast_arraycopy);
2908 
2909     StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
2910     StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
2911     StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
2912     StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
2913     StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
2914     StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
2915 
2916     // We don't generate specialized code for HeapWord-aligned source
2917     // arrays, so just use the code we've already generated
2918     StubRoutines::_arrayof_jbyte_disjoint_arraycopy  = StubRoutines::_jbyte_disjoint_arraycopy;
2919     StubRoutines::_arrayof_jbyte_arraycopy           = StubRoutines::_jbyte_arraycopy;
2920 
2921     StubRoutines::_arrayof_jshort_disjoint_arraycopy = StubRoutines::_jshort_disjoint_arraycopy;
2922     StubRoutines::_arrayof_jshort_arraycopy          = StubRoutines::_jshort_arraycopy;
2923 
2924     StubRoutines::_arrayof_jint_disjoint_arraycopy   = StubRoutines::_jint_disjoint_arraycopy;
2925     StubRoutines::_arrayof_jint_arraycopy            = StubRoutines::_jint_arraycopy;
2926 
2927     StubRoutines::_arrayof_jlong_disjoint_arraycopy  = StubRoutines::_jlong_disjoint_arraycopy;
2928     StubRoutines::_arrayof_jlong_arraycopy           = StubRoutines::_jlong_arraycopy;
2929 
2930     StubRoutines::_arrayof_oop_disjoint_arraycopy    = StubRoutines::_oop_disjoint_arraycopy;
2931     StubRoutines::_arrayof_oop_arraycopy             = StubRoutines::_oop_arraycopy;
2932 
2933     StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit    = StubRoutines::_oop_disjoint_arraycopy_uninit;
2934     StubRoutines::_arrayof_oop_arraycopy_uninit             = StubRoutines::_oop_arraycopy_uninit;
2935   }
2936 
2937   // AES intrinsic stubs
2938   enum {AESBlockSize = 16};
2939 
2940   address generate_key_shuffle_mask() {
2941     __ align(16);
2942     StubCodeMark mark(this, "StubRoutines", "key_shuffle_mask");
2943     address start = __ pc();
2944     __ emit_data64( 0x0405060700010203, relocInfo::none );
2945     __ emit_data64( 0x0c0d0e0f08090a0b, relocInfo::none );
2946     return start;
2947   }
2948 
2949   address generate_counter_shuffle_mask() {
2950     __ align(16);
2951     StubCodeMark mark(this, "StubRoutines", "counter_shuffle_mask");
2952     address start = __ pc();
2953     __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none);
2954     __ emit_data64(0x0001020304050607, relocInfo::none);
2955     return start;
2956   }
2957 
2958   // Utility routine for loading a 128-bit key word in little endian format
2959   // can optionally specify that the shuffle mask is already in an xmmregister
2960   void load_key(XMMRegister xmmdst, Register key, int offset, XMMRegister xmm_shuf_mask=NULL) {
2961     __ movdqu(xmmdst, Address(key, offset));
2962     if (xmm_shuf_mask != NULL) {
2963       __ pshufb(xmmdst, xmm_shuf_mask);
2964     } else {
2965       __ pshufb(xmmdst, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
2966     }
2967   }
2968 
2969   // Utility routine for increase 128bit counter (iv in CTR mode)
2970   void inc_counter(Register reg, XMMRegister xmmdst, int inc_delta, Label& next_block) {
2971     __ pextrq(reg, xmmdst, 0x0);
2972     __ addq(reg, inc_delta);
2973     __ pinsrq(xmmdst, reg, 0x0);
2974     __ jcc(Assembler::carryClear, next_block); // jump if no carry
2975     __ pextrq(reg, xmmdst, 0x01); // Carry
2976     __ addq(reg, 0x01);
2977     __ pinsrq(xmmdst, reg, 0x01); //Carry end
2978     __ BIND(next_block);          // next instruction
2979   }
2980 
2981   // Arguments:
2982   //
2983   // Inputs:
2984   //   c_rarg0   - source byte array address
2985   //   c_rarg1   - destination byte array address
2986   //   c_rarg2   - K (key) in little endian int array
2987   //
2988   address generate_aescrypt_encryptBlock() {
2989     assert(UseAES, "need AES instructions and misaligned SSE support");
2990     __ align(CodeEntryAlignment);
2991     StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
2992     Label L_doLast;
2993     address start = __ pc();
2994 
2995     const Register from        = c_rarg0;  // source array address
2996     const Register to          = c_rarg1;  // destination array address
2997     const Register key         = c_rarg2;  // key array address
2998     const Register keylen      = rax;
2999 
3000     const XMMRegister xmm_result = xmm0;
3001     const XMMRegister xmm_key_shuf_mask = xmm1;
3002     // On win64 xmm6-xmm15 must be preserved so don't use them.
3003     const XMMRegister xmm_temp1  = xmm2;
3004     const XMMRegister xmm_temp2  = xmm3;
3005     const XMMRegister xmm_temp3  = xmm4;
3006     const XMMRegister xmm_temp4  = xmm5;
3007 
3008     __ enter(); // required for proper stackwalking of RuntimeStub frame
3009 
3010     // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
3011     // context for the registers used, where all instructions below are using 128-bit mode
3012     // On EVEX without VL and BW, these instructions will all be AVX.
3013     if (VM_Version::supports_avx512vlbw()) {
3014       __ movl(rax, 0xffff);
3015       __ kmovql(k1, rax);
3016     }
3017 
3018     // keylen could be only {11, 13, 15} * 4 = {44, 52, 60}
3019     __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3020 
3021     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3022     __ movdqu(xmm_result, Address(from, 0));  // get 16 bytes of input
3023 
3024     // For encryption, the java expanded key ordering is just what we need
3025     // we don't know if the key is aligned, hence not using load-execute form
3026 
3027     load_key(xmm_temp1, key, 0x00, xmm_key_shuf_mask);
3028     __ pxor(xmm_result, xmm_temp1);
3029 
3030     load_key(xmm_temp1, key, 0x10, xmm_key_shuf_mask);
3031     load_key(xmm_temp2, key, 0x20, xmm_key_shuf_mask);
3032     load_key(xmm_temp3, key, 0x30, xmm_key_shuf_mask);
3033     load_key(xmm_temp4, key, 0x40, xmm_key_shuf_mask);
3034 
3035     __ aesenc(xmm_result, xmm_temp1);
3036     __ aesenc(xmm_result, xmm_temp2);
3037     __ aesenc(xmm_result, xmm_temp3);
3038     __ aesenc(xmm_result, xmm_temp4);
3039 
3040     load_key(xmm_temp1, key, 0x50, xmm_key_shuf_mask);
3041     load_key(xmm_temp2, key, 0x60, xmm_key_shuf_mask);
3042     load_key(xmm_temp3, key, 0x70, xmm_key_shuf_mask);
3043     load_key(xmm_temp4, key, 0x80, xmm_key_shuf_mask);
3044 
3045     __ aesenc(xmm_result, xmm_temp1);
3046     __ aesenc(xmm_result, xmm_temp2);
3047     __ aesenc(xmm_result, xmm_temp3);
3048     __ aesenc(xmm_result, xmm_temp4);
3049 
3050     load_key(xmm_temp1, key, 0x90, xmm_key_shuf_mask);
3051     load_key(xmm_temp2, key, 0xa0, xmm_key_shuf_mask);
3052 
3053     __ cmpl(keylen, 44);
3054     __ jccb(Assembler::equal, L_doLast);
3055 
3056     __ aesenc(xmm_result, xmm_temp1);
3057     __ aesenc(xmm_result, xmm_temp2);
3058 
3059     load_key(xmm_temp1, key, 0xb0, xmm_key_shuf_mask);
3060     load_key(xmm_temp2, key, 0xc0, xmm_key_shuf_mask);
3061 
3062     __ cmpl(keylen, 52);
3063     __ jccb(Assembler::equal, L_doLast);
3064 
3065     __ aesenc(xmm_result, xmm_temp1);
3066     __ aesenc(xmm_result, xmm_temp2);
3067 
3068     load_key(xmm_temp1, key, 0xd0, xmm_key_shuf_mask);
3069     load_key(xmm_temp2, key, 0xe0, xmm_key_shuf_mask);
3070 
3071     __ BIND(L_doLast);
3072     __ aesenc(xmm_result, xmm_temp1);
3073     __ aesenclast(xmm_result, xmm_temp2);
3074     __ movdqu(Address(to, 0), xmm_result);        // store the result
3075     __ xorptr(rax, rax); // return 0
3076     __ leave(); // required for proper stackwalking of RuntimeStub frame
3077     __ ret(0);
3078 
3079     return start;
3080   }
3081 
3082 
3083   // Arguments:
3084   //
3085   // Inputs:
3086   //   c_rarg0   - source byte array address
3087   //   c_rarg1   - destination byte array address
3088   //   c_rarg2   - K (key) in little endian int array
3089   //
3090   address generate_aescrypt_decryptBlock() {
3091     assert(UseAES, "need AES instructions and misaligned SSE support");
3092     __ align(CodeEntryAlignment);
3093     StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
3094     Label L_doLast;
3095     address start = __ pc();
3096 
3097     const Register from        = c_rarg0;  // source array address
3098     const Register to          = c_rarg1;  // destination array address
3099     const Register key         = c_rarg2;  // key array address
3100     const Register keylen      = rax;
3101 
3102     const XMMRegister xmm_result = xmm0;
3103     const XMMRegister xmm_key_shuf_mask = xmm1;
3104     // On win64 xmm6-xmm15 must be preserved so don't use them.
3105     const XMMRegister xmm_temp1  = xmm2;
3106     const XMMRegister xmm_temp2  = xmm3;
3107     const XMMRegister xmm_temp3  = xmm4;
3108     const XMMRegister xmm_temp4  = xmm5;
3109 
3110     __ enter(); // required for proper stackwalking of RuntimeStub frame
3111 
3112     // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
3113     // context for the registers used, where all instructions below are using 128-bit mode
3114     // On EVEX without VL and BW, these instructions will all be AVX.
3115     if (VM_Version::supports_avx512vlbw()) {
3116       __ movl(rax, 0xffff);
3117       __ kmovql(k1, rax);
3118     }
3119 
3120     // keylen could be only {11, 13, 15} * 4 = {44, 52, 60}
3121     __ movl(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3122 
3123     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3124     __ movdqu(xmm_result, Address(from, 0));
3125 
3126     // for decryption java expanded key ordering is rotated one position from what we want
3127     // so we start from 0x10 here and hit 0x00 last
3128     // we don't know if the key is aligned, hence not using load-execute form
3129     load_key(xmm_temp1, key, 0x10, xmm_key_shuf_mask);
3130     load_key(xmm_temp2, key, 0x20, xmm_key_shuf_mask);
3131     load_key(xmm_temp3, key, 0x30, xmm_key_shuf_mask);
3132     load_key(xmm_temp4, key, 0x40, xmm_key_shuf_mask);
3133 
3134     __ pxor  (xmm_result, xmm_temp1);
3135     __ aesdec(xmm_result, xmm_temp2);
3136     __ aesdec(xmm_result, xmm_temp3);
3137     __ aesdec(xmm_result, xmm_temp4);
3138 
3139     load_key(xmm_temp1, key, 0x50, xmm_key_shuf_mask);
3140     load_key(xmm_temp2, key, 0x60, xmm_key_shuf_mask);
3141     load_key(xmm_temp3, key, 0x70, xmm_key_shuf_mask);
3142     load_key(xmm_temp4, key, 0x80, xmm_key_shuf_mask);
3143 
3144     __ aesdec(xmm_result, xmm_temp1);
3145     __ aesdec(xmm_result, xmm_temp2);
3146     __ aesdec(xmm_result, xmm_temp3);
3147     __ aesdec(xmm_result, xmm_temp4);
3148 
3149     load_key(xmm_temp1, key, 0x90, xmm_key_shuf_mask);
3150     load_key(xmm_temp2, key, 0xa0, xmm_key_shuf_mask);
3151     load_key(xmm_temp3, key, 0x00, xmm_key_shuf_mask);
3152 
3153     __ cmpl(keylen, 44);
3154     __ jccb(Assembler::equal, L_doLast);
3155 
3156     __ aesdec(xmm_result, xmm_temp1);
3157     __ aesdec(xmm_result, xmm_temp2);
3158 
3159     load_key(xmm_temp1, key, 0xb0, xmm_key_shuf_mask);
3160     load_key(xmm_temp2, key, 0xc0, xmm_key_shuf_mask);
3161 
3162     __ cmpl(keylen, 52);
3163     __ jccb(Assembler::equal, L_doLast);
3164 
3165     __ aesdec(xmm_result, xmm_temp1);
3166     __ aesdec(xmm_result, xmm_temp2);
3167 
3168     load_key(xmm_temp1, key, 0xd0, xmm_key_shuf_mask);
3169     load_key(xmm_temp2, key, 0xe0, xmm_key_shuf_mask);
3170 
3171     __ BIND(L_doLast);
3172     __ aesdec(xmm_result, xmm_temp1);
3173     __ aesdec(xmm_result, xmm_temp2);
3174 
3175     // for decryption the aesdeclast operation is always on key+0x00
3176     __ aesdeclast(xmm_result, xmm_temp3);
3177     __ movdqu(Address(to, 0), xmm_result);  // store the result
3178     __ xorptr(rax, rax); // return 0
3179     __ leave(); // required for proper stackwalking of RuntimeStub frame
3180     __ ret(0);
3181 
3182     return start;
3183   }
3184 
3185 
3186   // Arguments:
3187   //
3188   // Inputs:
3189   //   c_rarg0   - source byte array address
3190   //   c_rarg1   - destination byte array address
3191   //   c_rarg2   - K (key) in little endian int array
3192   //   c_rarg3   - r vector byte array address
3193   //   c_rarg4   - input length
3194   //
3195   // Output:
3196   //   rax       - input length
3197   //
3198   address generate_cipherBlockChaining_encryptAESCrypt() {
3199     assert(UseAES, "need AES instructions and misaligned SSE support");
3200     __ align(CodeEntryAlignment);
3201     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
3202     address start = __ pc();
3203 
3204     Label L_exit, L_key_192_256, L_key_256, L_loopTop_128, L_loopTop_192, L_loopTop_256;
3205     const Register from        = c_rarg0;  // source array address
3206     const Register to          = c_rarg1;  // destination array address
3207     const Register key         = c_rarg2;  // key array address
3208     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
3209                                            // and left with the results of the last encryption block
3210 #ifndef _WIN64
3211     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
3212 #else
3213     const Address  len_mem(rbp, 6 * wordSize);  // length is on stack on Win64
3214     const Register len_reg     = r10;      // pick the first volatile windows register
3215 #endif
3216     const Register pos         = rax;
3217 
3218     // xmm register assignments for the loops below
3219     const XMMRegister xmm_result = xmm0;
3220     const XMMRegister xmm_temp   = xmm1;
3221     // keys 0-10 preloaded into xmm2-xmm12
3222     const int XMM_REG_NUM_KEY_FIRST = 2;
3223     const int XMM_REG_NUM_KEY_LAST  = 15;
3224     const XMMRegister xmm_key0   = as_XMMRegister(XMM_REG_NUM_KEY_FIRST);
3225     const XMMRegister xmm_key10  = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+10);
3226     const XMMRegister xmm_key11  = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+11);
3227     const XMMRegister xmm_key12  = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+12);
3228     const XMMRegister xmm_key13  = as_XMMRegister(XMM_REG_NUM_KEY_FIRST+13);
3229 
3230     __ enter(); // required for proper stackwalking of RuntimeStub frame
3231 
3232     // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
3233     // context for the registers used, where all instructions below are using 128-bit mode
3234     // On EVEX without VL and BW, these instructions will all be AVX.
3235     if (VM_Version::supports_avx512vlbw()) {
3236       __ movl(rax, 0xffff);
3237       __ kmovql(k1, rax);
3238     }
3239 
3240 #ifdef _WIN64
3241     // on win64, fill len_reg from stack position
3242     __ movl(len_reg, len_mem);
3243 #else
3244     __ push(len_reg); // Save
3245 #endif
3246 
3247     const XMMRegister xmm_key_shuf_mask = xmm_temp;  // used temporarily to swap key bytes up front
3248     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3249     // load up xmm regs xmm2 thru xmm12 with key 0x00 - 0xa0
3250     for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x00; rnum <= XMM_REG_NUM_KEY_FIRST+10; rnum++) {
3251       load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask);
3252       offset += 0x10;
3253     }
3254     __ movdqu(xmm_result, Address(rvec, 0x00));   // initialize xmm_result with r vec
3255 
3256     // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256))
3257     __ movl(rax, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3258     __ cmpl(rax, 44);
3259     __ jcc(Assembler::notEqual, L_key_192_256);
3260 
3261     // 128 bit code follows here
3262     __ movptr(pos, 0);
3263     __ align(OptoLoopAlignment);
3264 
3265     __ BIND(L_loopTop_128);
3266     __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of input
3267     __ pxor  (xmm_result, xmm_temp);               // xor with the current r vector
3268     __ pxor  (xmm_result, xmm_key0);               // do the aes rounds
3269     for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum <= XMM_REG_NUM_KEY_FIRST + 9; rnum++) {
3270       __ aesenc(xmm_result, as_XMMRegister(rnum));
3271     }
3272     __ aesenclast(xmm_result, xmm_key10);
3273     __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
3274     // no need to store r to memory until we exit
3275     __ addptr(pos, AESBlockSize);
3276     __ subptr(len_reg, AESBlockSize);
3277     __ jcc(Assembler::notEqual, L_loopTop_128);
3278 
3279     __ BIND(L_exit);
3280     __ movdqu(Address(rvec, 0), xmm_result);     // final value of r stored in rvec of CipherBlockChaining object
3281 
3282 #ifdef _WIN64
3283     __ movl(rax, len_mem);
3284 #else
3285     __ pop(rax); // return length
3286 #endif
3287     __ leave(); // required for proper stackwalking of RuntimeStub frame
3288     __ ret(0);
3289 
3290     __ BIND(L_key_192_256);
3291     // here rax = len in ints of AESCrypt.KLE array (52=192, or 60=256)
3292     load_key(xmm_key11, key, 0xb0, xmm_key_shuf_mask);
3293     load_key(xmm_key12, key, 0xc0, xmm_key_shuf_mask);
3294     __ cmpl(rax, 52);
3295     __ jcc(Assembler::notEqual, L_key_256);
3296 
3297     // 192-bit code follows here (could be changed to use more xmm registers)
3298     __ movptr(pos, 0);
3299     __ align(OptoLoopAlignment);
3300 
3301     __ BIND(L_loopTop_192);
3302     __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of input
3303     __ pxor  (xmm_result, xmm_temp);               // xor with the current r vector
3304     __ pxor  (xmm_result, xmm_key0);               // do the aes rounds
3305     for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum  <= XMM_REG_NUM_KEY_FIRST + 11; rnum++) {
3306       __ aesenc(xmm_result, as_XMMRegister(rnum));
3307     }
3308     __ aesenclast(xmm_result, xmm_key12);
3309     __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
3310     // no need to store r to memory until we exit
3311     __ addptr(pos, AESBlockSize);
3312     __ subptr(len_reg, AESBlockSize);
3313     __ jcc(Assembler::notEqual, L_loopTop_192);
3314     __ jmp(L_exit);
3315 
3316     __ BIND(L_key_256);
3317     // 256-bit code follows here (could be changed to use more xmm registers)
3318     load_key(xmm_key13, key, 0xd0, xmm_key_shuf_mask);
3319     __ movptr(pos, 0);
3320     __ align(OptoLoopAlignment);
3321 
3322     __ BIND(L_loopTop_256);
3323     __ movdqu(xmm_temp, Address(from, pos, Address::times_1, 0));   // get next 16 bytes of input
3324     __ pxor  (xmm_result, xmm_temp);               // xor with the current r vector
3325     __ pxor  (xmm_result, xmm_key0);               // do the aes rounds
3326     for (int rnum = XMM_REG_NUM_KEY_FIRST + 1; rnum  <= XMM_REG_NUM_KEY_FIRST + 13; rnum++) {
3327       __ aesenc(xmm_result, as_XMMRegister(rnum));
3328     }
3329     load_key(xmm_temp, key, 0xe0);
3330     __ aesenclast(xmm_result, xmm_temp);
3331     __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result);     // store into the next 16 bytes of output
3332     // no need to store r to memory until we exit
3333     __ addptr(pos, AESBlockSize);
3334     __ subptr(len_reg, AESBlockSize);
3335     __ jcc(Assembler::notEqual, L_loopTop_256);
3336     __ jmp(L_exit);
3337 
3338     return start;
3339   }
3340 
3341   // Safefetch stubs.
3342   void generate_safefetch(const char* name, int size, address* entry,
3343                           address* fault_pc, address* continuation_pc) {
3344     // safefetch signatures:
3345     //   int      SafeFetch32(int*      adr, int      errValue);
3346     //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
3347     //
3348     // arguments:
3349     //   c_rarg0 = adr
3350     //   c_rarg1 = errValue
3351     //
3352     // result:
3353     //   PPC_RET  = *adr or errValue
3354 
3355     StubCodeMark mark(this, "StubRoutines", name);
3356 
3357     // Entry point, pc or function descriptor.
3358     *entry = __ pc();
3359 
3360     // Load *adr into c_rarg1, may fault.
3361     *fault_pc = __ pc();
3362     switch (size) {
3363       case 4:
3364         // int32_t
3365         __ movl(c_rarg1, Address(c_rarg0, 0));
3366         break;
3367       case 8:
3368         // int64_t
3369         __ movq(c_rarg1, Address(c_rarg0, 0));
3370         break;
3371       default:
3372         ShouldNotReachHere();
3373     }
3374 
3375     // return errValue or *adr
3376     *continuation_pc = __ pc();
3377     __ movq(rax, c_rarg1);
3378     __ ret(0);
3379   }
3380 
3381   // This is a version of CBC/AES Decrypt which does 4 blocks in a loop at a time
3382   // to hide instruction latency
3383   //
3384   // Arguments:
3385   //
3386   // Inputs:
3387   //   c_rarg0   - source byte array address
3388   //   c_rarg1   - destination byte array address
3389   //   c_rarg2   - K (key) in little endian int array
3390   //   c_rarg3   - r vector byte array address
3391   //   c_rarg4   - input length
3392   //
3393   // Output:
3394   //   rax       - input length
3395   //
3396   address generate_cipherBlockChaining_decryptAESCrypt_Parallel() {
3397     assert(UseAES, "need AES instructions and misaligned SSE support");
3398     __ align(CodeEntryAlignment);
3399     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
3400     address start = __ pc();
3401 
3402     const Register from        = c_rarg0;  // source array address
3403     const Register to          = c_rarg1;  // destination array address
3404     const Register key         = c_rarg2;  // key array address
3405     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
3406                                            // and left with the results of the last encryption block
3407 #ifndef _WIN64
3408     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
3409 #else
3410     const Address  len_mem(rbp, 6 * wordSize);  // length is on stack on Win64
3411     const Register len_reg     = r10;      // pick the first volatile windows register
3412 #endif
3413     const Register pos         = rax;
3414 
3415     const int PARALLEL_FACTOR = 4;
3416     const int ROUNDS[3] = { 10, 12, 14 }; // aes rounds for key128, key192, key256
3417 
3418     Label L_exit;
3419     Label L_singleBlock_loopTopHead[3]; // 128, 192, 256
3420     Label L_singleBlock_loopTopHead2[3]; // 128, 192, 256
3421     Label L_singleBlock_loopTop[3]; // 128, 192, 256
3422     Label L_multiBlock_loopTopHead[3]; // 128, 192, 256
3423     Label L_multiBlock_loopTop[3]; // 128, 192, 256
3424 
3425     // keys 0-10 preloaded into xmm5-xmm15
3426     const int XMM_REG_NUM_KEY_FIRST = 5;
3427     const int XMM_REG_NUM_KEY_LAST  = 15;
3428     const XMMRegister xmm_key_first = as_XMMRegister(XMM_REG_NUM_KEY_FIRST);
3429     const XMMRegister xmm_key_last  = as_XMMRegister(XMM_REG_NUM_KEY_LAST);
3430 
3431     __ enter(); // required for proper stackwalking of RuntimeStub frame
3432 
3433     // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
3434     // context for the registers used, where all instructions below are using 128-bit mode
3435     // On EVEX without VL and BW, these instructions will all be AVX.
3436     if (VM_Version::supports_avx512vlbw()) {
3437       __ movl(rax, 0xffff);
3438       __ kmovql(k1, rax);
3439     }
3440 
3441 #ifdef _WIN64
3442     // on win64, fill len_reg from stack position
3443     __ movl(len_reg, len_mem);
3444 #else
3445     __ push(len_reg); // Save
3446 #endif
3447     __ push(rbx);
3448     // the java expanded key ordering is rotated one position from what we want
3449     // so we start from 0x10 here and hit 0x00 last
3450     const XMMRegister xmm_key_shuf_mask = xmm1;  // used temporarily to swap key bytes up front
3451     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3452     // load up xmm regs 5 thru 15 with key 0x10 - 0xa0 - 0x00
3453     for (int rnum = XMM_REG_NUM_KEY_FIRST, offset = 0x10; rnum < XMM_REG_NUM_KEY_LAST; rnum++) {
3454       load_key(as_XMMRegister(rnum), key, offset, xmm_key_shuf_mask);
3455       offset += 0x10;
3456     }
3457     load_key(xmm_key_last, key, 0x00, xmm_key_shuf_mask);
3458 
3459     const XMMRegister xmm_prev_block_cipher = xmm1;  // holds cipher of previous block
3460 
3461     // registers holding the four results in the parallelized loop
3462     const XMMRegister xmm_result0 = xmm0;
3463     const XMMRegister xmm_result1 = xmm2;
3464     const XMMRegister xmm_result2 = xmm3;
3465     const XMMRegister xmm_result3 = xmm4;
3466 
3467     __ movdqu(xmm_prev_block_cipher, Address(rvec, 0x00));   // initialize with initial rvec
3468 
3469     __ xorptr(pos, pos);
3470 
3471     // now split to different paths depending on the keylen (len in ints of AESCrypt.KLE array (52=192, or 60=256))
3472     __ movl(rbx, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3473     __ cmpl(rbx, 52);
3474     __ jcc(Assembler::equal, L_multiBlock_loopTopHead[1]);
3475     __ cmpl(rbx, 60);
3476     __ jcc(Assembler::equal, L_multiBlock_loopTopHead[2]);
3477 
3478 #define DoFour(opc, src_reg)           \
3479   __ opc(xmm_result0, src_reg);         \
3480   __ opc(xmm_result1, src_reg);         \
3481   __ opc(xmm_result2, src_reg);         \
3482   __ opc(xmm_result3, src_reg);         \
3483 
3484     for (int k = 0; k < 3; ++k) {
3485       __ BIND(L_multiBlock_loopTopHead[k]);
3486       if (k != 0) {
3487         __ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least 4 blocks left
3488         __ jcc(Assembler::less, L_singleBlock_loopTopHead2[k]);
3489       }
3490       if (k == 1) {
3491         __ subptr(rsp, 6 * wordSize);
3492         __ movdqu(Address(rsp, 0), xmm15); //save last_key from xmm15
3493         load_key(xmm15, key, 0xb0); // 0xb0; 192-bit key goes up to 0xc0
3494         __ movdqu(Address(rsp, 2 * wordSize), xmm15);
3495         load_key(xmm1, key, 0xc0);  // 0xc0;
3496         __ movdqu(Address(rsp, 4 * wordSize), xmm1);
3497       } else if (k == 2) {
3498         __ subptr(rsp, 10 * wordSize);
3499         __ movdqu(Address(rsp, 0), xmm15); //save last_key from xmm15
3500         load_key(xmm15, key, 0xd0); // 0xd0; 256-bit key goes upto 0xe0
3501         __ movdqu(Address(rsp, 6 * wordSize), xmm15);
3502         load_key(xmm1, key, 0xe0);  // 0xe0;
3503         __ movdqu(Address(rsp, 8 * wordSize), xmm1);
3504         load_key(xmm15, key, 0xb0); // 0xb0;
3505         __ movdqu(Address(rsp, 2 * wordSize), xmm15);
3506         load_key(xmm1, key, 0xc0);  // 0xc0;
3507         __ movdqu(Address(rsp, 4 * wordSize), xmm1);
3508       }
3509       __ align(OptoLoopAlignment);
3510       __ BIND(L_multiBlock_loopTop[k]);
3511       __ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least 4 blocks left
3512       __ jcc(Assembler::less, L_singleBlock_loopTopHead[k]);
3513 
3514       if  (k != 0) {
3515         __ movdqu(xmm15, Address(rsp, 2 * wordSize));
3516         __ movdqu(xmm1, Address(rsp, 4 * wordSize));
3517       }
3518 
3519       __ movdqu(xmm_result0, Address(from, pos, Address::times_1, 0 * AESBlockSize)); // get next 4 blocks into xmmresult registers
3520       __ movdqu(xmm_result1, Address(from, pos, Address::times_1, 1 * AESBlockSize));
3521       __ movdqu(xmm_result2, Address(from, pos, Address::times_1, 2 * AESBlockSize));
3522       __ movdqu(xmm_result3, Address(from, pos, Address::times_1, 3 * AESBlockSize));
3523 
3524       DoFour(pxor, xmm_key_first);
3525       if (k == 0) {
3526         for (int rnum = 1; rnum < ROUNDS[k]; rnum++) {
3527           DoFour(aesdec, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST));
3528         }
3529         DoFour(aesdeclast, xmm_key_last);
3530       } else if (k == 1) {
3531         for (int rnum = 1; rnum <= ROUNDS[k]-2; rnum++) {
3532           DoFour(aesdec, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST));
3533         }
3534         __ movdqu(xmm_key_last, Address(rsp, 0)); // xmm15 needs to be loaded again.
3535         DoFour(aesdec, xmm1);  // key : 0xc0
3536         __ movdqu(xmm_prev_block_cipher, Address(rvec, 0x00));  // xmm1 needs to be loaded again
3537         DoFour(aesdeclast, xmm_key_last);
3538       } else if (k == 2) {
3539         for (int rnum = 1; rnum <= ROUNDS[k] - 4; rnum++) {
3540           DoFour(aesdec, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST));
3541         }
3542         DoFour(aesdec, xmm1);  // key : 0xc0
3543         __ movdqu(xmm15, Address(rsp, 6 * wordSize));
3544         __ movdqu(xmm1, Address(rsp, 8 * wordSize));
3545         DoFour(aesdec, xmm15);  // key : 0xd0
3546         __ movdqu(xmm_key_last, Address(rsp, 0)); // xmm15 needs to be loaded again.
3547         DoFour(aesdec, xmm1);  // key : 0xe0
3548         __ movdqu(xmm_prev_block_cipher, Address(rvec, 0x00));  // xmm1 needs to be loaded again
3549         DoFour(aesdeclast, xmm_key_last);
3550       }
3551 
3552       // for each result, xor with the r vector of previous cipher block
3553       __ pxor(xmm_result0, xmm_prev_block_cipher);
3554       __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 0 * AESBlockSize));
3555       __ pxor(xmm_result1, xmm_prev_block_cipher);
3556       __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 1 * AESBlockSize));
3557       __ pxor(xmm_result2, xmm_prev_block_cipher);
3558       __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 2 * AESBlockSize));
3559       __ pxor(xmm_result3, xmm_prev_block_cipher);
3560       __ movdqu(xmm_prev_block_cipher, Address(from, pos, Address::times_1, 3 * AESBlockSize));   // this will carry over to next set of blocks
3561       if (k != 0) {
3562         __ movdqu(Address(rvec, 0x00), xmm_prev_block_cipher);
3563       }
3564 
3565       __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0);     // store 4 results into the next 64 bytes of output
3566       __ movdqu(Address(to, pos, Address::times_1, 1 * AESBlockSize), xmm_result1);
3567       __ movdqu(Address(to, pos, Address::times_1, 2 * AESBlockSize), xmm_result2);
3568       __ movdqu(Address(to, pos, Address::times_1, 3 * AESBlockSize), xmm_result3);
3569 
3570       __ addptr(pos, PARALLEL_FACTOR * AESBlockSize);
3571       __ subptr(len_reg, PARALLEL_FACTOR * AESBlockSize);
3572       __ jmp(L_multiBlock_loopTop[k]);
3573 
3574       // registers used in the non-parallelized loops
3575       // xmm register assignments for the loops below
3576       const XMMRegister xmm_result = xmm0;
3577       const XMMRegister xmm_prev_block_cipher_save = xmm2;
3578       const XMMRegister xmm_key11 = xmm3;
3579       const XMMRegister xmm_key12 = xmm4;
3580       const XMMRegister key_tmp = xmm4;
3581 
3582       __ BIND(L_singleBlock_loopTopHead[k]);
3583       if (k == 1) {
3584         __ addptr(rsp, 6 * wordSize);
3585       } else if (k == 2) {
3586         __ addptr(rsp, 10 * wordSize);
3587       }
3588       __ cmpptr(len_reg, 0); // any blocks left??
3589       __ jcc(Assembler::equal, L_exit);
3590       __ BIND(L_singleBlock_loopTopHead2[k]);
3591       if (k == 1) {
3592         load_key(xmm_key11, key, 0xb0); // 0xb0; 192-bit key goes upto 0xc0
3593         load_key(xmm_key12, key, 0xc0); // 0xc0; 192-bit key goes upto 0xc0
3594       }
3595       if (k == 2) {
3596         load_key(xmm_key11, key, 0xb0); // 0xb0; 256-bit key goes upto 0xe0
3597       }
3598       __ align(OptoLoopAlignment);
3599       __ BIND(L_singleBlock_loopTop[k]);
3600       __ movdqu(xmm_result, Address(from, pos, Address::times_1, 0)); // get next 16 bytes of cipher input
3601       __ movdqa(xmm_prev_block_cipher_save, xmm_result); // save for next r vector
3602       __ pxor(xmm_result, xmm_key_first); // do the aes dec rounds
3603       for (int rnum = 1; rnum <= 9 ; rnum++) {
3604           __ aesdec(xmm_result, as_XMMRegister(rnum + XMM_REG_NUM_KEY_FIRST));
3605       }
3606       if (k == 1) {
3607         __ aesdec(xmm_result, xmm_key11);
3608         __ aesdec(xmm_result, xmm_key12);
3609       }
3610       if (k == 2) {
3611         __ aesdec(xmm_result, xmm_key11);
3612         load_key(key_tmp, key, 0xc0);
3613         __ aesdec(xmm_result, key_tmp);
3614         load_key(key_tmp, key, 0xd0);
3615         __ aesdec(xmm_result, key_tmp);
3616         load_key(key_tmp, key, 0xe0);
3617         __ aesdec(xmm_result, key_tmp);
3618       }
3619 
3620       __ aesdeclast(xmm_result, xmm_key_last); // xmm15 always came from key+0
3621       __ pxor(xmm_result, xmm_prev_block_cipher); // xor with the current r vector
3622       __ movdqu(Address(to, pos, Address::times_1, 0), xmm_result); // store into the next 16 bytes of output
3623       // no need to store r to memory until we exit
3624       __ movdqa(xmm_prev_block_cipher, xmm_prev_block_cipher_save); // set up next r vector with cipher input from this block
3625       __ addptr(pos, AESBlockSize);
3626       __ subptr(len_reg, AESBlockSize);
3627       __ jcc(Assembler::notEqual, L_singleBlock_loopTop[k]);
3628       if (k != 2) {
3629         __ jmp(L_exit);
3630       }
3631     } //for 128/192/256
3632 
3633     __ BIND(L_exit);
3634     __ movdqu(Address(rvec, 0), xmm_prev_block_cipher);     // final value of r stored in rvec of CipherBlockChaining object
3635     __ pop(rbx);
3636 #ifdef _WIN64
3637     __ movl(rax, len_mem);
3638 #else
3639     __ pop(rax); // return length
3640 #endif
3641     __ leave(); // required for proper stackwalking of RuntimeStub frame
3642     __ ret(0);
3643     return start;
3644 }
3645 
3646   address generate_upper_word_mask() {
3647     __ align(64);
3648     StubCodeMark mark(this, "StubRoutines", "upper_word_mask");
3649     address start = __ pc();
3650     __ emit_data64(0x0000000000000000, relocInfo::none);
3651     __ emit_data64(0xFFFFFFFF00000000, relocInfo::none);
3652     return start;
3653   }
3654 
3655   address generate_shuffle_byte_flip_mask() {
3656     __ align(64);
3657     StubCodeMark mark(this, "StubRoutines", "shuffle_byte_flip_mask");
3658     address start = __ pc();
3659     __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none);
3660     __ emit_data64(0x0001020304050607, relocInfo::none);
3661     return start;
3662   }
3663 
3664   // ofs and limit are use for multi-block byte array.
3665   // int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit)
3666   address generate_sha1_implCompress(bool multi_block, const char *name) {
3667     __ align(CodeEntryAlignment);
3668     StubCodeMark mark(this, "StubRoutines", name);
3669     address start = __ pc();
3670 
3671     Register buf = c_rarg0;
3672     Register state = c_rarg1;
3673     Register ofs = c_rarg2;
3674     Register limit = c_rarg3;
3675 
3676     const XMMRegister abcd = xmm0;
3677     const XMMRegister e0 = xmm1;
3678     const XMMRegister e1 = xmm2;
3679     const XMMRegister msg0 = xmm3;
3680 
3681     const XMMRegister msg1 = xmm4;
3682     const XMMRegister msg2 = xmm5;
3683     const XMMRegister msg3 = xmm6;
3684     const XMMRegister shuf_mask = xmm7;
3685 
3686     __ enter();
3687 
3688     __ subptr(rsp, 4 * wordSize);
3689 
3690     __ fast_sha1(abcd, e0, e1, msg0, msg1, msg2, msg3, shuf_mask,
3691       buf, state, ofs, limit, rsp, multi_block);
3692 
3693     __ addptr(rsp, 4 * wordSize);
3694 
3695     __ leave();
3696     __ ret(0);
3697     return start;
3698   }
3699 
3700   address generate_pshuffle_byte_flip_mask() {
3701     __ align(64);
3702     StubCodeMark mark(this, "StubRoutines", "pshuffle_byte_flip_mask");
3703     address start = __ pc();
3704     __ emit_data64(0x0405060700010203, relocInfo::none);
3705     __ emit_data64(0x0c0d0e0f08090a0b, relocInfo::none);
3706 
3707     if (VM_Version::supports_avx2()) {
3708       __ emit_data64(0x0405060700010203, relocInfo::none); // second copy
3709       __ emit_data64(0x0c0d0e0f08090a0b, relocInfo::none);
3710       // _SHUF_00BA
3711       __ emit_data64(0x0b0a090803020100, relocInfo::none);
3712       __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
3713       __ emit_data64(0x0b0a090803020100, relocInfo::none);
3714       __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
3715       // _SHUF_DC00
3716       __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
3717       __ emit_data64(0x0b0a090803020100, relocInfo::none);
3718       __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
3719       __ emit_data64(0x0b0a090803020100, relocInfo::none);
3720     }
3721 
3722     return start;
3723   }
3724 
3725   //Mask for byte-swapping a couple of qwords in an XMM register using (v)pshufb.
3726   address generate_pshuffle_byte_flip_mask_sha512() {
3727     __ align(32);
3728     StubCodeMark mark(this, "StubRoutines", "pshuffle_byte_flip_mask_sha512");
3729     address start = __ pc();
3730     if (VM_Version::supports_avx2()) {
3731       __ emit_data64(0x0001020304050607, relocInfo::none); // PSHUFFLE_BYTE_FLIP_MASK
3732       __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none);
3733       __ emit_data64(0x1011121314151617, relocInfo::none);
3734       __ emit_data64(0x18191a1b1c1d1e1f, relocInfo::none);
3735       __ emit_data64(0x0000000000000000, relocInfo::none); //MASK_YMM_LO
3736       __ emit_data64(0x0000000000000000, relocInfo::none);
3737       __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
3738       __ emit_data64(0xFFFFFFFFFFFFFFFF, relocInfo::none);
3739     }
3740 
3741     return start;
3742   }
3743 
3744 // ofs and limit are use for multi-block byte array.
3745 // int com.sun.security.provider.DigestBase.implCompressMultiBlock(byte[] b, int ofs, int limit)
3746   address generate_sha256_implCompress(bool multi_block, const char *name) {
3747     assert(VM_Version::supports_sha() || VM_Version::supports_avx2(), "");
3748     __ align(CodeEntryAlignment);
3749     StubCodeMark mark(this, "StubRoutines", name);
3750     address start = __ pc();
3751 
3752     Register buf = c_rarg0;
3753     Register state = c_rarg1;
3754     Register ofs = c_rarg2;
3755     Register limit = c_rarg3;
3756 
3757     const XMMRegister msg = xmm0;
3758     const XMMRegister state0 = xmm1;
3759     const XMMRegister state1 = xmm2;
3760     const XMMRegister msgtmp0 = xmm3;
3761 
3762     const XMMRegister msgtmp1 = xmm4;
3763     const XMMRegister msgtmp2 = xmm5;
3764     const XMMRegister msgtmp3 = xmm6;
3765     const XMMRegister msgtmp4 = xmm7;
3766 
3767     const XMMRegister shuf_mask = xmm8;
3768 
3769     __ enter();
3770 
3771     __ subptr(rsp, 4 * wordSize);
3772 
3773     if (VM_Version::supports_sha()) {
3774       __ fast_sha256(msg, state0, state1, msgtmp0, msgtmp1, msgtmp2, msgtmp3, msgtmp4,
3775         buf, state, ofs, limit, rsp, multi_block, shuf_mask);
3776     } else if (VM_Version::supports_avx2()) {
3777       __ sha256_AVX2(msg, state0, state1, msgtmp0, msgtmp1, msgtmp2, msgtmp3, msgtmp4,
3778         buf, state, ofs, limit, rsp, multi_block, shuf_mask);
3779     }
3780     __ addptr(rsp, 4 * wordSize);
3781 
3782     __ leave();
3783     __ ret(0);
3784     return start;
3785   }
3786 
3787   address generate_sha512_implCompress(bool multi_block, const char *name) {
3788     assert(VM_Version::supports_avx2(), "");
3789     assert(VM_Version::supports_bmi2(), "");
3790     __ align(CodeEntryAlignment);
3791     StubCodeMark mark(this, "StubRoutines", name);
3792     address start = __ pc();
3793 
3794     Register buf = c_rarg0;
3795     Register state = c_rarg1;
3796     Register ofs = c_rarg2;
3797     Register limit = c_rarg3;
3798 
3799     const XMMRegister msg = xmm0;
3800     const XMMRegister state0 = xmm1;
3801     const XMMRegister state1 = xmm2;
3802     const XMMRegister msgtmp0 = xmm3;
3803     const XMMRegister msgtmp1 = xmm4;
3804     const XMMRegister msgtmp2 = xmm5;
3805     const XMMRegister msgtmp3 = xmm6;
3806     const XMMRegister msgtmp4 = xmm7;
3807 
3808     const XMMRegister shuf_mask = xmm8;
3809 
3810     __ enter();
3811 
3812     __ sha512_AVX2(msg, state0, state1, msgtmp0, msgtmp1, msgtmp2, msgtmp3, msgtmp4,
3813     buf, state, ofs, limit, rsp, multi_block, shuf_mask);
3814 
3815     __ leave();
3816     __ ret(0);
3817     return start;
3818   }
3819 
3820   // This is a version of CTR/AES crypt which does 6 blocks in a loop at a time
3821   // to hide instruction latency
3822   //
3823   // Arguments:
3824   //
3825   // Inputs:
3826   //   c_rarg0   - source byte array address
3827   //   c_rarg1   - destination byte array address
3828   //   c_rarg2   - K (key) in little endian int array
3829   //   c_rarg3   - counter vector byte array address
3830   //   Linux
3831   //     c_rarg4   -          input length
3832   //     c_rarg5   -          saved encryptedCounter start
3833   //     rbp + 6 * wordSize - saved used length
3834   //   Windows
3835   //     rbp + 6 * wordSize - input length
3836   //     rbp + 7 * wordSize - saved encryptedCounter start
3837   //     rbp + 8 * wordSize - saved used length
3838   //
3839   // Output:
3840   //   rax       - input length
3841   //
3842   address generate_counterMode_AESCrypt_Parallel() {
3843     assert(UseAES, "need AES instructions and misaligned SSE support");
3844     __ align(CodeEntryAlignment);
3845     StubCodeMark mark(this, "StubRoutines", "counterMode_AESCrypt");
3846     address start = __ pc();
3847     const Register from = c_rarg0; // source array address
3848     const Register to = c_rarg1; // destination array address
3849     const Register key = c_rarg2; // key array address
3850     const Register counter = c_rarg3; // counter byte array initialized from counter array address
3851                                       // and updated with the incremented counter in the end
3852 #ifndef _WIN64
3853     const Register len_reg = c_rarg4;
3854     const Register saved_encCounter_start = c_rarg5;
3855     const Register used_addr = r10;
3856     const Address  used_mem(rbp, 2 * wordSize);
3857     const Register used = r11;
3858 #else
3859     const Address len_mem(rbp, 6 * wordSize); // length is on stack on Win64
3860     const Address saved_encCounter_mem(rbp, 7 * wordSize); // length is on stack on Win64
3861     const Address used_mem(rbp, 8 * wordSize); // length is on stack on Win64
3862     const Register len_reg = r10; // pick the first volatile windows register
3863     const Register saved_encCounter_start = r11;
3864     const Register used_addr = r13;
3865     const Register used = r14;
3866 #endif
3867     const Register pos = rax;
3868 
3869     const int PARALLEL_FACTOR = 6;
3870     const XMMRegister xmm_counter_shuf_mask = xmm0;
3871     const XMMRegister xmm_key_shuf_mask = xmm1; // used temporarily to swap key bytes up front
3872     const XMMRegister xmm_curr_counter = xmm2;
3873 
3874     const XMMRegister xmm_key_tmp0 = xmm3;
3875     const XMMRegister xmm_key_tmp1 = xmm4;
3876 
3877     // registers holding the four results in the parallelized loop
3878     const XMMRegister xmm_result0 = xmm5;
3879     const XMMRegister xmm_result1 = xmm6;
3880     const XMMRegister xmm_result2 = xmm7;
3881     const XMMRegister xmm_result3 = xmm8;
3882     const XMMRegister xmm_result4 = xmm9;
3883     const XMMRegister xmm_result5 = xmm10;
3884 
3885     const XMMRegister xmm_from0 = xmm11;
3886     const XMMRegister xmm_from1 = xmm12;
3887     const XMMRegister xmm_from2 = xmm13;
3888     const XMMRegister xmm_from3 = xmm14; //the last one is xmm14. we have to preserve it on WIN64.
3889     const XMMRegister xmm_from4 = xmm3; //reuse xmm3~4. Because xmm_key_tmp0~1 are useless when loading input text
3890     const XMMRegister xmm_from5 = xmm4;
3891 
3892     //for key_128, key_192, key_256
3893     const int rounds[3] = {10, 12, 14};
3894     Label L_exit_preLoop, L_preLoop_start;
3895     Label L_multiBlock_loopTop[3];
3896     Label L_singleBlockLoopTop[3];
3897     Label L__incCounter[3][6]; //for 6 blocks
3898     Label L__incCounter_single[3]; //for single block, key128, key192, key256
3899     Label L_processTail_insr[3], L_processTail_4_insr[3], L_processTail_2_insr[3], L_processTail_1_insr[3], L_processTail_exit_insr[3];
3900     Label L_processTail_extr[3], L_processTail_4_extr[3], L_processTail_2_extr[3], L_processTail_1_extr[3], L_processTail_exit_extr[3];
3901 
3902     Label L_exit;
3903 
3904     __ enter(); // required for proper stackwalking of RuntimeStub frame
3905 
3906     // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
3907     // context for the registers used, where all instructions below are using 128-bit mode
3908     // On EVEX without VL and BW, these instructions will all be AVX.
3909     if (VM_Version::supports_avx512vlbw()) {
3910         __ movl(rax, 0xffff);
3911         __ kmovql(k1, rax);
3912     }
3913 
3914 #ifdef _WIN64
3915     // allocate spill slots for r13, r14
3916     enum {
3917         saved_r13_offset,
3918         saved_r14_offset
3919     };
3920     __ subptr(rsp, 2 * wordSize);
3921     __ movptr(Address(rsp, saved_r13_offset * wordSize), r13);
3922     __ movptr(Address(rsp, saved_r14_offset * wordSize), r14);
3923 
3924     // on win64, fill len_reg from stack position
3925     __ movl(len_reg, len_mem);
3926     __ movptr(saved_encCounter_start, saved_encCounter_mem);
3927     __ movptr(used_addr, used_mem);
3928     __ movl(used, Address(used_addr, 0));
3929 #else
3930     __ push(len_reg); // Save
3931     __ movptr(used_addr, used_mem);
3932     __ movl(used, Address(used_addr, 0));
3933 #endif
3934 
3935     __ push(rbx); // Save RBX
3936     __ movdqu(xmm_curr_counter, Address(counter, 0x00)); // initialize counter with initial counter
3937     __ movdqu(xmm_counter_shuf_mask, ExternalAddress(StubRoutines::x86::counter_shuffle_mask_addr()));
3938     __ pshufb(xmm_curr_counter, xmm_counter_shuf_mask); //counter is shuffled
3939     __ movptr(pos, 0);
3940 
3941     // Use the partially used encrpyted counter from last invocation
3942     __ BIND(L_preLoop_start);
3943     __ cmpptr(used, 16);
3944     __ jcc(Assembler::aboveEqual, L_exit_preLoop);
3945       __ cmpptr(len_reg, 0);
3946       __ jcc(Assembler::lessEqual, L_exit_preLoop);
3947       __ movb(rbx, Address(saved_encCounter_start, used));
3948       __ xorb(rbx, Address(from, pos));
3949       __ movb(Address(to, pos), rbx);
3950       __ addptr(pos, 1);
3951       __ addptr(used, 1);
3952       __ subptr(len_reg, 1);
3953 
3954     __ jmp(L_preLoop_start);
3955 
3956     __ BIND(L_exit_preLoop);
3957     __ movl(Address(used_addr, 0), used);
3958 
3959     // key length could be only {11, 13, 15} * 4 = {44, 52, 60}
3960     __ movdqu(xmm_key_shuf_mask, ExternalAddress(StubRoutines::x86::key_shuffle_mask_addr()));
3961     __ movl(rbx, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3962     __ cmpl(rbx, 52);
3963     __ jcc(Assembler::equal, L_multiBlock_loopTop[1]);
3964     __ cmpl(rbx, 60);
3965     __ jcc(Assembler::equal, L_multiBlock_loopTop[2]);
3966 
3967 #define CTR_DoSix(opc, src_reg)                \
3968     __ opc(xmm_result0, src_reg);              \
3969     __ opc(xmm_result1, src_reg);              \
3970     __ opc(xmm_result2, src_reg);              \
3971     __ opc(xmm_result3, src_reg);              \
3972     __ opc(xmm_result4, src_reg);              \
3973     __ opc(xmm_result5, src_reg);
3974 
3975     // k == 0 :  generate code for key_128
3976     // k == 1 :  generate code for key_192
3977     // k == 2 :  generate code for key_256
3978     for (int k = 0; k < 3; ++k) {
3979       //multi blocks starts here
3980       __ align(OptoLoopAlignment);
3981       __ BIND(L_multiBlock_loopTop[k]);
3982       __ cmpptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // see if at least PARALLEL_FACTOR blocks left
3983       __ jcc(Assembler::less, L_singleBlockLoopTop[k]);
3984       load_key(xmm_key_tmp0, key, 0x00, xmm_key_shuf_mask);
3985 
3986       //load, then increase counters
3987       CTR_DoSix(movdqa, xmm_curr_counter);
3988       inc_counter(rbx, xmm_result1, 0x01, L__incCounter[k][0]);
3989       inc_counter(rbx, xmm_result2, 0x02, L__incCounter[k][1]);
3990       inc_counter(rbx, xmm_result3, 0x03, L__incCounter[k][2]);
3991       inc_counter(rbx, xmm_result4, 0x04, L__incCounter[k][3]);
3992       inc_counter(rbx, xmm_result5,  0x05, L__incCounter[k][4]);
3993       inc_counter(rbx, xmm_curr_counter, 0x06, L__incCounter[k][5]);
3994       CTR_DoSix(pshufb, xmm_counter_shuf_mask); // after increased, shuffled counters back for PXOR
3995       CTR_DoSix(pxor, xmm_key_tmp0);   //PXOR with Round 0 key
3996 
3997       //load two ROUND_KEYs at a time
3998       for (int i = 1; i < rounds[k]; ) {
3999         load_key(xmm_key_tmp1, key, (0x10 * i), xmm_key_shuf_mask);
4000         load_key(xmm_key_tmp0, key, (0x10 * (i+1)), xmm_key_shuf_mask);
4001         CTR_DoSix(aesenc, xmm_key_tmp1);
4002         i++;
4003         if (i != rounds[k]) {
4004           CTR_DoSix(aesenc, xmm_key_tmp0);
4005         } else {
4006           CTR_DoSix(aesenclast, xmm_key_tmp0);
4007         }
4008         i++;
4009       }
4010 
4011       // get next PARALLEL_FACTOR blocks into xmm_result registers
4012       __ movdqu(xmm_from0, Address(from, pos, Address::times_1, 0 * AESBlockSize));
4013       __ movdqu(xmm_from1, Address(from, pos, Address::times_1, 1 * AESBlockSize));
4014       __ movdqu(xmm_from2, Address(from, pos, Address::times_1, 2 * AESBlockSize));
4015       __ movdqu(xmm_from3, Address(from, pos, Address::times_1, 3 * AESBlockSize));
4016       __ movdqu(xmm_from4, Address(from, pos, Address::times_1, 4 * AESBlockSize));
4017       __ movdqu(xmm_from5, Address(from, pos, Address::times_1, 5 * AESBlockSize));
4018 
4019       __ pxor(xmm_result0, xmm_from0);
4020       __ pxor(xmm_result1, xmm_from1);
4021       __ pxor(xmm_result2, xmm_from2);
4022       __ pxor(xmm_result3, xmm_from3);
4023       __ pxor(xmm_result4, xmm_from4);
4024       __ pxor(xmm_result5, xmm_from5);
4025 
4026       // store 6 results into the next 64 bytes of output
4027       __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0);
4028       __ movdqu(Address(to, pos, Address::times_1, 1 * AESBlockSize), xmm_result1);
4029       __ movdqu(Address(to, pos, Address::times_1, 2 * AESBlockSize), xmm_result2);
4030       __ movdqu(Address(to, pos, Address::times_1, 3 * AESBlockSize), xmm_result3);
4031       __ movdqu(Address(to, pos, Address::times_1, 4 * AESBlockSize), xmm_result4);
4032       __ movdqu(Address(to, pos, Address::times_1, 5 * AESBlockSize), xmm_result5);
4033 
4034       __ addptr(pos, PARALLEL_FACTOR * AESBlockSize); // increase the length of crypt text
4035       __ subptr(len_reg, PARALLEL_FACTOR * AESBlockSize); // decrease the remaining length
4036       __ jmp(L_multiBlock_loopTop[k]);
4037 
4038       // singleBlock starts here
4039       __ align(OptoLoopAlignment);
4040       __ BIND(L_singleBlockLoopTop[k]);
4041       __ cmpptr(len_reg, 0);
4042       __ jcc(Assembler::lessEqual, L_exit);
4043       load_key(xmm_key_tmp0, key, 0x00, xmm_key_shuf_mask);
4044       __ movdqa(xmm_result0, xmm_curr_counter);
4045       inc_counter(rbx, xmm_curr_counter, 0x01, L__incCounter_single[k]);
4046       __ pshufb(xmm_result0, xmm_counter_shuf_mask);
4047       __ pxor(xmm_result0, xmm_key_tmp0);
4048       for (int i = 1; i < rounds[k]; i++) {
4049         load_key(xmm_key_tmp0, key, (0x10 * i), xmm_key_shuf_mask);
4050         __ aesenc(xmm_result0, xmm_key_tmp0);
4051       }
4052       load_key(xmm_key_tmp0, key, (rounds[k] * 0x10), xmm_key_shuf_mask);
4053       __ aesenclast(xmm_result0, xmm_key_tmp0);
4054       __ cmpptr(len_reg, AESBlockSize);
4055       __ jcc(Assembler::less, L_processTail_insr[k]);
4056         __ movdqu(xmm_from0, Address(from, pos, Address::times_1, 0 * AESBlockSize));
4057         __ pxor(xmm_result0, xmm_from0);
4058         __ movdqu(Address(to, pos, Address::times_1, 0 * AESBlockSize), xmm_result0);
4059         __ addptr(pos, AESBlockSize);
4060         __ subptr(len_reg, AESBlockSize);
4061         __ jmp(L_singleBlockLoopTop[k]);
4062       __ BIND(L_processTail_insr[k]);                               // Process the tail part of the input array
4063         __ addptr(pos, len_reg);                                    // 1. Insert bytes from src array into xmm_from0 register
4064         __ testptr(len_reg, 8);
4065         __ jcc(Assembler::zero, L_processTail_4_insr[k]);
4066           __ subptr(pos,8);
4067           __ pinsrq(xmm_from0, Address(from, pos), 0);
4068         __ BIND(L_processTail_4_insr[k]);
4069         __ testptr(len_reg, 4);
4070         __ jcc(Assembler::zero, L_processTail_2_insr[k]);
4071           __ subptr(pos,4);
4072           __ pslldq(xmm_from0, 4);
4073           __ pinsrd(xmm_from0, Address(from, pos), 0);
4074         __ BIND(L_processTail_2_insr[k]);
4075         __ testptr(len_reg, 2);
4076         __ jcc(Assembler::zero, L_processTail_1_insr[k]);
4077           __ subptr(pos, 2);
4078           __ pslldq(xmm_from0, 2);
4079           __ pinsrw(xmm_from0, Address(from, pos), 0);
4080         __ BIND(L_processTail_1_insr[k]);
4081         __ testptr(len_reg, 1);
4082         __ jcc(Assembler::zero, L_processTail_exit_insr[k]);
4083           __ subptr(pos, 1);
4084           __ pslldq(xmm_from0, 1);
4085           __ pinsrb(xmm_from0, Address(from, pos), 0);
4086         __ BIND(L_processTail_exit_insr[k]);
4087 
4088         __ movdqu(Address(saved_encCounter_start, 0), xmm_result0);  // 2. Perform pxor of the encrypted counter and plaintext Bytes.
4089         __ pxor(xmm_result0, xmm_from0);                             //    Also the encrypted counter is saved for next invocation.
4090 
4091         __ testptr(len_reg, 8);
4092         __ jcc(Assembler::zero, L_processTail_4_extr[k]);            // 3. Extract bytes from xmm_result0 into the dest. array
4093           __ pextrq(Address(to, pos), xmm_result0, 0);
4094           __ psrldq(xmm_result0, 8);
4095           __ addptr(pos, 8);
4096         __ BIND(L_processTail_4_extr[k]);
4097         __ testptr(len_reg, 4);
4098         __ jcc(Assembler::zero, L_processTail_2_extr[k]);
4099           __ pextrd(Address(to, pos), xmm_result0, 0);
4100           __ psrldq(xmm_result0, 4);
4101           __ addptr(pos, 4);
4102         __ BIND(L_processTail_2_extr[k]);
4103         __ testptr(len_reg, 2);
4104         __ jcc(Assembler::zero, L_processTail_1_extr[k]);
4105           __ pextrw(Address(to, pos), xmm_result0, 0);
4106           __ psrldq(xmm_result0, 2);
4107           __ addptr(pos, 2);
4108         __ BIND(L_processTail_1_extr[k]);
4109         __ testptr(len_reg, 1);
4110         __ jcc(Assembler::zero, L_processTail_exit_extr[k]);
4111           __ pextrb(Address(to, pos), xmm_result0, 0);
4112 
4113         __ BIND(L_processTail_exit_extr[k]);
4114         __ movl(Address(used_addr, 0), len_reg);
4115         __ jmp(L_exit);
4116 
4117     }
4118 
4119     __ BIND(L_exit);
4120     __ pshufb(xmm_curr_counter, xmm_counter_shuf_mask); //counter is shuffled back.
4121     __ movdqu(Address(counter, 0), xmm_curr_counter); //save counter back
4122     __ pop(rbx); // pop the saved RBX.
4123 #ifdef _WIN64
4124     __ movl(rax, len_mem);
4125     __ movptr(r13, Address(rsp, saved_r13_offset * wordSize));
4126     __ movptr(r14, Address(rsp, saved_r14_offset * wordSize));
4127     __ addptr(rsp, 2 * wordSize);
4128 #else
4129     __ pop(rax); // return 'len'
4130 #endif
4131     __ leave(); // required for proper stackwalking of RuntimeStub frame
4132     __ ret(0);
4133     return start;
4134   }
4135 
4136   // byte swap x86 long
4137   address generate_ghash_long_swap_mask() {
4138     __ align(CodeEntryAlignment);
4139     StubCodeMark mark(this, "StubRoutines", "ghash_long_swap_mask");
4140     address start = __ pc();
4141     __ emit_data64(0x0f0e0d0c0b0a0908, relocInfo::none );
4142     __ emit_data64(0x0706050403020100, relocInfo::none );
4143   return start;
4144   }
4145 
4146   // byte swap x86 byte array
4147   address generate_ghash_byte_swap_mask() {
4148     __ align(CodeEntryAlignment);
4149     StubCodeMark mark(this, "StubRoutines", "ghash_byte_swap_mask");
4150     address start = __ pc();
4151     __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none );
4152     __ emit_data64(0x0001020304050607, relocInfo::none );
4153   return start;
4154   }
4155 
4156   /* Single and multi-block ghash operations */
4157   address generate_ghash_processBlocks() {
4158     __ align(CodeEntryAlignment);
4159     Label L_ghash_loop, L_exit;
4160     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
4161     address start = __ pc();
4162 
4163     const Register state        = c_rarg0;
4164     const Register subkeyH      = c_rarg1;
4165     const Register data         = c_rarg2;
4166     const Register blocks       = c_rarg3;
4167 
4168     const XMMRegister xmm_temp0 = xmm0;
4169     const XMMRegister xmm_temp1 = xmm1;
4170     const XMMRegister xmm_temp2 = xmm2;
4171     const XMMRegister xmm_temp3 = xmm3;
4172     const XMMRegister xmm_temp4 = xmm4;
4173     const XMMRegister xmm_temp5 = xmm5;
4174     const XMMRegister xmm_temp6 = xmm6;
4175     const XMMRegister xmm_temp7 = xmm7;
4176     const XMMRegister xmm_temp8 = xmm8;
4177     const XMMRegister xmm_temp9 = xmm9;
4178     const XMMRegister xmm_temp10 = xmm10;
4179 
4180     __ enter();
4181 
4182     // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge
4183     // context for the registers used, where all instructions below are using 128-bit mode
4184     // On EVEX without VL and BW, these instructions will all be AVX.
4185     if (VM_Version::supports_avx512vlbw()) {
4186       __ movl(rax, 0xffff);
4187       __ kmovql(k1, rax);
4188     }
4189 
4190     __ movdqu(xmm_temp10, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr()));
4191 
4192     __ movdqu(xmm_temp0, Address(state, 0));
4193     __ pshufb(xmm_temp0, xmm_temp10);
4194 
4195 
4196     __ BIND(L_ghash_loop);
4197     __ movdqu(xmm_temp2, Address(data, 0));
4198     __ pshufb(xmm_temp2, ExternalAddress(StubRoutines::x86::ghash_byte_swap_mask_addr()));
4199 
4200     __ movdqu(xmm_temp1, Address(subkeyH, 0));
4201     __ pshufb(xmm_temp1, xmm_temp10);
4202 
4203     __ pxor(xmm_temp0, xmm_temp2);
4204 
4205     //
4206     // Multiply with the hash key
4207     //
4208     __ movdqu(xmm_temp3, xmm_temp0);
4209     __ pclmulqdq(xmm_temp3, xmm_temp1, 0);      // xmm3 holds a0*b0
4210     __ movdqu(xmm_temp4, xmm_temp0);
4211     __ pclmulqdq(xmm_temp4, xmm_temp1, 16);     // xmm4 holds a0*b1
4212 
4213     __ movdqu(xmm_temp5, xmm_temp0);
4214     __ pclmulqdq(xmm_temp5, xmm_temp1, 1);      // xmm5 holds a1*b0
4215     __ movdqu(xmm_temp6, xmm_temp0);
4216     __ pclmulqdq(xmm_temp6, xmm_temp1, 17);     // xmm6 holds a1*b1
4217 
4218     __ pxor(xmm_temp4, xmm_temp5);      // xmm4 holds a0*b1 + a1*b0
4219 
4220     __ movdqu(xmm_temp5, xmm_temp4);    // move the contents of xmm4 to xmm5
4221     __ psrldq(xmm_temp4, 8);    // shift by xmm4 64 bits to the right
4222     __ pslldq(xmm_temp5, 8);    // shift by xmm5 64 bits to the left
4223     __ pxor(xmm_temp3, xmm_temp5);
4224     __ pxor(xmm_temp6, xmm_temp4);      // Register pair <xmm6:xmm3> holds the result
4225                                         // of the carry-less multiplication of
4226                                         // xmm0 by xmm1.
4227 
4228     // We shift the result of the multiplication by one bit position
4229     // to the left to cope for the fact that the bits are reversed.
4230     __ movdqu(xmm_temp7, xmm_temp3);
4231     __ movdqu(xmm_temp8, xmm_temp6);
4232     __ pslld(xmm_temp3, 1);
4233     __ pslld(xmm_temp6, 1);
4234     __ psrld(xmm_temp7, 31);
4235     __ psrld(xmm_temp8, 31);
4236     __ movdqu(xmm_temp9, xmm_temp7);
4237     __ pslldq(xmm_temp8, 4);
4238     __ pslldq(xmm_temp7, 4);
4239     __ psrldq(xmm_temp9, 12);
4240     __ por(xmm_temp3, xmm_temp7);
4241     __ por(xmm_temp6, xmm_temp8);
4242     __ por(xmm_temp6, xmm_temp9);
4243 
4244     //
4245     // First phase of the reduction
4246     //
4247     // Move xmm3 into xmm7, xmm8, xmm9 in order to perform the shifts
4248     // independently.
4249     __ movdqu(xmm_temp7, xmm_temp3);
4250     __ movdqu(xmm_temp8, xmm_temp3);
4251     __ movdqu(xmm_temp9, xmm_temp3);
4252     __ pslld(xmm_temp7, 31);    // packed right shift shifting << 31
4253     __ pslld(xmm_temp8, 30);    // packed right shift shifting << 30
4254     __ pslld(xmm_temp9, 25);    // packed right shift shifting << 25
4255     __ pxor(xmm_temp7, xmm_temp8);      // xor the shifted versions
4256     __ pxor(xmm_temp7, xmm_temp9);
4257     __ movdqu(xmm_temp8, xmm_temp7);
4258     __ pslldq(xmm_temp7, 12);
4259     __ psrldq(xmm_temp8, 4);
4260     __ pxor(xmm_temp3, xmm_temp7);      // first phase of the reduction complete
4261 
4262     //
4263     // Second phase of the reduction
4264     //
4265     // Make 3 copies of xmm3 in xmm2, xmm4, xmm5 for doing these
4266     // shift operations.
4267     __ movdqu(xmm_temp2, xmm_temp3);
4268     __ movdqu(xmm_temp4, xmm_temp3);
4269     __ movdqu(xmm_temp5, xmm_temp3);
4270     __ psrld(xmm_temp2, 1);     // packed left shifting >> 1
4271     __ psrld(xmm_temp4, 2);     // packed left shifting >> 2
4272     __ psrld(xmm_temp5, 7);     // packed left shifting >> 7
4273     __ pxor(xmm_temp2, xmm_temp4);      // xor the shifted versions
4274     __ pxor(xmm_temp2, xmm_temp5);
4275     __ pxor(xmm_temp2, xmm_temp8);
4276     __ pxor(xmm_temp3, xmm_temp2);
4277     __ pxor(xmm_temp6, xmm_temp3);      // the result is in xmm6
4278 
4279     __ decrement(blocks);
4280     __ jcc(Assembler::zero, L_exit);
4281     __ movdqu(xmm_temp0, xmm_temp6);
4282     __ addptr(data, 16);
4283     __ jmp(L_ghash_loop);
4284 
4285     __ BIND(L_exit);
4286     __ pshufb(xmm_temp6, xmm_temp10);          // Byte swap 16-byte result
4287     __ movdqu(Address(state, 0), xmm_temp6);   // store the result
4288 
4289     __ leave();
4290     __ ret(0);
4291     return start;
4292   }
4293 
4294   /**
4295    *  Arguments:
4296    *
4297    * Inputs:
4298    *   c_rarg0   - int crc
4299    *   c_rarg1   - byte* buf
4300    *   c_rarg2   - int length
4301    *
4302    * Ouput:
4303    *       rax   - int crc result
4304    */
4305   address generate_updateBytesCRC32() {
4306     assert(UseCRC32Intrinsics, "need AVX and CLMUL instructions");
4307 
4308     __ align(CodeEntryAlignment);
4309     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
4310 
4311     address start = __ pc();
4312     // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
4313     // Unix:  rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...)
4314     // rscratch1: r10
4315     const Register crc   = c_rarg0;  // crc
4316     const Register buf   = c_rarg1;  // source java byte array address
4317     const Register len   = c_rarg2;  // length
4318     const Register table = c_rarg3;  // crc_table address (reuse register)
4319     const Register tmp   = r11;
4320     assert_different_registers(crc, buf, len, table, tmp, rax);
4321 
4322     BLOCK_COMMENT("Entry:");
4323     __ enter(); // required for proper stackwalking of RuntimeStub frame
4324 
4325     __ kernel_crc32(crc, buf, len, table, tmp);
4326 
4327     __ movl(rax, crc);
4328     __ leave(); // required for proper stackwalking of RuntimeStub frame
4329     __ ret(0);
4330 
4331     return start;
4332   }
4333 
4334   /**
4335   *  Arguments:
4336   *
4337   * Inputs:
4338   *   c_rarg0   - int crc
4339   *   c_rarg1   - byte* buf
4340   *   c_rarg2   - long length
4341   *   c_rarg3   - table_start - optional (present only when doing a library_call,
4342   *              not used by x86 algorithm)
4343   *
4344   * Ouput:
4345   *       rax   - int crc result
4346   */
4347   address generate_updateBytesCRC32C(bool is_pclmulqdq_supported) {
4348       assert(UseCRC32CIntrinsics, "need SSE4_2");
4349       __ align(CodeEntryAlignment);
4350       StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
4351       address start = __ pc();
4352       //reg.arg        int#0        int#1        int#2        int#3        int#4        int#5        float regs
4353       //Windows        RCX          RDX          R8           R9           none         none         XMM0..XMM3
4354       //Lin / Sol      RDI          RSI          RDX          RCX          R8           R9           XMM0..XMM7
4355       const Register crc = c_rarg0;  // crc
4356       const Register buf = c_rarg1;  // source java byte array address
4357       const Register len = c_rarg2;  // length
4358       const Register a = rax;
4359       const Register j = r9;
4360       const Register k = r10;
4361       const Register l = r11;
4362 #ifdef _WIN64
4363       const Register y = rdi;
4364       const Register z = rsi;
4365 #else
4366       const Register y = rcx;
4367       const Register z = r8;
4368 #endif
4369       assert_different_registers(crc, buf, len, a, j, k, l, y, z);
4370 
4371       BLOCK_COMMENT("Entry:");
4372       __ enter(); // required for proper stackwalking of RuntimeStub frame
4373 #ifdef _WIN64
4374       __ push(y);
4375       __ push(z);
4376 #endif
4377       __ crc32c_ipl_alg2_alt2(crc, buf, len,
4378                               a, j, k,
4379                               l, y, z,
4380                               c_farg0, c_farg1, c_farg2,
4381                               is_pclmulqdq_supported);
4382       __ movl(rax, crc);
4383 #ifdef _WIN64
4384       __ pop(z);
4385       __ pop(y);
4386 #endif
4387       __ leave(); // required for proper stackwalking of RuntimeStub frame
4388       __ ret(0);
4389 
4390       return start;
4391   }
4392 
4393   /**
4394    *  Arguments:
4395    *
4396    *  Input:
4397    *    c_rarg0   - x address
4398    *    c_rarg1   - x length
4399    *    c_rarg2   - y address
4400    *    c_rarg3   - y lenth
4401    * not Win64
4402    *    c_rarg4   - z address
4403    *    c_rarg5   - z length
4404    * Win64
4405    *    rsp+40    - z address
4406    *    rsp+48    - z length
4407    */
4408   address generate_multiplyToLen() {
4409     __ align(CodeEntryAlignment);
4410     StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
4411 
4412     address start = __ pc();
4413     // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
4414     // Unix:  rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...)
4415     const Register x     = rdi;
4416     const Register xlen  = rax;
4417     const Register y     = rsi;
4418     const Register ylen  = rcx;
4419     const Register z     = r8;
4420     const Register zlen  = r11;
4421 
4422     // Next registers will be saved on stack in multiply_to_len().
4423     const Register tmp1  = r12;
4424     const Register tmp2  = r13;
4425     const Register tmp3  = r14;
4426     const Register tmp4  = r15;
4427     const Register tmp5  = rbx;
4428 
4429     BLOCK_COMMENT("Entry:");
4430     __ enter(); // required for proper stackwalking of RuntimeStub frame
4431 
4432 #ifndef _WIN64
4433     __ movptr(zlen, r9); // Save r9 in r11 - zlen
4434 #endif
4435     setup_arg_regs(4); // x => rdi, xlen => rsi, y => rdx
4436                        // ylen => rcx, z => r8, zlen => r11
4437                        // r9 and r10 may be used to save non-volatile registers
4438 #ifdef _WIN64
4439     // last 2 arguments (#4, #5) are on stack on Win64
4440     __ movptr(z, Address(rsp, 6 * wordSize));
4441     __ movptr(zlen, Address(rsp, 7 * wordSize));
4442 #endif
4443 
4444     __ movptr(xlen, rsi);
4445     __ movptr(y,    rdx);
4446     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5);
4447 
4448     restore_arg_regs();
4449 
4450     __ leave(); // required for proper stackwalking of RuntimeStub frame
4451     __ ret(0);
4452 
4453     return start;
4454   }
4455 
4456   /**
4457   *  Arguments:
4458   *
4459   *  Input:
4460   *    c_rarg0   - obja     address
4461   *    c_rarg1   - objb     address
4462   *    c_rarg3   - length   length
4463   *    c_rarg4   - scale    log2_array_indxscale
4464   *
4465   *  Output:
4466   *        rax   - int >= mismatched index, < 0 bitwise complement of tail
4467   */
4468   address generate_vectorizedMismatch() {
4469     __ align(CodeEntryAlignment);
4470     StubCodeMark mark(this, "StubRoutines", "vectorizedMismatch");
4471     address start = __ pc();
4472 
4473     BLOCK_COMMENT("Entry:");
4474     __ enter();
4475 
4476 #ifdef _WIN64  // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
4477     const Register scale = c_rarg0;  //rcx, will exchange with r9
4478     const Register objb = c_rarg1;   //rdx
4479     const Register length = c_rarg2; //r8
4480     const Register obja = c_rarg3;   //r9
4481     __ xchgq(obja, scale);  //now obja and scale contains the correct contents
4482 
4483     const Register tmp1 = r10;
4484     const Register tmp2 = r11;
4485 #endif
4486 #ifndef _WIN64 // Unix:  rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...)
4487     const Register obja = c_rarg0;   //U:rdi
4488     const Register objb = c_rarg1;   //U:rsi
4489     const Register length = c_rarg2; //U:rdx
4490     const Register scale = c_rarg3;  //U:rcx
4491     const Register tmp1 = r8;
4492     const Register tmp2 = r9;
4493 #endif
4494     const Register result = rax; //return value
4495     const XMMRegister vec0 = xmm0;
4496     const XMMRegister vec1 = xmm1;
4497     const XMMRegister vec2 = xmm2;
4498 
4499     __ vectorized_mismatch(obja, objb, length, scale, result, tmp1, tmp2, vec0, vec1, vec2);
4500 
4501     __ leave();
4502     __ ret(0);
4503 
4504     return start;
4505   }
4506 
4507 /**
4508    *  Arguments:
4509    *
4510   //  Input:
4511   //    c_rarg0   - x address
4512   //    c_rarg1   - x length
4513   //    c_rarg2   - z address
4514   //    c_rarg3   - z lenth
4515    *
4516    */
4517   address generate_squareToLen() {
4518 
4519     __ align(CodeEntryAlignment);
4520     StubCodeMark mark(this, "StubRoutines", "squareToLen");
4521 
4522     address start = __ pc();
4523     // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
4524     // Unix:  rdi, rsi, rdx, rcx (c_rarg0, c_rarg1, ...)
4525     const Register x      = rdi;
4526     const Register len    = rsi;
4527     const Register z      = r8;
4528     const Register zlen   = rcx;
4529 
4530    const Register tmp1      = r12;
4531    const Register tmp2      = r13;
4532    const Register tmp3      = r14;
4533    const Register tmp4      = r15;
4534    const Register tmp5      = rbx;
4535 
4536     BLOCK_COMMENT("Entry:");
4537     __ enter(); // required for proper stackwalking of RuntimeStub frame
4538 
4539        setup_arg_regs(4); // x => rdi, len => rsi, z => rdx
4540                           // zlen => rcx
4541                           // r9 and r10 may be used to save non-volatile registers
4542     __ movptr(r8, rdx);
4543     __ square_to_len(x, len, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, rdx, rax);
4544 
4545     restore_arg_regs();
4546 
4547     __ leave(); // required for proper stackwalking of RuntimeStub frame
4548     __ ret(0);
4549 
4550     return start;
4551   }
4552 
4553    /**
4554    *  Arguments:
4555    *
4556    *  Input:
4557    *    c_rarg0   - out address
4558    *    c_rarg1   - in address
4559    *    c_rarg2   - offset
4560    *    c_rarg3   - len
4561    * not Win64
4562    *    c_rarg4   - k
4563    * Win64
4564    *    rsp+40    - k
4565    */
4566   address generate_mulAdd() {
4567     __ align(CodeEntryAlignment);
4568     StubCodeMark mark(this, "StubRoutines", "mulAdd");
4569 
4570     address start = __ pc();
4571     // Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
4572     // Unix:  rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...)
4573     const Register out     = rdi;
4574     const Register in      = rsi;
4575     const Register offset  = r11;
4576     const Register len     = rcx;
4577     const Register k       = r8;
4578 
4579     // Next registers will be saved on stack in mul_add().
4580     const Register tmp1  = r12;
4581     const Register tmp2  = r13;
4582     const Register tmp3  = r14;
4583     const Register tmp4  = r15;
4584     const Register tmp5  = rbx;
4585 
4586     BLOCK_COMMENT("Entry:");
4587     __ enter(); // required for proper stackwalking of RuntimeStub frame
4588 
4589     setup_arg_regs(4); // out => rdi, in => rsi, offset => rdx
4590                        // len => rcx, k => r8
4591                        // r9 and r10 may be used to save non-volatile registers
4592 #ifdef _WIN64
4593     // last argument is on stack on Win64
4594     __ movl(k, Address(rsp, 6 * wordSize));
4595 #endif
4596     __ movptr(r11, rdx);  // move offset in rdx to offset(r11)
4597     __ mul_add(out, in, offset, len, k, tmp1, tmp2, tmp3, tmp4, tmp5, rdx, rax);
4598 
4599     restore_arg_regs();
4600 
4601     __ leave(); // required for proper stackwalking of RuntimeStub frame
4602     __ ret(0);
4603 
4604     return start;
4605   }
4606 
4607   address generate_libmExp() {
4608     address start = __ pc();
4609 
4610     const XMMRegister x0  = xmm0;
4611     const XMMRegister x1  = xmm1;
4612     const XMMRegister x2  = xmm2;
4613     const XMMRegister x3  = xmm3;
4614 
4615     const XMMRegister x4  = xmm4;
4616     const XMMRegister x5  = xmm5;
4617     const XMMRegister x6  = xmm6;
4618     const XMMRegister x7  = xmm7;
4619 
4620     const Register tmp   = r11;
4621 
4622     BLOCK_COMMENT("Entry:");
4623     __ enter(); // required for proper stackwalking of RuntimeStub frame
4624 
4625       __ fast_exp(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp);
4626 
4627     __ leave(); // required for proper stackwalking of RuntimeStub frame
4628     __ ret(0);
4629 
4630     return start;
4631 
4632   }
4633 
4634   address generate_libmLog() {
4635     address start = __ pc();
4636 
4637     const XMMRegister x0 = xmm0;
4638     const XMMRegister x1 = xmm1;
4639     const XMMRegister x2 = xmm2;
4640     const XMMRegister x3 = xmm3;
4641 
4642     const XMMRegister x4 = xmm4;
4643     const XMMRegister x5 = xmm5;
4644     const XMMRegister x6 = xmm6;
4645     const XMMRegister x7 = xmm7;
4646 
4647     const Register tmp1 = r11;
4648     const Register tmp2 = r8;
4649 
4650     BLOCK_COMMENT("Entry:");
4651     __ enter(); // required for proper stackwalking of RuntimeStub frame
4652 
4653     __ fast_log(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp1, tmp2);
4654 
4655     __ leave(); // required for proper stackwalking of RuntimeStub frame
4656     __ ret(0);
4657 
4658     return start;
4659 
4660   }
4661 
4662   address generate_libmLog10() {
4663     address start = __ pc();
4664 
4665     const XMMRegister x0 = xmm0;
4666     const XMMRegister x1 = xmm1;
4667     const XMMRegister x2 = xmm2;
4668     const XMMRegister x3 = xmm3;
4669 
4670     const XMMRegister x4 = xmm4;
4671     const XMMRegister x5 = xmm5;
4672     const XMMRegister x6 = xmm6;
4673     const XMMRegister x7 = xmm7;
4674 
4675     const Register tmp = r11;
4676 
4677     BLOCK_COMMENT("Entry:");
4678     __ enter(); // required for proper stackwalking of RuntimeStub frame
4679 
4680     __ fast_log10(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp);
4681 
4682     __ leave(); // required for proper stackwalking of RuntimeStub frame
4683     __ ret(0);
4684 
4685     return start;
4686 
4687   }
4688 
4689   address generate_libmPow() {
4690     address start = __ pc();
4691 
4692     const XMMRegister x0 = xmm0;
4693     const XMMRegister x1 = xmm1;
4694     const XMMRegister x2 = xmm2;
4695     const XMMRegister x3 = xmm3;
4696 
4697     const XMMRegister x4 = xmm4;
4698     const XMMRegister x5 = xmm5;
4699     const XMMRegister x6 = xmm6;
4700     const XMMRegister x7 = xmm7;
4701 
4702     const Register tmp1 = r8;
4703     const Register tmp2 = r9;
4704     const Register tmp3 = r10;
4705     const Register tmp4 = r11;
4706 
4707     BLOCK_COMMENT("Entry:");
4708     __ enter(); // required for proper stackwalking of RuntimeStub frame
4709 
4710     __ fast_pow(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp1, tmp2, tmp3, tmp4);
4711 
4712     __ leave(); // required for proper stackwalking of RuntimeStub frame
4713     __ ret(0);
4714 
4715     return start;
4716 
4717   }
4718 
4719   address generate_libmSin() {
4720     address start = __ pc();
4721 
4722     const XMMRegister x0 = xmm0;
4723     const XMMRegister x1 = xmm1;
4724     const XMMRegister x2 = xmm2;
4725     const XMMRegister x3 = xmm3;
4726 
4727     const XMMRegister x4 = xmm4;
4728     const XMMRegister x5 = xmm5;
4729     const XMMRegister x6 = xmm6;
4730     const XMMRegister x7 = xmm7;
4731 
4732     const Register tmp1 = r8;
4733     const Register tmp2 = r9;
4734     const Register tmp3 = r10;
4735     const Register tmp4 = r11;
4736 
4737     BLOCK_COMMENT("Entry:");
4738     __ enter(); // required for proper stackwalking of RuntimeStub frame
4739 
4740 #ifdef _WIN64
4741     __ push(rsi);
4742     __ push(rdi);
4743 #endif
4744     __ fast_sin(x0, x1, x2, x3, x4, x5, x6, x7, rax, rbx, rcx, rdx, tmp1, tmp2, tmp3, tmp4);
4745 
4746 #ifdef _WIN64
4747     __ pop(rdi);
4748     __ pop(rsi);
4749 #endif
4750 
4751     __ leave(); // required for proper stackwalking of RuntimeStub frame
4752     __ ret(0);
4753 
4754     return start;
4755 
4756   }
4757 
4758   address generate_libmCos() {
4759     address start = __ pc();
4760 
4761     const XMMRegister x0 = xmm0;
4762     const XMMRegister x1 = xmm1;
4763     const XMMRegister x2 = xmm2;
4764     const XMMRegister x3 = xmm3;
4765 
4766     const XMMRegister x4 = xmm4;
4767     const XMMRegister x5 = xmm5;
4768     const XMMRegister x6 = xmm6;
4769     const XMMRegister x7 = xmm7;
4770 
4771     const Register tmp1 = r8;
4772     const Register tmp2 = r9;
4773     const Register tmp3 = r10;
4774     const Register tmp4 = r11;
4775 
4776     BLOCK_COMMENT("Entry:");
4777     __ enter(); // required for proper stackwalking of RuntimeStub frame
4778 
4779 #ifdef _WIN64
4780     __ push(rsi);
4781     __ push(rdi);
4782 #endif
4783     __ fast_cos(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp1, tmp2, tmp3, tmp4);
4784 
4785 #ifdef _WIN64
4786     __ pop(rdi);
4787     __ pop(rsi);
4788 #endif
4789 
4790     __ leave(); // required for proper stackwalking of RuntimeStub frame
4791     __ ret(0);
4792 
4793     return start;
4794 
4795   }
4796 
4797   address generate_libmTan() {
4798     address start = __ pc();
4799 
4800     const XMMRegister x0 = xmm0;
4801     const XMMRegister x1 = xmm1;
4802     const XMMRegister x2 = xmm2;
4803     const XMMRegister x3 = xmm3;
4804 
4805     const XMMRegister x4 = xmm4;
4806     const XMMRegister x5 = xmm5;
4807     const XMMRegister x6 = xmm6;
4808     const XMMRegister x7 = xmm7;
4809 
4810     const Register tmp1 = r8;
4811     const Register tmp2 = r9;
4812     const Register tmp3 = r10;
4813     const Register tmp4 = r11;
4814 
4815     BLOCK_COMMENT("Entry:");
4816     __ enter(); // required for proper stackwalking of RuntimeStub frame
4817 
4818 #ifdef _WIN64
4819     __ push(rsi);
4820     __ push(rdi);
4821 #endif
4822     __ fast_tan(x0, x1, x2, x3, x4, x5, x6, x7, rax, rcx, rdx, tmp1, tmp2, tmp3, tmp4);
4823 
4824 #ifdef _WIN64
4825     __ pop(rdi);
4826     __ pop(rsi);
4827 #endif
4828 
4829     __ leave(); // required for proper stackwalking of RuntimeStub frame
4830     __ ret(0);
4831 
4832     return start;
4833 
4834   }
4835 
4836 #undef __
4837 #define __ masm->
4838 
4839   // Continuation point for throwing of implicit exceptions that are
4840   // not handled in the current activation. Fabricates an exception
4841   // oop and initiates normal exception dispatching in this
4842   // frame. Since we need to preserve callee-saved values (currently
4843   // only for C2, but done for C1 as well) we need a callee-saved oop
4844   // map and therefore have to make these stubs into RuntimeStubs
4845   // rather than BufferBlobs.  If the compiler needs all registers to
4846   // be preserved between the fault point and the exception handler
4847   // then it must assume responsibility for that in
4848   // AbstractCompiler::continuation_for_implicit_null_exception or
4849   // continuation_for_implicit_division_by_zero_exception. All other
4850   // implicit exceptions (e.g., NullPointerException or
4851   // AbstractMethodError on entry) are either at call sites or
4852   // otherwise assume that stack unwinding will be initiated, so
4853   // caller saved registers were assumed volatile in the compiler.
4854   address generate_throw_exception(const char* name,
4855                                    address runtime_entry,
4856                                    Register arg1 = noreg,
4857                                    Register arg2 = noreg) {
4858     // Information about frame layout at time of blocking runtime call.
4859     // Note that we only have to preserve callee-saved registers since
4860     // the compilers are responsible for supplying a continuation point
4861     // if they expect all registers to be preserved.
4862     enum layout {
4863       rbp_off = frame::arg_reg_save_area_bytes/BytesPerInt,
4864       rbp_off2,
4865       return_off,
4866       return_off2,
4867       framesize // inclusive of return address
4868     };
4869 
4870     int insts_size = 512;
4871     int locs_size  = 64;
4872 
4873     CodeBuffer code(name, insts_size, locs_size);
4874     OopMapSet* oop_maps  = new OopMapSet();
4875     MacroAssembler* masm = new MacroAssembler(&code);
4876 
4877     address start = __ pc();
4878 
4879     // This is an inlined and slightly modified version of call_VM
4880     // which has the ability to fetch the return PC out of
4881     // thread-local storage and also sets up last_Java_sp slightly
4882     // differently than the real call_VM
4883 
4884     __ enter(); // required for proper stackwalking of RuntimeStub frame
4885 
4886     assert(is_even(framesize/2), "sp not 16-byte aligned");
4887 
4888     // return address and rbp are already in place
4889     __ subptr(rsp, (framesize-4) << LogBytesPerInt); // prolog
4890 
4891     int frame_complete = __ pc() - start;
4892 
4893     // Set up last_Java_sp and last_Java_fp
4894     address the_pc = __ pc();
4895     __ set_last_Java_frame(rsp, rbp, the_pc);
4896     __ andptr(rsp, -(StackAlignmentInBytes));    // Align stack
4897 
4898     // Call runtime
4899     if (arg1 != noreg) {
4900       assert(arg2 != c_rarg1, "clobbered");
4901       __ movptr(c_rarg1, arg1);
4902     }
4903     if (arg2 != noreg) {
4904       __ movptr(c_rarg2, arg2);
4905     }
4906     __ movptr(c_rarg0, r15_thread);
4907     BLOCK_COMMENT("call runtime_entry");
4908     __ call(RuntimeAddress(runtime_entry));
4909 
4910     // Generate oop map
4911     OopMap* map = new OopMap(framesize, 0);
4912 
4913     oop_maps->add_gc_map(the_pc - start, map);
4914 
4915     __ reset_last_Java_frame(true);
4916 
4917     __ leave(); // required for proper stackwalking of RuntimeStub frame
4918 
4919     // check for pending exceptions
4920 #ifdef ASSERT
4921     Label L;
4922     __ cmpptr(Address(r15_thread, Thread::pending_exception_offset()),
4923             (int32_t) NULL_WORD);
4924     __ jcc(Assembler::notEqual, L);
4925     __ should_not_reach_here();
4926     __ bind(L);
4927 #endif // ASSERT
4928     __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
4929 
4930 
4931     // codeBlob framesize is in words (not VMRegImpl::slot_size)
4932     RuntimeStub* stub =
4933       RuntimeStub::new_runtime_stub(name,
4934                                     &code,
4935                                     frame_complete,
4936                                     (framesize >> (LogBytesPerWord - LogBytesPerInt)),
4937                                     oop_maps, false);
4938     return stub->entry_point();
4939   }
4940 
4941   void create_control_words() {
4942     // Round to nearest, 53-bit mode, exceptions masked
4943     StubRoutines::_fpu_cntrl_wrd_std   = 0x027F;
4944     // Round to zero, 53-bit mode, exception mased
4945     StubRoutines::_fpu_cntrl_wrd_trunc = 0x0D7F;
4946     // Round to nearest, 24-bit mode, exceptions masked
4947     StubRoutines::_fpu_cntrl_wrd_24    = 0x007F;
4948     // Round to nearest, 64-bit mode, exceptions masked
4949     StubRoutines::_fpu_cntrl_wrd_64    = 0x037F;
4950     // Round to nearest, 64-bit mode, exceptions masked
4951     StubRoutines::_mxcsr_std           = 0x1F80;
4952     // Note: the following two constants are 80-bit values
4953     //       layout is critical for correct loading by FPU.
4954     // Bias for strict fp multiply/divide
4955     StubRoutines::_fpu_subnormal_bias1[0]= 0x00000000; // 2^(-15360) == 0x03ff 8000 0000 0000 0000
4956     StubRoutines::_fpu_subnormal_bias1[1]= 0x80000000;
4957     StubRoutines::_fpu_subnormal_bias1[2]= 0x03ff;
4958     // Un-Bias for strict fp multiply/divide
4959     StubRoutines::_fpu_subnormal_bias2[0]= 0x00000000; // 2^(+15360) == 0x7bff 8000 0000 0000 0000
4960     StubRoutines::_fpu_subnormal_bias2[1]= 0x80000000;
4961     StubRoutines::_fpu_subnormal_bias2[2]= 0x7bff;
4962   }
4963 
4964   // Initialization
4965   void generate_initial() {
4966     // Generates all stubs and initializes the entry points
4967 
4968     // This platform-specific settings are needed by generate_call_stub()
4969     create_control_words();
4970 
4971     // entry points that exist in all platforms Note: This is code
4972     // that could be shared among different platforms - however the
4973     // benefit seems to be smaller than the disadvantage of having a
4974     // much more complicated generator structure. See also comment in
4975     // stubRoutines.hpp.
4976 
4977     StubRoutines::_forward_exception_entry = generate_forward_exception();
4978 
4979     StubRoutines::_call_stub_entry =
4980       generate_call_stub(StubRoutines::_call_stub_return_address);
4981 
4982     // is referenced by megamorphic call
4983     StubRoutines::_catch_exception_entry = generate_catch_exception();
4984 
4985     // atomic calls
4986     StubRoutines::_atomic_xchg_entry         = generate_atomic_xchg();
4987     StubRoutines::_atomic_xchg_ptr_entry     = generate_atomic_xchg_ptr();
4988     StubRoutines::_atomic_cmpxchg_entry      = generate_atomic_cmpxchg();
4989     StubRoutines::_atomic_cmpxchg_byte_entry = generate_atomic_cmpxchg_byte();
4990     StubRoutines::_atomic_cmpxchg_long_entry = generate_atomic_cmpxchg_long();
4991     StubRoutines::_atomic_add_entry          = generate_atomic_add();
4992     StubRoutines::_atomic_add_ptr_entry      = generate_atomic_add_ptr();
4993     StubRoutines::_fence_entry               = generate_orderaccess_fence();
4994 
4995     // platform dependent
4996     StubRoutines::x86::_get_previous_fp_entry = generate_get_previous_fp();
4997     StubRoutines::x86::_get_previous_sp_entry = generate_get_previous_sp();
4998 
4999     StubRoutines::x86::_verify_mxcsr_entry    = generate_verify_mxcsr();
5000 
5001     // Build this early so it's available for the interpreter.
5002     StubRoutines::_throw_StackOverflowError_entry =
5003       generate_throw_exception("StackOverflowError throw_exception",
5004                                CAST_FROM_FN_PTR(address,
5005                                                 SharedRuntime::
5006                                                 throw_StackOverflowError));
5007     StubRoutines::_throw_delayed_StackOverflowError_entry =
5008       generate_throw_exception("delayed StackOverflowError throw_exception",
5009                                CAST_FROM_FN_PTR(address,
5010                                                 SharedRuntime::
5011                                                 throw_delayed_StackOverflowError));
5012     if (UseCRC32Intrinsics) {
5013       // set table address before stub generation which use it
5014       StubRoutines::_crc_table_adr = (address)StubRoutines::x86::_crc_table;
5015       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
5016     }
5017 
5018     if (UseCRC32CIntrinsics) {
5019       bool supports_clmul = VM_Version::supports_clmul();
5020       StubRoutines::x86::generate_CRC32C_table(supports_clmul);
5021       StubRoutines::_crc32c_table_addr = (address)StubRoutines::x86::_crc32c_table;
5022       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(supports_clmul);
5023     }
5024     if (VM_Version::supports_sse2() && UseLibmIntrinsic && InlineIntrinsics) {
5025       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin) ||
5026           vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos) ||
5027           vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dtan)) {
5028         StubRoutines::x86::_ONEHALF_adr = (address)StubRoutines::x86::_ONEHALF;
5029         StubRoutines::x86::_P_2_adr = (address)StubRoutines::x86::_P_2;
5030         StubRoutines::x86::_SC_4_adr = (address)StubRoutines::x86::_SC_4;
5031         StubRoutines::x86::_Ctable_adr = (address)StubRoutines::x86::_Ctable;
5032         StubRoutines::x86::_SC_2_adr = (address)StubRoutines::x86::_SC_2;
5033         StubRoutines::x86::_SC_3_adr = (address)StubRoutines::x86::_SC_3;
5034         StubRoutines::x86::_SC_1_adr = (address)StubRoutines::x86::_SC_1;
5035         StubRoutines::x86::_PI_INV_TABLE_adr = (address)StubRoutines::x86::_PI_INV_TABLE;
5036         StubRoutines::x86::_PI_4_adr = (address)StubRoutines::x86::_PI_4;
5037         StubRoutines::x86::_PI32INV_adr = (address)StubRoutines::x86::_PI32INV;
5038         StubRoutines::x86::_SIGN_MASK_adr = (address)StubRoutines::x86::_SIGN_MASK;
5039         StubRoutines::x86::_P_1_adr = (address)StubRoutines::x86::_P_1;
5040         StubRoutines::x86::_P_3_adr = (address)StubRoutines::x86::_P_3;
5041         StubRoutines::x86::_NEG_ZERO_adr = (address)StubRoutines::x86::_NEG_ZERO;
5042       }
5043       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dexp)) {
5044         StubRoutines::_dexp = generate_libmExp();
5045       }
5046       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog)) {
5047         StubRoutines::_dlog = generate_libmLog();
5048       }
5049       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog10)) {
5050         StubRoutines::_dlog10 = generate_libmLog10();
5051       }
5052       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dpow)) {
5053         StubRoutines::_dpow = generate_libmPow();
5054       }
5055       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
5056         StubRoutines::_dsin = generate_libmSin();
5057       }
5058       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
5059         StubRoutines::_dcos = generate_libmCos();
5060       }
5061       if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dtan)) {
5062         StubRoutines::_dtan = generate_libmTan();
5063       }
5064     }
5065   }
5066 
5067   void generate_all() {
5068     // Generates all stubs and initializes the entry points
5069 
5070     // These entry points require SharedInfo::stack0 to be set up in
5071     // non-core builds and need to be relocatable, so they each
5072     // fabricate a RuntimeStub internally.
5073     StubRoutines::_throw_AbstractMethodError_entry =
5074       generate_throw_exception("AbstractMethodError throw_exception",
5075                                CAST_FROM_FN_PTR(address,
5076                                                 SharedRuntime::
5077                                                 throw_AbstractMethodError));
5078 
5079     StubRoutines::_throw_IncompatibleClassChangeError_entry =
5080       generate_throw_exception("IncompatibleClassChangeError throw_exception",
5081                                CAST_FROM_FN_PTR(address,
5082                                                 SharedRuntime::
5083                                                 throw_IncompatibleClassChangeError));
5084 
5085     StubRoutines::_throw_NullPointerException_at_call_entry =
5086       generate_throw_exception("NullPointerException at call throw_exception",
5087                                CAST_FROM_FN_PTR(address,
5088                                                 SharedRuntime::
5089                                                 throw_NullPointerException_at_call));
5090 
5091     // entry points that are platform specific
5092     StubRoutines::x86::_f2i_fixup = generate_f2i_fixup();
5093     StubRoutines::x86::_f2l_fixup = generate_f2l_fixup();
5094     StubRoutines::x86::_d2i_fixup = generate_d2i_fixup();
5095     StubRoutines::x86::_d2l_fixup = generate_d2l_fixup();
5096 
5097     StubRoutines::x86::_float_sign_mask  = generate_fp_mask("float_sign_mask",  0x7FFFFFFF7FFFFFFF);
5098     StubRoutines::x86::_float_sign_flip  = generate_fp_mask("float_sign_flip",  0x8000000080000000);
5099     StubRoutines::x86::_double_sign_mask = generate_fp_mask("double_sign_mask", 0x7FFFFFFFFFFFFFFF);
5100     StubRoutines::x86::_double_sign_flip = generate_fp_mask("double_sign_flip", 0x8000000000000000);
5101 
5102     // support for verify_oop (must happen after universe_init)
5103     StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop();
5104 
5105     // arraycopy stubs used by compilers
5106     generate_arraycopy_stubs();
5107 
5108     // don't bother generating these AES intrinsic stubs unless global flag is set
5109     if (UseAESIntrinsics) {
5110       StubRoutines::x86::_key_shuffle_mask_addr = generate_key_shuffle_mask();  // needed by the others
5111       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
5112       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
5113       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
5114       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt_Parallel();
5115     }
5116     if (UseAESCTRIntrinsics){
5117       StubRoutines::x86::_counter_shuffle_mask_addr = generate_counter_shuffle_mask();
5118       StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt_Parallel();
5119     }
5120 
5121     if (UseSHA1Intrinsics) {
5122       StubRoutines::x86::_upper_word_mask_addr = generate_upper_word_mask();
5123       StubRoutines::x86::_shuffle_byte_flip_mask_addr = generate_shuffle_byte_flip_mask();
5124       StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress");
5125       StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB");
5126     }
5127     if (UseSHA256Intrinsics) {
5128       StubRoutines::x86::_k256_adr = (address)StubRoutines::x86::_k256;
5129       char* dst = (char*)StubRoutines::x86::_k256_W;
5130       char* src = (char*)StubRoutines::x86::_k256;
5131       for (int ii = 0; ii < 16; ++ii) {
5132         memcpy(dst + 32 * ii,      src + 16 * ii, 16);
5133         memcpy(dst + 32 * ii + 16, src + 16 * ii, 16);
5134       }
5135       StubRoutines::x86::_k256_W_adr = (address)StubRoutines::x86::_k256_W;
5136       StubRoutines::x86::_pshuffle_byte_flip_mask_addr = generate_pshuffle_byte_flip_mask();
5137       StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress");
5138       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB");
5139     }
5140     if (UseSHA512Intrinsics) {
5141       StubRoutines::x86::_k512_W_addr = (address)StubRoutines::x86::_k512_W;
5142       StubRoutines::x86::_pshuffle_byte_flip_mask_addr_sha512 = generate_pshuffle_byte_flip_mask_sha512();
5143       StubRoutines::_sha512_implCompress = generate_sha512_implCompress(false, "sha512_implCompress");
5144       StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true, "sha512_implCompressMB");
5145     }
5146 
5147     // Generate GHASH intrinsics code
5148     if (UseGHASHIntrinsics) {
5149       StubRoutines::x86::_ghash_long_swap_mask_addr = generate_ghash_long_swap_mask();
5150       StubRoutines::x86::_ghash_byte_swap_mask_addr = generate_ghash_byte_swap_mask();
5151       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
5152     }
5153 
5154     // Safefetch stubs.
5155     generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
5156                                                        &StubRoutines::_safefetch32_fault_pc,
5157                                                        &StubRoutines::_safefetch32_continuation_pc);
5158     generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
5159                                                        &StubRoutines::_safefetchN_fault_pc,
5160                                                        &StubRoutines::_safefetchN_continuation_pc);
5161 #ifdef COMPILER2
5162     if (UseMultiplyToLenIntrinsic) {
5163       StubRoutines::_multiplyToLen = generate_multiplyToLen();
5164     }
5165     if (UseSquareToLenIntrinsic) {
5166       StubRoutines::_squareToLen = generate_squareToLen();
5167     }
5168     if (UseMulAddIntrinsic) {
5169       StubRoutines::_mulAdd = generate_mulAdd();
5170     }
5171 #ifndef _WINDOWS
5172     if (UseMontgomeryMultiplyIntrinsic) {
5173       StubRoutines::_montgomeryMultiply
5174         = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_multiply);
5175     }
5176     if (UseMontgomerySquareIntrinsic) {
5177       StubRoutines::_montgomerySquare
5178         = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_square);
5179     }
5180 #endif // WINDOWS
5181 #endif // COMPILER2
5182 
5183     if (UseVectorizedMismatchIntrinsic) {
5184       StubRoutines::_vectorizedMismatch = generate_vectorizedMismatch();
5185     }
5186   }
5187 
5188  public:
5189   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
5190     if (all) {
5191       generate_all();
5192     } else {
5193       generate_initial();
5194     }
5195   }
5196 }; // end class declaration
5197 
5198 void StubGenerator_generate(CodeBuffer* code, bool all) {
5199   StubGenerator g(code, all);
5200 }