Old src/hotspot/cpu/aarch64/stubGenerator

   1 /*
   2  * Copyright (c) 2003, 2019, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2019, Red Hat Inc. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/macroAssembler.hpp"
  28 #include "asm/macroAssembler.inline.hpp"
  29 #include "gc/shared/barrierSet.hpp"
  30 #include "gc/shared/barrierSetAssembler.hpp"
  31 #include "interpreter/interpreter.hpp"
  32 #include "memory/universe.hpp"
  33 #include "nativeInst_aarch64.hpp"
  34 #include "oops/instanceOop.hpp"
  35 #include "oops/method.hpp"
  36 #include "oops/objArrayKlass.hpp"
  37 #include "oops/oop.inline.hpp"
  38 #include "prims/methodHandles.hpp"
  39 #include "runtime/frame.inline.hpp"
  40 #include "runtime/handles.inline.hpp"
  41 #include "runtime/sharedRuntime.hpp"
  42 #include "runtime/stubCodeGenerator.hpp"
  43 #include "runtime/stubRoutines.hpp"
  44 #include "runtime/thread.inline.hpp"
  45 #include "utilities/align.hpp"
  46 #ifdef COMPILER2
  47 #include "opto/runtime.hpp"
  48 #endif
  49 #if INCLUDE_ZGC
  50 #include "gc/z/zThreadLocalData.hpp"
  51 #endif
  52 
  53 #ifdef BUILTIN_SIM
  54 #include "../../../../../../simulator/simulator.hpp"
  55 #endif
  56 
  57 // Declaration and definition of StubGenerator (no .hpp file).
  58 // For a more detailed description of the stub routine structure
  59 // see the comment in stubRoutines.hpp
  60 
  61 #undef __
  62 #define __ _masm->
  63 #define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8))
  64 
  65 #ifdef PRODUCT
  66 #define BLOCK_COMMENT(str) /* nothing */
  67 #else
  68 #define BLOCK_COMMENT(str) __ block_comment(str)
  69 #endif
  70 
  71 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  72 
  73 // Stub Code definitions
  74 
  75 class StubGenerator: public StubCodeGenerator {
  76  private:
  77 
  78 #ifdef PRODUCT
  79 #define inc_counter_np(counter) ((void)0)
  80 #else
  81   void inc_counter_np_(int& counter) {
  82     __ lea(rscratch2, ExternalAddress((address)&counter));
  83     __ ldrw(rscratch1, Address(rscratch2));
  84     __ addw(rscratch1, rscratch1, 1);
  85     __ strw(rscratch1, Address(rscratch2));
  86   }
  87 #define inc_counter_np(counter) \
  88   BLOCK_COMMENT("inc_counter " #counter); \
  89   inc_counter_np_(counter);
  90 #endif
  91 
  92   // Call stubs are used to call Java from C
  93   //
  94   // Arguments:
  95   //    c_rarg0:   call wrapper address                   address
  96   //    c_rarg1:   result                                 address
  97   //    c_rarg2:   result type                            BasicType
  98   //    c_rarg3:   method                                 Method*
  99   //    c_rarg4:   (interpreter) entry point              address
 100   //    c_rarg5:   parameters                             intptr_t*
 101   //    c_rarg6:   parameter size (in words)              int
 102   //    c_rarg7:   thread                                 Thread*
 103   //
 104   // There is no return from the stub itself as any Java result
 105   // is written to result
 106   //
 107   // we save r30 (lr) as the return PC at the base of the frame and
 108   // link r29 (fp) below it as the frame pointer installing sp (r31)
 109   // into fp.
 110   //
 111   // we save r0-r7, which accounts for all the c arguments.
 112   //
 113   // TODO: strictly do we need to save them all? they are treated as
 114   // volatile by C so could we omit saving the ones we are going to
 115   // place in global registers (thread? method?) or those we only use
 116   // during setup of the Java call?
 117   //
 118   // we don't need to save r8 which C uses as an indirect result location
 119   // return register.
 120   //
 121   // we don't need to save r9-r15 which both C and Java treat as
 122   // volatile
 123   //
 124   // we don't need to save r16-18 because Java does not use them
 125   //
 126   // we save r19-r28 which Java uses as scratch registers and C
 127   // expects to be callee-save
 128   //
 129   // we save the bottom 64 bits of each value stored in v8-v15; it is
 130   // the responsibility of the caller to preserve larger values.
 131   //
 132   // so the stub frame looks like this when we enter Java code
 133   //
 134   //     [ return_from_Java     ] <--- sp
 135   //     [ argument word n      ]
 136   //      ...
 137   // -27 [ argument word 1      ]
 138   // -26 [ saved v15            ] <--- sp_after_call
 139   // -25 [ saved v14            ]
 140   // -24 [ saved v13            ]
 141   // -23 [ saved v12            ]
 142   // -22 [ saved v11            ]
 143   // -21 [ saved v10            ]
 144   // -20 [ saved v9             ]
 145   // -19 [ saved v8             ]
 146   // -18 [ saved r28            ]
 147   // -17 [ saved r27            ]
 148   // -16 [ saved r26            ]
 149   // -15 [ saved r25            ]
 150   // -14 [ saved r24            ]
 151   // -13 [ saved r23            ]
 152   // -12 [ saved r22            ]
 153   // -11 [ saved r21            ]
 154   // -10 [ saved r20            ]
 155   //  -9 [ saved r19            ]
 156   //  -8 [ call wrapper    (r0) ]
 157   //  -7 [ result          (r1) ]
 158   //  -6 [ result type     (r2) ]
 159   //  -5 [ method          (r3) ]
 160   //  -4 [ entry point     (r4) ]
 161   //  -3 [ parameters      (r5) ]
 162   //  -2 [ parameter size  (r6) ]
 163   //  -1 [ thread (r7)          ]
 164   //   0 [ saved fp       (r29) ] <--- fp == saved sp (r31)
 165   //   1 [ saved lr       (r30) ]
 166 
 167   // Call stub stack layout word offsets from fp
 168   enum call_stub_layout {
 169     sp_after_call_off = -26,
 170 
 171     d15_off            = -26,
 172     d13_off            = -24,
 173     d11_off            = -22,
 174     d9_off             = -20,
 175 
 176     r28_off            = -18,
 177     r26_off            = -16,
 178     r24_off            = -14,
 179     r22_off            = -12,
 180     r20_off            = -10,
 181     call_wrapper_off   =  -8,
 182     result_off         =  -7,
 183     result_type_off    =  -6,
 184     method_off         =  -5,
 185     entry_point_off    =  -4,
 186     parameter_size_off =  -2,
 187     thread_off         =  -1,
 188     fp_f               =   0,
 189     retaddr_off        =   1,
 190   };
 191 
 192   address generate_call_stub(address& return_address) {
 193     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
 194            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
 195            "adjust this code");
 196 
 197     StubCodeMark mark(this, "StubRoutines", "call_stub");
 198     address start = __ pc();
 199 
 200     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 201 
 202     const Address call_wrapper  (rfp, call_wrapper_off   * wordSize);
 203     const Address result        (rfp, result_off         * wordSize);
 204     const Address result_type   (rfp, result_type_off    * wordSize);
 205     const Address method        (rfp, method_off         * wordSize);
 206     const Address entry_point   (rfp, entry_point_off    * wordSize);
 207     const Address parameter_size(rfp, parameter_size_off * wordSize);
 208 
 209     const Address thread        (rfp, thread_off         * wordSize);
 210 
 211     const Address d15_save      (rfp, d15_off * wordSize);
 212     const Address d13_save      (rfp, d13_off * wordSize);
 213     const Address d11_save      (rfp, d11_off * wordSize);
 214     const Address d9_save       (rfp, d9_off * wordSize);
 215 
 216     const Address r28_save      (rfp, r28_off * wordSize);
 217     const Address r26_save      (rfp, r26_off * wordSize);
 218     const Address r24_save      (rfp, r24_off * wordSize);
 219     const Address r22_save      (rfp, r22_off * wordSize);
 220     const Address r20_save      (rfp, r20_off * wordSize);
 221 
 222     // stub code
 223 
 224     // we need a C prolog to bootstrap the x86 caller into the sim
 225     __ c_stub_prolog(8, 0, MacroAssembler::ret_type_void);
 226 
 227     address aarch64_entry = __ pc();
 228 
 229 #ifdef BUILTIN_SIM
 230     // Save sender's SP for stack traces.
 231     __ mov(rscratch1, sp);
 232     __ str(rscratch1, Address(__ pre(sp, -2 * wordSize)));
 233 #endif
 234     // set up frame and move sp to end of save area
 235     __ enter();
 236     __ sub(sp, rfp, -sp_after_call_off * wordSize);
 237 
 238     // save register parameters and Java scratch/global registers
 239     // n.b. we save thread even though it gets installed in
 240     // rthread because we want to sanity check rthread later
 241     __ str(c_rarg7,  thread);
 242     __ strw(c_rarg6, parameter_size);
 243     __ stp(c_rarg4, c_rarg5,  entry_point);
 244     __ stp(c_rarg2, c_rarg3,  result_type);
 245     __ stp(c_rarg0, c_rarg1,  call_wrapper);
 246 
 247     __ stp(r20, r19,   r20_save);
 248     __ stp(r22, r21,   r22_save);
 249     __ stp(r24, r23,   r24_save);
 250     __ stp(r26, r25,   r26_save);
 251     __ stp(r28, r27,   r28_save);
 252 
 253     __ stpd(v9,  v8,   d9_save);
 254     __ stpd(v11, v10,  d11_save);
 255     __ stpd(v13, v12,  d13_save);
 256     __ stpd(v15, v14,  d15_save);
 257 
 258     // install Java thread in global register now we have saved
 259     // whatever value it held
 260     __ mov(rthread, c_rarg7);
 261     // And method
 262     __ mov(rmethod, c_rarg3);
 263 
 264     // set up the heapbase register
 265     __ reinit_heapbase();
 266 
 267 #ifdef ASSERT
 268     // make sure we have no pending exceptions
 269     {
 270       Label L;
 271       __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
 272       __ cmp(rscratch1, (u1)NULL_WORD);
 273       __ br(Assembler::EQ, L);
 274       __ stop("StubRoutines::call_stub: entered with pending exception");
 275       __ BIND(L);
 276     }
 277 #endif
 278     // pass parameters if any
 279     __ mov(esp, sp);
 280     __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
 281     __ andr(sp, rscratch1, -2 * wordSize);
 282 
 283     BLOCK_COMMENT("pass parameters if any");
 284     Label parameters_done;
 285     // parameter count is still in c_rarg6
 286     // and parameter pointer identifying param 1 is in c_rarg5
 287     __ cbzw(c_rarg6, parameters_done);
 288 
 289     address loop = __ pc();
 290     __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
 291     __ subsw(c_rarg6, c_rarg6, 1);
 292     __ push(rscratch1);
 293     __ br(Assembler::GT, loop);
 294 
 295     __ BIND(parameters_done);
 296 
 297     // call Java entry -- passing methdoOop, and current sp
 298     //      rmethod: Method*
 299     //      r13: sender sp
 300     BLOCK_COMMENT("call Java function");
 301     __ mov(r13, sp);
 302     __ blr(c_rarg4);
 303 
 304     // tell the simulator we have returned to the stub
 305 
 306     // we do this here because the notify will already have been done
 307     // if we get to the next instruction via an exception
 308     //
 309     // n.b. adding this instruction here affects the calculation of
 310     // whether or not a routine returns to the call stub (used when
 311     // doing stack walks) since the normal test is to check the return
 312     // pc against the address saved below. so we may need to allow for
 313     // this extra instruction in the check.
 314 
 315     if (NotifySimulator) {
 316       __ notify(Assembler::method_reentry);
 317     }
 318     // save current address for use by exception handling code
 319 
 320     return_address = __ pc();
 321 
 322     // store result depending on type (everything that is not
 323     // T_OBJECT, T_VALUETYPE, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
 324     // n.b. this assumes Java returns an integral result in r0
 325     // and a floating result in j_farg0
 326     __ ldr(j_rarg2, result);
 327     Label is_long, is_float, is_double, exit;
 328     __ ldr(j_rarg1, result_type);
 329     __ cmp(j_rarg1, (u1)T_OBJECT);
 330     __ br(Assembler::EQ, is_long);
 331     __ cmp(j_rarg1, (u1)T_VALUETYPE);
 332     __ br(Assembler::EQ, is_long);
 333     __ cmp(j_rarg1, (u1)T_LONG);
 334     __ br(Assembler::EQ, is_long);
 335     __ cmp(j_rarg1, (u1)T_FLOAT);
 336     __ br(Assembler::EQ, is_float);
 337     __ cmp(j_rarg1, (u1)T_DOUBLE);
 338     __ br(Assembler::EQ, is_double);
 339 
 340     // handle T_INT case
 341     __ strw(r0, Address(j_rarg2));
 342 
 343     __ BIND(exit);
 344 
 345     // pop parameters
 346     __ sub(esp, rfp, -sp_after_call_off * wordSize);
 347 
 348 #ifdef ASSERT
 349     // verify that threads correspond
 350     {
 351       Label L, S;
 352       __ ldr(rscratch1, thread);
 353       __ cmp(rthread, rscratch1);
 354       __ br(Assembler::NE, S);
 355       __ get_thread(rscratch1);
 356       __ cmp(rthread, rscratch1);
 357       __ br(Assembler::EQ, L);
 358       __ BIND(S);
 359       __ stop("StubRoutines::call_stub: threads must correspond");
 360       __ BIND(L);
 361     }
 362 #endif
 363 
 364     // restore callee-save registers
 365     __ ldpd(v15, v14,  d15_save);
 366     __ ldpd(v13, v12,  d13_save);
 367     __ ldpd(v11, v10,  d11_save);
 368     __ ldpd(v9,  v8,   d9_save);
 369 
 370     __ ldp(r28, r27,   r28_save);
 371     __ ldp(r26, r25,   r26_save);
 372     __ ldp(r24, r23,   r24_save);
 373     __ ldp(r22, r21,   r22_save);
 374     __ ldp(r20, r19,   r20_save);
 375 
 376     __ ldp(c_rarg0, c_rarg1,  call_wrapper);
 377     __ ldrw(c_rarg2, result_type);
 378     __ ldr(c_rarg3,  method);
 379     __ ldp(c_rarg4, c_rarg5,  entry_point);
 380     __ ldp(c_rarg6, c_rarg7,  parameter_size);
 381 
 382 #ifndef PRODUCT
 383     // tell the simulator we are about to end Java execution
 384     if (NotifySimulator) {
 385       __ notify(Assembler::method_exit);
 386     }
 387 #endif
 388     // leave frame and return to caller
 389     __ leave();
 390     __ ret(lr);
 391 
 392     // handle return types different from T_INT
 393 
 394     __ BIND(is_long);
 395     __ str(r0, Address(j_rarg2, 0));
 396     __ br(Assembler::AL, exit);
 397 
 398     __ BIND(is_float);
 399     __ strs(j_farg0, Address(j_rarg2, 0));
 400     __ br(Assembler::AL, exit);
 401 
 402     __ BIND(is_double);
 403     __ strd(j_farg0, Address(j_rarg2, 0));
 404     __ br(Assembler::AL, exit);
 405 
 406     return start;
 407   }
 408 
 409   // Return point for a Java call if there's an exception thrown in
 410   // Java code.  The exception is caught and transformed into a
 411   // pending exception stored in JavaThread that can be tested from
 412   // within the VM.
 413   //
 414   // Note: Usually the parameters are removed by the callee. In case
 415   // of an exception crossing an activation frame boundary, that is
 416   // not the case if the callee is compiled code => need to setup the
 417   // rsp.
 418   //
 419   // r0: exception oop
 420 
 421   // NOTE: this is used as a target from the signal handler so it
 422   // needs an x86 prolog which returns into the current simulator
 423   // executing the generated catch_exception code. so the prolog
 424   // needs to install rax in a sim register and adjust the sim's
 425   // restart pc to enter the generated code at the start position
 426   // then return from native to simulated execution.
 427 
 428   address generate_catch_exception() {
 429     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 430     address start = __ pc();
 431 
 432     // same as in generate_call_stub():
 433     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 434     const Address thread        (rfp, thread_off         * wordSize);
 435 
 436 #ifdef ASSERT
 437     // verify that threads correspond
 438     {
 439       Label L, S;
 440       __ ldr(rscratch1, thread);
 441       __ cmp(rthread, rscratch1);
 442       __ br(Assembler::NE, S);
 443       __ get_thread(rscratch1);
 444       __ cmp(rthread, rscratch1);
 445       __ br(Assembler::EQ, L);
 446       __ bind(S);
 447       __ stop("StubRoutines::catch_exception: threads must correspond");
 448       __ bind(L);
 449     }
 450 #endif
 451 
 452     // set pending exception
 453     __ verify_oop(r0);
 454 
 455     __ str(r0, Address(rthread, Thread::pending_exception_offset()));
 456     __ mov(rscratch1, (address)__FILE__);
 457     __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
 458     __ movw(rscratch1, (int)__LINE__);
 459     __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
 460 
 461     // complete return to VM
 462     assert(StubRoutines::_call_stub_return_address != NULL,
 463            "_call_stub_return_address must have been generated before");
 464     __ b(StubRoutines::_call_stub_return_address);
 465 
 466     return start;
 467   }
 468 
 469   // Continuation point for runtime calls returning with a pending
 470   // exception.  The pending exception check happened in the runtime
 471   // or native call stub.  The pending exception in Thread is
 472   // converted into a Java-level exception.
 473   //
 474   // Contract with Java-level exception handlers:
 475   // r0: exception
 476   // r3: throwing pc
 477   //
 478   // NOTE: At entry of this stub, exception-pc must be in LR !!
 479 
 480   // NOTE: this is always used as a jump target within generated code
 481   // so it just needs to be generated code wiht no x86 prolog
 482 
 483   address generate_forward_exception() {
 484     StubCodeMark mark(this, "StubRoutines", "forward exception");
 485     address start = __ pc();
 486 
 487     // Upon entry, LR points to the return address returning into
 488     // Java (interpreted or compiled) code; i.e., the return address
 489     // becomes the throwing pc.
 490     //
 491     // Arguments pushed before the runtime call are still on the stack
 492     // but the exception handler will reset the stack pointer ->
 493     // ignore them.  A potential result in registers can be ignored as
 494     // well.
 495 
 496 #ifdef ASSERT
 497     // make sure this code is only executed if there is a pending exception
 498     {
 499       Label L;
 500       __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
 501       __ cbnz(rscratch1, L);
 502       __ stop("StubRoutines::forward exception: no pending exception (1)");
 503       __ bind(L);
 504     }
 505 #endif
 506 
 507     // compute exception handler into r19
 508 
 509     // call the VM to find the handler address associated with the
 510     // caller address. pass thread in r0 and caller pc (ret address)
 511     // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
 512     // the stack.
 513     __ mov(c_rarg1, lr);
 514     // lr will be trashed by the VM call so we move it to R19
 515     // (callee-saved) because we also need to pass it to the handler
 516     // returned by this call.
 517     __ mov(r19, lr);
 518     BLOCK_COMMENT("call exception_handler_for_return_address");
 519     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 520                          SharedRuntime::exception_handler_for_return_address),
 521                     rthread, c_rarg1);
 522     // we should not really care that lr is no longer the callee
 523     // address. we saved the value the handler needs in r19 so we can
 524     // just copy it to r3. however, the C2 handler will push its own
 525     // frame and then calls into the VM and the VM code asserts that
 526     // the PC for the frame above the handler belongs to a compiled
 527     // Java method. So, we restore lr here to satisfy that assert.
 528     __ mov(lr, r19);
 529     // setup r0 & r3 & clear pending exception
 530     __ mov(r3, r19);
 531     __ mov(r19, r0);
 532     __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
 533     __ str(zr, Address(rthread, Thread::pending_exception_offset()));
 534 
 535 #ifdef ASSERT
 536     // make sure exception is set
 537     {
 538       Label L;
 539       __ cbnz(r0, L);
 540       __ stop("StubRoutines::forward exception: no pending exception (2)");
 541       __ bind(L);
 542     }
 543 #endif
 544 
 545     // continue at exception handler
 546     // r0: exception
 547     // r3: throwing pc
 548     // r19: exception handler
 549     __ verify_oop(r0);
 550     __ br(r19);
 551 
 552     return start;
 553   }
 554 
 555   // Non-destructive plausibility checks for oops
 556   //
 557   // Arguments:
 558   //    r0: oop to verify
 559   //    rscratch1: error message
 560   //
 561   // Stack after saving c_rarg3:
 562   //    [tos + 0]: saved c_rarg3
 563   //    [tos + 1]: saved c_rarg2
 564   //    [tos + 2]: saved lr
 565   //    [tos + 3]: saved rscratch2
 566   //    [tos + 4]: saved r0
 567   //    [tos + 5]: saved rscratch1
 568   address generate_verify_oop() {
 569 
 570     StubCodeMark mark(this, "StubRoutines", "verify_oop");
 571     address start = __ pc();
 572 
 573     Label exit, error;
 574 
 575     // save c_rarg2 and c_rarg3
 576     __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
 577 
 578     // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 579     __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 580     __ ldr(c_rarg3, Address(c_rarg2));
 581     __ add(c_rarg3, c_rarg3, 1);
 582     __ str(c_rarg3, Address(c_rarg2));
 583 
 584     // object is in r0
 585     // make sure object is 'reasonable'
 586     __ cbz(r0, exit); // if obj is NULL it is OK
 587 
 588 #if INCLUDE_ZGC
 589     if (UseZGC) {
 590       // Check if mask is good.
 591       // verifies that ZAddressBadMask & r0 == 0
 592       __ ldr(c_rarg3, Address(rthread, ZThreadLocalData::address_bad_mask_offset()));
 593       __ andr(c_rarg2, r0, c_rarg3);
 594       __ cbnz(c_rarg2, error);
 595     }
 596 #endif
 597 
 598     // Check if the oop is in the right area of memory
 599     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask());
 600     __ andr(c_rarg2, r0, c_rarg3);
 601     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits());
 602 
 603     // Compare c_rarg2 and c_rarg3.  We don't use a compare
 604     // instruction here because the flags register is live.
 605     __ eor(c_rarg2, c_rarg2, c_rarg3);
 606     __ cbnz(c_rarg2, error);
 607 
 608     // make sure klass is 'reasonable', which is not zero.
 609     __ load_klass(r0, r0);  // get klass
 610     __ cbz(r0, error);      // if klass is NULL it is broken
 611 
 612     // return if everything seems ok
 613     __ bind(exit);
 614 
 615     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 616     __ ret(lr);
 617 
 618     // handle errors
 619     __ bind(error);
 620     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 621 
 622     __ push(RegSet::range(r0, r29), sp);
 623     // debug(char* msg, int64_t pc, int64_t regs[])
 624     __ mov(c_rarg0, rscratch1);      // pass address of error message
 625     __ mov(c_rarg1, lr);             // pass return address
 626     __ mov(c_rarg2, sp);             // pass address of regs on stack
 627 #ifndef PRODUCT
 628     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
 629 #endif
 630     BLOCK_COMMENT("call MacroAssembler::debug");
 631     __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
 632     __ blrt(rscratch1, 3, 0, 1);
 633 
 634     return start;
 635   }
 636 
 637   void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); }
 638 
 639   // The inner part of zero_words().  This is the bulk operation,
 640   // zeroing words in blocks, possibly using DC ZVA to do it.  The
 641   // caller is responsible for zeroing the last few words.
 642   //
 643   // Inputs:
 644   // r10: the HeapWord-aligned base address of an array to zero.
 645   // r11: the count in HeapWords, r11 > 0.
 646   //
 647   // Returns r10 and r11, adjusted for the caller to clear.
 648   // r10: the base address of the tail of words left to clear.
 649   // r11: the number of words in the tail.
 650   //      r11 < MacroAssembler::zero_words_block_size.
 651 
 652   address generate_zero_blocks() {
 653     Label done;
 654     Label base_aligned;
 655 
 656     Register base = r10, cnt = r11;
 657 
 658     __ align(CodeEntryAlignment);
 659     StubCodeMark mark(this, "StubRoutines", "zero_blocks");
 660     address start = __ pc();
 661 
 662     if (UseBlockZeroing) {
 663       int zva_length = VM_Version::zva_length();
 664 
 665       // Ensure ZVA length can be divided by 16. This is required by
 666       // the subsequent operations.
 667       assert (zva_length % 16 == 0, "Unexpected ZVA Length");
 668 
 669       __ tbz(base, 3, base_aligned);
 670       __ str(zr, Address(__ post(base, 8)));
 671       __ sub(cnt, cnt, 1);
 672       __ bind(base_aligned);
 673 
 674       // Ensure count >= zva_length * 2 so that it still deserves a zva after
 675       // alignment.
 676       Label small;
 677       int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
 678       __ subs(rscratch1, cnt, low_limit >> 3);
 679       __ br(Assembler::LT, small);
 680       __ zero_dcache_blocks(base, cnt);
 681       __ bind(small);
 682     }
 683 
 684     {
 685       // Number of stp instructions we'll unroll
 686       const int unroll =
 687         MacroAssembler::zero_words_block_size / 2;
 688       // Clear the remaining blocks.
 689       Label loop;
 690       __ subs(cnt, cnt, unroll * 2);
 691       __ br(Assembler::LT, done);
 692       __ bind(loop);
 693       for (int i = 0; i < unroll; i++)
 694         __ stp(zr, zr, __ post(base, 16));
 695       __ subs(cnt, cnt, unroll * 2);
 696       __ br(Assembler::GE, loop);
 697       __ bind(done);
 698       __ add(cnt, cnt, unroll * 2);
 699     }
 700 
 701     __ ret(lr);
 702 
 703     return start;
 704   }
 705 
 706 
 707   typedef enum {
 708     copy_forwards = 1,
 709     copy_backwards = -1
 710   } copy_direction;
 711 
 712   // Bulk copy of blocks of 8 words.
 713   //
 714   // count is a count of words.
 715   //
 716   // Precondition: count >= 8
 717   //
 718   // Postconditions:
 719   //
 720   // The least significant bit of count contains the remaining count
 721   // of words to copy.  The rest of count is trash.
 722   //
 723   // s and d are adjusted to point to the remaining words to copy
 724   //
 725   void generate_copy_longs(Label &start, Register s, Register d, Register count,
 726                            copy_direction direction) {
 727     int unit = wordSize * direction;
 728     int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
 729 
 730     int offset;
 731     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
 732       t4 = r7, t5 = r10, t6 = r11, t7 = r12;
 733     const Register stride = r13;
 734 
 735     assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7);
 736     assert_different_registers(s, d, count, rscratch1);
 737 
 738     Label again, drain;
 739     const char *stub_name;
 740     if (direction == copy_forwards)
 741       stub_name = "forward_copy_longs";
 742     else
 743       stub_name = "backward_copy_longs";
 744 
 745     __ align(CodeEntryAlignment);
 746 
 747     StubCodeMark mark(this, "StubRoutines", stub_name);
 748 
 749     __ bind(start);
 750 
 751     Label unaligned_copy_long;
 752     if (AvoidUnalignedAccesses) {
 753       __ tbnz(d, 3, unaligned_copy_long);
 754     }
 755 
 756     if (direction == copy_forwards) {
 757       __ sub(s, s, bias);
 758       __ sub(d, d, bias);
 759     }
 760 
 761 #ifdef ASSERT
 762     // Make sure we are never given < 8 words
 763     {
 764       Label L;
 765       __ cmp(count, (u1)8);
 766       __ br(Assembler::GE, L);
 767       __ stop("genrate_copy_longs called with < 8 words");
 768       __ bind(L);
 769     }
 770 #endif
 771 
 772     // Fill 8 registers
 773     if (UseSIMDForMemoryOps) {
 774       __ ldpq(v0, v1, Address(s, 4 * unit));
 775       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
 776     } else {
 777       __ ldp(t0, t1, Address(s, 2 * unit));
 778       __ ldp(t2, t3, Address(s, 4 * unit));
 779       __ ldp(t4, t5, Address(s, 6 * unit));
 780       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 781     }
 782 
 783     __ subs(count, count, 16);
 784     __ br(Assembler::LO, drain);
 785 
 786     int prefetch = PrefetchCopyIntervalInBytes;
 787     bool use_stride = false;
 788     if (direction == copy_backwards) {
 789        use_stride = prefetch > 256;
 790        prefetch = -prefetch;
 791        if (use_stride) __ mov(stride, prefetch);
 792     }
 793 
 794     __ bind(again);
 795 
 796     if (PrefetchCopyIntervalInBytes > 0)
 797       __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 798 
 799     if (UseSIMDForMemoryOps) {
 800       __ stpq(v0, v1, Address(d, 4 * unit));
 801       __ ldpq(v0, v1, Address(s, 4 * unit));
 802       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
 803       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
 804     } else {
 805       __ stp(t0, t1, Address(d, 2 * unit));
 806       __ ldp(t0, t1, Address(s, 2 * unit));
 807       __ stp(t2, t3, Address(d, 4 * unit));
 808       __ ldp(t2, t3, Address(s, 4 * unit));
 809       __ stp(t4, t5, Address(d, 6 * unit));
 810       __ ldp(t4, t5, Address(s, 6 * unit));
 811       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 812       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 813     }
 814 
 815     __ subs(count, count, 8);
 816     __ br(Assembler::HS, again);
 817 
 818     // Drain
 819     __ bind(drain);
 820     if (UseSIMDForMemoryOps) {
 821       __ stpq(v0, v1, Address(d, 4 * unit));
 822       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
 823     } else {
 824       __ stp(t0, t1, Address(d, 2 * unit));
 825       __ stp(t2, t3, Address(d, 4 * unit));
 826       __ stp(t4, t5, Address(d, 6 * unit));
 827       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 828     }
 829 
 830     {
 831       Label L1, L2;
 832       __ tbz(count, exact_log2(4), L1);
 833       if (UseSIMDForMemoryOps) {
 834         __ ldpq(v0, v1, Address(__ pre(s, 4 * unit)));
 835         __ stpq(v0, v1, Address(__ pre(d, 4 * unit)));
 836       } else {
 837         __ ldp(t0, t1, Address(s, 2 * unit));
 838         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
 839         __ stp(t0, t1, Address(d, 2 * unit));
 840         __ stp(t2, t3, Address(__ pre(d, 4 * unit)));
 841       }
 842       __ bind(L1);
 843 
 844       if (direction == copy_forwards) {
 845         __ add(s, s, bias);
 846         __ add(d, d, bias);
 847       }
 848 
 849       __ tbz(count, 1, L2);
 850       __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
 851       __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
 852       __ bind(L2);
 853     }
 854 
 855     __ ret(lr);
 856 
 857     if (AvoidUnalignedAccesses) {
 858       Label drain, again;
 859       // Register order for storing. Order is different for backward copy.
 860 
 861       __ bind(unaligned_copy_long);
 862 
 863       // source address is even aligned, target odd aligned
 864       //
 865       // when forward copying word pairs we read long pairs at offsets
 866       // {0, 2, 4, 6} (in long words). when backwards copying we read
 867       // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
 868       // address by -2 in the forwards case so we can compute the
 869       // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
 870       // or -1.
 871       //
 872       // when forward copying we need to store 1 word, 3 pairs and
 873       // then 1 word at offsets {0, 1, 3, 5, 7}. Rather thna use a
 874       // zero offset We adjust the destination by -1 which means we
 875       // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
 876       //
 877       // When backwards copyng we need to store 1 word, 3 pairs and
 878       // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
 879       // offsets {1, 3, 5, 7, 8} * unit.
 880 
 881       if (direction == copy_forwards) {
 882         __ sub(s, s, 16);
 883         __ sub(d, d, 8);
 884       }
 885 
 886       // Fill 8 registers
 887       //
 888       // for forwards copy s was offset by -16 from the original input
 889       // value of s so the register contents are at these offsets
 890       // relative to the 64 bit block addressed by that original input
 891       // and so on for each successive 64 byte block when s is updated
 892       //
 893       // t0 at offset 0,  t1 at offset 8
 894       // t2 at offset 16, t3 at offset 24
 895       // t4 at offset 32, t5 at offset 40
 896       // t6 at offset 48, t7 at offset 56
 897 
 898       // for backwards copy s was not offset so the register contents
 899       // are at these offsets into the preceding 64 byte block
 900       // relative to that original input and so on for each successive
 901       // preceding 64 byte block when s is updated. this explains the
 902       // slightly counter-intuitive looking pattern of register usage
 903       // in the stp instructions for backwards copy.
 904       //
 905       // t0 at offset -16, t1 at offset -8
 906       // t2 at offset -32, t3 at offset -24
 907       // t4 at offset -48, t5 at offset -40
 908       // t6 at offset -64, t7 at offset -56
 909 
 910       __ ldp(t0, t1, Address(s, 2 * unit));
 911       __ ldp(t2, t3, Address(s, 4 * unit));
 912       __ ldp(t4, t5, Address(s, 6 * unit));
 913       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 914 
 915       __ subs(count, count, 16);
 916       __ br(Assembler::LO, drain);
 917 
 918       int prefetch = PrefetchCopyIntervalInBytes;
 919       bool use_stride = false;
 920       if (direction == copy_backwards) {
 921          use_stride = prefetch > 256;
 922          prefetch = -prefetch;
 923          if (use_stride) __ mov(stride, prefetch);
 924       }
 925 
 926       __ bind(again);
 927 
 928       if (PrefetchCopyIntervalInBytes > 0)
 929         __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 930 
 931       if (direction == copy_forwards) {
 932        // allowing for the offset of -8 the store instructions place
 933        // registers into the target 64 bit block at the following
 934        // offsets
 935        //
 936        // t0 at offset 0
 937        // t1 at offset 8,  t2 at offset 16
 938        // t3 at offset 24, t4 at offset 32
 939        // t5 at offset 40, t6 at offset 48
 940        // t7 at offset 56
 941 
 942         __ str(t0, Address(d, 1 * unit));
 943         __ stp(t1, t2, Address(d, 2 * unit));
 944         __ ldp(t0, t1, Address(s, 2 * unit));
 945         __ stp(t3, t4, Address(d, 4 * unit));
 946         __ ldp(t2, t3, Address(s, 4 * unit));
 947         __ stp(t5, t6, Address(d, 6 * unit));
 948         __ ldp(t4, t5, Address(s, 6 * unit));
 949         __ str(t7, Address(__ pre(d, 8 * unit)));
 950         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 951       } else {
 952        // d was not offset when we started so the registers are
 953        // written into the 64 bit block preceding d with the following
 954        // offsets
 955        //
 956        // t1 at offset -8
 957        // t3 at offset -24, t0 at offset -16
 958        // t5 at offset -48, t2 at offset -32
 959        // t7 at offset -56, t4 at offset -48
 960        //                   t6 at offset -64
 961        //
 962        // note that this matches the offsets previously noted for the
 963        // loads
 964 
 965         __ str(t1, Address(d, 1 * unit));
 966         __ stp(t3, t0, Address(d, 3 * unit));
 967         __ ldp(t0, t1, Address(s, 2 * unit));
 968         __ stp(t5, t2, Address(d, 5 * unit));
 969         __ ldp(t2, t3, Address(s, 4 * unit));
 970         __ stp(t7, t4, Address(d, 7 * unit));
 971         __ ldp(t4, t5, Address(s, 6 * unit));
 972         __ str(t6, Address(__ pre(d, 8 * unit)));
 973         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 974       }
 975 
 976       __ subs(count, count, 8);
 977       __ br(Assembler::HS, again);
 978 
 979       // Drain
 980       //
 981       // this uses the same pattern of offsets and register arguments
 982       // as above
 983       __ bind(drain);
 984       if (direction == copy_forwards) {
 985         __ str(t0, Address(d, 1 * unit));
 986         __ stp(t1, t2, Address(d, 2 * unit));
 987         __ stp(t3, t4, Address(d, 4 * unit));
 988         __ stp(t5, t6, Address(d, 6 * unit));
 989         __ str(t7, Address(__ pre(d, 8 * unit)));
 990       } else {
 991         __ str(t1, Address(d, 1 * unit));
 992         __ stp(t3, t0, Address(d, 3 * unit));
 993         __ stp(t5, t2, Address(d, 5 * unit));
 994         __ stp(t7, t4, Address(d, 7 * unit));
 995         __ str(t6, Address(__ pre(d, 8 * unit)));
 996       }
 997       // now we need to copy any remaining part block which may
 998       // include a 4 word block subblock and/or a 2 word subblock.
 999       // bits 2 and 1 in the count are the tell-tale for whetehr we
1000       // have each such subblock
1001       {
1002         Label L1, L2;
1003         __ tbz(count, exact_log2(4), L1);
1004        // this is the same as above but copying only 4 longs hence
1005        // with ony one intervening stp between the str instructions
1006        // but note that the offsets and registers still follow the
1007        // same pattern
1008         __ ldp(t0, t1, Address(s, 2 * unit));
1009         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
1010         if (direction == copy_forwards) {
1011           __ str(t0, Address(d, 1 * unit));
1012           __ stp(t1, t2, Address(d, 2 * unit));
1013           __ str(t3, Address(__ pre(d, 4 * unit)));
1014         } else {
1015           __ str(t1, Address(d, 1 * unit));
1016           __ stp(t3, t0, Address(d, 3 * unit));
1017           __ str(t2, Address(__ pre(d, 4 * unit)));
1018         }
1019         __ bind(L1);
1020 
1021         __ tbz(count, 1, L2);
1022        // this is the same as above but copying only 2 longs hence
1023        // there is no intervening stp between the str instructions
1024        // but note that the offset and register patterns are still
1025        // the same
1026         __ ldp(t0, t1, Address(__ pre(s, 2 * unit)));
1027         if (direction == copy_forwards) {
1028           __ str(t0, Address(d, 1 * unit));
1029           __ str(t1, Address(__ pre(d, 2 * unit)));
1030         } else {
1031           __ str(t1, Address(d, 1 * unit));
1032           __ str(t0, Address(__ pre(d, 2 * unit)));
1033         }
1034         __ bind(L2);
1035 
1036        // for forwards copy we need to re-adjust the offsets we
1037        // applied so that s and d are follow the last words written
1038 
1039        if (direction == copy_forwards) {
1040          __ add(s, s, 16);
1041          __ add(d, d, 8);
1042        }
1043 
1044       }
1045 
1046       __ ret(lr);
1047       }
1048   }
1049 
1050   // Small copy: less than 16 bytes.
1051   //
1052   // NB: Ignores all of the bits of count which represent more than 15
1053   // bytes, so a caller doesn't have to mask them.
1054 
1055   void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) {
1056     bool is_backwards = step < 0;
1057     size_t granularity = uabs(step);
1058     int direction = is_backwards ? -1 : 1;
1059     int unit = wordSize * direction;
1060 
1061     Label Lword, Lint, Lshort, Lbyte;
1062 
1063     assert(granularity
1064            && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
1065 
1066     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6;
1067 
1068     // ??? I don't know if this bit-test-and-branch is the right thing
1069     // to do.  It does a lot of jumping, resulting in several
1070     // mispredicted branches.  It might make more sense to do this
1071     // with something like Duff's device with a single computed branch.
1072 
1073     __ tbz(count, 3 - exact_log2(granularity), Lword);
1074     __ ldr(tmp, Address(__ adjust(s, unit, is_backwards)));
1075     __ str(tmp, Address(__ adjust(d, unit, is_backwards)));
1076     __ bind(Lword);
1077 
1078     if (granularity <= sizeof (jint)) {
1079       __ tbz(count, 2 - exact_log2(granularity), Lint);
1080       __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
1081       __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
1082       __ bind(Lint);
1083     }
1084 
1085     if (granularity <= sizeof (jshort)) {
1086       __ tbz(count, 1 - exact_log2(granularity), Lshort);
1087       __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
1088       __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
1089       __ bind(Lshort);
1090     }
1091 
1092     if (granularity <= sizeof (jbyte)) {
1093       __ tbz(count, 0, Lbyte);
1094       __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
1095       __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
1096       __ bind(Lbyte);
1097     }
1098   }
1099 
1100   Label copy_f, copy_b;
1101 
1102   // All-singing all-dancing memory copy.
1103   //
1104   // Copy count units of memory from s to d.  The size of a unit is
1105   // step, which can be positive or negative depending on the direction
1106   // of copy.  If is_aligned is false, we align the source address.
1107   //
1108 
1109   void copy_memory(bool is_aligned, Register s, Register d,
1110                    Register count, Register tmp, int step) {
1111     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
1112     bool is_backwards = step < 0;
1113     int granularity = uabs(step);
1114     const Register t0 = r3, t1 = r4;
1115 
1116     // <= 96 bytes do inline. Direction doesn't matter because we always
1117     // load all the data before writing anything
1118     Label copy4, copy8, copy16, copy32, copy80, copy_big, finish;
1119     const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8;
1120     const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12;
1121     const Register send = r17, dend = r18;
1122 
1123     if (PrefetchCopyIntervalInBytes > 0)
1124       __ prfm(Address(s, 0), PLDL1KEEP);
1125     __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity));
1126     __ br(Assembler::HI, copy_big);
1127 
1128     __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
1129     __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
1130 
1131     __ cmp(count, u1(16/granularity));
1132     __ br(Assembler::LS, copy16);
1133 
1134     __ cmp(count, u1(64/granularity));
1135     __ br(Assembler::HI, copy80);
1136 
1137     __ cmp(count, u1(32/granularity));
1138     __ br(Assembler::LS, copy32);
1139 
1140     // 33..64 bytes
1141     if (UseSIMDForMemoryOps) {
1142       __ ldpq(v0, v1, Address(s, 0));
1143       __ ldpq(v2, v3, Address(send, -32));
1144       __ stpq(v0, v1, Address(d, 0));
1145       __ stpq(v2, v3, Address(dend, -32));
1146     } else {
1147       __ ldp(t0, t1, Address(s, 0));
1148       __ ldp(t2, t3, Address(s, 16));
1149       __ ldp(t4, t5, Address(send, -32));
1150       __ ldp(t6, t7, Address(send, -16));
1151 
1152       __ stp(t0, t1, Address(d, 0));
1153       __ stp(t2, t3, Address(d, 16));
1154       __ stp(t4, t5, Address(dend, -32));
1155       __ stp(t6, t7, Address(dend, -16));
1156     }
1157     __ b(finish);
1158 
1159     // 17..32 bytes
1160     __ bind(copy32);
1161     __ ldp(t0, t1, Address(s, 0));
1162     __ ldp(t2, t3, Address(send, -16));
1163     __ stp(t0, t1, Address(d, 0));
1164     __ stp(t2, t3, Address(dend, -16));
1165     __ b(finish);
1166 
1167     // 65..80/96 bytes
1168     // (96 bytes if SIMD because we do 32 byes per instruction)
1169     __ bind(copy80);
1170     if (UseSIMDForMemoryOps) {
1171       __ ld4(v0, v1, v2, v3, __ T16B, Address(s, 0));
1172       __ ldpq(v4, v5, Address(send, -32));
1173       __ st4(v0, v1, v2, v3, __ T16B, Address(d, 0));
1174       __ stpq(v4, v5, Address(dend, -32));
1175     } else {
1176       __ ldp(t0, t1, Address(s, 0));
1177       __ ldp(t2, t3, Address(s, 16));
1178       __ ldp(t4, t5, Address(s, 32));
1179       __ ldp(t6, t7, Address(s, 48));
1180       __ ldp(t8, t9, Address(send, -16));
1181 
1182       __ stp(t0, t1, Address(d, 0));
1183       __ stp(t2, t3, Address(d, 16));
1184       __ stp(t4, t5, Address(d, 32));
1185       __ stp(t6, t7, Address(d, 48));
1186       __ stp(t8, t9, Address(dend, -16));
1187     }
1188     __ b(finish);
1189 
1190     // 0..16 bytes
1191     __ bind(copy16);
1192     __ cmp(count, u1(8/granularity));
1193     __ br(Assembler::LO, copy8);
1194 
1195     // 8..16 bytes
1196     __ ldr(t0, Address(s, 0));
1197     __ ldr(t1, Address(send, -8));
1198     __ str(t0, Address(d, 0));
1199     __ str(t1, Address(dend, -8));
1200     __ b(finish);
1201 
1202     if (granularity < 8) {
1203       // 4..7 bytes
1204       __ bind(copy8);
1205       __ tbz(count, 2 - exact_log2(granularity), copy4);
1206       __ ldrw(t0, Address(s, 0));
1207       __ ldrw(t1, Address(send, -4));
1208       __ strw(t0, Address(d, 0));
1209       __ strw(t1, Address(dend, -4));
1210       __ b(finish);
1211       if (granularity < 4) {
1212         // 0..3 bytes
1213         __ bind(copy4);
1214         __ cbz(count, finish); // get rid of 0 case
1215         if (granularity == 2) {
1216           __ ldrh(t0, Address(s, 0));
1217           __ strh(t0, Address(d, 0));
1218         } else { // granularity == 1
1219           // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
1220           // the first and last byte.
1221           // Handle the 3 byte case by loading and storing base + count/2
1222           // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
1223           // This does means in the 1 byte case we load/store the same
1224           // byte 3 times.
1225           __ lsr(count, count, 1);
1226           __ ldrb(t0, Address(s, 0));
1227           __ ldrb(t1, Address(send, -1));
1228           __ ldrb(t2, Address(s, count));
1229           __ strb(t0, Address(d, 0));
1230           __ strb(t1, Address(dend, -1));
1231           __ strb(t2, Address(d, count));
1232         }
1233         __ b(finish);
1234       }
1235     }
1236 
1237     __ bind(copy_big);
1238     if (is_backwards) {
1239       __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
1240       __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
1241     }
1242 
1243     // Now we've got the small case out of the way we can align the
1244     // source address on a 2-word boundary.
1245 
1246     Label aligned;
1247 
1248     if (is_aligned) {
1249       // We may have to adjust by 1 word to get s 2-word-aligned.
1250       __ tbz(s, exact_log2(wordSize), aligned);
1251       __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards)));
1252       __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards)));
1253       __ sub(count, count, wordSize/granularity);
1254     } else {
1255       if (is_backwards) {
1256         __ andr(rscratch2, s, 2 * wordSize - 1);
1257       } else {
1258         __ neg(rscratch2, s);
1259         __ andr(rscratch2, rscratch2, 2 * wordSize - 1);
1260       }
1261       // rscratch2 is the byte adjustment needed to align s.
1262       __ cbz(rscratch2, aligned);
1263       int shift = exact_log2(granularity);
1264       if (shift)  __ lsr(rscratch2, rscratch2, shift);
1265       __ sub(count, count, rscratch2);
1266 
1267 #if 0
1268       // ?? This code is only correct for a disjoint copy.  It may or
1269       // may not make sense to use it in that case.
1270 
1271       // Copy the first pair; s and d may not be aligned.
1272       __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
1273       __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
1274 
1275       // Align s and d, adjust count
1276       if (is_backwards) {
1277         __ sub(s, s, rscratch2);
1278         __ sub(d, d, rscratch2);
1279       } else {
1280         __ add(s, s, rscratch2);
1281         __ add(d, d, rscratch2);
1282       }
1283 #else
1284       copy_memory_small(s, d, rscratch2, rscratch1, step);
1285 #endif
1286     }
1287 
1288     __ bind(aligned);
1289 
1290     // s is now 2-word-aligned.
1291 
1292     // We have a count of units and some trailing bytes.  Adjust the
1293     // count and do a bulk copy of words.
1294     __ lsr(rscratch2, count, exact_log2(wordSize/granularity));
1295     if (direction == copy_forwards)
1296       __ bl(copy_f);
1297     else
1298       __ bl(copy_b);
1299 
1300     // And the tail.
1301     copy_memory_small(s, d, count, tmp, step);
1302 
1303     if (granularity >= 8) __ bind(copy8);
1304     if (granularity >= 4) __ bind(copy4);
1305     __ bind(finish);
1306   }
1307 
1308 
1309   void clobber_registers() {
1310 #ifdef ASSERT
1311     __ mov(rscratch1, (uint64_t)0xdeadbeef);
1312     __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
1313     for (Register r = r3; r <= r18; r++)
1314       if (r != rscratch1) __ mov(r, rscratch1);
1315 #endif
1316   }
1317 
1318   // Scan over array at a for count oops, verifying each one.
1319   // Preserves a and count, clobbers rscratch1 and rscratch2.
1320   void verify_oop_array (size_t size, Register a, Register count, Register temp) {
1321     Label loop, end;
1322     __ mov(rscratch1, a);
1323     __ mov(rscratch2, zr);
1324     __ bind(loop);
1325     __ cmp(rscratch2, count);
1326     __ br(Assembler::HS, end);
1327     if (size == (size_t)wordSize) {
1328       __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1329       __ verify_oop(temp);
1330     } else {
1331       __ ldrw(r16, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1332       __ decode_heap_oop(temp); // calls verify_oop
1333     }
1334     __ add(rscratch2, rscratch2, size);
1335     __ b(loop);
1336     __ bind(end);
1337   }
1338 
1339   // Arguments:
1340   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1341   //             ignored
1342   //   is_oop  - true => oop array, so generate store check code
1343   //   name    - stub name string
1344   //
1345   // Inputs:
1346   //   c_rarg0   - source array address
1347   //   c_rarg1   - destination array address
1348   //   c_rarg2   - element count, treated as ssize_t, can be zero
1349   //
1350   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1351   // the hardware handle it.  The two dwords within qwords that span
1352   // cache line boundaries will still be loaded and stored atomicly.
1353   //
1354   // Side Effects:
1355   //   disjoint_int_copy_entry is set to the no-overlap entry point
1356   //   used by generate_conjoint_int_oop_copy().
1357   //
1358   address generate_disjoint_copy(size_t size, bool aligned, bool is_oop, address *entry,
1359                                   const char *name, bool dest_uninitialized = false) {
1360     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1361     RegSet saved_reg = RegSet::of(s, d, count);
1362     __ align(CodeEntryAlignment);
1363     StubCodeMark mark(this, "StubRoutines", name);
1364     address start = __ pc();
1365     __ enter();
1366 
1367     if (entry != NULL) {
1368       *entry = __ pc();
1369       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1370       BLOCK_COMMENT("Entry:");
1371     }
1372 
1373     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1374     if (dest_uninitialized) {
1375       decorators |= IS_DEST_UNINITIALIZED;
1376     }
1377     if (aligned) {
1378       decorators |= ARRAYCOPY_ALIGNED;
1379     }
1380 
1381     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1382     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg);
1383 
1384     if (is_oop) {
1385       // save regs before copy_memory
1386       __ push(RegSet::of(d, count), sp);
1387     }
1388     copy_memory(aligned, s, d, count, rscratch1, size);
1389 
1390     if (is_oop) {
1391       __ pop(RegSet::of(d, count), sp);
1392       if (VerifyOops)
1393         verify_oop_array(size, d, count, r16);
1394     }
1395 
1396     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1397 
1398     __ leave();
1399     __ mov(r0, zr); // return 0
1400     __ ret(lr);
1401 #ifdef BUILTIN_SIM
1402     {
1403       AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck);
1404       sim->notifyCompile(const_cast<char*>(name), start);
1405     }
1406 #endif
1407     return start;
1408   }
1409 
1410   // Arguments:
1411   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1412   //             ignored
1413   //   is_oop  - true => oop array, so generate store check code
1414   //   name    - stub name string
1415   //
1416   // Inputs:
1417   //   c_rarg0   - source array address
1418   //   c_rarg1   - destination array address
1419   //   c_rarg2   - element count, treated as ssize_t, can be zero
1420   //
1421   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1422   // the hardware handle it.  The two dwords within qwords that span
1423   // cache line boundaries will still be loaded and stored atomicly.
1424   //
1425   address generate_conjoint_copy(size_t size, bool aligned, bool is_oop, address nooverlap_target,
1426                                  address *entry, const char *name,
1427                                  bool dest_uninitialized = false) {
1428     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1429     RegSet saved_regs = RegSet::of(s, d, count);
1430     StubCodeMark mark(this, "StubRoutines", name);
1431     address start = __ pc();
1432     __ enter();
1433 
1434     if (entry != NULL) {
1435       *entry = __ pc();
1436       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1437       BLOCK_COMMENT("Entry:");
1438     }
1439 
1440     // use fwd copy when (d-s) above_equal (count*size)
1441     __ sub(rscratch1, d, s);
1442     __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
1443     __ br(Assembler::HS, nooverlap_target);
1444 
1445     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1446     if (dest_uninitialized) {
1447       decorators |= IS_DEST_UNINITIALIZED;
1448     }
1449     if (aligned) {
1450       decorators |= ARRAYCOPY_ALIGNED;
1451     }
1452 
1453     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1454     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs);
1455 
1456     if (is_oop) {
1457       // save regs before copy_memory
1458       __ push(RegSet::of(d, count), sp);
1459     }
1460     copy_memory(aligned, s, d, count, rscratch1, -size);
1461     if (is_oop) {
1462       __ pop(RegSet::of(d, count), sp);
1463       if (VerifyOops)
1464         verify_oop_array(size, d, count, r16);
1465     }
1466     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1467     __ leave();
1468     __ mov(r0, zr); // return 0
1469     __ ret(lr);
1470 #ifdef BUILTIN_SIM
1471     {
1472       AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck);
1473       sim->notifyCompile(const_cast<char*>(name), start);
1474     }
1475 #endif
1476     return start;
1477 }
1478 
1479   // Arguments:
1480   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1481   //             ignored
1482   //   name    - stub name string
1483   //
1484   // Inputs:
1485   //   c_rarg0   - source array address
1486   //   c_rarg1   - destination array address
1487   //   c_rarg2   - element count, treated as ssize_t, can be zero
1488   //
1489   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1490   // we let the hardware handle it.  The one to eight bytes within words,
1491   // dwords or qwords that span cache line boundaries will still be loaded
1492   // and stored atomically.
1493   //
1494   // Side Effects:
1495   //   disjoint_byte_copy_entry is set to the no-overlap entry point  //
1496   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1497   // we let the hardware handle it.  The one to eight bytes within words,
1498   // dwords or qwords that span cache line boundaries will still be loaded
1499   // and stored atomically.
1500   //
1501   // Side Effects:
1502   //   disjoint_byte_copy_entry is set to the no-overlap entry point
1503   //   used by generate_conjoint_byte_copy().
1504   //
1505   address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
1506     const bool not_oop = false;
1507     return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name);
1508   }
1509 
1510   // Arguments:
1511   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1512   //             ignored
1513   //   name    - stub name string
1514   //
1515   // Inputs:
1516   //   c_rarg0   - source array address
1517   //   c_rarg1   - destination array address
1518   //   c_rarg2   - element count, treated as ssize_t, can be zero
1519   //
1520   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1521   // we let the hardware handle it.  The one to eight bytes within words,
1522   // dwords or qwords that span cache line boundaries will still be loaded
1523   // and stored atomically.
1524   //
1525   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1526                                       address* entry, const char *name) {
1527     const bool not_oop = false;
1528     return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name);
1529   }
1530 
1531   // Arguments:
1532   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1533   //             ignored
1534   //   name    - stub name string
1535   //
1536   // Inputs:
1537   //   c_rarg0   - source array address
1538   //   c_rarg1   - destination array address
1539   //   c_rarg2   - element count, treated as ssize_t, can be zero
1540   //
1541   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1542   // let the hardware handle it.  The two or four words within dwords
1543   // or qwords that span cache line boundaries will still be loaded
1544   // and stored atomically.
1545   //
1546   // Side Effects:
1547   //   disjoint_short_copy_entry is set to the no-overlap entry point
1548   //   used by generate_conjoint_short_copy().
1549   //
1550   address generate_disjoint_short_copy(bool aligned,
1551                                        address* entry, const char *name) {
1552     const bool not_oop = false;
1553     return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name);
1554   }
1555 
1556   // Arguments:
1557   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1558   //             ignored
1559   //   name    - stub name string
1560   //
1561   // Inputs:
1562   //   c_rarg0   - source array address
1563   //   c_rarg1   - destination array address
1564   //   c_rarg2   - element count, treated as ssize_t, can be zero
1565   //
1566   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1567   // let the hardware handle it.  The two or four words within dwords
1568   // or qwords that span cache line boundaries will still be loaded
1569   // and stored atomically.
1570   //
1571   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1572                                        address *entry, const char *name) {
1573     const bool not_oop = false;
1574     return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name);
1575 
1576   }
1577   // Arguments:
1578   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1579   //             ignored
1580   //   name    - stub name string
1581   //
1582   // Inputs:
1583   //   c_rarg0   - source array address
1584   //   c_rarg1   - destination array address
1585   //   c_rarg2   - element count, treated as ssize_t, can be zero
1586   //
1587   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1588   // the hardware handle it.  The two dwords within qwords that span
1589   // cache line boundaries will still be loaded and stored atomicly.
1590   //
1591   // Side Effects:
1592   //   disjoint_int_copy_entry is set to the no-overlap entry point
1593   //   used by generate_conjoint_int_oop_copy().
1594   //
1595   address generate_disjoint_int_copy(bool aligned, address *entry,
1596                                          const char *name, bool dest_uninitialized = false) {
1597     const bool not_oop = false;
1598     return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name);
1599   }
1600 
1601   // Arguments:
1602   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1603   //             ignored
1604   //   name    - stub name string
1605   //
1606   // Inputs:
1607   //   c_rarg0   - source array address
1608   //   c_rarg1   - destination array address
1609   //   c_rarg2   - element count, treated as ssize_t, can be zero
1610   //
1611   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1612   // the hardware handle it.  The two dwords within qwords that span
1613   // cache line boundaries will still be loaded and stored atomicly.
1614   //
1615   address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
1616                                      address *entry, const char *name,
1617                                      bool dest_uninitialized = false) {
1618     const bool not_oop = false;
1619     return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name);
1620   }
1621 
1622 
1623   // Arguments:
1624   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1625   //             ignored
1626   //   name    - stub name string
1627   //
1628   // Inputs:
1629   //   c_rarg0   - source array address
1630   //   c_rarg1   - destination array address
1631   //   c_rarg2   - element count, treated as size_t, can be zero
1632   //
1633   // Side Effects:
1634   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1635   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1636   //
1637   address generate_disjoint_long_copy(bool aligned, address *entry,
1638                                           const char *name, bool dest_uninitialized = false) {
1639     const bool not_oop = false;
1640     return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name);
1641   }
1642 
1643   // Arguments:
1644   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1645   //             ignored
1646   //   name    - stub name string
1647   //
1648   // Inputs:
1649   //   c_rarg0   - source array address
1650   //   c_rarg1   - destination array address
1651   //   c_rarg2   - element count, treated as size_t, can be zero
1652   //
1653   address generate_conjoint_long_copy(bool aligned,
1654                                       address nooverlap_target, address *entry,
1655                                       const char *name, bool dest_uninitialized = false) {
1656     const bool not_oop = false;
1657     return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name);
1658   }
1659 
1660   // Arguments:
1661   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1662   //             ignored
1663   //   name    - stub name string
1664   //
1665   // Inputs:
1666   //   c_rarg0   - source array address
1667   //   c_rarg1   - destination array address
1668   //   c_rarg2   - element count, treated as size_t, can be zero
1669   //
1670   // Side Effects:
1671   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1672   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1673   //
1674   address generate_disjoint_oop_copy(bool aligned, address *entry,
1675                                      const char *name, bool dest_uninitialized) {
1676     const bool is_oop = true;
1677     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1678     return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized);
1679   }
1680 
1681   // Arguments:
1682   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1683   //             ignored
1684   //   name    - stub name string
1685   //
1686   // Inputs:
1687   //   c_rarg0   - source array address
1688   //   c_rarg1   - destination array address
1689   //   c_rarg2   - element count, treated as size_t, can be zero
1690   //
1691   address generate_conjoint_oop_copy(bool aligned,
1692                                      address nooverlap_target, address *entry,
1693                                      const char *name, bool dest_uninitialized) {
1694     const bool is_oop = true;
1695     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1696     return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry,
1697                                   name, dest_uninitialized);
1698   }
1699 
1700 
1701   // Helper for generating a dynamic type check.
1702   // Smashes rscratch1, rscratch2.
1703   void generate_type_check(Register sub_klass,
1704                            Register super_check_offset,
1705                            Register super_klass,
1706                            Label& L_success) {
1707     assert_different_registers(sub_klass, super_check_offset, super_klass);
1708 
1709     BLOCK_COMMENT("type_check:");
1710 
1711     Label L_miss;
1712 
1713     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, NULL,
1714                                      super_check_offset);
1715     __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL);
1716 
1717     // Fall through on failure!
1718     __ BIND(L_miss);
1719   }
1720 
1721   //
1722   //  Generate checkcasting array copy stub
1723   //
1724   //  Input:
1725   //    c_rarg0   - source array address
1726   //    c_rarg1   - destination array address
1727   //    c_rarg2   - element count, treated as ssize_t, can be zero
1728   //    c_rarg3   - size_t ckoff (super_check_offset)
1729   //    c_rarg4   - oop ckval (super_klass)
1730   //
1731   //  Output:
1732   //    r0 ==  0  -  success
1733   //    r0 == -1^K - failure, where K is partial transfer count
1734   //
1735   address generate_checkcast_copy(const char *name, address *entry,
1736                                   bool dest_uninitialized = false) {
1737 
1738     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
1739 
1740     // Input registers (after setup_arg_regs)
1741     const Register from        = c_rarg0;   // source array address
1742     const Register to          = c_rarg1;   // destination array address
1743     const Register count       = c_rarg2;   // elementscount
1744     const Register ckoff       = c_rarg3;   // super_check_offset
1745     const Register ckval       = c_rarg4;   // super_klass
1746 
1747     RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
1748     RegSet wb_post_saved_regs = RegSet::of(count);
1749 
1750     // Registers used as temps (r18, r19, r20 are save-on-entry)
1751     const Register count_save  = r21;       // orig elementscount
1752     const Register start_to    = r20;       // destination array start address
1753     const Register copied_oop  = r18;       // actual oop copied
1754     const Register r19_klass   = r19;       // oop._klass
1755 
1756     //---------------------------------------------------------------
1757     // Assembler stub will be used for this call to arraycopy
1758     // if the two arrays are subtypes of Object[] but the
1759     // destination array type is not equal to or a supertype
1760     // of the source type.  Each element must be separately
1761     // checked.
1762 
1763     assert_different_registers(from, to, count, ckoff, ckval, start_to,
1764                                copied_oop, r19_klass, count_save);
1765 
1766     __ align(CodeEntryAlignment);
1767     StubCodeMark mark(this, "StubRoutines", name);
1768     address start = __ pc();
1769 
1770     __ enter(); // required for proper stackwalking of RuntimeStub frame
1771 
1772 #ifdef ASSERT
1773     // caller guarantees that the arrays really are different
1774     // otherwise, we would have to make conjoint checks
1775     { Label L;
1776       array_overlap_test(L, TIMES_OOP);
1777       __ stop("checkcast_copy within a single array");
1778       __ bind(L);
1779     }
1780 #endif //ASSERT
1781 
1782     // Caller of this entry point must set up the argument registers.
1783     if (entry != NULL) {
1784       *entry = __ pc();
1785       BLOCK_COMMENT("Entry:");
1786     }
1787 
1788      // Empty array:  Nothing to do.
1789     __ cbz(count, L_done);
1790 
1791     __ push(RegSet::of(r18, r19, r20, r21), sp);
1792 
1793 #ifdef ASSERT
1794     BLOCK_COMMENT("assert consistent ckoff/ckval");
1795     // The ckoff and ckval must be mutually consistent,
1796     // even though caller generates both.
1797     { Label L;
1798       int sco_offset = in_bytes(Klass::super_check_offset_offset());
1799       __ ldrw(start_to, Address(ckval, sco_offset));
1800       __ cmpw(ckoff, start_to);
1801       __ br(Assembler::EQ, L);
1802       __ stop("super_check_offset inconsistent");
1803       __ bind(L);
1804     }
1805 #endif //ASSERT
1806 
1807     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
1808     bool is_oop = true;
1809     if (dest_uninitialized) {
1810       decorators |= IS_DEST_UNINITIALIZED;
1811     }
1812 
1813     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1814     bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs);
1815 
1816     // save the original count
1817     __ mov(count_save, count);
1818 
1819     // Copy from low to high addresses
1820     __ mov(start_to, to);              // Save destination array start address
1821     __ b(L_load_element);
1822 
1823     // ======== begin loop ========
1824     // (Loop is rotated; its entry is L_load_element.)
1825     // Loop control:
1826     //   for (; count != 0; count--) {
1827     //     copied_oop = load_heap_oop(from++);
1828     //     ... generate_type_check ...;
1829     //     store_heap_oop(to++, copied_oop);
1830     //   }
1831     __ align(OptoLoopAlignment);
1832 
1833     __ BIND(L_store_element);
1834     __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop, noreg, noreg, noreg, AS_RAW);  // store the oop 
1835     __ sub(count, count, 1);
1836     __ cbz(count, L_do_card_marks);
1837 
1838     // ======== loop entry is here ========
1839     __ BIND(L_load_element);
1840     __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8), noreg, noreg, AS_RAW); // load the oop
1841     __ cbz(copied_oop, L_store_element);
1842 
1843     __ load_klass(r19_klass, copied_oop);// query the object klass
1844     generate_type_check(r19_klass, ckoff, ckval, L_store_element);
1845     // ======== end loop ========
1846 
1847     // It was a real error; we must depend on the caller to finish the job.
1848     // Register count = remaining oops, count_orig = total oops.
1849     // Emit GC store barriers for the oops we have copied and report
1850     // their number to the caller.
1851 
1852     __ subs(count, count_save, count);     // K = partially copied oop count
1853     __ eon(count, count, zr);                   // report (-1^K) to caller
1854     __ br(Assembler::EQ, L_done_pop);
1855 
1856     __ BIND(L_do_card_marks);
1857     bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs);
1858 
1859     __ bind(L_done_pop);
1860     __ pop(RegSet::of(r18, r19, r20, r21), sp);
1861     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
1862 
1863     __ bind(L_done);
1864     __ mov(r0, count);
1865     __ leave();
1866     __ ret(lr);
1867 
1868     return start;
1869   }
1870 
1871   // Perform range checks on the proposed arraycopy.
1872   // Kills temp, but nothing else.
1873   // Also, clean the sign bits of src_pos and dst_pos.
1874   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
1875                               Register src_pos, // source position (c_rarg1)
1876                               Register dst,     // destination array oo (c_rarg2)
1877                               Register dst_pos, // destination position (c_rarg3)
1878                               Register length,
1879                               Register temp,
1880                               Label& L_failed) {
1881     BLOCK_COMMENT("arraycopy_range_checks:");
1882 
1883     assert_different_registers(rscratch1, temp);
1884 
1885     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
1886     __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
1887     __ addw(temp, length, src_pos);
1888     __ cmpw(temp, rscratch1);
1889     __ br(Assembler::HI, L_failed);
1890 
1891     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
1892     __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
1893     __ addw(temp, length, dst_pos);
1894     __ cmpw(temp, rscratch1);
1895     __ br(Assembler::HI, L_failed);
1896 
1897     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
1898     __ movw(src_pos, src_pos);
1899     __ movw(dst_pos, dst_pos);
1900 
1901     BLOCK_COMMENT("arraycopy_range_checks done");
1902   }
1903 
1904   // These stubs get called from some dumb test routine.
1905   // I'll write them properly when they're called from
1906   // something that's actually doing something.
1907   static void fake_arraycopy_stub(address src, address dst, int count) {
1908     assert(count == 0, "huh?");
1909   }
1910 
1911 
1912   //
1913   //  Generate 'unsafe' array copy stub
1914   //  Though just as safe as the other stubs, it takes an unscaled
1915   //  size_t argument instead of an element count.
1916   //
1917   //  Input:
1918   //    c_rarg0   - source array address
1919   //    c_rarg1   - destination array address
1920   //    c_rarg2   - byte count, treated as ssize_t, can be zero
1921   //
1922   // Examines the alignment of the operands and dispatches
1923   // to a long, int, short, or byte copy loop.
1924   //
1925   address generate_unsafe_copy(const char *name,
1926                                address byte_copy_entry,
1927                                address short_copy_entry,
1928                                address int_copy_entry,
1929                                address long_copy_entry) {
1930     Label L_long_aligned, L_int_aligned, L_short_aligned;
1931     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1932 
1933     __ align(CodeEntryAlignment);
1934     StubCodeMark mark(this, "StubRoutines", name);
1935     address start = __ pc();
1936     __ enter(); // required for proper stackwalking of RuntimeStub frame
1937 
1938     // bump this on entry, not on exit:
1939     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
1940 
1941     __ orr(rscratch1, s, d);
1942     __ orr(rscratch1, rscratch1, count);
1943 
1944     __ andr(rscratch1, rscratch1, BytesPerLong-1);
1945     __ cbz(rscratch1, L_long_aligned);
1946     __ andr(rscratch1, rscratch1, BytesPerInt-1);
1947     __ cbz(rscratch1, L_int_aligned);
1948     __ tbz(rscratch1, 0, L_short_aligned);
1949     __ b(RuntimeAddress(byte_copy_entry));
1950 
1951     __ BIND(L_short_aligned);
1952     __ lsr(count, count, LogBytesPerShort);  // size => short_count
1953     __ b(RuntimeAddress(short_copy_entry));
1954     __ BIND(L_int_aligned);
1955     __ lsr(count, count, LogBytesPerInt);    // size => int_count
1956     __ b(RuntimeAddress(int_copy_entry));
1957     __ BIND(L_long_aligned);
1958     __ lsr(count, count, LogBytesPerLong);   // size => long_count
1959     __ b(RuntimeAddress(long_copy_entry));
1960 
1961     return start;
1962   }
1963 
1964   //
1965   //  Generate generic array copy stubs
1966   //
1967   //  Input:
1968   //    c_rarg0    -  src oop
1969   //    c_rarg1    -  src_pos (32-bits)
1970   //    c_rarg2    -  dst oop
1971   //    c_rarg3    -  dst_pos (32-bits)
1972   //    c_rarg4    -  element count (32-bits)
1973   //
1974   //  Output:
1975   //    r0 ==  0  -  success
1976   //    r0 == -1^K - failure, where K is partial transfer count
1977   //
1978   address generate_generic_copy(const char *name,
1979                                 address byte_copy_entry, address short_copy_entry,
1980                                 address int_copy_entry, address oop_copy_entry,
1981                                 address long_copy_entry, address checkcast_copy_entry) {
1982 
1983     Label L_failed, L_objArray;
1984     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
1985 
1986     // Input registers
1987     const Register src        = c_rarg0;  // source array oop
1988     const Register src_pos    = c_rarg1;  // source position
1989     const Register dst        = c_rarg2;  // destination array oop
1990     const Register dst_pos    = c_rarg3;  // destination position
1991     const Register length     = c_rarg4;
1992 
1993 
1994     // Registers used as temps
1995     const Register dst_klass  = c_rarg5;
1996 
1997     __ align(CodeEntryAlignment);
1998 
1999     StubCodeMark mark(this, "StubRoutines", name);
2000 
2001     address start = __ pc();
2002 
2003     __ enter(); // required for proper stackwalking of RuntimeStub frame
2004 
2005     // bump this on entry, not on exit:
2006     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
2007 
2008     //-----------------------------------------------------------------------
2009     // Assembler stub will be used for this call to arraycopy
2010     // if the following conditions are met:
2011     //
2012     // (1) src and dst must not be null.
2013     // (2) src_pos must not be negative.
2014     // (3) dst_pos must not be negative.
2015     // (4) length  must not be negative.
2016     // (5) src klass and dst klass should be the same and not NULL.
2017     // (6) src and dst should be arrays.
2018     // (7) src_pos + length must not exceed length of src.
2019     // (8) dst_pos + length must not exceed length of dst.
2020     //
2021 
2022     //  if (src == NULL) return -1;
2023     __ cbz(src, L_failed);
2024 
2025     //  if (src_pos < 0) return -1;
2026     __ tbnz(src_pos, 31, L_failed);  // i.e. sign bit set
2027 
2028     //  if (dst == NULL) return -1;
2029     __ cbz(dst, L_failed);
2030 
2031     //  if (dst_pos < 0) return -1;
2032     __ tbnz(dst_pos, 31, L_failed);  // i.e. sign bit set
2033 
2034     // registers used as temp
2035     const Register scratch_length    = r16; // elements count to copy
2036     const Register scratch_src_klass = r17; // array klass
2037     const Register lh                = r18; // layout helper
2038 
2039     //  if (length < 0) return -1;
2040     __ movw(scratch_length, length);        // length (elements count, 32-bits value)
2041     __ tbnz(scratch_length, 31, L_failed);  // i.e. sign bit set
2042 
2043     __ load_klass(scratch_src_klass, src);
2044 #ifdef ASSERT
2045     //  assert(src->klass() != NULL);
2046     {
2047       BLOCK_COMMENT("assert klasses not null {");
2048       Label L1, L2;
2049       __ cbnz(scratch_src_klass, L2);   // it is broken if klass is NULL
2050       __ bind(L1);
2051       __ stop("broken null klass");
2052       __ bind(L2);
2053       __ load_klass(rscratch1, dst);
2054       __ cbz(rscratch1, L1);     // this would be broken also
2055       BLOCK_COMMENT("} assert klasses not null done");
2056     }
2057 #endif
2058 
2059     // Load layout helper (32-bits)
2060     //
2061     //  |array_tag|     | header_size | element_type |     |log2_element_size|
2062     // 32        30    24            16              8     2                 0
2063     //
2064     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2065     //
2066 
2067     const int lh_offset = in_bytes(Klass::layout_helper_offset());
2068 
2069     // Handle objArrays completely differently...
2070     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2071     __ ldrw(lh, Address(scratch_src_klass, lh_offset));
2072     __ movw(rscratch1, objArray_lh);
2073     __ eorw(rscratch2, lh, rscratch1);
2074     __ cbzw(rscratch2, L_objArray);
2075 
2076     //  if (src->klass() != dst->klass()) return -1;
2077     __ load_klass(rscratch2, dst);
2078     __ eor(rscratch2, rscratch2, scratch_src_klass);
2079     __ cbnz(rscratch2, L_failed);
2080 
2081     //  if (!src->is_Array()) return -1;
2082     __ tbz(lh, 31, L_failed);  // i.e. (lh >= 0)
2083 
2084     // At this point, it is known to be a typeArray (array_tag 0x3).
2085 #ifdef ASSERT
2086     {
2087       BLOCK_COMMENT("assert primitive array {");
2088       Label L;
2089       __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
2090       __ cmpw(lh, rscratch2);
2091       __ br(Assembler::GE, L);
2092       __ stop("must be a primitive array");
2093       __ bind(L);
2094       BLOCK_COMMENT("} assert primitive array done");
2095     }
2096 #endif
2097 
2098     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2099                            rscratch2, L_failed);
2100 
2101     // TypeArrayKlass
2102     //
2103     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2104     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2105     //
2106 
2107     const Register rscratch1_offset = rscratch1;    // array offset
2108     const Register r18_elsize = lh; // element size
2109 
2110     __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
2111            exact_log2(Klass::_lh_header_size_mask+1));   // array_offset
2112     __ add(src, src, rscratch1_offset);           // src array offset
2113     __ add(dst, dst, rscratch1_offset);           // dst array offset
2114     BLOCK_COMMENT("choose copy loop based on element size");
2115 
2116     // next registers should be set before the jump to corresponding stub
2117     const Register from     = c_rarg0;  // source array address
2118     const Register to       = c_rarg1;  // destination array address
2119     const Register count    = c_rarg2;  // elements count
2120 
2121     // 'from', 'to', 'count' registers should be set in such order
2122     // since they are the same as 'src', 'src_pos', 'dst'.
2123 
2124     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
2125 
2126     // The possible values of elsize are 0-3, i.e. exact_log2(element
2127     // size in bytes).  We do a simple bitwise binary search.
2128   __ BIND(L_copy_bytes);
2129     __ tbnz(r18_elsize, 1, L_copy_ints);
2130     __ tbnz(r18_elsize, 0, L_copy_shorts);
2131     __ lea(from, Address(src, src_pos));// src_addr
2132     __ lea(to,   Address(dst, dst_pos));// dst_addr
2133     __ movw(count, scratch_length); // length
2134     __ b(RuntimeAddress(byte_copy_entry));
2135 
2136   __ BIND(L_copy_shorts);
2137     __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
2138     __ lea(to,   Address(dst, dst_pos, Address::lsl(1)));// dst_addr
2139     __ movw(count, scratch_length); // length
2140     __ b(RuntimeAddress(short_copy_entry));
2141 
2142   __ BIND(L_copy_ints);
2143     __ tbnz(r18_elsize, 0, L_copy_longs);
2144     __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
2145     __ lea(to,   Address(dst, dst_pos, Address::lsl(2)));// dst_addr
2146     __ movw(count, scratch_length); // length
2147     __ b(RuntimeAddress(int_copy_entry));
2148 
2149   __ BIND(L_copy_longs);
2150 #ifdef ASSERT
2151     {
2152       BLOCK_COMMENT("assert long copy {");
2153       Label L;
2154       __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r18_elsize
2155       __ cmpw(r18_elsize, LogBytesPerLong);
2156       __ br(Assembler::EQ, L);
2157       __ stop("must be long copy, but elsize is wrong");
2158       __ bind(L);
2159       BLOCK_COMMENT("} assert long copy done");
2160     }
2161 #endif
2162     __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
2163     __ lea(to,   Address(dst, dst_pos, Address::lsl(3)));// dst_addr
2164     __ movw(count, scratch_length); // length
2165     __ b(RuntimeAddress(long_copy_entry));
2166 
2167     // ObjArrayKlass
2168   __ BIND(L_objArray);
2169     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
2170 
2171     Label L_plain_copy, L_checkcast_copy;
2172     //  test array classes for subtyping
2173     __ load_klass(r18, dst);
2174     __ cmp(scratch_src_klass, r18); // usual case is exact equality
2175     __ br(Assembler::NE, L_checkcast_copy);
2176 
2177     // Identically typed arrays can be copied without element-wise checks.
2178     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2179                            rscratch2, L_failed);
2180 
2181     __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2182     __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2183     __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2184     __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2185     __ movw(count, scratch_length); // length
2186   __ BIND(L_plain_copy);
2187     __ b(RuntimeAddress(oop_copy_entry));
2188 
2189   __ BIND(L_checkcast_copy);
2190     // live at this point:  scratch_src_klass, scratch_length, r18 (dst_klass)
2191     {
2192       // Before looking at dst.length, make sure dst is also an objArray.
2193       __ ldrw(rscratch1, Address(r18, lh_offset));
2194       __ movw(rscratch2, objArray_lh);
2195       __ eorw(rscratch1, rscratch1, rscratch2);
2196       __ cbnzw(rscratch1, L_failed);
2197 
2198       // It is safe to examine both src.length and dst.length.
2199       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2200                              r18, L_failed);
2201 
2202       __ load_klass(dst_klass, dst); // reload
2203 
2204       // Marshal the base address arguments now, freeing registers.
2205       __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2206       __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2207       __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2208       __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2209       __ movw(count, length);           // length (reloaded)
2210       Register sco_temp = c_rarg3;      // this register is free now
2211       assert_different_registers(from, to, count, sco_temp,
2212                                  dst_klass, scratch_src_klass);
2213       // assert_clean_int(count, sco_temp);
2214 
2215       // Generate the type check.
2216       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
2217       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2218 
2219       // Smashes rscratch1, rscratch2
2220       generate_type_check(scratch_src_klass, sco_temp, dst_klass, L_plain_copy);
2221 
2222       // Fetch destination element klass from the ObjArrayKlass header.
2223       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2224       __ ldr(dst_klass, Address(dst_klass, ek_offset));
2225       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2226 
2227       // the checkcast_copy loop needs two extra arguments:
2228       assert(c_rarg3 == sco_temp, "#3 already in place");
2229       // Set up arguments for checkcast_copy_entry.
2230       __ mov(c_rarg4, dst_klass);  // dst.klass.element_klass
2231       __ b(RuntimeAddress(checkcast_copy_entry));
2232     }
2233 
2234   __ BIND(L_failed);
2235     __ mov(r0, -1);
2236     __ leave();   // required for proper stackwalking of RuntimeStub frame
2237     __ ret(lr);
2238 
2239     return start;
2240   }
2241 
2242   //
2243   // Generate stub for array fill. If "aligned" is true, the
2244   // "to" address is assumed to be heapword aligned.
2245   //
2246   // Arguments for generated stub:
2247   //   to:    c_rarg0
2248   //   value: c_rarg1
2249   //   count: c_rarg2 treated as signed
2250   //
2251   address generate_fill(BasicType t, bool aligned, const char *name) {
2252     __ align(CodeEntryAlignment);
2253     StubCodeMark mark(this, "StubRoutines", name);
2254     address start = __ pc();
2255 
2256     BLOCK_COMMENT("Entry:");
2257 
2258     const Register to        = c_rarg0;  // source array address
2259     const Register value     = c_rarg1;  // value
2260     const Register count     = c_rarg2;  // elements count
2261 
2262     const Register bz_base = r10;        // base for block_zero routine
2263     const Register cnt_words = r11;      // temp register
2264 
2265     __ enter();
2266 
2267     Label L_fill_elements, L_exit1;
2268 
2269     int shift = -1;
2270     switch (t) {
2271       case T_BYTE:
2272         shift = 0;
2273         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2274         __ bfi(value, value, 8, 8);   // 8 bit -> 16 bit
2275         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2276         __ br(Assembler::LO, L_fill_elements);
2277         break;
2278       case T_SHORT:
2279         shift = 1;
2280         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2281         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2282         __ br(Assembler::LO, L_fill_elements);
2283         break;
2284       case T_INT:
2285         shift = 2;
2286         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2287         __ br(Assembler::LO, L_fill_elements);
2288         break;
2289       default: ShouldNotReachHere();
2290     }
2291 
2292     // Align source address at 8 bytes address boundary.
2293     Label L_skip_align1, L_skip_align2, L_skip_align4;
2294     if (!aligned) {
2295       switch (t) {
2296         case T_BYTE:
2297           // One byte misalignment happens only for byte arrays.
2298           __ tbz(to, 0, L_skip_align1);
2299           __ strb(value, Address(__ post(to, 1)));
2300           __ subw(count, count, 1);
2301           __ bind(L_skip_align1);
2302           // Fallthrough
2303         case T_SHORT:
2304           // Two bytes misalignment happens only for byte and short (char) arrays.
2305           __ tbz(to, 1, L_skip_align2);
2306           __ strh(value, Address(__ post(to, 2)));
2307           __ subw(count, count, 2 >> shift);
2308           __ bind(L_skip_align2);
2309           // Fallthrough
2310         case T_INT:
2311           // Align to 8 bytes, we know we are 4 byte aligned to start.
2312           __ tbz(to, 2, L_skip_align4);
2313           __ strw(value, Address(__ post(to, 4)));
2314           __ subw(count, count, 4 >> shift);
2315           __ bind(L_skip_align4);
2316           break;
2317         default: ShouldNotReachHere();
2318       }
2319     }
2320 
2321     //
2322     //  Fill large chunks
2323     //
2324     __ lsrw(cnt_words, count, 3 - shift); // number of words
2325     __ bfi(value, value, 32, 32);         // 32 bit -> 64 bit
2326     __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
2327     if (UseBlockZeroing) {
2328       Label non_block_zeroing, rest;
2329       // If the fill value is zero we can use the fast zero_words().
2330       __ cbnz(value, non_block_zeroing);
2331       __ mov(bz_base, to);
2332       __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
2333       __ zero_words(bz_base, cnt_words);
2334       __ b(rest);
2335       __ bind(non_block_zeroing);
2336       __ fill_words(to, cnt_words, value);
2337       __ bind(rest);
2338     } else {
2339       __ fill_words(to, cnt_words, value);
2340     }
2341 
2342     // Remaining count is less than 8 bytes. Fill it by a single store.
2343     // Note that the total length is no less than 8 bytes.
2344     if (t == T_BYTE || t == T_SHORT) {
2345       Label L_exit1;
2346       __ cbzw(count, L_exit1);
2347       __ add(to, to, count, Assembler::LSL, shift); // points to the end
2348       __ str(value, Address(to, -8));    // overwrite some elements
2349       __ bind(L_exit1);
2350       __ leave();
2351       __ ret(lr);
2352     }
2353 
2354     // Handle copies less than 8 bytes.
2355     Label L_fill_2, L_fill_4, L_exit2;
2356     __ bind(L_fill_elements);
2357     switch (t) {
2358       case T_BYTE:
2359         __ tbz(count, 0, L_fill_2);
2360         __ strb(value, Address(__ post(to, 1)));
2361         __ bind(L_fill_2);
2362         __ tbz(count, 1, L_fill_4);
2363         __ strh(value, Address(__ post(to, 2)));
2364         __ bind(L_fill_4);
2365         __ tbz(count, 2, L_exit2);
2366         __ strw(value, Address(to));
2367         break;
2368       case T_SHORT:
2369         __ tbz(count, 0, L_fill_4);
2370         __ strh(value, Address(__ post(to, 2)));
2371         __ bind(L_fill_4);
2372         __ tbz(count, 1, L_exit2);
2373         __ strw(value, Address(to));
2374         break;
2375       case T_INT:
2376         __ cbzw(count, L_exit2);
2377         __ strw(value, Address(to));
2378         break;
2379       default: ShouldNotReachHere();
2380     }
2381     __ bind(L_exit2);
2382     __ leave();
2383     __ ret(lr);
2384     return start;
2385   }
2386 
2387   void generate_arraycopy_stubs() {
2388     address entry;
2389     address entry_jbyte_arraycopy;
2390     address entry_jshort_arraycopy;
2391     address entry_jint_arraycopy;
2392     address entry_oop_arraycopy;
2393     address entry_jlong_arraycopy;
2394     address entry_checkcast_arraycopy;
2395 
2396     generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards);
2397     generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards);
2398 
2399     StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
2400 
2401     //*** jbyte
2402     // Always need aligned and unaligned versions
2403     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_byte_copy(false, &entry,
2404                                                                                   "jbyte_disjoint_arraycopy");
2405     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_byte_copy(false, entry,
2406                                                                                   &entry_jbyte_arraycopy,
2407                                                                                   "jbyte_arraycopy");
2408     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry,
2409                                                                                   "arrayof_jbyte_disjoint_arraycopy");
2410     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_byte_copy(true, entry, NULL,
2411                                                                                   "arrayof_jbyte_arraycopy");
2412 
2413     //*** jshort
2414     // Always need aligned and unaligned versions
2415     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_short_copy(false, &entry,
2416                                                                                     "jshort_disjoint_arraycopy");
2417     StubRoutines::_jshort_arraycopy                  = generate_conjoint_short_copy(false, entry,
2418                                                                                     &entry_jshort_arraycopy,
2419                                                                                     "jshort_arraycopy");
2420     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry,
2421                                                                                     "arrayof_jshort_disjoint_arraycopy");
2422     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_short_copy(true, entry, NULL,
2423                                                                                     "arrayof_jshort_arraycopy");
2424 
2425     //*** jint
2426     // Aligned versions
2427     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry,
2428                                                                                 "arrayof_jint_disjoint_arraycopy");
2429     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy,
2430                                                                                 "arrayof_jint_arraycopy");
2431     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
2432     // entry_jint_arraycopy always points to the unaligned version
2433     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_int_copy(false, &entry,
2434                                                                                 "jint_disjoint_arraycopy");
2435     StubRoutines::_jint_arraycopy                  = generate_conjoint_int_copy(false, entry,
2436                                                                                 &entry_jint_arraycopy,
2437                                                                                 "jint_arraycopy");
2438 
2439     //*** jlong
2440     // It is always aligned
2441     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry,
2442                                                                                   "arrayof_jlong_disjoint_arraycopy");
2443     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy,
2444                                                                                   "arrayof_jlong_arraycopy");
2445     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
2446     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
2447 
2448     //*** oops
2449     {
2450       // With compressed oops we need unaligned versions; notice that
2451       // we overwrite entry_oop_arraycopy.
2452       bool aligned = !UseCompressedOops;
2453 
2454       StubRoutines::_arrayof_oop_disjoint_arraycopy
2455         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy",
2456                                      /*dest_uninitialized*/false);
2457       StubRoutines::_arrayof_oop_arraycopy
2458         = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy",
2459                                      /*dest_uninitialized*/false);
2460       // Aligned versions without pre-barriers
2461       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
2462         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit",
2463                                      /*dest_uninitialized*/true);
2464       StubRoutines::_arrayof_oop_arraycopy_uninit
2465         = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit",
2466                                      /*dest_uninitialized*/true);
2467     }
2468 
2469     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
2470     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
2471     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
2472     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
2473 
2474     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
2475     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
2476                                                                         /*dest_uninitialized*/true);
2477 
2478     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
2479                                                               entry_jbyte_arraycopy,
2480                                                               entry_jshort_arraycopy,
2481                                                               entry_jint_arraycopy,
2482                                                               entry_jlong_arraycopy);
2483 
2484     StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
2485                                                                entry_jbyte_arraycopy,
2486                                                                entry_jshort_arraycopy,
2487                                                                entry_jint_arraycopy,
2488                                                                entry_oop_arraycopy,
2489                                                                entry_jlong_arraycopy,
2490                                                                entry_checkcast_arraycopy);
2491 
2492     StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
2493     StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
2494     StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
2495     StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
2496     StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
2497     StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
2498   }
2499 
2500   void generate_math_stubs() { Unimplemented(); }
2501 
2502   // Arguments:
2503   //
2504   // Inputs:
2505   //   c_rarg0   - source byte array address
2506   //   c_rarg1   - destination byte array address
2507   //   c_rarg2   - K (key) in little endian int array
2508   //
2509   address generate_aescrypt_encryptBlock() {
2510     __ align(CodeEntryAlignment);
2511     StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
2512 
2513     Label L_doLast;
2514 
2515     const Register from        = c_rarg0;  // source array address
2516     const Register to          = c_rarg1;  // destination array address
2517     const Register key         = c_rarg2;  // key array address
2518     const Register keylen      = rscratch1;
2519 
2520     address start = __ pc();
2521     __ enter();
2522 
2523     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2524 
2525     __ ld1(v0, __ T16B, from); // get 16 bytes of input
2526 
2527     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2528     __ rev32(v1, __ T16B, v1);
2529     __ rev32(v2, __ T16B, v2);
2530     __ rev32(v3, __ T16B, v3);
2531     __ rev32(v4, __ T16B, v4);
2532     __ aese(v0, v1);
2533     __ aesmc(v0, v0);
2534     __ aese(v0, v2);
2535     __ aesmc(v0, v0);
2536     __ aese(v0, v3);
2537     __ aesmc(v0, v0);
2538     __ aese(v0, v4);
2539     __ aesmc(v0, v0);
2540 
2541     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2542     __ rev32(v1, __ T16B, v1);
2543     __ rev32(v2, __ T16B, v2);
2544     __ rev32(v3, __ T16B, v3);
2545     __ rev32(v4, __ T16B, v4);
2546     __ aese(v0, v1);
2547     __ aesmc(v0, v0);
2548     __ aese(v0, v2);
2549     __ aesmc(v0, v0);
2550     __ aese(v0, v3);
2551     __ aesmc(v0, v0);
2552     __ aese(v0, v4);
2553     __ aesmc(v0, v0);
2554 
2555     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2556     __ rev32(v1, __ T16B, v1);
2557     __ rev32(v2, __ T16B, v2);
2558 
2559     __ cmpw(keylen, 44);
2560     __ br(Assembler::EQ, L_doLast);
2561 
2562     __ aese(v0, v1);
2563     __ aesmc(v0, v0);
2564     __ aese(v0, v2);
2565     __ aesmc(v0, v0);
2566 
2567     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2568     __ rev32(v1, __ T16B, v1);
2569     __ rev32(v2, __ T16B, v2);
2570 
2571     __ cmpw(keylen, 52);
2572     __ br(Assembler::EQ, L_doLast);
2573 
2574     __ aese(v0, v1);
2575     __ aesmc(v0, v0);
2576     __ aese(v0, v2);
2577     __ aesmc(v0, v0);
2578 
2579     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2580     __ rev32(v1, __ T16B, v1);
2581     __ rev32(v2, __ T16B, v2);
2582 
2583     __ BIND(L_doLast);
2584 
2585     __ aese(v0, v1);
2586     __ aesmc(v0, v0);
2587     __ aese(v0, v2);
2588 
2589     __ ld1(v1, __ T16B, key);
2590     __ rev32(v1, __ T16B, v1);
2591     __ eor(v0, __ T16B, v0, v1);
2592 
2593     __ st1(v0, __ T16B, to);
2594 
2595     __ mov(r0, 0);
2596 
2597     __ leave();
2598     __ ret(lr);
2599 
2600     return start;
2601   }
2602 
2603   // Arguments:
2604   //
2605   // Inputs:
2606   //   c_rarg0   - source byte array address
2607   //   c_rarg1   - destination byte array address
2608   //   c_rarg2   - K (key) in little endian int array
2609   //
2610   address generate_aescrypt_decryptBlock() {
2611     assert(UseAES, "need AES instructions and misaligned SSE support");
2612     __ align(CodeEntryAlignment);
2613     StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
2614     Label L_doLast;
2615 
2616     const Register from        = c_rarg0;  // source array address
2617     const Register to          = c_rarg1;  // destination array address
2618     const Register key         = c_rarg2;  // key array address
2619     const Register keylen      = rscratch1;
2620 
2621     address start = __ pc();
2622     __ enter(); // required for proper stackwalking of RuntimeStub frame
2623 
2624     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2625 
2626     __ ld1(v0, __ T16B, from); // get 16 bytes of input
2627 
2628     __ ld1(v5, __ T16B, __ post(key, 16));
2629     __ rev32(v5, __ T16B, v5);
2630 
2631     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2632     __ rev32(v1, __ T16B, v1);
2633     __ rev32(v2, __ T16B, v2);
2634     __ rev32(v3, __ T16B, v3);
2635     __ rev32(v4, __ T16B, v4);
2636     __ aesd(v0, v1);
2637     __ aesimc(v0, v0);
2638     __ aesd(v0, v2);
2639     __ aesimc(v0, v0);
2640     __ aesd(v0, v3);
2641     __ aesimc(v0, v0);
2642     __ aesd(v0, v4);
2643     __ aesimc(v0, v0);
2644 
2645     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2646     __ rev32(v1, __ T16B, v1);
2647     __ rev32(v2, __ T16B, v2);
2648     __ rev32(v3, __ T16B, v3);
2649     __ rev32(v4, __ T16B, v4);
2650     __ aesd(v0, v1);
2651     __ aesimc(v0, v0);
2652     __ aesd(v0, v2);
2653     __ aesimc(v0, v0);
2654     __ aesd(v0, v3);
2655     __ aesimc(v0, v0);
2656     __ aesd(v0, v4);
2657     __ aesimc(v0, v0);
2658 
2659     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2660     __ rev32(v1, __ T16B, v1);
2661     __ rev32(v2, __ T16B, v2);
2662 
2663     __ cmpw(keylen, 44);
2664     __ br(Assembler::EQ, L_doLast);
2665 
2666     __ aesd(v0, v1);
2667     __ aesimc(v0, v0);
2668     __ aesd(v0, v2);
2669     __ aesimc(v0, v0);
2670 
2671     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2672     __ rev32(v1, __ T16B, v1);
2673     __ rev32(v2, __ T16B, v2);
2674 
2675     __ cmpw(keylen, 52);
2676     __ br(Assembler::EQ, L_doLast);
2677 
2678     __ aesd(v0, v1);
2679     __ aesimc(v0, v0);
2680     __ aesd(v0, v2);
2681     __ aesimc(v0, v0);
2682 
2683     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2684     __ rev32(v1, __ T16B, v1);
2685     __ rev32(v2, __ T16B, v2);
2686 
2687     __ BIND(L_doLast);
2688 
2689     __ aesd(v0, v1);
2690     __ aesimc(v0, v0);
2691     __ aesd(v0, v2);
2692 
2693     __ eor(v0, __ T16B, v0, v5);
2694 
2695     __ st1(v0, __ T16B, to);
2696 
2697     __ mov(r0, 0);
2698 
2699     __ leave();
2700     __ ret(lr);
2701 
2702     return start;
2703   }
2704 
2705   // Arguments:
2706   //
2707   // Inputs:
2708   //   c_rarg0   - source byte array address
2709   //   c_rarg1   - destination byte array address
2710   //   c_rarg2   - K (key) in little endian int array
2711   //   c_rarg3   - r vector byte array address
2712   //   c_rarg4   - input length
2713   //
2714   // Output:
2715   //   x0        - input length
2716   //
2717   address generate_cipherBlockChaining_encryptAESCrypt() {
2718     assert(UseAES, "need AES instructions and misaligned SSE support");
2719     __ align(CodeEntryAlignment);
2720     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
2721 
2722     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2723 
2724     const Register from        = c_rarg0;  // source array address
2725     const Register to          = c_rarg1;  // destination array address
2726     const Register key         = c_rarg2;  // key array address
2727     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2728                                            // and left with the results of the last encryption block
2729     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2730     const Register keylen      = rscratch1;
2731 
2732     address start = __ pc();
2733 
2734       __ enter();
2735 
2736       __ movw(rscratch2, len_reg);
2737 
2738       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2739 
2740       __ ld1(v0, __ T16B, rvec);
2741 
2742       __ cmpw(keylen, 52);
2743       __ br(Assembler::CC, L_loadkeys_44);
2744       __ br(Assembler::EQ, L_loadkeys_52);
2745 
2746       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2747       __ rev32(v17, __ T16B, v17);
2748       __ rev32(v18, __ T16B, v18);
2749     __ BIND(L_loadkeys_52);
2750       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2751       __ rev32(v19, __ T16B, v19);
2752       __ rev32(v20, __ T16B, v20);
2753     __ BIND(L_loadkeys_44);
2754       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2755       __ rev32(v21, __ T16B, v21);
2756       __ rev32(v22, __ T16B, v22);
2757       __ rev32(v23, __ T16B, v23);
2758       __ rev32(v24, __ T16B, v24);
2759       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2760       __ rev32(v25, __ T16B, v25);
2761       __ rev32(v26, __ T16B, v26);
2762       __ rev32(v27, __ T16B, v27);
2763       __ rev32(v28, __ T16B, v28);
2764       __ ld1(v29, v30, v31, __ T16B, key);
2765       __ rev32(v29, __ T16B, v29);
2766       __ rev32(v30, __ T16B, v30);
2767       __ rev32(v31, __ T16B, v31);
2768 
2769     __ BIND(L_aes_loop);
2770       __ ld1(v1, __ T16B, __ post(from, 16));
2771       __ eor(v0, __ T16B, v0, v1);
2772 
2773       __ br(Assembler::CC, L_rounds_44);
2774       __ br(Assembler::EQ, L_rounds_52);
2775 
2776       __ aese(v0, v17); __ aesmc(v0, v0);
2777       __ aese(v0, v18); __ aesmc(v0, v0);
2778     __ BIND(L_rounds_52);
2779       __ aese(v0, v19); __ aesmc(v0, v0);
2780       __ aese(v0, v20); __ aesmc(v0, v0);
2781     __ BIND(L_rounds_44);
2782       __ aese(v0, v21); __ aesmc(v0, v0);
2783       __ aese(v0, v22); __ aesmc(v0, v0);
2784       __ aese(v0, v23); __ aesmc(v0, v0);
2785       __ aese(v0, v24); __ aesmc(v0, v0);
2786       __ aese(v0, v25); __ aesmc(v0, v0);
2787       __ aese(v0, v26); __ aesmc(v0, v0);
2788       __ aese(v0, v27); __ aesmc(v0, v0);
2789       __ aese(v0, v28); __ aesmc(v0, v0);
2790       __ aese(v0, v29); __ aesmc(v0, v0);
2791       __ aese(v0, v30);
2792       __ eor(v0, __ T16B, v0, v31);
2793 
2794       __ st1(v0, __ T16B, __ post(to, 16));
2795 
2796       __ subw(len_reg, len_reg, 16);
2797       __ cbnzw(len_reg, L_aes_loop);
2798 
2799       __ st1(v0, __ T16B, rvec);
2800 
2801       __ mov(r0, rscratch2);
2802 
2803       __ leave();
2804       __ ret(lr);
2805 
2806       return start;
2807   }
2808 
2809   // Arguments:
2810   //
2811   // Inputs:
2812   //   c_rarg0   - source byte array address
2813   //   c_rarg1   - destination byte array address
2814   //   c_rarg2   - K (key) in little endian int array
2815   //   c_rarg3   - r vector byte array address
2816   //   c_rarg4   - input length
2817   //
2818   // Output:
2819   //   r0        - input length
2820   //
2821   address generate_cipherBlockChaining_decryptAESCrypt() {
2822     assert(UseAES, "need AES instructions and misaligned SSE support");
2823     __ align(CodeEntryAlignment);
2824     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
2825 
2826     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2827 
2828     const Register from        = c_rarg0;  // source array address
2829     const Register to          = c_rarg1;  // destination array address
2830     const Register key         = c_rarg2;  // key array address
2831     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2832                                            // and left with the results of the last encryption block
2833     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2834     const Register keylen      = rscratch1;
2835 
2836     address start = __ pc();
2837 
2838       __ enter();
2839 
2840       __ movw(rscratch2, len_reg);
2841 
2842       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2843 
2844       __ ld1(v2, __ T16B, rvec);
2845 
2846       __ ld1(v31, __ T16B, __ post(key, 16));
2847       __ rev32(v31, __ T16B, v31);
2848 
2849       __ cmpw(keylen, 52);
2850       __ br(Assembler::CC, L_loadkeys_44);
2851       __ br(Assembler::EQ, L_loadkeys_52);
2852 
2853       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2854       __ rev32(v17, __ T16B, v17);
2855       __ rev32(v18, __ T16B, v18);
2856     __ BIND(L_loadkeys_52);
2857       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2858       __ rev32(v19, __ T16B, v19);
2859       __ rev32(v20, __ T16B, v20);
2860     __ BIND(L_loadkeys_44);
2861       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2862       __ rev32(v21, __ T16B, v21);
2863       __ rev32(v22, __ T16B, v22);
2864       __ rev32(v23, __ T16B, v23);
2865       __ rev32(v24, __ T16B, v24);
2866       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2867       __ rev32(v25, __ T16B, v25);
2868       __ rev32(v26, __ T16B, v26);
2869       __ rev32(v27, __ T16B, v27);
2870       __ rev32(v28, __ T16B, v28);
2871       __ ld1(v29, v30, __ T16B, key);
2872       __ rev32(v29, __ T16B, v29);
2873       __ rev32(v30, __ T16B, v30);
2874 
2875     __ BIND(L_aes_loop);
2876       __ ld1(v0, __ T16B, __ post(from, 16));
2877       __ orr(v1, __ T16B, v0, v0);
2878 
2879       __ br(Assembler::CC, L_rounds_44);
2880       __ br(Assembler::EQ, L_rounds_52);
2881 
2882       __ aesd(v0, v17); __ aesimc(v0, v0);
2883       __ aesd(v0, v18); __ aesimc(v0, v0);
2884     __ BIND(L_rounds_52);
2885       __ aesd(v0, v19); __ aesimc(v0, v0);
2886       __ aesd(v0, v20); __ aesimc(v0, v0);
2887     __ BIND(L_rounds_44);
2888       __ aesd(v0, v21); __ aesimc(v0, v0);
2889       __ aesd(v0, v22); __ aesimc(v0, v0);
2890       __ aesd(v0, v23); __ aesimc(v0, v0);
2891       __ aesd(v0, v24); __ aesimc(v0, v0);
2892       __ aesd(v0, v25); __ aesimc(v0, v0);
2893       __ aesd(v0, v26); __ aesimc(v0, v0);
2894       __ aesd(v0, v27); __ aesimc(v0, v0);
2895       __ aesd(v0, v28); __ aesimc(v0, v0);
2896       __ aesd(v0, v29); __ aesimc(v0, v0);
2897       __ aesd(v0, v30);
2898       __ eor(v0, __ T16B, v0, v31);
2899       __ eor(v0, __ T16B, v0, v2);
2900 
2901       __ st1(v0, __ T16B, __ post(to, 16));
2902       __ orr(v2, __ T16B, v1, v1);
2903 
2904       __ subw(len_reg, len_reg, 16);
2905       __ cbnzw(len_reg, L_aes_loop);
2906 
2907       __ st1(v2, __ T16B, rvec);
2908 
2909       __ mov(r0, rscratch2);
2910 
2911       __ leave();
2912       __ ret(lr);
2913 
2914     return start;
2915   }
2916 
2917   // Arguments:
2918   //
2919   // Inputs:
2920   //   c_rarg0   - byte[]  source+offset
2921   //   c_rarg1   - int[]   SHA.state
2922   //   c_rarg2   - int     offset
2923   //   c_rarg3   - int     limit
2924   //
2925   address generate_sha1_implCompress(bool multi_block, const char *name) {
2926     __ align(CodeEntryAlignment);
2927     StubCodeMark mark(this, "StubRoutines", name);
2928     address start = __ pc();
2929 
2930     Register buf   = c_rarg0;
2931     Register state = c_rarg1;
2932     Register ofs   = c_rarg2;
2933     Register limit = c_rarg3;
2934 
2935     Label keys;
2936     Label sha1_loop;
2937 
2938     // load the keys into v0..v3
2939     __ adr(rscratch1, keys);
2940     __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
2941     // load 5 words state into v6, v7
2942     __ ldrq(v6, Address(state, 0));
2943     __ ldrs(v7, Address(state, 16));
2944 
2945 
2946     __ BIND(sha1_loop);
2947     // load 64 bytes of data into v16..v19
2948     __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
2949     __ rev32(v16, __ T16B, v16);
2950     __ rev32(v17, __ T16B, v17);
2951     __ rev32(v18, __ T16B, v18);
2952     __ rev32(v19, __ T16B, v19);
2953 
2954     // do the sha1
2955     __ addv(v4, __ T4S, v16, v0);
2956     __ orr(v20, __ T16B, v6, v6);
2957 
2958     FloatRegister d0 = v16;
2959     FloatRegister d1 = v17;
2960     FloatRegister d2 = v18;
2961     FloatRegister d3 = v19;
2962 
2963     for (int round = 0; round < 20; round++) {
2964       FloatRegister tmp1 = (round & 1) ? v4 : v5;
2965       FloatRegister tmp2 = (round & 1) ? v21 : v22;
2966       FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
2967       FloatRegister tmp4 = (round & 1) ? v5 : v4;
2968       FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
2969 
2970       if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
2971       if (round < 19) __ addv(tmp1, __ T4S, d1, key);
2972       __ sha1h(tmp2, __ T4S, v20);
2973       if (round < 5)
2974         __ sha1c(v20, __ T4S, tmp3, tmp4);
2975       else if (round < 10 || round >= 15)
2976         __ sha1p(v20, __ T4S, tmp3, tmp4);
2977       else
2978         __ sha1m(v20, __ T4S, tmp3, tmp4);
2979       if (round < 16) __ sha1su1(d0, __ T4S, d3);
2980 
2981       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
2982     }
2983 
2984     __ addv(v7, __ T2S, v7, v21);
2985     __ addv(v6, __ T4S, v6, v20);
2986 
2987     if (multi_block) {
2988       __ add(ofs, ofs, 64);
2989       __ cmp(ofs, limit);
2990       __ br(Assembler::LE, sha1_loop);
2991       __ mov(c_rarg0, ofs); // return ofs
2992     }
2993 
2994     __ strq(v6, Address(state, 0));
2995     __ strs(v7, Address(state, 16));
2996 
2997     __ ret(lr);
2998 
2999     __ bind(keys);
3000     __ emit_int32(0x5a827999);
3001     __ emit_int32(0x6ed9eba1);
3002     __ emit_int32(0x8f1bbcdc);
3003     __ emit_int32(0xca62c1d6);
3004 
3005     return start;
3006   }
3007 
3008 
3009   // Arguments:
3010   //
3011   // Inputs:
3012   //   c_rarg0   - byte[]  source+offset
3013   //   c_rarg1   - int[]   SHA.state
3014   //   c_rarg2   - int     offset
3015   //   c_rarg3   - int     limit
3016   //
3017   address generate_sha256_implCompress(bool multi_block, const char *name) {
3018     static const uint32_t round_consts[64] = {
3019       0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
3020       0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
3021       0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
3022       0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
3023       0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
3024       0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
3025       0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
3026       0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
3027       0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
3028       0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
3029       0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
3030       0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
3031       0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
3032       0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
3033       0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
3034       0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
3035     };
3036     __ align(CodeEntryAlignment);
3037     StubCodeMark mark(this, "StubRoutines", name);
3038     address start = __ pc();
3039 
3040     Register buf   = c_rarg0;
3041     Register state = c_rarg1;
3042     Register ofs   = c_rarg2;
3043     Register limit = c_rarg3;
3044 
3045     Label sha1_loop;
3046 
3047     __ stpd(v8, v9, __ pre(sp, -32));
3048     __ stpd(v10, v11, Address(sp, 16));
3049 
3050 // dga == v0
3051 // dgb == v1
3052 // dg0 == v2
3053 // dg1 == v3
3054 // dg2 == v4
3055 // t0 == v6
3056 // t1 == v7
3057 
3058     // load 16 keys to v16..v31
3059     __ lea(rscratch1, ExternalAddress((address)round_consts));
3060     __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
3061     __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
3062     __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
3063     __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
3064 
3065     // load 8 words (256 bits) state
3066     __ ldpq(v0, v1, state);
3067 
3068     __ BIND(sha1_loop);
3069     // load 64 bytes of data into v8..v11
3070     __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
3071     __ rev32(v8, __ T16B, v8);
3072     __ rev32(v9, __ T16B, v9);
3073     __ rev32(v10, __ T16B, v10);
3074     __ rev32(v11, __ T16B, v11);
3075 
3076     __ addv(v6, __ T4S, v8, v16);
3077     __ orr(v2, __ T16B, v0, v0);
3078     __ orr(v3, __ T16B, v1, v1);
3079 
3080     FloatRegister d0 = v8;
3081     FloatRegister d1 = v9;
3082     FloatRegister d2 = v10;
3083     FloatRegister d3 = v11;
3084 
3085 
3086     for (int round = 0; round < 16; round++) {
3087       FloatRegister tmp1 = (round & 1) ? v6 : v7;
3088       FloatRegister tmp2 = (round & 1) ? v7 : v6;
3089       FloatRegister tmp3 = (round & 1) ? v2 : v4;
3090       FloatRegister tmp4 = (round & 1) ? v4 : v2;
3091 
3092       if (round < 12) __ sha256su0(d0, __ T4S, d1);
3093        __ orr(v4, __ T16B, v2, v2);
3094       if (round < 15)
3095         __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
3096       __ sha256h(v2, __ T4S, v3, tmp2);
3097       __ sha256h2(v3, __ T4S, v4, tmp2);
3098       if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
3099 
3100       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3101     }
3102 
3103     __ addv(v0, __ T4S, v0, v2);
3104     __ addv(v1, __ T4S, v1, v3);
3105 
3106     if (multi_block) {
3107       __ add(ofs, ofs, 64);
3108       __ cmp(ofs, limit);
3109       __ br(Assembler::LE, sha1_loop);
3110       __ mov(c_rarg0, ofs); // return ofs
3111     }
3112 
3113     __ ldpd(v10, v11, Address(sp, 16));
3114     __ ldpd(v8, v9, __ post(sp, 32));
3115 
3116     __ stpq(v0, v1, state);
3117 
3118     __ ret(lr);
3119 
3120     return start;
3121   }
3122 
3123 #ifndef BUILTIN_SIM
3124   // Safefetch stubs.
3125   void generate_safefetch(const char* name, int size, address* entry,
3126                           address* fault_pc, address* continuation_pc) {
3127     // safefetch signatures:
3128     //   int      SafeFetch32(int*      adr, int      errValue);
3129     //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
3130     //
3131     // arguments:
3132     //   c_rarg0 = adr
3133     //   c_rarg1 = errValue
3134     //
3135     // result:
3136     //   PPC_RET  = *adr or errValue
3137 
3138     StubCodeMark mark(this, "StubRoutines", name);
3139 
3140     // Entry point, pc or function descriptor.
3141     *entry = __ pc();
3142 
3143     // Load *adr into c_rarg1, may fault.
3144     *fault_pc = __ pc();
3145     switch (size) {
3146       case 4:
3147         // int32_t
3148         __ ldrw(c_rarg1, Address(c_rarg0, 0));
3149         break;
3150       case 8:
3151         // int64_t
3152         __ ldr(c_rarg1, Address(c_rarg0, 0));
3153         break;
3154       default:
3155         ShouldNotReachHere();
3156     }
3157 
3158     // return errValue or *adr
3159     *continuation_pc = __ pc();
3160     __ mov(r0, c_rarg1);
3161     __ ret(lr);
3162   }
3163 #endif
3164 
3165   /**
3166    *  Arguments:
3167    *
3168    * Inputs:
3169    *   c_rarg0   - int crc
3170    *   c_rarg1   - byte* buf
3171    *   c_rarg2   - int length
3172    *
3173    * Ouput:
3174    *       rax   - int crc result
3175    */
3176   address generate_updateBytesCRC32() {
3177     assert(UseCRC32Intrinsics, "what are we doing here?");
3178 
3179     __ align(CodeEntryAlignment);
3180     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
3181 
3182     address start = __ pc();
3183 
3184     const Register crc   = c_rarg0;  // crc
3185     const Register buf   = c_rarg1;  // source java byte array address
3186     const Register len   = c_rarg2;  // length
3187     const Register table0 = c_rarg3; // crc_table address
3188     const Register table1 = c_rarg4;
3189     const Register table2 = c_rarg5;
3190     const Register table3 = c_rarg6;
3191     const Register tmp3 = c_rarg7;
3192 
3193     BLOCK_COMMENT("Entry:");
3194     __ enter(); // required for proper stackwalking of RuntimeStub frame
3195 
3196     __ kernel_crc32(crc, buf, len,
3197               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
3198 
3199     __ leave(); // required for proper stackwalking of RuntimeStub frame
3200     __ ret(lr);
3201 
3202     return start;
3203   }
3204 
3205   /**
3206    *  Arguments:
3207    *
3208    * Inputs:
3209    *   c_rarg0   - int crc
3210    *   c_rarg1   - byte* buf
3211    *   c_rarg2   - int length
3212    *   c_rarg3   - int* table
3213    *
3214    * Ouput:
3215    *       r0   - int crc result
3216    */
3217   address generate_updateBytesCRC32C() {
3218     assert(UseCRC32CIntrinsics, "what are we doing here?");
3219 
3220     __ align(CodeEntryAlignment);
3221     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
3222 
3223     address start = __ pc();
3224 
3225     const Register crc   = c_rarg0;  // crc
3226     const Register buf   = c_rarg1;  // source java byte array address
3227     const Register len   = c_rarg2;  // length
3228     const Register table0 = c_rarg3; // crc_table address
3229     const Register table1 = c_rarg4;
3230     const Register table2 = c_rarg5;
3231     const Register table3 = c_rarg6;
3232     const Register tmp3 = c_rarg7;
3233 
3234     BLOCK_COMMENT("Entry:");
3235     __ enter(); // required for proper stackwalking of RuntimeStub frame
3236 
3237     __ kernel_crc32c(crc, buf, len,
3238               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
3239 
3240     __ leave(); // required for proper stackwalking of RuntimeStub frame
3241     __ ret(lr);
3242 
3243     return start;
3244   }
3245 
3246   /***
3247    *  Arguments:
3248    *
3249    *  Inputs:
3250    *   c_rarg0   - int   adler
3251    *   c_rarg1   - byte* buff
3252    *   c_rarg2   - int   len
3253    *
3254    * Output:
3255    *   c_rarg0   - int adler result
3256    */
3257   address generate_updateBytesAdler32() {
3258     __ align(CodeEntryAlignment);
3259     StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32");
3260     address start = __ pc();
3261 
3262     Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
3263 
3264     // Aliases
3265     Register adler  = c_rarg0;
3266     Register s1     = c_rarg0;
3267     Register s2     = c_rarg3;
3268     Register buff   = c_rarg1;
3269     Register len    = c_rarg2;
3270     Register nmax  = r4;
3271     Register base  = r5;
3272     Register count = r6;
3273     Register temp0 = rscratch1;
3274     Register temp1 = rscratch2;
3275     FloatRegister vbytes = v0;
3276     FloatRegister vs1acc = v1;
3277     FloatRegister vs2acc = v2;
3278     FloatRegister vtable = v3;
3279 
3280     // Max number of bytes we can process before having to take the mod
3281     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
3282     unsigned long BASE = 0xfff1;
3283     unsigned long NMAX = 0x15B0;
3284 
3285     __ mov(base, BASE);
3286     __ mov(nmax, NMAX);
3287 
3288     // Load accumulation coefficients for the upper 16 bits
3289     __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table));
3290     __ ld1(vtable, __ T16B, Address(temp0));
3291 
3292     // s1 is initialized to the lower 16 bits of adler
3293     // s2 is initialized to the upper 16 bits of adler
3294     __ ubfx(s2, adler, 16, 16);  // s2 = ((adler >> 16) & 0xffff)
3295     __ uxth(s1, adler);          // s1 = (adler & 0xffff)
3296 
3297     // The pipelined loop needs at least 16 elements for 1 iteration
3298     // It does check this, but it is more effective to skip to the cleanup loop
3299     __ cmp(len, (u1)16);
3300     __ br(Assembler::HS, L_nmax);
3301     __ cbz(len, L_combine);
3302 
3303     __ bind(L_simple_by1_loop);
3304     __ ldrb(temp0, Address(__ post(buff, 1)));
3305     __ add(s1, s1, temp0);
3306     __ add(s2, s2, s1);
3307     __ subs(len, len, 1);
3308     __ br(Assembler::HI, L_simple_by1_loop);
3309 
3310     // s1 = s1 % BASE
3311     __ subs(temp0, s1, base);
3312     __ csel(s1, temp0, s1, Assembler::HS);
3313 
3314     // s2 = s2 % BASE
3315     __ lsr(temp0, s2, 16);
3316     __ lsl(temp1, temp0, 4);
3317     __ sub(temp1, temp1, temp0);
3318     __ add(s2, temp1, s2, ext::uxth);
3319 
3320     __ subs(temp0, s2, base);
3321     __ csel(s2, temp0, s2, Assembler::HS);
3322 
3323     __ b(L_combine);
3324 
3325     __ bind(L_nmax);
3326     __ subs(len, len, nmax);
3327     __ sub(count, nmax, 16);
3328     __ br(Assembler::LO, L_by16);
3329 
3330     __ bind(L_nmax_loop);
3331 
3332     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
3333                                       vbytes, vs1acc, vs2acc, vtable);
3334 
3335     __ subs(count, count, 16);
3336     __ br(Assembler::HS, L_nmax_loop);
3337 
3338     // s1 = s1 % BASE
3339     __ lsr(temp0, s1, 16);
3340     __ lsl(temp1, temp0, 4);
3341     __ sub(temp1, temp1, temp0);
3342     __ add(temp1, temp1, s1, ext::uxth);
3343 
3344     __ lsr(temp0, temp1, 16);
3345     __ lsl(s1, temp0, 4);
3346     __ sub(s1, s1, temp0);
3347     __ add(s1, s1, temp1, ext:: uxth);
3348 
3349     __ subs(temp0, s1, base);
3350     __ csel(s1, temp0, s1, Assembler::HS);
3351 
3352     // s2 = s2 % BASE
3353     __ lsr(temp0, s2, 16);
3354     __ lsl(temp1, temp0, 4);
3355     __ sub(temp1, temp1, temp0);
3356     __ add(temp1, temp1, s2, ext::uxth);
3357 
3358     __ lsr(temp0, temp1, 16);
3359     __ lsl(s2, temp0, 4);
3360     __ sub(s2, s2, temp0);
3361     __ add(s2, s2, temp1, ext:: uxth);
3362 
3363     __ subs(temp0, s2, base);
3364     __ csel(s2, temp0, s2, Assembler::HS);
3365 
3366     __ subs(len, len, nmax);
3367     __ sub(count, nmax, 16);
3368     __ br(Assembler::HS, L_nmax_loop);
3369 
3370     __ bind(L_by16);
3371     __ adds(len, len, count);
3372     __ br(Assembler::LO, L_by1);
3373 
3374     __ bind(L_by16_loop);
3375 
3376     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
3377                                       vbytes, vs1acc, vs2acc, vtable);
3378 
3379     __ subs(len, len, 16);
3380     __ br(Assembler::HS, L_by16_loop);
3381 
3382     __ bind(L_by1);
3383     __ adds(len, len, 15);
3384     __ br(Assembler::LO, L_do_mod);
3385 
3386     __ bind(L_by1_loop);
3387     __ ldrb(temp0, Address(__ post(buff, 1)));
3388     __ add(s1, temp0, s1);
3389     __ add(s2, s2, s1);
3390     __ subs(len, len, 1);
3391     __ br(Assembler::HS, L_by1_loop);
3392 
3393     __ bind(L_do_mod);
3394     // s1 = s1 % BASE
3395     __ lsr(temp0, s1, 16);
3396     __ lsl(temp1, temp0, 4);
3397     __ sub(temp1, temp1, temp0);
3398     __ add(temp1, temp1, s1, ext::uxth);
3399 
3400     __ lsr(temp0, temp1, 16);
3401     __ lsl(s1, temp0, 4);
3402     __ sub(s1, s1, temp0);
3403     __ add(s1, s1, temp1, ext:: uxth);
3404 
3405     __ subs(temp0, s1, base);
3406     __ csel(s1, temp0, s1, Assembler::HS);
3407 
3408     // s2 = s2 % BASE
3409     __ lsr(temp0, s2, 16);
3410     __ lsl(temp1, temp0, 4);
3411     __ sub(temp1, temp1, temp0);
3412     __ add(temp1, temp1, s2, ext::uxth);
3413 
3414     __ lsr(temp0, temp1, 16);
3415     __ lsl(s2, temp0, 4);
3416     __ sub(s2, s2, temp0);
3417     __ add(s2, s2, temp1, ext:: uxth);
3418 
3419     __ subs(temp0, s2, base);
3420     __ csel(s2, temp0, s2, Assembler::HS);
3421 
3422     // Combine lower bits and higher bits
3423     __ bind(L_combine);
3424     __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
3425 
3426     __ ret(lr);
3427 
3428     return start;
3429   }
3430 
3431   void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff,
3432           Register temp0, Register temp1, FloatRegister vbytes,
3433           FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) {
3434     // Below is a vectorized implementation of updating s1 and s2 for 16 bytes.
3435     // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration.
3436     // In non-vectorized code, we update s1 and s2 as:
3437     //   s1 <- s1 + b1
3438     //   s2 <- s2 + s1
3439     //   s1 <- s1 + b2
3440     //   s2 <- s2 + b1
3441     //   ...
3442     //   s1 <- s1 + b16
3443     //   s2 <- s2 + s1
3444     // Putting above assignments together, we have:
3445     //   s1_new = s1 + b1 + b2 + ... + b16
3446     //   s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16)
3447     //          = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1)
3448     //          = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1)
3449     __ ld1(vbytes, __ T16B, Address(__ post(buff, 16)));
3450 
3451     // s2 = s2 + s1 * 16
3452     __ add(s2, s2, s1, Assembler::LSL, 4);
3453 
3454     // vs1acc = b1 + b2 + b3 + ... + b16
3455     // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1)
3456     __ umullv(vs2acc, __ T8B, vtable, vbytes);
3457     __ umlalv(vs2acc, __ T16B, vtable, vbytes);
3458     __ uaddlv(vs1acc, __ T16B, vbytes);
3459     __ uaddlv(vs2acc, __ T8H, vs2acc);
3460 
3461     // s1 = s1 + vs1acc, s2 = s2 + vs2acc
3462     __ fmovd(temp0, vs1acc);
3463     __ fmovd(temp1, vs2acc);
3464     __ add(s1, s1, temp0);
3465     __ add(s2, s2, temp1);
3466   }
3467 
3468   /**
3469    *  Arguments:
3470    *
3471    *  Input:
3472    *    c_rarg0   - x address
3473    *    c_rarg1   - x length
3474    *    c_rarg2   - y address
3475    *    c_rarg3   - y lenth
3476    *    c_rarg4   - z address
3477    *    c_rarg5   - z length
3478    */
3479   address generate_multiplyToLen() {
3480     __ align(CodeEntryAlignment);
3481     StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
3482 
3483     address start = __ pc();
3484     const Register x     = r0;
3485     const Register xlen  = r1;
3486     const Register y     = r2;
3487     const Register ylen  = r3;
3488     const Register z     = r4;
3489     const Register zlen  = r5;
3490 
3491     const Register tmp1  = r10;
3492     const Register tmp2  = r11;
3493     const Register tmp3  = r12;
3494     const Register tmp4  = r13;
3495     const Register tmp5  = r14;
3496     const Register tmp6  = r15;
3497     const Register tmp7  = r16;
3498 
3499     BLOCK_COMMENT("Entry:");
3500     __ enter(); // required for proper stackwalking of RuntimeStub frame
3501     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
3502     __ leave(); // required for proper stackwalking of RuntimeStub frame
3503     __ ret(lr);
3504 
3505     return start;
3506   }
3507 
3508   address generate_squareToLen() {
3509     // squareToLen algorithm for sizes 1..127 described in java code works
3510     // faster than multiply_to_len on some CPUs and slower on others, but
3511     // multiply_to_len shows a bit better overall results
3512     __ align(CodeEntryAlignment);
3513     StubCodeMark mark(this, "StubRoutines", "squareToLen");
3514     address start = __ pc();
3515 
3516     const Register x     = r0;
3517     const Register xlen  = r1;
3518     const Register z     = r2;
3519     const Register zlen  = r3;
3520     const Register y     = r4; // == x
3521     const Register ylen  = r5; // == xlen
3522 
3523     const Register tmp1  = r10;
3524     const Register tmp2  = r11;
3525     const Register tmp3  = r12;
3526     const Register tmp4  = r13;
3527     const Register tmp5  = r14;
3528     const Register tmp6  = r15;
3529     const Register tmp7  = r16;
3530 
3531     RegSet spilled_regs = RegSet::of(y, ylen);
3532     BLOCK_COMMENT("Entry:");
3533     __ enter();
3534     __ push(spilled_regs, sp);
3535     __ mov(y, x);
3536     __ mov(ylen, xlen);
3537     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
3538     __ pop(spilled_regs, sp);
3539     __ leave();
3540     __ ret(lr);
3541     return start;
3542   }
3543 
3544   address generate_mulAdd() {
3545     __ align(CodeEntryAlignment);
3546     StubCodeMark mark(this, "StubRoutines", "mulAdd");
3547 
3548     address start = __ pc();
3549 
3550     const Register out     = r0;
3551     const Register in      = r1;
3552     const Register offset  = r2;
3553     const Register len     = r3;
3554     const Register k       = r4;
3555 
3556     BLOCK_COMMENT("Entry:");
3557     __ enter();
3558     __ mul_add(out, in, offset, len, k);
3559     __ leave();
3560     __ ret(lr);
3561 
3562     return start;
3563   }
3564 
3565   void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi,
3566                       FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0,
3567                       FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4) {
3568     // Karatsuba multiplication performs a 128*128 -> 256-bit
3569     // multiplication in three 128-bit multiplications and a few
3570     // additions.
3571     //
3572     // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1)
3573     // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0
3574     //
3575     // Inputs:
3576     //
3577     // A0 in a.d[0]     (subkey)
3578     // A1 in a.d[1]
3579     // (A1+A0) in a1_xor_a0.d[0]
3580     //
3581     // B0 in b.d[0]     (state)
3582     // B1 in b.d[1]
3583 
3584     __ ext(tmp1, __ T16B, b, b, 0x08);
3585     __ pmull2(result_hi, __ T1Q, b, a, __ T2D);  // A1*B1
3586     __ eor(tmp1, __ T16B, tmp1, b);            // (B1+B0)
3587     __ pmull(result_lo,  __ T1Q, b, a, __ T1D);  // A0*B0
3588     __ pmull(tmp2, __ T1Q, tmp1, a1_xor_a0, __ T1D); // (A1+A0)(B1+B0)
3589 
3590     __ ext(tmp4, __ T16B, result_lo, result_hi, 0x08);
3591     __ eor(tmp3, __ T16B, result_hi, result_lo); // A1*B1+A0*B0
3592     __ eor(tmp2, __ T16B, tmp2, tmp4);
3593     __ eor(tmp2, __ T16B, tmp2, tmp3);
3594 
3595     // Register pair <result_hi:result_lo> holds the result of carry-less multiplication
3596     __ ins(result_hi, __ D, tmp2, 0, 1);
3597     __ ins(result_lo, __ D, tmp2, 1, 0);
3598   }
3599 
3600   void ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi,
3601                     FloatRegister p, FloatRegister z, FloatRegister t1) {
3602     const FloatRegister t0 = result;
3603 
3604     // The GCM field polynomial f is z^128 + p(z), where p =
3605     // z^7+z^2+z+1.
3606     //
3607     //    z^128 === -p(z)  (mod (z^128 + p(z)))
3608     //
3609     // so, given that the product we're reducing is
3610     //    a == lo + hi * z^128
3611     // substituting,
3612     //      === lo - hi * p(z)  (mod (z^128 + p(z)))
3613     //
3614     // we reduce by multiplying hi by p(z) and subtracting the result
3615     // from (i.e. XORing it with) lo.  Because p has no nonzero high
3616     // bits we can do this with two 64-bit multiplications, lo*p and
3617     // hi*p.
3618 
3619     __ pmull2(t0, __ T1Q, hi, p, __ T2D);
3620     __ ext(t1, __ T16B, t0, z, 8);
3621     __ eor(hi, __ T16B, hi, t1);
3622     __ ext(t1, __ T16B, z, t0, 8);
3623     __ eor(lo, __ T16B, lo, t1);
3624     __ pmull(t0, __ T1Q, hi, p, __ T1D);
3625     __ eor(result, __ T16B, lo, t0);
3626   }
3627 
3628   address generate_has_negatives(address &has_negatives_long) {
3629     const u1 large_loop_size = 64;
3630     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
3631     int dcache_line = VM_Version::dcache_line_size();
3632 
3633     Register ary1 = r1, len = r2, result = r0;
3634 
3635     __ align(CodeEntryAlignment);
3636 
3637     StubCodeMark mark(this, "StubRoutines", "has_negatives");
3638 
3639     address entry = __ pc();
3640 
3641     __ enter();
3642 
3643   Label RET_TRUE, RET_TRUE_NO_POP, RET_FALSE, ALIGNED, LOOP16, CHECK_16, DONE,
3644         LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;
3645 
3646   __ cmp(len, (u1)15);
3647   __ br(Assembler::GT, LEN_OVER_15);
3648   // The only case when execution falls into this code is when pointer is near
3649   // the end of memory page and we have to avoid reading next page
3650   __ add(ary1, ary1, len);
3651   __ subs(len, len, 8);
3652   __ br(Assembler::GT, LEN_OVER_8);
3653   __ ldr(rscratch2, Address(ary1, -8));
3654   __ sub(rscratch1, zr, len, __ LSL, 3);  // LSL 3 is to get bits from bytes.
3655   __ lsrv(rscratch2, rscratch2, rscratch1);
3656   __ tst(rscratch2, UPPER_BIT_MASK);
3657   __ cset(result, Assembler::NE);
3658   __ leave();
3659   __ ret(lr);
3660   __ bind(LEN_OVER_8);
3661   __ ldp(rscratch1, rscratch2, Address(ary1, -16));
3662   __ sub(len, len, 8); // no data dep., then sub can be executed while loading
3663   __ tst(rscratch2, UPPER_BIT_MASK);
3664   __ br(Assembler::NE, RET_TRUE_NO_POP);
3665   __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
3666   __ lsrv(rscratch1, rscratch1, rscratch2);
3667   __ tst(rscratch1, UPPER_BIT_MASK);
3668   __ cset(result, Assembler::NE);
3669   __ leave();
3670   __ ret(lr);
3671 
3672   Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10;
3673   const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6;
3674 
3675   has_negatives_long = __ pc(); // 2nd entry point
3676 
3677   __ enter();
3678 
3679   __ bind(LEN_OVER_15);
3680     __ push(spilled_regs, sp);
3681     __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment
3682     __ cbz(rscratch2, ALIGNED);
3683     __ ldp(tmp6, tmp1, Address(ary1));
3684     __ mov(tmp5, 16);
3685     __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address
3686     __ add(ary1, ary1, rscratch1);
3687     __ sub(len, len, rscratch1);
3688     __ orr(tmp6, tmp6, tmp1);
3689     __ tst(tmp6, UPPER_BIT_MASK);
3690     __ br(Assembler::NE, RET_TRUE);
3691 
3692   __ bind(ALIGNED);
3693     __ cmp(len, large_loop_size);
3694     __ br(Assembler::LT, CHECK_16);
3695     // Perform 16-byte load as early return in pre-loop to handle situation
3696     // when initially aligned large array has negative values at starting bytes,
3697     // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is
3698     // slower. Cases with negative bytes further ahead won't be affected that
3699     // much. In fact, it'll be faster due to early loads, less instructions and
3700     // less branches in LARGE_LOOP.
3701     __ ldp(tmp6, tmp1, Address(__ post(ary1, 16)));
3702     __ sub(len, len, 16);
3703     __ orr(tmp6, tmp6, tmp1);
3704     __ tst(tmp6, UPPER_BIT_MASK);
3705     __ br(Assembler::NE, RET_TRUE);
3706     __ cmp(len, large_loop_size);
3707     __ br(Assembler::LT, CHECK_16);
3708 
3709     if (SoftwarePrefetchHintDistance >= 0
3710         && SoftwarePrefetchHintDistance >= dcache_line) {
3711       // initial prefetch
3712       __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line));
3713     }
3714   __ bind(LARGE_LOOP);
3715     if (SoftwarePrefetchHintDistance >= 0) {
3716       __ prfm(Address(ary1, SoftwarePrefetchHintDistance));
3717     }
3718     // Issue load instructions first, since it can save few CPU/MEM cycles, also
3719     // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp)
3720     // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3
3721     // instructions per cycle and have less branches, but this approach disables
3722     // early return, thus, all 64 bytes are loaded and checked every time.
3723     __ ldp(tmp2, tmp3, Address(ary1));
3724     __ ldp(tmp4, tmp5, Address(ary1, 16));
3725     __ ldp(rscratch1, rscratch2, Address(ary1, 32));
3726     __ ldp(tmp6, tmp1, Address(ary1, 48));
3727     __ add(ary1, ary1, large_loop_size);
3728     __ sub(len, len, large_loop_size);
3729     __ orr(tmp2, tmp2, tmp3);
3730     __ orr(tmp4, tmp4, tmp5);
3731     __ orr(rscratch1, rscratch1, rscratch2);
3732     __ orr(tmp6, tmp6, tmp1);
3733     __ orr(tmp2, tmp2, tmp4);
3734     __ orr(rscratch1, rscratch1, tmp6);
3735     __ orr(tmp2, tmp2, rscratch1);
3736     __ tst(tmp2, UPPER_BIT_MASK);
3737     __ br(Assembler::NE, RET_TRUE);
3738     __ cmp(len, large_loop_size);
3739     __ br(Assembler::GE, LARGE_LOOP);
3740 
3741   __ bind(CHECK_16); // small 16-byte load pre-loop
3742     __ cmp(len, (u1)16);
3743     __ br(Assembler::LT, POST_LOOP16);
3744 
3745   __ bind(LOOP16); // small 16-byte load loop
3746     __ ldp(tmp2, tmp3, Address(__ post(ary1, 16)));
3747     __ sub(len, len, 16);
3748     __ orr(tmp2, tmp2, tmp3);
3749     __ tst(tmp2, UPPER_BIT_MASK);
3750     __ br(Assembler::NE, RET_TRUE);
3751     __ cmp(len, (u1)16);
3752     __ br(Assembler::GE, LOOP16); // 16-byte load loop end
3753 
3754   __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
3755     __ cmp(len, (u1)8);
3756     __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
3757     __ ldr(tmp3, Address(__ post(ary1, 8)));
3758     __ sub(len, len, 8);
3759     __ tst(tmp3, UPPER_BIT_MASK);
3760     __ br(Assembler::NE, RET_TRUE);
3761 
3762   __ bind(POST_LOOP16_LOAD_TAIL);
3763     __ cbz(len, RET_FALSE); // Can't shift left by 64 when len==0
3764     __ ldr(tmp1, Address(ary1));
3765     __ mov(tmp2, 64);
3766     __ sub(tmp4, tmp2, len, __ LSL, 3);
3767     __ lslv(tmp1, tmp1, tmp4);
3768     __ tst(tmp1, UPPER_BIT_MASK);
3769     __ br(Assembler::NE, RET_TRUE);
3770     // Fallthrough
3771 
3772   __ bind(RET_FALSE);
3773     __ pop(spilled_regs, sp);
3774     __ leave();
3775     __ mov(result, zr);
3776     __ ret(lr);
3777 
3778   __ bind(RET_TRUE);
3779     __ pop(spilled_regs, sp);
3780   __ bind(RET_TRUE_NO_POP);
3781     __ leave();
3782     __ mov(result, 1);
3783     __ ret(lr);
3784 
3785   __ bind(DONE);
3786     __ pop(spilled_regs, sp);
3787     __ leave();
3788     __ ret(lr);
3789     return entry;
3790   }
3791 
3792   void generate_large_array_equals_loop_nonsimd(int loopThreshold,
3793         bool usePrefetch, Label &NOT_EQUAL) {
3794     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
3795         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
3796         tmp7 = r12, tmp8 = r13;
3797     Label LOOP;
3798 
3799     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
3800     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
3801     __ bind(LOOP);
3802     if (usePrefetch) {
3803       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
3804       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
3805     }
3806     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
3807     __ eor(tmp1, tmp1, tmp2);
3808     __ eor(tmp3, tmp3, tmp4);
3809     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
3810     __ orr(tmp1, tmp1, tmp3);
3811     __ cbnz(tmp1, NOT_EQUAL);
3812     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
3813     __ eor(tmp5, tmp5, tmp6);
3814     __ eor(tmp7, tmp7, tmp8);
3815     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
3816     __ orr(tmp5, tmp5, tmp7);
3817     __ cbnz(tmp5, NOT_EQUAL);
3818     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
3819     __ eor(tmp1, tmp1, tmp2);
3820     __ eor(tmp3, tmp3, tmp4);
3821     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
3822     __ orr(tmp1, tmp1, tmp3);
3823     __ cbnz(tmp1, NOT_EQUAL);
3824     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
3825     __ eor(tmp5, tmp5, tmp6);
3826     __ sub(cnt1, cnt1, 8 * wordSize);
3827     __ eor(tmp7, tmp7, tmp8);
3828     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
3829     // tmp6 is not used. MacroAssembler::subs is used here (rather than
3830     // cmp) because subs allows an unlimited range of immediate operand.
3831     __ subs(tmp6, cnt1, loopThreshold);
3832     __ orr(tmp5, tmp5, tmp7);
3833     __ cbnz(tmp5, NOT_EQUAL);
3834     __ br(__ GE, LOOP);
3835     // post-loop
3836     __ eor(tmp1, tmp1, tmp2);
3837     __ eor(tmp3, tmp3, tmp4);
3838     __ orr(tmp1, tmp1, tmp3);
3839     __ sub(cnt1, cnt1, 2 * wordSize);
3840     __ cbnz(tmp1, NOT_EQUAL);
3841   }
3842 
3843   void generate_large_array_equals_loop_simd(int loopThreshold,
3844         bool usePrefetch, Label &NOT_EQUAL) {
3845     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
3846         tmp2 = rscratch2;
3847     Label LOOP;
3848 
3849     __ bind(LOOP);
3850     if (usePrefetch) {
3851       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
3852       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
3853     }
3854     __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize)));
3855     __ sub(cnt1, cnt1, 8 * wordSize);
3856     __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize)));
3857     __ subs(tmp1, cnt1, loopThreshold);
3858     __ eor(v0, __ T16B, v0, v4);
3859     __ eor(v1, __ T16B, v1, v5);
3860     __ eor(v2, __ T16B, v2, v6);
3861     __ eor(v3, __ T16B, v3, v7);
3862     __ orr(v0, __ T16B, v0, v1);
3863     __ orr(v1, __ T16B, v2, v3);
3864     __ orr(v0, __ T16B, v0, v1);
3865     __ umov(tmp1, v0, __ D, 0);
3866     __ umov(tmp2, v0, __ D, 1);
3867     __ orr(tmp1, tmp1, tmp2);
3868     __ cbnz(tmp1, NOT_EQUAL);
3869     __ br(__ GE, LOOP);
3870   }
3871 
3872   // a1 = r1 - array1 address
3873   // a2 = r2 - array2 address
3874   // result = r0 - return value. Already contains "false"
3875   // cnt1 = r10 - amount of elements left to check, reduced by wordSize
3876   // r3-r5 are reserved temporary registers
3877   address generate_large_array_equals() {
3878     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
3879         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
3880         tmp7 = r12, tmp8 = r13;
3881     Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP,
3882         SMALL_LOOP, POST_LOOP;
3883     const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16;
3884     // calculate if at least 32 prefetched bytes are used
3885     int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32;
3886     int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE);
3887     RegSet spilled_regs = RegSet::range(tmp6, tmp8);
3888     assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4,
3889         tmp5, tmp6, tmp7, tmp8);
3890 
3891     __ align(CodeEntryAlignment);
3892 
3893     StubCodeMark mark(this, "StubRoutines", "large_array_equals");
3894 
3895     address entry = __ pc();
3896     __ enter();
3897     __ sub(cnt1, cnt1, wordSize);  // first 8 bytes were loaded outside of stub
3898     // also advance pointers to use post-increment instead of pre-increment
3899     __ add(a1, a1, wordSize);
3900     __ add(a2, a2, wordSize);
3901     if (AvoidUnalignedAccesses) {
3902       // both implementations (SIMD/nonSIMD) are using relatively large load
3903       // instructions (ld1/ldp), which has huge penalty (up to x2 exec time)
3904       // on some CPUs in case of address is not at least 16-byte aligned.
3905       // Arrays are 8-byte aligned currently, so, we can make additional 8-byte
3906       // load if needed at least for 1st address and make if 16-byte aligned.
3907       Label ALIGNED16;
3908       __ tbz(a1, 3, ALIGNED16);
3909       __ ldr(tmp1, Address(__ post(a1, wordSize)));
3910       __ ldr(tmp2, Address(__ post(a2, wordSize)));
3911       __ sub(cnt1, cnt1, wordSize);
3912       __ eor(tmp1, tmp1, tmp2);
3913       __ cbnz(tmp1, NOT_EQUAL_NO_POP);
3914       __ bind(ALIGNED16);
3915     }
3916     if (UseSIMDForArrayEquals) {
3917       if (SoftwarePrefetchHintDistance >= 0) {
3918         __ subs(tmp1, cnt1, prefetchLoopThreshold);
3919         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
3920         generate_large_array_equals_loop_simd(prefetchLoopThreshold,
3921             /* prfm = */ true, NOT_EQUAL);
3922         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
3923         __ br(__ LT, TAIL);
3924       }
3925       __ bind(NO_PREFETCH_LARGE_LOOP);
3926       generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold,
3927           /* prfm = */ false, NOT_EQUAL);
3928     } else {
3929       __ push(spilled_regs, sp);
3930       if (SoftwarePrefetchHintDistance >= 0) {
3931         __ subs(tmp1, cnt1, prefetchLoopThreshold);
3932         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
3933         generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold,
3934             /* prfm = */ true, NOT_EQUAL);
3935         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
3936         __ br(__ LT, TAIL);
3937       }
3938       __ bind(NO_PREFETCH_LARGE_LOOP);
3939       generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold,
3940           /* prfm = */ false, NOT_EQUAL);
3941     }
3942     __ bind(TAIL);
3943       __ cbz(cnt1, EQUAL);
3944       __ subs(cnt1, cnt1, wordSize);
3945       __ br(__ LE, POST_LOOP);
3946     __ bind(SMALL_LOOP);
3947       __ ldr(tmp1, Address(__ post(a1, wordSize)));
3948       __ ldr(tmp2, Address(__ post(a2, wordSize)));
3949       __ subs(cnt1, cnt1, wordSize);
3950       __ eor(tmp1, tmp1, tmp2);
3951       __ cbnz(tmp1, NOT_EQUAL);
3952       __ br(__ GT, SMALL_LOOP);
3953     __ bind(POST_LOOP);
3954       __ ldr(tmp1, Address(a1, cnt1));
3955       __ ldr(tmp2, Address(a2, cnt1));
3956       __ eor(tmp1, tmp1, tmp2);
3957       __ cbnz(tmp1, NOT_EQUAL);
3958     __ bind(EQUAL);
3959       __ mov(result, true);
3960     __ bind(NOT_EQUAL);
3961       if (!UseSIMDForArrayEquals) {
3962         __ pop(spilled_regs, sp);
3963       }
3964     __ bind(NOT_EQUAL_NO_POP);
3965     __ leave();
3966     __ ret(lr);
3967     return entry;
3968   }
3969 
3970   address generate_dsin_dcos(bool isCos) {
3971     __ align(CodeEntryAlignment);
3972     StubCodeMark mark(this, "StubRoutines", isCos ? "libmDcos" : "libmDsin");
3973     address start = __ pc();
3974     __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw,
3975         (address)StubRoutines::aarch64::_two_over_pi,
3976         (address)StubRoutines::aarch64::_pio2,
3977         (address)StubRoutines::aarch64::_dsin_coef,
3978         (address)StubRoutines::aarch64::_dcos_coef);
3979     return start;
3980   }
3981 
3982   address generate_dlog() {
3983     __ align(CodeEntryAlignment);
3984     StubCodeMark mark(this, "StubRoutines", "dlog");
3985     address entry = __ pc();
3986     FloatRegister vtmp0 = v0, vtmp1 = v1, vtmp2 = v2, vtmp3 = v3, vtmp4 = v4,
3987         vtmp5 = v5, tmpC1 = v16, tmpC2 = v17, tmpC3 = v18, tmpC4 = v19;
3988     Register tmp1 = r0, tmp2 = r1, tmp3 = r2, tmp4 = r3, tmp5 = r4;
3989     __ fast_log(vtmp0, vtmp1, vtmp2, vtmp3, vtmp4, vtmp5, tmpC1, tmpC2, tmpC3,
3990         tmpC4, tmp1, tmp2, tmp3, tmp4, tmp5);
3991     return entry;
3992   }
3993 
3994   // code for comparing 16 bytes of strings with same encoding
3995   void compare_string_16_bytes_same(Label &DIFF1, Label &DIFF2) {
3996     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, tmp1 = r10, tmp2 = r11;
3997     __ ldr(rscratch1, Address(__ post(str1, 8)));
3998     __ eor(rscratch2, tmp1, tmp2);
3999     __ ldr(cnt1, Address(__ post(str2, 8)));
4000     __ cbnz(rscratch2, DIFF1);
4001     __ ldr(tmp1, Address(__ post(str1, 8)));
4002     __ eor(rscratch2, rscratch1, cnt1);
4003     __ ldr(tmp2, Address(__ post(str2, 8)));
4004     __ cbnz(rscratch2, DIFF2);
4005   }
4006 
4007   // code for comparing 16 characters of strings with Latin1 and Utf16 encoding
4008   void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1,
4009       Label &DIFF2) {
4010     Register cnt1 = r2, tmp1 = r10, tmp2 = r11, tmp3 = r12;
4011     FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2;
4012 
4013     __ ldrq(vtmp, Address(__ post(tmp2, 16)));
4014     __ ldr(tmpU, Address(__ post(cnt1, 8)));
4015     __ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
4016     // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3
4017 
4018     __ fmovd(tmpL, vtmp3);
4019     __ eor(rscratch2, tmp3, tmpL);
4020     __ cbnz(rscratch2, DIFF2);
4021 
4022     __ ldr(tmp3, Address(__ post(cnt1, 8)));
4023     __ umov(tmpL, vtmp3, __ D, 1);
4024     __ eor(rscratch2, tmpU, tmpL);
4025     __ cbnz(rscratch2, DIFF1);
4026 
4027     __ zip2(vtmp, __ T16B, vtmp, vtmpZ);
4028     __ ldr(tmpU, Address(__ post(cnt1, 8)));
4029     __ fmovd(tmpL, vtmp);
4030     __ eor(rscratch2, tmp3, tmpL);
4031     __ cbnz(rscratch2, DIFF2);
4032 
4033     __ ldr(tmp3, Address(__ post(cnt1, 8)));
4034     __ umov(tmpL, vtmp, __ D, 1);
4035     __ eor(rscratch2, tmpU, tmpL);
4036     __ cbnz(rscratch2, DIFF1);
4037   }
4038 
4039   // r0  = result
4040   // r1  = str1
4041   // r2  = cnt1
4042   // r3  = str2
4043   // r4  = cnt2
4044   // r10 = tmp1
4045   // r11 = tmp2
4046   address generate_compare_long_string_different_encoding(bool isLU) {
4047     __ align(CodeEntryAlignment);
4048     StubCodeMark mark(this, "StubRoutines", isLU
4049         ? "compare_long_string_different_encoding LU"
4050         : "compare_long_string_different_encoding UL");
4051     address entry = __ pc();
4052     Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2,
4053         DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH,
4054         LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2;
4055     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
4056         tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14;
4057     FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2;
4058     RegSet spilled_regs = RegSet::of(tmp3, tmp4);
4059 
4060     int prefetchLoopExitCondition = MAX(64, SoftwarePrefetchHintDistance/2);
4061 
4062     __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ);
4063     // cnt2 == amount of characters left to compare
4064     // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL))
4065     __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
4066     __ add(str1, str1, isLU ? wordSize/2 : wordSize);
4067     __ add(str2, str2, isLU ? wordSize : wordSize/2);
4068     __ fmovd(isLU ? tmp1 : tmp2, vtmp);
4069     __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
4070     __ add(str1, str1, cnt2, __ LSL, isLU ? 0 : 1);
4071     __ eor(rscratch2, tmp1, tmp2);
4072     __ add(str2, str2, cnt2, __ LSL, isLU ? 1 : 0);
4073     __ mov(rscratch1, tmp2);
4074     __ cbnz(rscratch2, CALCULATE_DIFFERENCE);
4075     Register strU = isLU ? str2 : str1,
4076              strL = isLU ? str1 : str2,
4077              tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison
4078              tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison
4079     __ push(spilled_regs, sp);
4080     __ sub(tmp2, strL, cnt2); // strL pointer to load from
4081     __ sub(cnt1, strU, cnt2, __ LSL, 1); // strU pointer to load from
4082 
4083     __ ldr(tmp3, Address(__ post(cnt1, 8)));
4084 
4085     if (SoftwarePrefetchHintDistance >= 0) {
4086       __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
4087       __ br(__ LT, NO_PREFETCH);
4088       __ bind(LARGE_LOOP_PREFETCH);
4089         __ prfm(Address(tmp2, SoftwarePrefetchHintDistance));
4090         __ mov(tmp4, 2);
4091         __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
4092         __ bind(LARGE_LOOP_PREFETCH_REPEAT1);
4093           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
4094           __ subs(tmp4, tmp4, 1);
4095           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1);
4096           __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
4097           __ mov(tmp4, 2);
4098         __ bind(LARGE_LOOP_PREFETCH_REPEAT2);
4099           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
4100           __ subs(tmp4, tmp4, 1);
4101           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2);
4102           __ sub(cnt2, cnt2, 64);
4103           __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
4104           __ br(__ GE, LARGE_LOOP_PREFETCH);
4105     }
4106     __ cbz(cnt2, LOAD_LAST); // no characters left except last load
4107     __ bind(NO_PREFETCH);
4108     __ subs(cnt2, cnt2, 16);
4109     __ br(__ LT, TAIL);
4110     __ bind(SMALL_LOOP); // smaller loop
4111       __ subs(cnt2, cnt2, 16);
4112       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
4113       __ br(__ GE, SMALL_LOOP);
4114       __ cmn(cnt2, (u1)16);
4115       __ br(__ EQ, LOAD_LAST);
4116     __ bind(TAIL); // 1..15 characters left until last load (last 4 characters)
4117       __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 8 bytes before last 4 characters in UTF-16 string
4118       __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string
4119       __ ldr(tmp3, Address(cnt1, -8));
4120       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load
4121       __ b(LOAD_LAST);
4122     __ bind(DIFF2);
4123       __ mov(tmpU, tmp3);
4124     __ bind(DIFF1);
4125       __ pop(spilled_regs, sp);
4126       __ b(CALCULATE_DIFFERENCE);
4127     __ bind(LOAD_LAST);
4128       // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU.
4129       // No need to load it again
4130       __ mov(tmpU, tmp3);
4131       __ pop(spilled_regs, sp);
4132 
4133       __ ldrs(vtmp, Address(strL));
4134       __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
4135       __ fmovd(tmpL, vtmp);
4136 
4137       __ eor(rscratch2, tmpU, tmpL);
4138       __ cbz(rscratch2, DONE);
4139 
4140     // Find the first different characters in the longwords and
4141     // compute their difference.
4142     __ bind(CALCULATE_DIFFERENCE);
4143       __ rev(rscratch2, rscratch2);
4144       __ clz(rscratch2, rscratch2);
4145       __ andr(rscratch2, rscratch2, -16);
4146       __ lsrv(tmp1, tmp1, rscratch2);
4147       __ uxthw(tmp1, tmp1);
4148       __ lsrv(rscratch1, rscratch1, rscratch2);
4149       __ uxthw(rscratch1, rscratch1);
4150       __ subw(result, tmp1, rscratch1);
4151     __ bind(DONE);
4152       __ ret(lr);
4153     return entry;
4154   }
4155 
4156   // r0  = result
4157   // r1  = str1
4158   // r2  = cnt1
4159   // r3  = str2
4160   // r4  = cnt2
4161   // r10 = tmp1
4162   // r11 = tmp2
4163   address generate_compare_long_string_same_encoding(bool isLL) {
4164     __ align(CodeEntryAlignment);
4165     StubCodeMark mark(this, "StubRoutines", isLL
4166         ? "compare_long_string_same_encoding LL"
4167         : "compare_long_string_same_encoding UU");
4168     address entry = __ pc();
4169     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
4170         tmp1 = r10, tmp2 = r11;
4171     Label SMALL_LOOP, LARGE_LOOP_PREFETCH, CHECK_LAST, DIFF2, TAIL,
4172         LENGTH_DIFF, DIFF, LAST_CHECK_AND_LENGTH_DIFF,
4173         DIFF_LAST_POSITION, DIFF_LAST_POSITION2;
4174     // exit from large loop when less than 64 bytes left to read or we're about
4175     // to prefetch memory behind array border
4176     int largeLoopExitCondition = MAX(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);
4177     // cnt1/cnt2 contains amount of characters to compare. cnt1 can be re-used
4178     // update cnt2 counter with already loaded 8 bytes
4179     __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2));
4180     // update pointers, because of previous read
4181     __ add(str1, str1, wordSize);
4182     __ add(str2, str2, wordSize);
4183     if (SoftwarePrefetchHintDistance >= 0) {
4184       __ bind(LARGE_LOOP_PREFETCH);
4185         __ prfm(Address(str1, SoftwarePrefetchHintDistance));
4186         __ prfm(Address(str2, SoftwarePrefetchHintDistance));
4187         compare_string_16_bytes_same(DIFF, DIFF2);
4188         compare_string_16_bytes_same(DIFF, DIFF2);
4189         __ sub(cnt2, cnt2, isLL ? 64 : 32);
4190         compare_string_16_bytes_same(DIFF, DIFF2);
4191         __ subs(rscratch2, cnt2, largeLoopExitCondition);
4192         compare_string_16_bytes_same(DIFF, DIFF2);
4193         __ br(__ GT, LARGE_LOOP_PREFETCH);
4194         __ cbz(cnt2, LAST_CHECK_AND_LENGTH_DIFF); // no more chars left?
4195     }
4196     // less than 16 bytes left?
4197     __ subs(cnt2, cnt2, isLL ? 16 : 8);
4198     __ br(__ LT, TAIL);
4199     __ bind(SMALL_LOOP);
4200       compare_string_16_bytes_same(DIFF, DIFF2);
4201       __ subs(cnt2, cnt2, isLL ? 16 : 8);
4202       __ br(__ GE, SMALL_LOOP);
4203     __ bind(TAIL);
4204       __ adds(cnt2, cnt2, isLL ? 16 : 8);
4205       __ br(__ EQ, LAST_CHECK_AND_LENGTH_DIFF);
4206       __ subs(cnt2, cnt2, isLL ? 8 : 4);
4207       __ br(__ LE, CHECK_LAST);
4208       __ eor(rscratch2, tmp1, tmp2);
4209       __ cbnz(rscratch2, DIFF);
4210       __ ldr(tmp1, Address(__ post(str1, 8)));
4211       __ ldr(tmp2, Address(__ post(str2, 8)));
4212       __ sub(cnt2, cnt2, isLL ? 8 : 4);
4213     __ bind(CHECK_LAST);
4214       if (!isLL) {
4215         __ add(cnt2, cnt2, cnt2); // now in bytes
4216       }
4217       __ eor(rscratch2, tmp1, tmp2);
4218       __ cbnz(rscratch2, DIFF);
4219       __ ldr(rscratch1, Address(str1, cnt2));
4220       __ ldr(cnt1, Address(str2, cnt2));
4221       __ eor(rscratch2, rscratch1, cnt1);
4222       __ cbz(rscratch2, LENGTH_DIFF);
4223       // Find the first different characters in the longwords and
4224       // compute their difference.
4225     __ bind(DIFF2);
4226       __ rev(rscratch2, rscratch2);
4227       __ clz(rscratch2, rscratch2);
4228       __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
4229       __ lsrv(rscratch1, rscratch1, rscratch2);
4230       if (isLL) {
4231         __ lsrv(cnt1, cnt1, rscratch2);
4232         __ uxtbw(rscratch1, rscratch1);
4233         __ uxtbw(cnt1, cnt1);
4234       } else {
4235         __ lsrv(cnt1, cnt1, rscratch2);
4236         __ uxthw(rscratch1, rscratch1);
4237         __ uxthw(cnt1, cnt1);
4238       }
4239       __ subw(result, rscratch1, cnt1);
4240       __ b(LENGTH_DIFF);
4241     __ bind(DIFF);
4242       __ rev(rscratch2, rscratch2);
4243       __ clz(rscratch2, rscratch2);
4244       __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
4245       __ lsrv(tmp1, tmp1, rscratch2);
4246       if (isLL) {
4247         __ lsrv(tmp2, tmp2, rscratch2);
4248         __ uxtbw(tmp1, tmp1);
4249         __ uxtbw(tmp2, tmp2);
4250       } else {
4251         __ lsrv(tmp2, tmp2, rscratch2);
4252         __ uxthw(tmp1, tmp1);
4253         __ uxthw(tmp2, tmp2);
4254       }
4255       __ subw(result, tmp1, tmp2);
4256       __ b(LENGTH_DIFF);
4257     __ bind(LAST_CHECK_AND_LENGTH_DIFF);
4258       __ eor(rscratch2, tmp1, tmp2);
4259       __ cbnz(rscratch2, DIFF);
4260     __ bind(LENGTH_DIFF);
4261       __ ret(lr);
4262     return entry;
4263   }
4264 
4265   void generate_compare_long_strings() {
4266       StubRoutines::aarch64::_compare_long_string_LL
4267           = generate_compare_long_string_same_encoding(true);
4268       StubRoutines::aarch64::_compare_long_string_UU
4269           = generate_compare_long_string_same_encoding(false);
4270       StubRoutines::aarch64::_compare_long_string_LU
4271           = generate_compare_long_string_different_encoding(true);
4272       StubRoutines::aarch64::_compare_long_string_UL
4273           = generate_compare_long_string_different_encoding(false);
4274   }
4275 
4276   // R0 = result
4277   // R1 = str2
4278   // R2 = cnt1
4279   // R3 = str1
4280   // R4 = cnt2
4281   // This generic linear code use few additional ideas, which makes it faster:
4282   // 1) we can safely keep at least 1st register of pattern(since length >= 8)
4283   // in order to skip initial loading(help in systems with 1 ld pipeline)
4284   // 2) we can use "fast" algorithm of finding single character to search for
4285   // first symbol with less branches(1 branch per each loaded register instead
4286   // of branch for each symbol), so, this is where constants like
4287   // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from
4288   // 3) after loading and analyzing 1st register of source string, it can be
4289   // used to search for every 1st character entry, saving few loads in
4290   // comparison with "simplier-but-slower" implementation
4291   // 4) in order to avoid lots of push/pop operations, code below is heavily
4292   // re-using/re-initializing/compressing register values, which makes code
4293   // larger and a bit less readable, however, most of extra operations are
4294   // issued during loads or branches, so, penalty is minimal
4295   address generate_string_indexof_linear(bool str1_isL, bool str2_isL) {
4296     const char* stubName = str1_isL
4297         ? (str2_isL ? "indexof_linear_ll" : "indexof_linear_ul")
4298         : "indexof_linear_uu";
4299     __ align(CodeEntryAlignment);
4300     StubCodeMark mark(this, "StubRoutines", stubName);
4301     address entry = __ pc();
4302 
4303     int str1_chr_size = str1_isL ? 1 : 2;
4304     int str2_chr_size = str2_isL ? 1 : 2;
4305     int str1_chr_shift = str1_isL ? 0 : 1;
4306     int str2_chr_shift = str2_isL ? 0 : 1;
4307     bool isL = str1_isL && str2_isL;
4308    // parameters
4309     Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4;
4310     // temporary registers
4311     Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23;
4312     RegSet spilled_regs = RegSet::range(tmp1, tmp4);
4313     // redefinitions
4314     Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3;
4315 
4316     __ push(spilled_regs, sp);
4317     Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO,
4318         L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
4319         L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
4320         L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
4321         L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
4322         L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
4323     // Read whole register from str1. It is safe, because length >=8 here
4324     __ ldr(ch1, Address(str1));
4325     // Read whole register from str2. It is safe, because length >=8 here
4326     __ ldr(ch2, Address(str2));
4327     __ sub(cnt2, cnt2, cnt1);
4328     __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF);
4329     if (str1_isL != str2_isL) {
4330       __ eor(v0, __ T16B, v0, v0);
4331     }
4332     __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
4333     __ mul(first, first, tmp1);
4334     // check if we have less than 1 register to check
4335     __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1);
4336     if (str1_isL != str2_isL) {
4337       __ fmovd(v1, ch1);
4338     }
4339     __ br(__ LE, L_SMALL);
4340     __ eor(ch2, first, ch2);
4341     if (str1_isL != str2_isL) {
4342       __ zip1(v1, __ T16B, v1, v0);
4343     }
4344     __ sub(tmp2, ch2, tmp1);
4345     __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4346     __ bics(tmp2, tmp2, ch2);
4347     if (str1_isL != str2_isL) {
4348       __ fmovd(ch1, v1);
4349     }
4350     __ br(__ NE, L_HAS_ZERO);
4351     __ subs(cnt2, cnt2, wordSize/str2_chr_size);
4352     __ add(result, result, wordSize/str2_chr_size);
4353     __ add(str2, str2, wordSize);
4354     __ br(__ LT, L_POST_LOOP);
4355     __ BIND(L_LOOP);
4356       __ ldr(ch2, Address(str2));
4357       __ eor(ch2, first, ch2);
4358       __ sub(tmp2, ch2, tmp1);
4359       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4360       __ bics(tmp2, tmp2, ch2);
4361       __ br(__ NE, L_HAS_ZERO);
4362     __ BIND(L_LOOP_PROCEED);
4363       __ subs(cnt2, cnt2, wordSize/str2_chr_size);
4364       __ add(str2, str2, wordSize);
4365       __ add(result, result, wordSize/str2_chr_size);
4366       __ br(__ GE, L_LOOP);
4367     __ BIND(L_POST_LOOP);
4368       __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check
4369       __ br(__ LE, NOMATCH);
4370       __ ldr(ch2, Address(str2));
4371       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
4372       __ eor(ch2, first, ch2);
4373       __ sub(tmp2, ch2, tmp1);
4374       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4375       __ mov(tmp4, -1); // all bits set
4376       __ b(L_SMALL_PROCEED);
4377     __ align(OptoLoopAlignment);
4378     __ BIND(L_SMALL);
4379       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
4380       __ eor(ch2, first, ch2);
4381       if (str1_isL != str2_isL) {
4382         __ zip1(v1, __ T16B, v1, v0);
4383       }
4384       __ sub(tmp2, ch2, tmp1);
4385       __ mov(tmp4, -1); // all bits set
4386       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4387       if (str1_isL != str2_isL) {
4388         __ fmovd(ch1, v1); // move converted 4 symbols
4389       }
4390     __ BIND(L_SMALL_PROCEED);
4391       __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits.
4392       __ bic(tmp2, tmp2, ch2);
4393       __ ands(tmp2, tmp2, tmp4); // clear useless bits and check
4394       __ rbit(tmp2, tmp2);
4395       __ br(__ EQ, NOMATCH);
4396     __ BIND(L_SMALL_HAS_ZERO_LOOP);
4397       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's
4398       __ cmp(cnt1, u1(wordSize/str2_chr_size));
4399       __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2);
4400       if (str2_isL) { // LL
4401         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
4402         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
4403         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
4404         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
4405         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
4406       } else {
4407         __ mov(ch2, 0xE); // all bits in byte set except last one
4408         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
4409         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4410         __ lslv(tmp2, tmp2, tmp4);
4411         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4412         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4413         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
4414         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4415       }
4416       __ cmp(ch1, ch2);
4417       __ mov(tmp4, wordSize/str2_chr_size);
4418       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
4419     __ BIND(L_SMALL_CMP_LOOP);
4420       str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
4421                : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
4422       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
4423                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
4424       __ add(tmp4, tmp4, 1);
4425       __ cmp(tmp4, cnt1);
4426       __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP);
4427       __ cmp(first, ch2);
4428       __ br(__ EQ, L_SMALL_CMP_LOOP);
4429     __ BIND(L_SMALL_CMP_LOOP_NOMATCH);
4430       __ cbz(tmp2, NOMATCH); // no more matches. exit
4431       __ clz(tmp4, tmp2);
4432       __ add(result, result, 1); // advance index
4433       __ add(str2, str2, str2_chr_size); // advance pointer
4434       __ b(L_SMALL_HAS_ZERO_LOOP);
4435     __ align(OptoLoopAlignment);
4436     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP);
4437       __ cmp(first, ch2);
4438       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
4439       __ b(DONE);
4440     __ align(OptoLoopAlignment);
4441     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2);
4442       if (str2_isL) { // LL
4443         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
4444         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
4445         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
4446         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
4447         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
4448       } else {
4449         __ mov(ch2, 0xE); // all bits in byte set except last one
4450         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
4451         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4452         __ lslv(tmp2, tmp2, tmp4);
4453         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4454         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4455         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
4456         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4457       }
4458       __ cmp(ch1, ch2);
4459       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
4460       __ b(DONE);
4461     __ align(OptoLoopAlignment);
4462     __ BIND(L_HAS_ZERO);
4463       __ rbit(tmp2, tmp2);
4464       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's
4465       // Now, perform compression of counters(cnt2 and cnt1) into one register.
4466       // It's fine because both counters are 32bit and are not changed in this
4467       // loop. Just restore it on exit. So, cnt1 can be re-used in this loop.
4468       __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2);
4469       __ sub(result, result, 1);
4470     __ BIND(L_HAS_ZERO_LOOP);
4471       __ mov(cnt1, wordSize/str2_chr_size);
4472       __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2);
4473       __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare
4474       if (str2_isL) {
4475         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
4476         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4477         __ lslv(tmp2, tmp2, tmp4);
4478         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4479         __ add(tmp4, tmp4, 1);
4480         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4481         __ lsl(tmp2, tmp2, 1);
4482         __ mov(tmp4, wordSize/str2_chr_size);
4483       } else {
4484         __ mov(ch2, 0xE);
4485         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
4486         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4487         __ lslv(tmp2, tmp2, tmp4);
4488         __ add(tmp4, tmp4, 1);
4489         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4490         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
4491         __ lsl(tmp2, tmp2, 1);
4492         __ mov(tmp4, wordSize/str2_chr_size);
4493         __ sub(str2, str2, str2_chr_size);
4494       }
4495       __ cmp(ch1, ch2);
4496       __ mov(tmp4, wordSize/str2_chr_size);
4497       __ br(__ NE, L_CMP_LOOP_NOMATCH);
4498     __ BIND(L_CMP_LOOP);
4499       str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
4500                : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
4501       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
4502                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
4503       __ add(tmp4, tmp4, 1);
4504       __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2);
4505       __ br(__ GE, L_CMP_LOOP_LAST_CMP);
4506       __ cmp(cnt1, ch2);
4507       __ br(__ EQ, L_CMP_LOOP);
4508     __ BIND(L_CMP_LOOP_NOMATCH);
4509       // here we're not matched
4510       __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop
4511       __ clz(tmp4, tmp2);
4512       __ add(str2, str2, str2_chr_size); // advance pointer
4513       __ b(L_HAS_ZERO_LOOP);
4514     __ align(OptoLoopAlignment);
4515     __ BIND(L_CMP_LOOP_LAST_CMP);
4516       __ cmp(cnt1, ch2);
4517       __ br(__ NE, L_CMP_LOOP_NOMATCH);
4518       __ b(DONE);
4519     __ align(OptoLoopAlignment);
4520     __ BIND(L_CMP_LOOP_LAST_CMP2);
4521       if (str2_isL) {
4522         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
4523         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4524         __ lslv(tmp2, tmp2, tmp4);
4525         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4526         __ add(tmp4, tmp4, 1);
4527         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4528         __ lsl(tmp2, tmp2, 1);
4529       } else {
4530         __ mov(ch2, 0xE);
4531         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
4532         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4533         __ lslv(tmp2, tmp2, tmp4);
4534         __ add(tmp4, tmp4, 1);
4535         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4536         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
4537         __ lsl(tmp2, tmp2, 1);
4538         __ sub(str2, str2, str2_chr_size);
4539       }
4540       __ cmp(ch1, ch2);
4541       __ br(__ NE, L_CMP_LOOP_NOMATCH);
4542       __ b(DONE);
4543     __ align(OptoLoopAlignment);
4544     __ BIND(L_HAS_ZERO_LOOP_NOMATCH);
4545       // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
4546       // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
4547       // so, result was increased at max by wordSize/str2_chr_size - 1, so,
4548       // respective high bit wasn't changed. L_LOOP_PROCEED will increase
4549       // result by analyzed characters value, so, we can just reset lower bits
4550       // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
4551       // 2) restore cnt1 and cnt2 values from "compressed" cnt2
4552       // 3) advance str2 value to represent next str2 octet. result & 7/3 is
4553       // index of last analyzed substring inside current octet. So, str2 in at
4554       // respective start address. We need to advance it to next octet
4555       __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed
4556       __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2);
4557       __ bfm(result, zr, 0, 2 - str2_chr_shift);
4558       __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2
4559       __ movw(cnt2, cnt2);
4560       __ b(L_LOOP_PROCEED);
4561     __ align(OptoLoopAlignment);
4562     __ BIND(NOMATCH);
4563       __ mov(result, -1);
4564     __ BIND(DONE);
4565       __ pop(spilled_regs, sp);
4566       __ ret(lr);
4567     return entry;
4568   }
4569 
4570   void generate_string_indexof_stubs() {
4571     StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true);
4572     StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false);
4573     StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
4574   }
4575 
4576   void inflate_and_store_2_fp_registers(bool generatePrfm,
4577       FloatRegister src1, FloatRegister src2) {
4578     Register dst = r1;
4579     __ zip1(v1, __ T16B, src1, v0);
4580     __ zip2(v2, __ T16B, src1, v0);
4581     if (generatePrfm) {
4582       __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM);
4583     }
4584     __ zip1(v3, __ T16B, src2, v0);
4585     __ zip2(v4, __ T16B, src2, v0);
4586     __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64)));
4587   }
4588 
4589   // R0 = src
4590   // R1 = dst
4591   // R2 = len
4592   // R3 = len >> 3
4593   // V0 = 0
4594   // v1 = loaded 8 bytes
4595   address generate_large_byte_array_inflate() {
4596     __ align(CodeEntryAlignment);
4597     StubCodeMark mark(this, "StubRoutines", "large_byte_array_inflate");
4598     address entry = __ pc();
4599     Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE;
4600     Register src = r0, dst = r1, len = r2, octetCounter = r3;
4601     const int large_loop_threshold = MAX(64, SoftwarePrefetchHintDistance)/8 + 4;
4602 
4603     // do one more 8-byte read to have address 16-byte aligned in most cases
4604     // also use single store instruction
4605     __ ldrd(v2, __ post(src, 8));
4606     __ sub(octetCounter, octetCounter, 2);
4607     __ zip1(v1, __ T16B, v1, v0);
4608     __ zip1(v2, __ T16B, v2, v0);
4609     __ st1(v1, v2, __ T16B, __ post(dst, 32));
4610     __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
4611     __ subs(rscratch1, octetCounter, large_loop_threshold);
4612     __ br(__ LE, LOOP_START);
4613     __ b(LOOP_PRFM_START);
4614     __ bind(LOOP_PRFM);
4615       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
4616     __ bind(LOOP_PRFM_START);
4617       __ prfm(Address(src, SoftwarePrefetchHintDistance));
4618       __ sub(octetCounter, octetCounter, 8);
4619       __ subs(rscratch1, octetCounter, large_loop_threshold);
4620       inflate_and_store_2_fp_registers(true, v3, v4);
4621       inflate_and_store_2_fp_registers(true, v5, v6);
4622       __ br(__ GT, LOOP_PRFM);
4623       __ cmp(octetCounter, (u1)8);
4624       __ br(__ LT, DONE);
4625     __ bind(LOOP);
4626       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
4627       __ bind(LOOP_START);
4628       __ sub(octetCounter, octetCounter, 8);
4629       __ cmp(octetCounter, (u1)8);
4630       inflate_and_store_2_fp_registers(false, v3, v4);
4631       inflate_and_store_2_fp_registers(false, v5, v6);
4632       __ br(__ GE, LOOP);
4633     __ bind(DONE);
4634       __ ret(lr);
4635     return entry;
4636   }
4637 
4638   /**
4639    *  Arguments:
4640    *
4641    *  Input:
4642    *  c_rarg0   - current state address
4643    *  c_rarg1   - H key address
4644    *  c_rarg2   - data address
4645    *  c_rarg3   - number of blocks
4646    *
4647    *  Output:
4648    *  Updated state at c_rarg0
4649    */
4650   address generate_ghash_processBlocks() {
4651     // Bafflingly, GCM uses little-endian for the byte order, but
4652     // big-endian for the bit order.  For example, the polynomial 1 is
4653     // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
4654     //
4655     // So, we must either reverse the bytes in each word and do
4656     // everything big-endian or reverse the bits in each byte and do
4657     // it little-endian.  On AArch64 it's more idiomatic to reverse
4658     // the bits in each byte (we have an instruction, RBIT, to do
4659     // that) and keep the data in little-endian bit order throught the
4660     // calculation, bit-reversing the inputs and outputs.
4661 
4662     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
4663     __ align(wordSize * 2);
4664     address p = __ pc();
4665     __ emit_int64(0x87);  // The low-order bits of the field
4666                           // polynomial (i.e. p = z^7+z^2+z+1)
4667                           // repeated in the low and high parts of a
4668                           // 128-bit vector
4669     __ emit_int64(0x87);
4670 
4671     __ align(CodeEntryAlignment);
4672     address start = __ pc();
4673 
4674     Register state   = c_rarg0;
4675     Register subkeyH = c_rarg1;
4676     Register data    = c_rarg2;
4677     Register blocks  = c_rarg3;
4678 
4679     FloatRegister vzr = v30;
4680     __ eor(vzr, __ T16B, vzr, vzr); // zero register
4681 
4682     __ ldrq(v0, Address(state));
4683     __ ldrq(v1, Address(subkeyH));
4684 
4685     __ rev64(v0, __ T16B, v0);          // Bit-reverse words in state and subkeyH
4686     __ rbit(v0, __ T16B, v0);
4687     __ rev64(v1, __ T16B, v1);
4688     __ rbit(v1, __ T16B, v1);
4689 
4690     __ ldrq(v26, p);
4691 
4692     __ ext(v16, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
4693     __ eor(v16, __ T16B, v16, v1);      // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
4694 
4695     {
4696       Label L_ghash_loop;
4697       __ bind(L_ghash_loop);
4698 
4699       __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
4700                                                  // reversing each byte
4701       __ rbit(v2, __ T16B, v2);
4702       __ eor(v2, __ T16B, v0, v2);   // bit-swapped data ^ bit-swapped state
4703 
4704       // Multiply state in v2 by subkey in v1
4705       ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
4706                      /*a*/v1, /*b*/v2, /*a1_xor_a0*/v16,
4707                      /*temps*/v6, v20, v18, v21);
4708       // Reduce v7:v5 by the field polynomial
4709       ghash_reduce(v0, v5, v7, v26, vzr, v20);
4710 
4711       __ sub(blocks, blocks, 1);
4712       __ cbnz(blocks, L_ghash_loop);
4713     }
4714 
4715     // The bit-reversed result is at this point in v0
4716     __ rev64(v1, __ T16B, v0);
4717     __ rbit(v1, __ T16B, v1);
4718 
4719     __ st1(v1, __ T16B, state);
4720     __ ret(lr);
4721 
4722     return start;
4723   }
4724 
4725   // Continuation point for throwing of implicit exceptions that are
4726   // not handled in the current activation. Fabricates an exception
4727   // oop and initiates normal exception dispatching in this
4728   // frame. Since we need to preserve callee-saved values (currently
4729   // only for C2, but done for C1 as well) we need a callee-saved oop
4730   // map and therefore have to make these stubs into RuntimeStubs
4731   // rather than BufferBlobs.  If the compiler needs all registers to
4732   // be preserved between the fault point and the exception handler
4733   // then it must assume responsibility for that in
4734   // AbstractCompiler::continuation_for_implicit_null_exception or
4735   // continuation_for_implicit_division_by_zero_exception. All other
4736   // implicit exceptions (e.g., NullPointerException or
4737   // AbstractMethodError on entry) are either at call sites or
4738   // otherwise assume that stack unwinding will be initiated, so
4739   // caller saved registers were assumed volatile in the compiler.
4740 
4741 #undef __
4742 #define __ masm->
4743 
4744   address generate_throw_exception(const char* name,
4745                                    address runtime_entry,
4746                                    Register arg1 = noreg,
4747                                    Register arg2 = noreg) {
4748     // Information about frame layout at time of blocking runtime call.
4749     // Note that we only have to preserve callee-saved registers since
4750     // the compilers are responsible for supplying a continuation point
4751     // if they expect all registers to be preserved.
4752     // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0
4753     enum layout {
4754       rfp_off = 0,
4755       rfp_off2,
4756       return_off,
4757       return_off2,
4758       framesize // inclusive of return address
4759     };
4760 
4761     int insts_size = 512;
4762     int locs_size  = 64;
4763 
4764     CodeBuffer code(name, insts_size, locs_size);
4765     OopMapSet* oop_maps  = new OopMapSet();
4766     MacroAssembler* masm = new MacroAssembler(&code);
4767 
4768     address start = __ pc();
4769 
4770     // This is an inlined and slightly modified version of call_VM
4771     // which has the ability to fetch the return PC out of
4772     // thread-local storage and also sets up last_Java_sp slightly
4773     // differently than the real call_VM
4774 
4775     __ enter(); // Save FP and LR before call
4776 
4777     assert(is_even(framesize/2), "sp not 16-byte aligned");
4778 
4779     // lr and fp are already in place
4780     __ sub(sp, rfp, ((unsigned)framesize-4) << LogBytesPerInt); // prolog
4781 
4782     int frame_complete = __ pc() - start;
4783 
4784     // Set up last_Java_sp and last_Java_fp
4785     address the_pc = __ pc();
4786     __ set_last_Java_frame(sp, rfp, the_pc, rscratch1);
4787 
4788     // Call runtime
4789     if (arg1 != noreg) {
4790       assert(arg2 != c_rarg1, "clobbered");
4791       __ mov(c_rarg1, arg1);
4792     }
4793     if (arg2 != noreg) {
4794       __ mov(c_rarg2, arg2);
4795     }
4796     __ mov(c_rarg0, rthread);
4797     BLOCK_COMMENT("call runtime_entry");
4798     __ mov(rscratch1, runtime_entry);
4799     __ blrt(rscratch1, 3 /* number_of_arguments */, 0, 1);
4800 
4801     // Generate oop map
4802     OopMap* map = new OopMap(framesize, 0);
4803 
4804     oop_maps->add_gc_map(the_pc - start, map);
4805 
4806     __ reset_last_Java_frame(true);
4807     __ maybe_isb();
4808 
4809     __ leave();
4810 
4811     // check for pending exceptions
4812 #ifdef ASSERT
4813     Label L;
4814     __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
4815     __ cbnz(rscratch1, L);
4816     __ should_not_reach_here();
4817     __ bind(L);
4818 #endif // ASSERT
4819     __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
4820 
4821 
4822     // codeBlob framesize is in words (not VMRegImpl::slot_size)
4823     RuntimeStub* stub =
4824       RuntimeStub::new_runtime_stub(name,
4825                                     &code,
4826                                     frame_complete,
4827                                     (framesize >> (LogBytesPerWord - LogBytesPerInt)),
4828                                     oop_maps, false);
4829     return stub->entry_point();
4830   }
4831 
4832   class MontgomeryMultiplyGenerator : public MacroAssembler {
4833 
4834     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
4835       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
4836 
4837     RegSet _toSave;
4838     bool _squaring;
4839 
4840   public:
4841     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
4842       : MacroAssembler(as->code()), _squaring(squaring) {
4843 
4844       // Register allocation
4845 
4846       Register reg = c_rarg0;
4847       Pa_base = reg;       // Argument registers
4848       if (squaring)
4849         Pb_base = Pa_base;
4850       else
4851         Pb_base = ++reg;
4852       Pn_base = ++reg;
4853       Rlen= ++reg;
4854       inv = ++reg;
4855       Pm_base = ++reg;
4856 
4857                           // Working registers:
4858       Ra =  ++reg;        // The current digit of a, b, n, and m.
4859       Rb =  ++reg;
4860       Rm =  ++reg;
4861       Rn =  ++reg;
4862 
4863       Pa =  ++reg;        // Pointers to the current/next digit of a, b, n, and m.
4864       Pb =  ++reg;
4865       Pm =  ++reg;
4866       Pn =  ++reg;
4867 
4868       t0 =  ++reg;        // Three registers which form a
4869       t1 =  ++reg;        // triple-precision accumuator.
4870       t2 =  ++reg;
4871 
4872       Ri =  ++reg;        // Inner and outer loop indexes.
4873       Rj =  ++reg;
4874 
4875       Rhi_ab = ++reg;     // Product registers: low and high parts
4876       Rlo_ab = ++reg;     // of a*b and m*n.
4877       Rhi_mn = ++reg;
4878       Rlo_mn = ++reg;
4879 
4880       // r19 and up are callee-saved.
4881       _toSave = RegSet::range(r19, reg) + Pm_base;
4882     }
4883 
4884   private:
4885     void save_regs() {
4886       push(_toSave, sp);
4887     }
4888 
4889     void restore_regs() {
4890       pop(_toSave, sp);
4891     }
4892 
4893     template <typename T>
4894     void unroll_2(Register count, T block) {
4895       Label loop, end, odd;
4896       tbnz(count, 0, odd);
4897       cbz(count, end);
4898       align(16);
4899       bind(loop);
4900       (this->*block)();
4901       bind(odd);
4902       (this->*block)();
4903       subs(count, count, 2);
4904       br(Assembler::GT, loop);
4905       bind(end);
4906     }
4907 
4908     template <typename T>
4909     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
4910       Label loop, end, odd;
4911       tbnz(count, 0, odd);
4912       cbz(count, end);
4913       align(16);
4914       bind(loop);
4915       (this->*block)(d, s, tmp);
4916       bind(odd);
4917       (this->*block)(d, s, tmp);
4918       subs(count, count, 2);
4919       br(Assembler::GT, loop);
4920       bind(end);
4921     }
4922 
4923     void pre1(RegisterOrConstant i) {
4924       block_comment("pre1");
4925       // Pa = Pa_base;
4926       // Pb = Pb_base + i;
4927       // Pm = Pm_base;
4928       // Pn = Pn_base + i;
4929       // Ra = *Pa;
4930       // Rb = *Pb;
4931       // Rm = *Pm;
4932       // Rn = *Pn;
4933       ldr(Ra, Address(Pa_base));
4934       ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
4935       ldr(Rm, Address(Pm_base));
4936       ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
4937       lea(Pa, Address(Pa_base));
4938       lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
4939       lea(Pm, Address(Pm_base));
4940       lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
4941 
4942       // Zero the m*n result.
4943       mov(Rhi_mn, zr);
4944       mov(Rlo_mn, zr);
4945     }
4946 
4947     // The core multiply-accumulate step of a Montgomery
4948     // multiplication.  The idea is to schedule operations as a
4949     // pipeline so that instructions with long latencies (loads and
4950     // multiplies) have time to complete before their results are
4951     // used.  This most benefits in-order implementations of the
4952     // architecture but out-of-order ones also benefit.
4953     void step() {
4954       block_comment("step");
4955       // MACC(Ra, Rb, t0, t1, t2);
4956       // Ra = *++Pa;
4957       // Rb = *--Pb;
4958       umulh(Rhi_ab, Ra, Rb);
4959       mul(Rlo_ab, Ra, Rb);
4960       ldr(Ra, pre(Pa, wordSize));
4961       ldr(Rb, pre(Pb, -wordSize));
4962       acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
4963                                        // previous iteration.
4964       // MACC(Rm, Rn, t0, t1, t2);
4965       // Rm = *++Pm;
4966       // Rn = *--Pn;
4967       umulh(Rhi_mn, Rm, Rn);
4968       mul(Rlo_mn, Rm, Rn);
4969       ldr(Rm, pre(Pm, wordSize));
4970       ldr(Rn, pre(Pn, -wordSize));
4971       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
4972     }
4973 
4974     void post1() {
4975       block_comment("post1");
4976 
4977       // MACC(Ra, Rb, t0, t1, t2);
4978       // Ra = *++Pa;
4979       // Rb = *--Pb;
4980       umulh(Rhi_ab, Ra, Rb);
4981       mul(Rlo_ab, Ra, Rb);
4982       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
4983       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
4984 
4985       // *Pm = Rm = t0 * inv;
4986       mul(Rm, t0, inv);
4987       str(Rm, Address(Pm));
4988 
4989       // MACC(Rm, Rn, t0, t1, t2);
4990       // t0 = t1; t1 = t2; t2 = 0;
4991       umulh(Rhi_mn, Rm, Rn);
4992 
4993 #ifndef PRODUCT
4994       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
4995       {
4996         mul(Rlo_mn, Rm, Rn);
4997         add(Rlo_mn, t0, Rlo_mn);
4998         Label ok;
4999         cbz(Rlo_mn, ok); {
5000           stop("broken Montgomery multiply");
5001         } bind(ok);
5002       }
5003 #endif
5004       // We have very carefully set things up so that
5005       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
5006       // the lower half of Rm * Rn because we know the result already:
5007       // it must be -t0.  t0 + (-t0) must generate a carry iff
5008       // t0 != 0.  So, rather than do a mul and an adds we just set
5009       // the carry flag iff t0 is nonzero.
5010       //
5011       // mul(Rlo_mn, Rm, Rn);
5012       // adds(zr, t0, Rlo_mn);
5013       subs(zr, t0, 1); // Set carry iff t0 is nonzero
5014       adcs(t0, t1, Rhi_mn);
5015       adc(t1, t2, zr);
5016       mov(t2, zr);
5017     }
5018 
5019     void pre2(RegisterOrConstant i, RegisterOrConstant len) {
5020       block_comment("pre2");
5021       // Pa = Pa_base + i-len;
5022       // Pb = Pb_base + len;
5023       // Pm = Pm_base + i-len;
5024       // Pn = Pn_base + len;
5025 
5026       if (i.is_register()) {
5027         sub(Rj, i.as_register(), len);
5028       } else {
5029         mov(Rj, i.as_constant());
5030         sub(Rj, Rj, len);
5031       }
5032       // Rj == i-len
5033 
5034       lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
5035       lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
5036       lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
5037       lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
5038 
5039       // Ra = *++Pa;
5040       // Rb = *--Pb;
5041       // Rm = *++Pm;
5042       // Rn = *--Pn;
5043       ldr(Ra, pre(Pa, wordSize));
5044       ldr(Rb, pre(Pb, -wordSize));
5045       ldr(Rm, pre(Pm, wordSize));
5046       ldr(Rn, pre(Pn, -wordSize));
5047 
5048       mov(Rhi_mn, zr);
5049       mov(Rlo_mn, zr);
5050     }
5051 
5052     void post2(RegisterOrConstant i, RegisterOrConstant len) {
5053       block_comment("post2");
5054       if (i.is_constant()) {
5055         mov(Rj, i.as_constant()-len.as_constant());
5056       } else {
5057         sub(Rj, i.as_register(), len);
5058       }
5059 
5060       adds(t0, t0, Rlo_mn); // The pending m*n, low part
5061 
5062       // As soon as we know the least significant digit of our result,
5063       // store it.
5064       // Pm_base[i-len] = t0;
5065       str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
5066 
5067       // t0 = t1; t1 = t2; t2 = 0;
5068       adcs(t0, t1, Rhi_mn); // The pending m*n, high part
5069       adc(t1, t2, zr);
5070       mov(t2, zr);
5071     }
5072 
5073     // A carry in t0 after Montgomery multiplication means that we
5074     // should subtract multiples of n from our result in m.  We'll
5075     // keep doing that until there is no carry.
5076     void normalize(RegisterOrConstant len) {
5077       block_comment("normalize");
5078       // while (t0)
5079       //   t0 = sub(Pm_base, Pn_base, t0, len);
5080       Label loop, post, again;
5081       Register cnt = t1, i = t2; // Re-use registers; we're done with them now
5082       cbz(t0, post); {
5083         bind(again); {
5084           mov(i, zr);
5085           mov(cnt, len);
5086           ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
5087           ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
5088           subs(zr, zr, zr); // set carry flag, i.e. no borrow
5089           align(16);
5090           bind(loop); {
5091             sbcs(Rm, Rm, Rn);
5092             str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
5093             add(i, i, 1);
5094             ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
5095             ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
5096             sub(cnt, cnt, 1);
5097           } cbnz(cnt, loop);
5098           sbc(t0, t0, zr);
5099         } cbnz(t0, again);
5100       } bind(post);
5101     }
5102 
5103     // Move memory at s to d, reversing words.
5104     //    Increments d to end of copied memory
5105     //    Destroys tmp1, tmp2
5106     //    Preserves len
5107     //    Leaves s pointing to the address which was in d at start
5108     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
5109       assert(tmp1 < r19 && tmp2 < r19, "register corruption");
5110 
5111       lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
5112       mov(tmp1, len);
5113       unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
5114       sub(s, d, len, ext::uxtw, LogBytesPerWord);
5115     }
5116     // where
5117     void reverse1(Register d, Register s, Register tmp) {
5118       ldr(tmp, pre(s, -wordSize));
5119       ror(tmp, tmp, 32);
5120       str(tmp, post(d, wordSize));
5121     }
5122 
5123     void step_squaring() {
5124       // An extra ACC
5125       step();
5126       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
5127     }
5128 
5129     void last_squaring(RegisterOrConstant i) {
5130       Label dont;
5131       // if ((i & 1) == 0) {
5132       tbnz(i.as_register(), 0, dont); {
5133         // MACC(Ra, Rb, t0, t1, t2);
5134         // Ra = *++Pa;
5135         // Rb = *--Pb;
5136         umulh(Rhi_ab, Ra, Rb);
5137         mul(Rlo_ab, Ra, Rb);
5138         acc(Rhi_ab, Rlo_ab, t0, t1, t2);
5139       } bind(dont);
5140     }
5141 
5142     void extra_step_squaring() {
5143       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
5144 
5145       // MACC(Rm, Rn, t0, t1, t2);
5146       // Rm = *++Pm;
5147       // Rn = *--Pn;
5148       umulh(Rhi_mn, Rm, Rn);
5149       mul(Rlo_mn, Rm, Rn);
5150       ldr(Rm, pre(Pm, wordSize));
5151       ldr(Rn, pre(Pn, -wordSize));
5152     }
5153 
5154     void post1_squaring() {
5155       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
5156 
5157       // *Pm = Rm = t0 * inv;
5158       mul(Rm, t0, inv);
5159       str(Rm, Address(Pm));
5160 
5161       // MACC(Rm, Rn, t0, t1, t2);
5162       // t0 = t1; t1 = t2; t2 = 0;
5163       umulh(Rhi_mn, Rm, Rn);
5164 
5165 #ifndef PRODUCT
5166       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
5167       {
5168         mul(Rlo_mn, Rm, Rn);
5169         add(Rlo_mn, t0, Rlo_mn);
5170         Label ok;
5171         cbz(Rlo_mn, ok); {
5172           stop("broken Montgomery multiply");
5173         } bind(ok);
5174       }
5175 #endif
5176       // We have very carefully set things up so that
5177       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
5178       // the lower half of Rm * Rn because we know the result already:
5179       // it must be -t0.  t0 + (-t0) must generate a carry iff
5180       // t0 != 0.  So, rather than do a mul and an adds we just set
5181       // the carry flag iff t0 is nonzero.
5182       //
5183       // mul(Rlo_mn, Rm, Rn);
5184       // adds(zr, t0, Rlo_mn);
5185       subs(zr, t0, 1); // Set carry iff t0 is nonzero
5186       adcs(t0, t1, Rhi_mn);
5187       adc(t1, t2, zr);
5188       mov(t2, zr);
5189     }
5190 
5191     void acc(Register Rhi, Register Rlo,
5192              Register t0, Register t1, Register t2) {
5193       adds(t0, t0, Rlo);
5194       adcs(t1, t1, Rhi);
5195       adc(t2, t2, zr);
5196     }
5197 
5198   public:
5199     /**
5200      * Fast Montgomery multiplication.  The derivation of the
5201      * algorithm is in A Cryptographic Library for the Motorola
5202      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
5203      *
5204      * Arguments:
5205      *
5206      * Inputs for multiplication:
5207      *   c_rarg0   - int array elements a
5208      *   c_rarg1   - int array elements b
5209      *   c_rarg2   - int array elements n (the modulus)
5210      *   c_rarg3   - int length
5211      *   c_rarg4   - int inv
5212      *   c_rarg5   - int array elements m (the result)
5213      *
5214      * Inputs for squaring:
5215      *   c_rarg0   - int array elements a
5216      *   c_rarg1   - int array elements n (the modulus)
5217      *   c_rarg2   - int length
5218      *   c_rarg3   - int inv
5219      *   c_rarg4   - int array elements m (the result)
5220      *
5221      */
5222     address generate_multiply() {
5223       Label argh, nothing;
5224       bind(argh);
5225       stop("MontgomeryMultiply total_allocation must be <= 8192");
5226 
5227       align(CodeEntryAlignment);
5228       address entry = pc();
5229 
5230       cbzw(Rlen, nothing);
5231 
5232       enter();
5233 
5234       // Make room.
5235       cmpw(Rlen, 512);
5236       br(Assembler::HI, argh);
5237       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
5238       andr(sp, Ra, -2 * wordSize);
5239 
5240       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
5241 
5242       {
5243         // Copy input args, reversing as we go.  We use Ra as a
5244         // temporary variable.
5245         reverse(Ra, Pa_base, Rlen, t0, t1);
5246         if (!_squaring)
5247           reverse(Ra, Pb_base, Rlen, t0, t1);
5248         reverse(Ra, Pn_base, Rlen, t0, t1);
5249       }
5250 
5251       // Push all call-saved registers and also Pm_base which we'll need
5252       // at the end.
5253       save_regs();
5254 
5255 #ifndef PRODUCT
5256       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
5257       {
5258         ldr(Rn, Address(Pn_base, 0));
5259         mul(Rlo_mn, Rn, inv);
5260         subs(zr, Rlo_mn, -1);
5261         Label ok;
5262         br(EQ, ok); {
5263           stop("broken inverse in Montgomery multiply");
5264         } bind(ok);
5265       }
5266 #endif
5267 
5268       mov(Pm_base, Ra);
5269 
5270       mov(t0, zr);
5271       mov(t1, zr);
5272       mov(t2, zr);
5273 
5274       block_comment("for (int i = 0; i < len; i++) {");
5275       mov(Ri, zr); {
5276         Label loop, end;
5277         cmpw(Ri, Rlen);
5278         br(Assembler::GE, end);
5279 
5280         bind(loop);
5281         pre1(Ri);
5282 
5283         block_comment("  for (j = i; j; j--) {"); {
5284           movw(Rj, Ri);
5285           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
5286         } block_comment("  } // j");
5287 
5288         post1();
5289         addw(Ri, Ri, 1);
5290         cmpw(Ri, Rlen);
5291         br(Assembler::LT, loop);
5292         bind(end);
5293         block_comment("} // i");
5294       }
5295 
5296       block_comment("for (int i = len; i < 2*len; i++) {");
5297       mov(Ri, Rlen); {
5298         Label loop, end;
5299         cmpw(Ri, Rlen, Assembler::LSL, 1);
5300         br(Assembler::GE, end);
5301 
5302         bind(loop);
5303         pre2(Ri, Rlen);
5304 
5305         block_comment("  for (j = len*2-i-1; j; j--) {"); {
5306           lslw(Rj, Rlen, 1);
5307           subw(Rj, Rj, Ri);
5308           subw(Rj, Rj, 1);
5309           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
5310         } block_comment("  } // j");
5311 
5312         post2(Ri, Rlen);
5313         addw(Ri, Ri, 1);
5314         cmpw(Ri, Rlen, Assembler::LSL, 1);
5315         br(Assembler::LT, loop);
5316         bind(end);
5317       }
5318       block_comment("} // i");
5319 
5320       normalize(Rlen);
5321 
5322       mov(Ra, Pm_base);  // Save Pm_base in Ra
5323       restore_regs();  // Restore caller's Pm_base
5324 
5325       // Copy our result into caller's Pm_base
5326       reverse(Pm_base, Ra, Rlen, t0, t1);
5327 
5328       leave();
5329       bind(nothing);
5330       ret(lr);
5331 
5332       return entry;
5333     }
5334     // In C, approximately:
5335 
5336     // void
5337     // montgomery_multiply(unsigned long Pa_base[], unsigned long Pb_base[],
5338     //                     unsigned long Pn_base[], unsigned long Pm_base[],
5339     //                     unsigned long inv, int len) {
5340     //   unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
5341     //   unsigned long *Pa, *Pb, *Pn, *Pm;
5342     //   unsigned long Ra, Rb, Rn, Rm;
5343 
5344     //   int i;
5345 
5346     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
5347 
5348     //   for (i = 0; i < len; i++) {
5349     //     int j;
5350 
5351     //     Pa = Pa_base;
5352     //     Pb = Pb_base + i;
5353     //     Pm = Pm_base;
5354     //     Pn = Pn_base + i;
5355 
5356     //     Ra = *Pa;
5357     //     Rb = *Pb;
5358     //     Rm = *Pm;
5359     //     Rn = *Pn;
5360 
5361     //     int iters = i;
5362     //     for (j = 0; iters--; j++) {
5363     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
5364     //       MACC(Ra, Rb, t0, t1, t2);
5365     //       Ra = *++Pa;
5366     //       Rb = *--Pb;
5367     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5368     //       MACC(Rm, Rn, t0, t1, t2);
5369     //       Rm = *++Pm;
5370     //       Rn = *--Pn;
5371     //     }
5372 
5373     //     assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
5374     //     MACC(Ra, Rb, t0, t1, t2);
5375     //     *Pm = Rm = t0 * inv;
5376     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
5377     //     MACC(Rm, Rn, t0, t1, t2);
5378 
5379     //     assert(t0 == 0, "broken Montgomery multiply");
5380 
5381     //     t0 = t1; t1 = t2; t2 = 0;
5382     //   }
5383 
5384     //   for (i = len; i < 2*len; i++) {
5385     //     int j;
5386 
5387     //     Pa = Pa_base + i-len;
5388     //     Pb = Pb_base + len;
5389     //     Pm = Pm_base + i-len;
5390     //     Pn = Pn_base + len;
5391 
5392     //     Ra = *++Pa;
5393     //     Rb = *--Pb;
5394     //     Rm = *++Pm;
5395     //     Rn = *--Pn;
5396 
5397     //     int iters = len*2-i-1;
5398     //     for (j = i-len+1; iters--; j++) {
5399     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
5400     //       MACC(Ra, Rb, t0, t1, t2);
5401     //       Ra = *++Pa;
5402     //       Rb = *--Pb;
5403     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5404     //       MACC(Rm, Rn, t0, t1, t2);
5405     //       Rm = *++Pm;
5406     //       Rn = *--Pn;
5407     //     }
5408 
5409     //     Pm_base[i-len] = t0;
5410     //     t0 = t1; t1 = t2; t2 = 0;
5411     //   }
5412 
5413     //   while (t0)
5414     //     t0 = sub(Pm_base, Pn_base, t0, len);
5415     // }
5416 
5417     /**
5418      * Fast Montgomery squaring.  This uses asymptotically 25% fewer
5419      * multiplies than Montgomery multiplication so it should be up to
5420      * 25% faster.  However, its loop control is more complex and it
5421      * may actually run slower on some machines.
5422      *
5423      * Arguments:
5424      *
5425      * Inputs:
5426      *   c_rarg0   - int array elements a
5427      *   c_rarg1   - int array elements n (the modulus)
5428      *   c_rarg2   - int length
5429      *   c_rarg3   - int inv
5430      *   c_rarg4   - int array elements m (the result)
5431      *
5432      */
5433     address generate_square() {
5434       Label argh;
5435       bind(argh);
5436       stop("MontgomeryMultiply total_allocation must be <= 8192");
5437 
5438       align(CodeEntryAlignment);
5439       address entry = pc();
5440 
5441       enter();
5442 
5443       // Make room.
5444       cmpw(Rlen, 512);
5445       br(Assembler::HI, argh);
5446       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
5447       andr(sp, Ra, -2 * wordSize);
5448 
5449       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
5450 
5451       {
5452         // Copy input args, reversing as we go.  We use Ra as a
5453         // temporary variable.
5454         reverse(Ra, Pa_base, Rlen, t0, t1);
5455         reverse(Ra, Pn_base, Rlen, t0, t1);
5456       }
5457 
5458       // Push all call-saved registers and also Pm_base which we'll need
5459       // at the end.
5460       save_regs();
5461 
5462       mov(Pm_base, Ra);
5463 
5464       mov(t0, zr);
5465       mov(t1, zr);
5466       mov(t2, zr);
5467 
5468       block_comment("for (int i = 0; i < len; i++) {");
5469       mov(Ri, zr); {
5470         Label loop, end;
5471         bind(loop);
5472         cmp(Ri, Rlen);
5473         br(Assembler::GE, end);
5474 
5475         pre1(Ri);
5476 
5477         block_comment("for (j = (i+1)/2; j; j--) {"); {
5478           add(Rj, Ri, 1);
5479           lsr(Rj, Rj, 1);
5480           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
5481         } block_comment("  } // j");
5482 
5483         last_squaring(Ri);
5484 
5485         block_comment("  for (j = i/2; j; j--) {"); {
5486           lsr(Rj, Ri, 1);
5487           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
5488         } block_comment("  } // j");
5489 
5490         post1_squaring();
5491         add(Ri, Ri, 1);
5492         cmp(Ri, Rlen);
5493         br(Assembler::LT, loop);
5494 
5495         bind(end);
5496         block_comment("} // i");
5497       }
5498 
5499       block_comment("for (int i = len; i < 2*len; i++) {");
5500       mov(Ri, Rlen); {
5501         Label loop, end;
5502         bind(loop);
5503         cmp(Ri, Rlen, Assembler::LSL, 1);
5504         br(Assembler::GE, end);
5505 
5506         pre2(Ri, Rlen);
5507 
5508         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
5509           lsl(Rj, Rlen, 1);
5510           sub(Rj, Rj, Ri);
5511           sub(Rj, Rj, 1);
5512           lsr(Rj, Rj, 1);
5513           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
5514         } block_comment("  } // j");
5515 
5516         last_squaring(Ri);
5517 
5518         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
5519           lsl(Rj, Rlen, 1);
5520           sub(Rj, Rj, Ri);
5521           lsr(Rj, Rj, 1);
5522           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
5523         } block_comment("  } // j");
5524 
5525         post2(Ri, Rlen);
5526         add(Ri, Ri, 1);
5527         cmp(Ri, Rlen, Assembler::LSL, 1);
5528 
5529         br(Assembler::LT, loop);
5530         bind(end);
5531         block_comment("} // i");
5532       }
5533 
5534       normalize(Rlen);
5535 
5536       mov(Ra, Pm_base);  // Save Pm_base in Ra
5537       restore_regs();  // Restore caller's Pm_base
5538 
5539       // Copy our result into caller's Pm_base
5540       reverse(Pm_base, Ra, Rlen, t0, t1);
5541 
5542       leave();
5543       ret(lr);
5544 
5545       return entry;
5546     }
5547     // In C, approximately:
5548 
5549     // void
5550     // montgomery_square(unsigned long Pa_base[], unsigned long Pn_base[],
5551     //                   unsigned long Pm_base[], unsigned long inv, int len) {
5552     //   unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
5553     //   unsigned long *Pa, *Pb, *Pn, *Pm;
5554     //   unsigned long Ra, Rb, Rn, Rm;
5555 
5556     //   int i;
5557 
5558     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
5559 
5560     //   for (i = 0; i < len; i++) {
5561     //     int j;
5562 
5563     //     Pa = Pa_base;
5564     //     Pb = Pa_base + i;
5565     //     Pm = Pm_base;
5566     //     Pn = Pn_base + i;
5567 
5568     //     Ra = *Pa;
5569     //     Rb = *Pb;
5570     //     Rm = *Pm;
5571     //     Rn = *Pn;
5572 
5573     //     int iters = (i+1)/2;
5574     //     for (j = 0; iters--; j++) {
5575     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
5576     //       MACC2(Ra, Rb, t0, t1, t2);
5577     //       Ra = *++Pa;
5578     //       Rb = *--Pb;
5579     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5580     //       MACC(Rm, Rn, t0, t1, t2);
5581     //       Rm = *++Pm;
5582     //       Rn = *--Pn;
5583     //     }
5584     //     if ((i & 1) == 0) {
5585     //       assert(Ra == Pa_base[j], "must be");
5586     //       MACC(Ra, Ra, t0, t1, t2);
5587     //     }
5588     //     iters = i/2;
5589     //     assert(iters == i-j, "must be");
5590     //     for (; iters--; j++) {
5591     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5592     //       MACC(Rm, Rn, t0, t1, t2);
5593     //       Rm = *++Pm;
5594     //       Rn = *--Pn;
5595     //     }
5596 
5597     //     *Pm = Rm = t0 * inv;
5598     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
5599     //     MACC(Rm, Rn, t0, t1, t2);
5600 
5601     //     assert(t0 == 0, "broken Montgomery multiply");
5602 
5603     //     t0 = t1; t1 = t2; t2 = 0;
5604     //   }
5605 
5606     //   for (i = len; i < 2*len; i++) {
5607     //     int start = i-len+1;
5608     //     int end = start + (len - start)/2;
5609     //     int j;
5610 
5611     //     Pa = Pa_base + i-len;
5612     //     Pb = Pa_base + len;
5613     //     Pm = Pm_base + i-len;
5614     //     Pn = Pn_base + len;
5615 
5616     //     Ra = *++Pa;
5617     //     Rb = *--Pb;
5618     //     Rm = *++Pm;
5619     //     Rn = *--Pn;
5620 
5621     //     int iters = (2*len-i-1)/2;
5622     //     assert(iters == end-start, "must be");
5623     //     for (j = start; iters--; j++) {
5624     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
5625     //       MACC2(Ra, Rb, t0, t1, t2);
5626     //       Ra = *++Pa;
5627     //       Rb = *--Pb;
5628     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5629     //       MACC(Rm, Rn, t0, t1, t2);
5630     //       Rm = *++Pm;
5631     //       Rn = *--Pn;
5632     //     }
5633     //     if ((i & 1) == 0) {
5634     //       assert(Ra == Pa_base[j], "must be");
5635     //       MACC(Ra, Ra, t0, t1, t2);
5636     //     }
5637     //     iters =  (2*len-i)/2;
5638     //     assert(iters == len-j, "must be");
5639     //     for (; iters--; j++) {
5640     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5641     //       MACC(Rm, Rn, t0, t1, t2);
5642     //       Rm = *++Pm;
5643     //       Rn = *--Pn;
5644     //     }
5645     //     Pm_base[i-len] = t0;
5646     //     t0 = t1; t1 = t2; t2 = 0;
5647     //   }
5648 
5649     //   while (t0)
5650     //     t0 = sub(Pm_base, Pn_base, t0, len);
5651     // }
5652   };
5653 
5654 
5655   // Call here from the interpreter or compiled code to either load
5656   // multiple returned values from the value type instance being
5657   // returned to registers or to store returned values to a newly
5658   // allocated value type instance.
5659   address generate_return_value_stub(address destination, const char* name, bool has_res) {
5660 
5661     // Information about frame layout at time of blocking runtime call.
5662     // Note that we only have to preserve callee-saved registers since
5663     // the compilers are responsible for supplying a continuation point
5664     // if they expect all registers to be preserved.
5665     // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0
5666     enum layout {
5667       rfp_off = 0, rfp_off2,
5668 
5669       j_rarg7_off, j_rarg7_2,
5670       j_rarg6_off, j_rarg6_2,
5671       j_rarg5_off, j_rarg5_2,
5672       j_rarg4_off, j_rarg4_2,
5673       j_rarg3_off, j_rarg3_2,
5674       j_rarg2_off, j_rarg2_2,
5675       j_rarg1_off, j_rarg1_2,
5676       j_rarg0_off, j_rarg0_2,
5677 
5678       j_farg0_off, j_farg0_2,
5679       j_farg1_off, j_farg1_2,
5680       j_farg2_off, j_farg2_2,
5681       j_farg3_off, j_farg3_2,
5682       j_farg4_off, j_farg4_2,
5683       j_farg5_off, j_farg5_2,
5684       j_farg6_off, j_farg6_2,
5685       j_farg7_off, j_farg7_2,
5686  
5687       return_off, return_off2,
5688       framesize // inclusive of return address
5689     };
5690 
5691     int insts_size = 512;
5692     int locs_size  = 64;
5693 
5694     CodeBuffer code(name, insts_size, locs_size);
5695     OopMapSet* oop_maps  = new OopMapSet();
5696     MacroAssembler* masm = new MacroAssembler(&code);
5697 
5698     address start = __ pc();
5699 
5700     const Address f7_save       (rfp, j_farg7_off * wordSize);
5701     const Address f6_save       (rfp, j_farg6_off * wordSize);
5702     const Address f5_save       (rfp, j_farg5_off * wordSize);
5703     const Address f4_save       (rfp, j_farg4_off * wordSize);
5704     const Address f3_save       (rfp, j_farg3_off * wordSize);
5705     const Address f2_save       (rfp, j_farg2_off * wordSize);
5706     const Address f1_save       (rfp, j_farg1_off * wordSize);
5707     const Address f0_save       (rfp, j_farg0_off * wordSize);
5708 
5709     const Address r0_save      (rfp, j_rarg0_off * wordSize);
5710     const Address r1_save      (rfp, j_rarg1_off * wordSize);
5711     const Address r2_save      (rfp, j_rarg2_off * wordSize);
5712     const Address r3_save      (rfp, j_rarg3_off * wordSize);
5713     const Address r4_save      (rfp, j_rarg4_off * wordSize);
5714     const Address r5_save      (rfp, j_rarg5_off * wordSize);
5715     const Address r6_save      (rfp, j_rarg6_off * wordSize);
5716     const Address r7_save      (rfp, j_rarg7_off * wordSize);
5717 
5718     // Generate oop map
5719     OopMap* map = new OopMap(framesize, 0);
5720 
5721     map->set_callee_saved(VMRegImpl::stack2reg(rfp_off), rfp->as_VMReg());
5722     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg7_off), j_rarg7->as_VMReg());
5723     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg6_off), j_rarg6->as_VMReg());
5724     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg5_off), j_rarg5->as_VMReg());
5725     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg4_off), j_rarg4->as_VMReg());
5726     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg3_off), j_rarg3->as_VMReg());
5727     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg2_off), j_rarg2->as_VMReg());
5728     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg1_off), j_rarg1->as_VMReg());
5729     map->set_callee_saved(VMRegImpl::stack2reg(j_rarg0_off), j_rarg0->as_VMReg());
5730 
5731     map->set_callee_saved(VMRegImpl::stack2reg(j_farg0_off), j_farg0->as_VMReg());
5732     map->set_callee_saved(VMRegImpl::stack2reg(j_farg1_off), j_farg1->as_VMReg());
5733     map->set_callee_saved(VMRegImpl::stack2reg(j_farg2_off), j_farg2->as_VMReg());
5734     map->set_callee_saved(VMRegImpl::stack2reg(j_farg3_off), j_farg3->as_VMReg());
5735     map->set_callee_saved(VMRegImpl::stack2reg(j_farg4_off), j_farg4->as_VMReg());
5736     map->set_callee_saved(VMRegImpl::stack2reg(j_farg5_off), j_farg5->as_VMReg());
5737     map->set_callee_saved(VMRegImpl::stack2reg(j_farg6_off), j_farg6->as_VMReg());
5738     map->set_callee_saved(VMRegImpl::stack2reg(j_farg7_off), j_farg7->as_VMReg());
5739 
5740     // This is an inlined and slightly modified version of call_VM
5741     // which has the ability to fetch the return PC out of
5742     // thread-local storage and also sets up last_Java_sp slightly
5743     // differently than the real call_VM
5744 
5745     __ enter(); // Save FP and LR before call
5746 
5747     assert(is_even(framesize/2), "sp not 16-byte aligned");
5748 
5749     // lr and fp are already in place
5750     __ sub(sp, rfp, ((unsigned)framesize - 4) << LogBytesPerInt); // prolog
5751 
5752     __ strd(j_farg7, f7_save); 
5753     __ strd(j_farg6, f6_save); 
5754     __ strd(j_farg5, f5_save); 
5755     __ strd(j_farg4, f4_save); 
5756     __ strd(j_farg3, f3_save); 
5757     __ strd(j_farg2, f2_save); 
5758     __ strd(j_farg1, f1_save); 
5759     __ strd(j_farg0, f0_save); 
5760 
5761     __ str(j_rarg0, r0_save); 
5762     __ str(j_rarg1, r1_save); 
5763     __ str(j_rarg2, r2_save); 
5764     __ str(j_rarg3, r3_save); 
5765     __ str(j_rarg4, r4_save); 
5766     __ str(j_rarg5, r5_save); 
5767     __ str(j_rarg6, r6_save); 
5768     __ str(j_rarg7, r7_save); 
5769 
5770     int frame_complete = __ pc() - start;
5771 
5772     // Set up last_Java_sp and last_Java_fp
5773     address the_pc = __ pc();
5774     __ set_last_Java_frame(sp, rfp, the_pc, rscratch1);
5775 
5776     // Call runtime
5777     __ mov(c_rarg0, rthread);
5778     __ mov(c_rarg1, r0);
5779 
5780     BLOCK_COMMENT("call runtime_entry");
5781     __ mov(rscratch1, destination);
5782     __ blrt(rscratch1, 2 /* number_of_arguments */, 0, 1);
5783 
5784     oop_maps->add_gc_map(the_pc - start, map);
5785 
5786     __ reset_last_Java_frame(false); 
5787     __ maybe_isb(); 
5788 
5789     __ ldrd(j_farg7, f7_save); 
5790     __ ldrd(j_farg6, f6_save); 
5791     __ ldrd(j_farg5, f5_save); 
5792     __ ldrd(j_farg4, f4_save); 
5793     __ ldrd(j_farg3, f3_save); 
5794     __ ldrd(j_farg3, f2_save); 
5795     __ ldrd(j_farg1, f1_save); 
5796     __ ldrd(j_farg0, f0_save); 
5797 
5798     __ ldr(j_rarg0, r0_save); 
5799     __ ldr(j_rarg1, r1_save); 
5800     __ ldr(j_rarg2, r2_save); 
5801     __ ldr(j_rarg3, r3_save); 
5802     __ ldr(j_rarg4, r4_save); 
5803     __ ldr(j_rarg5, r5_save); 
5804     __ ldr(j_rarg6, r6_save); 
5805     __ ldr(j_rarg7, r7_save); 
5806 
5807     __ leave();
5808 
5809     // check for pending exceptions
5810     Label pending;
5811     __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
5812     __ cmp(rscratch1, (u1)NULL_WORD);
5813     __ br(Assembler::NE, pending);
5814 
5815     if (has_res) {
5816       __ get_vm_result(r0, rthread);
5817     }
5818     __ ret(lr);
5819 
5820     __ bind(pending);
5821     __ ldr(r0, Address(rthread, in_bytes(Thread::pending_exception_offset())));
5822     __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
5823 
5824 
5825     // codeBlob framesize is in words (not VMRegImpl::slot_size)
5826     int frame_size_in_words = (framesize >> (LogBytesPerWord - LogBytesPerInt));
5827     RuntimeStub* stub =
5828       RuntimeStub::new_runtime_stub(name, &code, frame_complete, frame_size_in_words, oop_maps, false);
5829 
5830     return stub->entry_point();
5831   }
5832 
5833   // Initialization
5834   void generate_initial() {
5835     // Generate initial stubs and initializes the entry points
5836 
5837     // entry points that exist in all platforms Note: This is code
5838     // that could be shared among different platforms - however the
5839     // benefit seems to be smaller than the disadvantage of having a
5840     // much more complicated generator structure. See also comment in
5841     // stubRoutines.hpp.
5842 
5843     StubRoutines::_forward_exception_entry = generate_forward_exception();
5844 
5845     StubRoutines::_call_stub_entry =
5846       generate_call_stub(StubRoutines::_call_stub_return_address);
5847 
5848     // is referenced by megamorphic call
5849     StubRoutines::_catch_exception_entry = generate_catch_exception();
5850 
5851     // Build this early so it's available for the interpreter.
5852     StubRoutines::_throw_StackOverflowError_entry =
5853       generate_throw_exception("StackOverflowError throw_exception",
5854                                CAST_FROM_FN_PTR(address,
5855                                                 SharedRuntime::throw_StackOverflowError));
5856     StubRoutines::_throw_delayed_StackOverflowError_entry =
5857       generate_throw_exception("delayed StackOverflowError throw_exception",
5858                                CAST_FROM_FN_PTR(address,
5859                                                 SharedRuntime::throw_delayed_StackOverflowError));
5860     if (UseCRC32Intrinsics) {
5861       // set table address before stub generation which use it
5862       StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table;
5863       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
5864     }
5865 
5866     if (UseCRC32CIntrinsics) {
5867       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
5868     }
5869 
5870     // Disabled until JDK-8210858 is fixed
5871     // if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog)) {
5872     //   StubRoutines::_dlog = generate_dlog();
5873     // }
5874 
5875     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
5876       StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false);
5877     }
5878 
5879     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
5880       StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true);
5881     }
5882 
5883 
5884     StubRoutines::_load_value_type_fields_in_regs = 
5885          generate_return_value_stub(CAST_FROM_FN_PTR(address, SharedRuntime::load_value_type_fields_in_regs), "load_value_type_fields_in_regs", false);
5886     StubRoutines::_store_value_type_fields_to_buf = 
5887          generate_return_value_stub(CAST_FROM_FN_PTR(address, SharedRuntime::store_value_type_fields_to_buf), "store_value_type_fields_to_buf", true);
5888   }
5889 
5890   void generate_all() {
5891     // support for verify_oop (must happen after universe_init)
5892     StubRoutines::_verify_oop_subroutine_entry     = generate_verify_oop();
5893     StubRoutines::_throw_AbstractMethodError_entry =
5894       generate_throw_exception("AbstractMethodError throw_exception",
5895                                CAST_FROM_FN_PTR(address,
5896                                                 SharedRuntime::
5897                                                 throw_AbstractMethodError));
5898 
5899     StubRoutines::_throw_IncompatibleClassChangeError_entry =
5900       generate_throw_exception("IncompatibleClassChangeError throw_exception",
5901                                CAST_FROM_FN_PTR(address,
5902                                                 SharedRuntime::
5903                                                 throw_IncompatibleClassChangeError));
5904 
5905     StubRoutines::_throw_NullPointerException_at_call_entry =
5906       generate_throw_exception("NullPointerException at call throw_exception",
5907                                CAST_FROM_FN_PTR(address,
5908                                                 SharedRuntime::
5909                                                 throw_NullPointerException_at_call));
5910 
5911     // arraycopy stubs used by compilers
5912     generate_arraycopy_stubs();
5913 
5914     // has negatives stub for large arrays.
5915     StubRoutines::aarch64::_has_negatives = generate_has_negatives(StubRoutines::aarch64::_has_negatives_long);
5916 
5917     // array equals stub for large arrays.
5918     if (!UseSimpleArrayEquals) {
5919       StubRoutines::aarch64::_large_array_equals = generate_large_array_equals();
5920     }
5921 
5922     generate_compare_long_strings();
5923 
5924     generate_string_indexof_stubs();
5925 
5926     // byte_array_inflate stub for large arrays.
5927     StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate();
5928 
5929 #ifdef COMPILER2
5930     if (UseMultiplyToLenIntrinsic) {
5931       StubRoutines::_multiplyToLen = generate_multiplyToLen();
5932     }
5933 
5934     if (UseSquareToLenIntrinsic) {
5935       StubRoutines::_squareToLen = generate_squareToLen();
5936     }
5937 
5938     if (UseMulAddIntrinsic) {
5939       StubRoutines::_mulAdd = generate_mulAdd();
5940     }
5941 
5942     if (UseMontgomeryMultiplyIntrinsic) {
5943       StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply");
5944       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
5945       StubRoutines::_montgomeryMultiply = g.generate_multiply();
5946     }
5947 
5948     if (UseMontgomerySquareIntrinsic) {
5949       StubCodeMark mark(this, "StubRoutines", "montgomerySquare");
5950       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
5951       // We use generate_multiply() rather than generate_square()
5952       // because it's faster for the sizes of modulus we care about.
5953       StubRoutines::_montgomerySquare = g.generate_multiply();
5954     }
5955 #endif // COMPILER2
5956 
5957 #ifndef BUILTIN_SIM
5958     // generate GHASH intrinsics code
5959     if (UseGHASHIntrinsics) {
5960       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
5961     }
5962 
5963     if (UseAESIntrinsics) {
5964       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
5965       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
5966       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
5967       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
5968     }
5969 
5970     if (UseSHA1Intrinsics) {
5971       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(false,   "sha1_implCompress");
5972       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(true,    "sha1_implCompressMB");
5973     }
5974     if (UseSHA256Intrinsics) {
5975       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(false, "sha256_implCompress");
5976       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true,  "sha256_implCompressMB");
5977     }
5978 
5979     // generate Adler32 intrinsics code
5980     if (UseAdler32Intrinsics) {
5981       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
5982     }
5983 
5984     // Safefetch stubs.
5985     generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
5986                                                        &StubRoutines::_safefetch32_fault_pc,
5987                                                        &StubRoutines::_safefetch32_continuation_pc);
5988     generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
5989                                                        &StubRoutines::_safefetchN_fault_pc,
5990                                                        &StubRoutines::_safefetchN_continuation_pc);
5991 #endif
5992     StubRoutines::aarch64::set_completed();
5993   }
5994 
5995  public:
5996   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
5997     if (all) {
5998       generate_all();
5999     } else {
6000       generate_initial();
6001     }
6002   }
6003 }; // end class declaration
6004 
6005 void StubGenerator_generate(CodeBuffer* code, bool all) {
6006   StubGenerator g(code, all);
6007 }