1 /*
   2  * Copyright (c) 2003, 2019, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2019, Red Hat Inc. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/macroAssembler.hpp"
  28 #include "asm/macroAssembler.inline.hpp"
  29 #include "gc/shared/barrierSet.hpp"
  30 #include "gc/shared/barrierSetAssembler.hpp"
  31 #include "interpreter/interpreter.hpp"
  32 #include "nativeInst_aarch64.hpp"
  33 #include "oops/instanceOop.hpp"
  34 #include "oops/method.hpp"
  35 #include "oops/objArrayKlass.hpp"
  36 #include "oops/oop.inline.hpp"
  37 #include "prims/methodHandles.hpp"
  38 #include "runtime/frame.inline.hpp"
  39 #include "runtime/handles.inline.hpp"
  40 #include "runtime/sharedRuntime.hpp"
  41 #include "runtime/stubCodeGenerator.hpp"
  42 #include "runtime/stubRoutines.hpp"
  43 #include "runtime/thread.inline.hpp"
  44 #include "utilities/align.hpp"
  45 #ifdef COMPILER2
  46 #include "opto/runtime.hpp"
  47 #endif
  48 
  49 #ifdef BUILTIN_SIM
  50 #include "../../../../../../simulator/simulator.hpp"
  51 #endif
  52 
  53 // Declaration and definition of StubGenerator (no .hpp file).
  54 // For a more detailed description of the stub routine structure
  55 // see the comment in stubRoutines.hpp
  56 
  57 #undef __
  58 #define __ _masm->
  59 #define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8))
  60 
  61 #ifdef PRODUCT
  62 #define BLOCK_COMMENT(str) /* nothing */
  63 #else
  64 #define BLOCK_COMMENT(str) __ block_comment(str)
  65 #endif
  66 
  67 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  68 
  69 // Stub Code definitions
  70 
  71 class StubGenerator: public StubCodeGenerator {
  72  private:
  73 
  74 #ifdef PRODUCT
  75 #define inc_counter_np(counter) ((void)0)
  76 #else
  77   void inc_counter_np_(int& counter) {
  78     __ lea(rscratch2, ExternalAddress((address)&counter));
  79     __ ldrw(rscratch1, Address(rscratch2));
  80     __ addw(rscratch1, rscratch1, 1);
  81     __ strw(rscratch1, Address(rscratch2));
  82   }
  83 #define inc_counter_np(counter) \
  84   BLOCK_COMMENT("inc_counter " #counter); \
  85   inc_counter_np_(counter);
  86 #endif
  87 
  88   // Call stubs are used to call Java from C
  89   //
  90   // Arguments:
  91   //    c_rarg0:   call wrapper address                   address
  92   //    c_rarg1:   result                                 address
  93   //    c_rarg2:   result type                            BasicType
  94   //    c_rarg3:   method                                 Method*
  95   //    c_rarg4:   (interpreter) entry point              address
  96   //    c_rarg5:   parameters                             intptr_t*
  97   //    c_rarg6:   parameter size (in words)              int
  98   //    c_rarg7:   thread                                 Thread*
  99   //
 100   // There is no return from the stub itself as any Java result
 101   // is written to result
 102   //
 103   // we save r30 (lr) as the return PC at the base of the frame and
 104   // link r29 (fp) below it as the frame pointer installing sp (r31)
 105   // into fp.
 106   //
 107   // we save r0-r7, which accounts for all the c arguments.
 108   //
 109   // TODO: strictly do we need to save them all? they are treated as
 110   // volatile by C so could we omit saving the ones we are going to
 111   // place in global registers (thread? method?) or those we only use
 112   // during setup of the Java call?
 113   //
 114   // we don't need to save r8 which C uses as an indirect result location
 115   // return register.
 116   //
 117   // we don't need to save r9-r15 which both C and Java treat as
 118   // volatile
 119   //
 120   // we don't need to save r16-18 because Java does not use them
 121   //
 122   // we save r19-r28 which Java uses as scratch registers and C
 123   // expects to be callee-save
 124   //
 125   // we save the bottom 64 bits of each value stored in v8-v15; it is
 126   // the responsibility of the caller to preserve larger values.
 127   //
 128   // so the stub frame looks like this when we enter Java code
 129   //
 130   //     [ return_from_Java     ] <--- sp
 131   //     [ argument word n      ]
 132   //      ...
 133   // -27 [ argument word 1      ]
 134   // -26 [ saved v15            ] <--- sp_after_call
 135   // -25 [ saved v14            ]
 136   // -24 [ saved v13            ]
 137   // -23 [ saved v12            ]
 138   // -22 [ saved v11            ]
 139   // -21 [ saved v10            ]
 140   // -20 [ saved v9             ]
 141   // -19 [ saved v8             ]
 142   // -18 [ saved r28            ]
 143   // -17 [ saved r27            ]
 144   // -16 [ saved r26            ]
 145   // -15 [ saved r25            ]
 146   // -14 [ saved r24            ]
 147   // -13 [ saved r23            ]
 148   // -12 [ saved r22            ]
 149   // -11 [ saved r21            ]
 150   // -10 [ saved r20            ]
 151   //  -9 [ saved r19            ]
 152   //  -8 [ call wrapper    (r0) ]
 153   //  -7 [ result          (r1) ]
 154   //  -6 [ result type     (r2) ]
 155   //  -5 [ method          (r3) ]
 156   //  -4 [ entry point     (r4) ]
 157   //  -3 [ parameters      (r5) ]
 158   //  -2 [ parameter size  (r6) ]
 159   //  -1 [ thread (r7)          ]
 160   //   0 [ saved fp       (r29) ] <--- fp == saved sp (r31)
 161   //   1 [ saved lr       (r30) ]
 162 
 163   // Call stub stack layout word offsets from fp
 164   enum call_stub_layout {
 165     sp_after_call_off = -26,
 166 
 167     d15_off            = -26,
 168     d13_off            = -24,
 169     d11_off            = -22,
 170     d9_off             = -20,
 171 
 172     r28_off            = -18,
 173     r26_off            = -16,
 174     r24_off            = -14,
 175     r22_off            = -12,
 176     r20_off            = -10,
 177     call_wrapper_off   =  -8,
 178     result_off         =  -7,
 179     result_type_off    =  -6,
 180     method_off         =  -5,
 181     entry_point_off    =  -4,
 182     parameter_size_off =  -2,
 183     thread_off         =  -1,
 184     fp_f               =   0,
 185     retaddr_off        =   1,
 186   };
 187 
 188   address generate_call_stub(address& return_address) {
 189     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
 190            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
 191            "adjust this code");
 192 
 193     StubCodeMark mark(this, "StubRoutines", "call_stub");
 194     address start = __ pc();
 195 
 196     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 197 
 198     const Address call_wrapper  (rfp, call_wrapper_off   * wordSize);
 199     const Address result        (rfp, result_off         * wordSize);
 200     const Address result_type   (rfp, result_type_off    * wordSize);
 201     const Address method        (rfp, method_off         * wordSize);
 202     const Address entry_point   (rfp, entry_point_off    * wordSize);
 203     const Address parameter_size(rfp, parameter_size_off * wordSize);
 204 
 205     const Address thread        (rfp, thread_off         * wordSize);
 206 
 207     const Address d15_save      (rfp, d15_off * wordSize);
 208     const Address d13_save      (rfp, d13_off * wordSize);
 209     const Address d11_save      (rfp, d11_off * wordSize);
 210     const Address d9_save       (rfp, d9_off * wordSize);
 211 
 212     const Address r28_save      (rfp, r28_off * wordSize);
 213     const Address r26_save      (rfp, r26_off * wordSize);
 214     const Address r24_save      (rfp, r24_off * wordSize);
 215     const Address r22_save      (rfp, r22_off * wordSize);
 216     const Address r20_save      (rfp, r20_off * wordSize);
 217 
 218     // stub code
 219 
 220     // we need a C prolog to bootstrap the x86 caller into the sim
 221     __ c_stub_prolog(8, 0, MacroAssembler::ret_type_void);
 222 
 223     address aarch64_entry = __ pc();
 224 
 225 #ifdef BUILTIN_SIM
 226     // Save sender's SP for stack traces.
 227     __ mov(rscratch1, sp);
 228     __ str(rscratch1, Address(__ pre(sp, -2 * wordSize)));
 229 #endif
 230     // set up frame and move sp to end of save area
 231     __ enter();
 232     __ sub(sp, rfp, -sp_after_call_off * wordSize);
 233 
 234     // save register parameters and Java scratch/global registers
 235     // n.b. we save thread even though it gets installed in
 236     // rthread because we want to sanity check rthread later
 237     __ str(c_rarg7,  thread);
 238     __ strw(c_rarg6, parameter_size);
 239     __ stp(c_rarg4, c_rarg5,  entry_point);
 240     __ stp(c_rarg2, c_rarg3,  result_type);
 241     __ stp(c_rarg0, c_rarg1,  call_wrapper);
 242 
 243     __ stp(r20, r19,   r20_save);
 244     __ stp(r22, r21,   r22_save);
 245     __ stp(r24, r23,   r24_save);
 246     __ stp(r26, r25,   r26_save);
 247     __ stp(r28, r27,   r28_save);
 248 
 249     __ stpd(v9,  v8,   d9_save);
 250     __ stpd(v11, v10,  d11_save);
 251     __ stpd(v13, v12,  d13_save);
 252     __ stpd(v15, v14,  d15_save);
 253 
 254     // install Java thread in global register now we have saved
 255     // whatever value it held
 256     __ mov(rthread, c_rarg7);
 257     // And method
 258     __ mov(rmethod, c_rarg3);
 259 
 260     // set up the heapbase register
 261     __ reinit_heapbase();
 262 
 263 #ifdef ASSERT
 264     // make sure we have no pending exceptions
 265     {
 266       Label L;
 267       __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
 268       __ cmp(rscratch1, (u1)NULL_WORD);
 269       __ br(Assembler::EQ, L);
 270       __ stop("StubRoutines::call_stub: entered with pending exception");
 271       __ BIND(L);
 272     }
 273 #endif
 274     // pass parameters if any
 275     __ mov(esp, sp);
 276     __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
 277     __ andr(sp, rscratch1, -2 * wordSize);
 278 
 279     BLOCK_COMMENT("pass parameters if any");
 280     Label parameters_done;
 281     // parameter count is still in c_rarg6
 282     // and parameter pointer identifying param 1 is in c_rarg5
 283     __ cbzw(c_rarg6, parameters_done);
 284 
 285     address loop = __ pc();
 286     __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
 287     __ subsw(c_rarg6, c_rarg6, 1);
 288     __ push(rscratch1);
 289     __ br(Assembler::GT, loop);
 290 
 291     __ BIND(parameters_done);
 292 
 293     // call Java entry -- passing methdoOop, and current sp
 294     //      rmethod: Method*
 295     //      r13: sender sp
 296     BLOCK_COMMENT("call Java function");
 297     __ mov(r13, sp);
 298     __ blr(c_rarg4);
 299 
 300     // tell the simulator we have returned to the stub
 301 
 302     // we do this here because the notify will already have been done
 303     // if we get to the next instruction via an exception
 304     //
 305     // n.b. adding this instruction here affects the calculation of
 306     // whether or not a routine returns to the call stub (used when
 307     // doing stack walks) since the normal test is to check the return
 308     // pc against the address saved below. so we may need to allow for
 309     // this extra instruction in the check.
 310 
 311     if (NotifySimulator) {
 312       __ notify(Assembler::method_reentry);
 313     }
 314     // save current address for use by exception handling code
 315 
 316     return_address = __ pc();
 317 
 318     // store result depending on type (everything that is not
 319     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
 320     // n.b. this assumes Java returns an integral result in r0
 321     // and a floating result in j_farg0
 322     __ ldr(j_rarg2, result);
 323     Label is_long, is_float, is_double, exit;
 324     __ ldr(j_rarg1, result_type);
 325     __ cmp(j_rarg1, (u1)T_OBJECT);
 326     __ br(Assembler::EQ, is_long);
 327     __ cmp(j_rarg1, (u1)T_LONG);
 328     __ br(Assembler::EQ, is_long);
 329     __ cmp(j_rarg1, (u1)T_FLOAT);
 330     __ br(Assembler::EQ, is_float);
 331     __ cmp(j_rarg1, (u1)T_DOUBLE);
 332     __ br(Assembler::EQ, is_double);
 333 
 334     // handle T_INT case
 335     __ strw(r0, Address(j_rarg2));
 336 
 337     __ BIND(exit);
 338 
 339     // pop parameters
 340     __ sub(esp, rfp, -sp_after_call_off * wordSize);
 341 
 342 #ifdef ASSERT
 343     // verify that threads correspond
 344     {
 345       Label L, S;
 346       __ ldr(rscratch1, thread);
 347       __ cmp(rthread, rscratch1);
 348       __ br(Assembler::NE, S);
 349       __ get_thread(rscratch1);
 350       __ cmp(rthread, rscratch1);
 351       __ br(Assembler::EQ, L);
 352       __ BIND(S);
 353       __ stop("StubRoutines::call_stub: threads must correspond");
 354       __ BIND(L);
 355     }
 356 #endif
 357 
 358     // restore callee-save registers
 359     __ ldpd(v15, v14,  d15_save);
 360     __ ldpd(v13, v12,  d13_save);
 361     __ ldpd(v11, v10,  d11_save);
 362     __ ldpd(v9,  v8,   d9_save);
 363 
 364     __ ldp(r28, r27,   r28_save);
 365     __ ldp(r26, r25,   r26_save);
 366     __ ldp(r24, r23,   r24_save);
 367     __ ldp(r22, r21,   r22_save);
 368     __ ldp(r20, r19,   r20_save);
 369 
 370     __ ldp(c_rarg0, c_rarg1,  call_wrapper);
 371     __ ldrw(c_rarg2, result_type);
 372     __ ldr(c_rarg3,  method);
 373     __ ldp(c_rarg4, c_rarg5,  entry_point);
 374     __ ldp(c_rarg6, c_rarg7,  parameter_size);
 375 
 376 #ifndef PRODUCT
 377     // tell the simulator we are about to end Java execution
 378     if (NotifySimulator) {
 379       __ notify(Assembler::method_exit);
 380     }
 381 #endif
 382     // leave frame and return to caller
 383     __ leave();
 384     __ ret(lr);
 385 
 386     // handle return types different from T_INT
 387 
 388     __ BIND(is_long);
 389     __ str(r0, Address(j_rarg2, 0));
 390     __ br(Assembler::AL, exit);
 391 
 392     __ BIND(is_float);
 393     __ strs(j_farg0, Address(j_rarg2, 0));
 394     __ br(Assembler::AL, exit);
 395 
 396     __ BIND(is_double);
 397     __ strd(j_farg0, Address(j_rarg2, 0));
 398     __ br(Assembler::AL, exit);
 399 
 400     return start;
 401   }
 402 
 403   // Return point for a Java call if there's an exception thrown in
 404   // Java code.  The exception is caught and transformed into a
 405   // pending exception stored in JavaThread that can be tested from
 406   // within the VM.
 407   //
 408   // Note: Usually the parameters are removed by the callee. In case
 409   // of an exception crossing an activation frame boundary, that is
 410   // not the case if the callee is compiled code => need to setup the
 411   // rsp.
 412   //
 413   // r0: exception oop
 414 
 415   // NOTE: this is used as a target from the signal handler so it
 416   // needs an x86 prolog which returns into the current simulator
 417   // executing the generated catch_exception code. so the prolog
 418   // needs to install rax in a sim register and adjust the sim's
 419   // restart pc to enter the generated code at the start position
 420   // then return from native to simulated execution.
 421 
 422   address generate_catch_exception() {
 423     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 424     address start = __ pc();
 425 
 426     // same as in generate_call_stub():
 427     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 428     const Address thread        (rfp, thread_off         * wordSize);
 429 
 430 #ifdef ASSERT
 431     // verify that threads correspond
 432     {
 433       Label L, S;
 434       __ ldr(rscratch1, thread);
 435       __ cmp(rthread, rscratch1);
 436       __ br(Assembler::NE, S);
 437       __ get_thread(rscratch1);
 438       __ cmp(rthread, rscratch1);
 439       __ br(Assembler::EQ, L);
 440       __ bind(S);
 441       __ stop("StubRoutines::catch_exception: threads must correspond");
 442       __ bind(L);
 443     }
 444 #endif
 445 
 446     // set pending exception
 447     __ verify_oop(r0);
 448 
 449     __ str(r0, Address(rthread, Thread::pending_exception_offset()));
 450     __ mov(rscratch1, (address)__FILE__);
 451     __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
 452     __ movw(rscratch1, (int)__LINE__);
 453     __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
 454 
 455     // complete return to VM
 456     assert(StubRoutines::_call_stub_return_address != NULL,
 457            "_call_stub_return_address must have been generated before");
 458     __ b(StubRoutines::_call_stub_return_address);
 459 
 460     return start;
 461   }
 462 
 463   // Continuation point for runtime calls returning with a pending
 464   // exception.  The pending exception check happened in the runtime
 465   // or native call stub.  The pending exception in Thread is
 466   // converted into a Java-level exception.
 467   //
 468   // Contract with Java-level exception handlers:
 469   // r0: exception
 470   // r3: throwing pc
 471   //
 472   // NOTE: At entry of this stub, exception-pc must be in LR !!
 473 
 474   // NOTE: this is always used as a jump target within generated code
 475   // so it just needs to be generated code wiht no x86 prolog
 476 
 477   address generate_forward_exception() {
 478     StubCodeMark mark(this, "StubRoutines", "forward exception");
 479     address start = __ pc();
 480 
 481     // Upon entry, LR points to the return address returning into
 482     // Java (interpreted or compiled) code; i.e., the return address
 483     // becomes the throwing pc.
 484     //
 485     // Arguments pushed before the runtime call are still on the stack
 486     // but the exception handler will reset the stack pointer ->
 487     // ignore them.  A potential result in registers can be ignored as
 488     // well.
 489 
 490 #ifdef ASSERT
 491     // make sure this code is only executed if there is a pending exception
 492     {
 493       Label L;
 494       __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
 495       __ cbnz(rscratch1, L);
 496       __ stop("StubRoutines::forward exception: no pending exception (1)");
 497       __ bind(L);
 498     }
 499 #endif
 500 
 501     // compute exception handler into r19
 502 
 503     // call the VM to find the handler address associated with the
 504     // caller address. pass thread in r0 and caller pc (ret address)
 505     // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
 506     // the stack.
 507     __ mov(c_rarg1, lr);
 508     // lr will be trashed by the VM call so we move it to R19
 509     // (callee-saved) because we also need to pass it to the handler
 510     // returned by this call.
 511     __ mov(r19, lr);
 512     BLOCK_COMMENT("call exception_handler_for_return_address");
 513     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 514                          SharedRuntime::exception_handler_for_return_address),
 515                     rthread, c_rarg1);
 516     // we should not really care that lr is no longer the callee
 517     // address. we saved the value the handler needs in r19 so we can
 518     // just copy it to r3. however, the C2 handler will push its own
 519     // frame and then calls into the VM and the VM code asserts that
 520     // the PC for the frame above the handler belongs to a compiled
 521     // Java method. So, we restore lr here to satisfy that assert.
 522     __ mov(lr, r19);
 523     // setup r0 & r3 & clear pending exception
 524     __ mov(r3, r19);
 525     __ mov(r19, r0);
 526     __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
 527     __ str(zr, Address(rthread, Thread::pending_exception_offset()));
 528 
 529 #ifdef ASSERT
 530     // make sure exception is set
 531     {
 532       Label L;
 533       __ cbnz(r0, L);
 534       __ stop("StubRoutines::forward exception: no pending exception (2)");
 535       __ bind(L);
 536     }
 537 #endif
 538 
 539     // continue at exception handler
 540     // r0: exception
 541     // r3: throwing pc
 542     // r19: exception handler
 543     __ verify_oop(r0);
 544     __ br(r19);
 545 
 546     return start;
 547   }
 548 
 549   // Non-destructive plausibility checks for oops
 550   //
 551   // Arguments:
 552   //    r0: oop to verify
 553   //    rscratch1: error message
 554   //
 555   // Stack after saving c_rarg3:
 556   //    [tos + 0]: saved c_rarg3
 557   //    [tos + 1]: saved c_rarg2
 558   //    [tos + 2]: saved lr
 559   //    [tos + 3]: saved rscratch2
 560   //    [tos + 4]: saved r0
 561   //    [tos + 5]: saved rscratch1
 562   address generate_verify_oop() {
 563 
 564     StubCodeMark mark(this, "StubRoutines", "verify_oop");
 565     address start = __ pc();
 566 
 567     Label exit, error;
 568 
 569     // save c_rarg2 and c_rarg3
 570     __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
 571 
 572     // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 573     __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 574     __ ldr(c_rarg3, Address(c_rarg2));
 575     __ add(c_rarg3, c_rarg3, 1);
 576     __ str(c_rarg3, Address(c_rarg2));
 577 
 578     // object is in r0
 579     // make sure object is 'reasonable'
 580     __ cbz(r0, exit); // if obj is NULL it is OK
 581 
 582     // Check if the oop is in the right area of memory
 583     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask());
 584     __ andr(c_rarg2, r0, c_rarg3);
 585     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits());
 586 
 587     // Compare c_rarg2 and c_rarg3.  We don't use a compare
 588     // instruction here because the flags register is live.
 589     __ eor(c_rarg2, c_rarg2, c_rarg3);
 590     __ cbnz(c_rarg2, error);
 591 
 592     // make sure klass is 'reasonable', which is not zero.
 593     __ load_klass(r0, r0);  // get klass
 594     __ cbz(r0, error);      // if klass is NULL it is broken
 595 
 596     // return if everything seems ok
 597     __ bind(exit);
 598 
 599     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 600     __ ret(lr);
 601 
 602     // handle errors
 603     __ bind(error);
 604     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 605 
 606     __ push(RegSet::range(r0, r29), sp);
 607     // debug(char* msg, int64_t pc, int64_t regs[])
 608     __ mov(c_rarg0, rscratch1);      // pass address of error message
 609     __ mov(c_rarg1, lr);             // pass return address
 610     __ mov(c_rarg2, sp);             // pass address of regs on stack
 611 #ifndef PRODUCT
 612     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
 613 #endif
 614     BLOCK_COMMENT("call MacroAssembler::debug");
 615     __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
 616     __ blrt(rscratch1, 3, 0, 1);
 617 
 618     return start;
 619   }
 620 
 621   void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); }
 622 
 623   // The inner part of zero_words().  This is the bulk operation,
 624   // zeroing words in blocks, possibly using DC ZVA to do it.  The
 625   // caller is responsible for zeroing the last few words.
 626   //
 627   // Inputs:
 628   // r10: the HeapWord-aligned base address of an array to zero.
 629   // r11: the count in HeapWords, r11 > 0.
 630   //
 631   // Returns r10 and r11, adjusted for the caller to clear.
 632   // r10: the base address of the tail of words left to clear.
 633   // r11: the number of words in the tail.
 634   //      r11 < MacroAssembler::zero_words_block_size.
 635 
 636   address generate_zero_blocks() {
 637     Label done;
 638     Label base_aligned;
 639 
 640     Register base = r10, cnt = r11;
 641 
 642     __ align(CodeEntryAlignment);
 643     StubCodeMark mark(this, "StubRoutines", "zero_blocks");
 644     address start = __ pc();
 645 
 646     if (UseBlockZeroing) {
 647       int zva_length = VM_Version::zva_length();
 648 
 649       // Ensure ZVA length can be divided by 16. This is required by
 650       // the subsequent operations.
 651       assert (zva_length % 16 == 0, "Unexpected ZVA Length");
 652 
 653       __ tbz(base, 3, base_aligned);
 654       __ str(zr, Address(__ post(base, 8)));
 655       __ sub(cnt, cnt, 1);
 656       __ bind(base_aligned);
 657 
 658       // Ensure count >= zva_length * 2 so that it still deserves a zva after
 659       // alignment.
 660       Label small;
 661       int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
 662       __ subs(rscratch1, cnt, low_limit >> 3);
 663       __ br(Assembler::LT, small);
 664       __ zero_dcache_blocks(base, cnt);
 665       __ bind(small);
 666     }
 667 
 668     {
 669       // Number of stp instructions we'll unroll
 670       const int unroll =
 671         MacroAssembler::zero_words_block_size / 2;
 672       // Clear the remaining blocks.
 673       Label loop;
 674       __ subs(cnt, cnt, unroll * 2);
 675       __ br(Assembler::LT, done);
 676       __ bind(loop);
 677       for (int i = 0; i < unroll; i++)
 678         __ stp(zr, zr, __ post(base, 16));
 679       __ subs(cnt, cnt, unroll * 2);
 680       __ br(Assembler::GE, loop);
 681       __ bind(done);
 682       __ add(cnt, cnt, unroll * 2);
 683     }
 684 
 685     __ ret(lr);
 686 
 687     return start;
 688   }
 689 
 690 
 691   typedef enum {
 692     copy_forwards = 1,
 693     copy_backwards = -1
 694   } copy_direction;
 695 
 696   // Bulk copy of blocks of 8 words.
 697   //
 698   // count is a count of words.
 699   //
 700   // Precondition: count >= 8
 701   //
 702   // Postconditions:
 703   //
 704   // The least significant bit of count contains the remaining count
 705   // of words to copy.  The rest of count is trash.
 706   //
 707   // s and d are adjusted to point to the remaining words to copy
 708   //
 709   void generate_copy_longs(Label &start, Register s, Register d, Register count,
 710                            copy_direction direction) {
 711     int unit = wordSize * direction;
 712     int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
 713 
 714     int offset;
 715     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
 716       t4 = r7, t5 = r10, t6 = r11, t7 = r12;
 717     const Register stride = r13;
 718 
 719     assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7);
 720     assert_different_registers(s, d, count, rscratch1);
 721 
 722     Label again, drain;
 723     const char *stub_name;
 724     if (direction == copy_forwards)
 725       stub_name = "forward_copy_longs";
 726     else
 727       stub_name = "backward_copy_longs";
 728 
 729     __ align(CodeEntryAlignment);
 730 
 731     StubCodeMark mark(this, "StubRoutines", stub_name);
 732 
 733     __ bind(start);
 734 
 735     Label unaligned_copy_long;
 736     if (AvoidUnalignedAccesses) {
 737       __ tbnz(d, 3, unaligned_copy_long);
 738     }
 739 
 740     if (direction == copy_forwards) {
 741       __ sub(s, s, bias);
 742       __ sub(d, d, bias);
 743     }
 744 
 745 #ifdef ASSERT
 746     // Make sure we are never given < 8 words
 747     {
 748       Label L;
 749       __ cmp(count, (u1)8);
 750       __ br(Assembler::GE, L);
 751       __ stop("genrate_copy_longs called with < 8 words");
 752       __ bind(L);
 753     }
 754 #endif
 755 
 756     // Fill 8 registers
 757     if (UseSIMDForMemoryOps) {
 758       __ ldpq(v0, v1, Address(s, 4 * unit));
 759       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
 760     } else {
 761       __ ldp(t0, t1, Address(s, 2 * unit));
 762       __ ldp(t2, t3, Address(s, 4 * unit));
 763       __ ldp(t4, t5, Address(s, 6 * unit));
 764       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 765     }
 766 
 767     __ subs(count, count, 16);
 768     __ br(Assembler::LO, drain);
 769 
 770     int prefetch = PrefetchCopyIntervalInBytes;
 771     bool use_stride = false;
 772     if (direction == copy_backwards) {
 773        use_stride = prefetch > 256;
 774        prefetch = -prefetch;
 775        if (use_stride) __ mov(stride, prefetch);
 776     }
 777 
 778     __ bind(again);
 779 
 780     if (PrefetchCopyIntervalInBytes > 0)
 781       __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 782 
 783     if (UseSIMDForMemoryOps) {
 784       __ stpq(v0, v1, Address(d, 4 * unit));
 785       __ ldpq(v0, v1, Address(s, 4 * unit));
 786       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
 787       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
 788     } else {
 789       __ stp(t0, t1, Address(d, 2 * unit));
 790       __ ldp(t0, t1, Address(s, 2 * unit));
 791       __ stp(t2, t3, Address(d, 4 * unit));
 792       __ ldp(t2, t3, Address(s, 4 * unit));
 793       __ stp(t4, t5, Address(d, 6 * unit));
 794       __ ldp(t4, t5, Address(s, 6 * unit));
 795       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 796       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 797     }
 798 
 799     __ subs(count, count, 8);
 800     __ br(Assembler::HS, again);
 801 
 802     // Drain
 803     __ bind(drain);
 804     if (UseSIMDForMemoryOps) {
 805       __ stpq(v0, v1, Address(d, 4 * unit));
 806       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
 807     } else {
 808       __ stp(t0, t1, Address(d, 2 * unit));
 809       __ stp(t2, t3, Address(d, 4 * unit));
 810       __ stp(t4, t5, Address(d, 6 * unit));
 811       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 812     }
 813 
 814     {
 815       Label L1, L2;
 816       __ tbz(count, exact_log2(4), L1);
 817       if (UseSIMDForMemoryOps) {
 818         __ ldpq(v0, v1, Address(__ pre(s, 4 * unit)));
 819         __ stpq(v0, v1, Address(__ pre(d, 4 * unit)));
 820       } else {
 821         __ ldp(t0, t1, Address(s, 2 * unit));
 822         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
 823         __ stp(t0, t1, Address(d, 2 * unit));
 824         __ stp(t2, t3, Address(__ pre(d, 4 * unit)));
 825       }
 826       __ bind(L1);
 827 
 828       if (direction == copy_forwards) {
 829         __ add(s, s, bias);
 830         __ add(d, d, bias);
 831       }
 832 
 833       __ tbz(count, 1, L2);
 834       __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
 835       __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
 836       __ bind(L2);
 837     }
 838 
 839     __ ret(lr);
 840 
 841     if (AvoidUnalignedAccesses) {
 842       Label drain, again;
 843       // Register order for storing. Order is different for backward copy.
 844 
 845       __ bind(unaligned_copy_long);
 846 
 847       // source address is even aligned, target odd aligned
 848       //
 849       // when forward copying word pairs we read long pairs at offsets
 850       // {0, 2, 4, 6} (in long words). when backwards copying we read
 851       // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
 852       // address by -2 in the forwards case so we can compute the
 853       // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
 854       // or -1.
 855       //
 856       // when forward copying we need to store 1 word, 3 pairs and
 857       // then 1 word at offsets {0, 1, 3, 5, 7}. Rather thna use a
 858       // zero offset We adjust the destination by -1 which means we
 859       // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
 860       //
 861       // When backwards copyng we need to store 1 word, 3 pairs and
 862       // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
 863       // offsets {1, 3, 5, 7, 8} * unit.
 864 
 865       if (direction == copy_forwards) {
 866         __ sub(s, s, 16);
 867         __ sub(d, d, 8);
 868       }
 869 
 870       // Fill 8 registers
 871       //
 872       // for forwards copy s was offset by -16 from the original input
 873       // value of s so the register contents are at these offsets
 874       // relative to the 64 bit block addressed by that original input
 875       // and so on for each successive 64 byte block when s is updated
 876       //
 877       // t0 at offset 0,  t1 at offset 8
 878       // t2 at offset 16, t3 at offset 24
 879       // t4 at offset 32, t5 at offset 40
 880       // t6 at offset 48, t7 at offset 56
 881 
 882       // for backwards copy s was not offset so the register contents
 883       // are at these offsets into the preceding 64 byte block
 884       // relative to that original input and so on for each successive
 885       // preceding 64 byte block when s is updated. this explains the
 886       // slightly counter-intuitive looking pattern of register usage
 887       // in the stp instructions for backwards copy.
 888       //
 889       // t0 at offset -16, t1 at offset -8
 890       // t2 at offset -32, t3 at offset -24
 891       // t4 at offset -48, t5 at offset -40
 892       // t6 at offset -64, t7 at offset -56
 893 
 894       __ ldp(t0, t1, Address(s, 2 * unit));
 895       __ ldp(t2, t3, Address(s, 4 * unit));
 896       __ ldp(t4, t5, Address(s, 6 * unit));
 897       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 898 
 899       __ subs(count, count, 16);
 900       __ br(Assembler::LO, drain);
 901 
 902       int prefetch = PrefetchCopyIntervalInBytes;
 903       bool use_stride = false;
 904       if (direction == copy_backwards) {
 905          use_stride = prefetch > 256;
 906          prefetch = -prefetch;
 907          if (use_stride) __ mov(stride, prefetch);
 908       }
 909 
 910       __ bind(again);
 911 
 912       if (PrefetchCopyIntervalInBytes > 0)
 913         __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 914 
 915       if (direction == copy_forwards) {
 916        // allowing for the offset of -8 the store instructions place
 917        // registers into the target 64 bit block at the following
 918        // offsets
 919        //
 920        // t0 at offset 0
 921        // t1 at offset 8,  t2 at offset 16
 922        // t3 at offset 24, t4 at offset 32
 923        // t5 at offset 40, t6 at offset 48
 924        // t7 at offset 56
 925 
 926         __ str(t0, Address(d, 1 * unit));
 927         __ stp(t1, t2, Address(d, 2 * unit));
 928         __ ldp(t0, t1, Address(s, 2 * unit));
 929         __ stp(t3, t4, Address(d, 4 * unit));
 930         __ ldp(t2, t3, Address(s, 4 * unit));
 931         __ stp(t5, t6, Address(d, 6 * unit));
 932         __ ldp(t4, t5, Address(s, 6 * unit));
 933         __ str(t7, Address(__ pre(d, 8 * unit)));
 934         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 935       } else {
 936        // d was not offset when we started so the registers are
 937        // written into the 64 bit block preceding d with the following
 938        // offsets
 939        //
 940        // t1 at offset -8
 941        // t3 at offset -24, t0 at offset -16
 942        // t5 at offset -48, t2 at offset -32
 943        // t7 at offset -56, t4 at offset -48
 944        //                   t6 at offset -64
 945        //
 946        // note that this matches the offsets previously noted for the
 947        // loads
 948 
 949         __ str(t1, Address(d, 1 * unit));
 950         __ stp(t3, t0, Address(d, 3 * unit));
 951         __ ldp(t0, t1, Address(s, 2 * unit));
 952         __ stp(t5, t2, Address(d, 5 * unit));
 953         __ ldp(t2, t3, Address(s, 4 * unit));
 954         __ stp(t7, t4, Address(d, 7 * unit));
 955         __ ldp(t4, t5, Address(s, 6 * unit));
 956         __ str(t6, Address(__ pre(d, 8 * unit)));
 957         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 958       }
 959 
 960       __ subs(count, count, 8);
 961       __ br(Assembler::HS, again);
 962 
 963       // Drain
 964       //
 965       // this uses the same pattern of offsets and register arguments
 966       // as above
 967       __ bind(drain);
 968       if (direction == copy_forwards) {
 969         __ str(t0, Address(d, 1 * unit));
 970         __ stp(t1, t2, Address(d, 2 * unit));
 971         __ stp(t3, t4, Address(d, 4 * unit));
 972         __ stp(t5, t6, Address(d, 6 * unit));
 973         __ str(t7, Address(__ pre(d, 8 * unit)));
 974       } else {
 975         __ str(t1, Address(d, 1 * unit));
 976         __ stp(t3, t0, Address(d, 3 * unit));
 977         __ stp(t5, t2, Address(d, 5 * unit));
 978         __ stp(t7, t4, Address(d, 7 * unit));
 979         __ str(t6, Address(__ pre(d, 8 * unit)));
 980       }
 981       // now we need to copy any remaining part block which may
 982       // include a 4 word block subblock and/or a 2 word subblock.
 983       // bits 2 and 1 in the count are the tell-tale for whetehr we
 984       // have each such subblock
 985       {
 986         Label L1, L2;
 987         __ tbz(count, exact_log2(4), L1);
 988        // this is the same as above but copying only 4 longs hence
 989        // with ony one intervening stp between the str instructions
 990        // but note that the offsets and registers still follow the
 991        // same pattern
 992         __ ldp(t0, t1, Address(s, 2 * unit));
 993         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
 994         if (direction == copy_forwards) {
 995           __ str(t0, Address(d, 1 * unit));
 996           __ stp(t1, t2, Address(d, 2 * unit));
 997           __ str(t3, Address(__ pre(d, 4 * unit)));
 998         } else {
 999           __ str(t1, Address(d, 1 * unit));
1000           __ stp(t3, t0, Address(d, 3 * unit));
1001           __ str(t2, Address(__ pre(d, 4 * unit)));
1002         }
1003         __ bind(L1);
1004 
1005         __ tbz(count, 1, L2);
1006        // this is the same as above but copying only 2 longs hence
1007        // there is no intervening stp between the str instructions
1008        // but note that the offset and register patterns are still
1009        // the same
1010         __ ldp(t0, t1, Address(__ pre(s, 2 * unit)));
1011         if (direction == copy_forwards) {
1012           __ str(t0, Address(d, 1 * unit));
1013           __ str(t1, Address(__ pre(d, 2 * unit)));
1014         } else {
1015           __ str(t1, Address(d, 1 * unit));
1016           __ str(t0, Address(__ pre(d, 2 * unit)));
1017         }
1018         __ bind(L2);
1019 
1020        // for forwards copy we need to re-adjust the offsets we
1021        // applied so that s and d are follow the last words written
1022 
1023        if (direction == copy_forwards) {
1024          __ add(s, s, 16);
1025          __ add(d, d, 8);
1026        }
1027 
1028       }
1029 
1030       __ ret(lr);
1031       }
1032   }
1033 
1034   // Small copy: less than 16 bytes.
1035   //
1036   // NB: Ignores all of the bits of count which represent more than 15
1037   // bytes, so a caller doesn't have to mask them.
1038 
1039   void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) {
1040     bool is_backwards = step < 0;
1041     size_t granularity = uabs(step);
1042     int direction = is_backwards ? -1 : 1;
1043     int unit = wordSize * direction;
1044 
1045     Label Lword, Lint, Lshort, Lbyte;
1046 
1047     assert(granularity
1048            && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
1049 
1050     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6;
1051 
1052     // ??? I don't know if this bit-test-and-branch is the right thing
1053     // to do.  It does a lot of jumping, resulting in several
1054     // mispredicted branches.  It might make more sense to do this
1055     // with something like Duff's device with a single computed branch.
1056 
1057     __ tbz(count, 3 - exact_log2(granularity), Lword);
1058     __ ldr(tmp, Address(__ adjust(s, unit, is_backwards)));
1059     __ str(tmp, Address(__ adjust(d, unit, is_backwards)));
1060     __ bind(Lword);
1061 
1062     if (granularity <= sizeof (jint)) {
1063       __ tbz(count, 2 - exact_log2(granularity), Lint);
1064       __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
1065       __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
1066       __ bind(Lint);
1067     }
1068 
1069     if (granularity <= sizeof (jshort)) {
1070       __ tbz(count, 1 - exact_log2(granularity), Lshort);
1071       __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
1072       __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
1073       __ bind(Lshort);
1074     }
1075 
1076     if (granularity <= sizeof (jbyte)) {
1077       __ tbz(count, 0, Lbyte);
1078       __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
1079       __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
1080       __ bind(Lbyte);
1081     }
1082   }
1083 
1084   Label copy_f, copy_b;
1085 
1086   // All-singing all-dancing memory copy.
1087   //
1088   // Copy count units of memory from s to d.  The size of a unit is
1089   // step, which can be positive or negative depending on the direction
1090   // of copy.  If is_aligned is false, we align the source address.
1091   //
1092 
1093   void copy_memory(bool is_aligned, Register s, Register d,
1094                    Register count, Register tmp, int step) {
1095     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
1096     bool is_backwards = step < 0;
1097     int granularity = uabs(step);
1098     const Register t0 = r3, t1 = r4;
1099 
1100     // <= 96 bytes do inline. Direction doesn't matter because we always
1101     // load all the data before writing anything
1102     Label copy4, copy8, copy16, copy32, copy80, copy_big, finish;
1103     const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8;
1104     const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12;
1105     const Register send = r17, dend = r18;
1106 
1107     if (PrefetchCopyIntervalInBytes > 0)
1108       __ prfm(Address(s, 0), PLDL1KEEP);
1109     __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity));
1110     __ br(Assembler::HI, copy_big);
1111 
1112     __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
1113     __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
1114 
1115     __ cmp(count, u1(16/granularity));
1116     __ br(Assembler::LS, copy16);
1117 
1118     __ cmp(count, u1(64/granularity));
1119     __ br(Assembler::HI, copy80);
1120 
1121     __ cmp(count, u1(32/granularity));
1122     __ br(Assembler::LS, copy32);
1123 
1124     // 33..64 bytes
1125     if (UseSIMDForMemoryOps) {
1126       __ ldpq(v0, v1, Address(s, 0));
1127       __ ldpq(v2, v3, Address(send, -32));
1128       __ stpq(v0, v1, Address(d, 0));
1129       __ stpq(v2, v3, Address(dend, -32));
1130     } else {
1131       __ ldp(t0, t1, Address(s, 0));
1132       __ ldp(t2, t3, Address(s, 16));
1133       __ ldp(t4, t5, Address(send, -32));
1134       __ ldp(t6, t7, Address(send, -16));
1135 
1136       __ stp(t0, t1, Address(d, 0));
1137       __ stp(t2, t3, Address(d, 16));
1138       __ stp(t4, t5, Address(dend, -32));
1139       __ stp(t6, t7, Address(dend, -16));
1140     }
1141     __ b(finish);
1142 
1143     // 17..32 bytes
1144     __ bind(copy32);
1145     __ ldp(t0, t1, Address(s, 0));
1146     __ ldp(t2, t3, Address(send, -16));
1147     __ stp(t0, t1, Address(d, 0));
1148     __ stp(t2, t3, Address(dend, -16));
1149     __ b(finish);
1150 
1151     // 65..80/96 bytes
1152     // (96 bytes if SIMD because we do 32 byes per instruction)
1153     __ bind(copy80);
1154     if (UseSIMDForMemoryOps) {
1155       __ ld4(v0, v1, v2, v3, __ T16B, Address(s, 0));
1156       __ ldpq(v4, v5, Address(send, -32));
1157       __ st4(v0, v1, v2, v3, __ T16B, Address(d, 0));
1158       __ stpq(v4, v5, Address(dend, -32));
1159     } else {
1160       __ ldp(t0, t1, Address(s, 0));
1161       __ ldp(t2, t3, Address(s, 16));
1162       __ ldp(t4, t5, Address(s, 32));
1163       __ ldp(t6, t7, Address(s, 48));
1164       __ ldp(t8, t9, Address(send, -16));
1165 
1166       __ stp(t0, t1, Address(d, 0));
1167       __ stp(t2, t3, Address(d, 16));
1168       __ stp(t4, t5, Address(d, 32));
1169       __ stp(t6, t7, Address(d, 48));
1170       __ stp(t8, t9, Address(dend, -16));
1171     }
1172     __ b(finish);
1173 
1174     // 0..16 bytes
1175     __ bind(copy16);
1176     __ cmp(count, u1(8/granularity));
1177     __ br(Assembler::LO, copy8);
1178 
1179     // 8..16 bytes
1180     __ ldr(t0, Address(s, 0));
1181     __ ldr(t1, Address(send, -8));
1182     __ str(t0, Address(d, 0));
1183     __ str(t1, Address(dend, -8));
1184     __ b(finish);
1185 
1186     if (granularity < 8) {
1187       // 4..7 bytes
1188       __ bind(copy8);
1189       __ tbz(count, 2 - exact_log2(granularity), copy4);
1190       __ ldrw(t0, Address(s, 0));
1191       __ ldrw(t1, Address(send, -4));
1192       __ strw(t0, Address(d, 0));
1193       __ strw(t1, Address(dend, -4));
1194       __ b(finish);
1195       if (granularity < 4) {
1196         // 0..3 bytes
1197         __ bind(copy4);
1198         __ cbz(count, finish); // get rid of 0 case
1199         if (granularity == 2) {
1200           __ ldrh(t0, Address(s, 0));
1201           __ strh(t0, Address(d, 0));
1202         } else { // granularity == 1
1203           // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
1204           // the first and last byte.
1205           // Handle the 3 byte case by loading and storing base + count/2
1206           // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
1207           // This does means in the 1 byte case we load/store the same
1208           // byte 3 times.
1209           __ lsr(count, count, 1);
1210           __ ldrb(t0, Address(s, 0));
1211           __ ldrb(t1, Address(send, -1));
1212           __ ldrb(t2, Address(s, count));
1213           __ strb(t0, Address(d, 0));
1214           __ strb(t1, Address(dend, -1));
1215           __ strb(t2, Address(d, count));
1216         }
1217         __ b(finish);
1218       }
1219     }
1220 
1221     __ bind(copy_big);
1222     if (is_backwards) {
1223       __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
1224       __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
1225     }
1226 
1227     // Now we've got the small case out of the way we can align the
1228     // source address on a 2-word boundary.
1229 
1230     Label aligned;
1231 
1232     if (is_aligned) {
1233       // We may have to adjust by 1 word to get s 2-word-aligned.
1234       __ tbz(s, exact_log2(wordSize), aligned);
1235       __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards)));
1236       __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards)));
1237       __ sub(count, count, wordSize/granularity);
1238     } else {
1239       if (is_backwards) {
1240         __ andr(rscratch2, s, 2 * wordSize - 1);
1241       } else {
1242         __ neg(rscratch2, s);
1243         __ andr(rscratch2, rscratch2, 2 * wordSize - 1);
1244       }
1245       // rscratch2 is the byte adjustment needed to align s.
1246       __ cbz(rscratch2, aligned);
1247       int shift = exact_log2(granularity);
1248       if (shift)  __ lsr(rscratch2, rscratch2, shift);
1249       __ sub(count, count, rscratch2);
1250 
1251 #if 0
1252       // ?? This code is only correct for a disjoint copy.  It may or
1253       // may not make sense to use it in that case.
1254 
1255       // Copy the first pair; s and d may not be aligned.
1256       __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
1257       __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
1258 
1259       // Align s and d, adjust count
1260       if (is_backwards) {
1261         __ sub(s, s, rscratch2);
1262         __ sub(d, d, rscratch2);
1263       } else {
1264         __ add(s, s, rscratch2);
1265         __ add(d, d, rscratch2);
1266       }
1267 #else
1268       copy_memory_small(s, d, rscratch2, rscratch1, step);
1269 #endif
1270     }
1271 
1272     __ bind(aligned);
1273 
1274     // s is now 2-word-aligned.
1275 
1276     // We have a count of units and some trailing bytes.  Adjust the
1277     // count and do a bulk copy of words.
1278     __ lsr(rscratch2, count, exact_log2(wordSize/granularity));
1279     if (direction == copy_forwards)
1280       __ bl(copy_f);
1281     else
1282       __ bl(copy_b);
1283 
1284     // And the tail.
1285     copy_memory_small(s, d, count, tmp, step);
1286 
1287     if (granularity >= 8) __ bind(copy8);
1288     if (granularity >= 4) __ bind(copy4);
1289     __ bind(finish);
1290   }
1291 
1292 
1293   void clobber_registers() {
1294 #ifdef ASSERT
1295     __ mov(rscratch1, (uint64_t)0xdeadbeef);
1296     __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
1297     for (Register r = r3; r <= r18; r++)
1298       if (r != rscratch1) __ mov(r, rscratch1);
1299 #endif
1300   }
1301 
1302   // Scan over array at a for count oops, verifying each one.
1303   // Preserves a and count, clobbers rscratch1 and rscratch2.
1304   void verify_oop_array (size_t size, Register a, Register count, Register temp) {
1305     Label loop, end;
1306     __ mov(rscratch1, a);
1307     __ mov(rscratch2, zr);
1308     __ bind(loop);
1309     __ cmp(rscratch2, count);
1310     __ br(Assembler::HS, end);
1311     if (size == (size_t)wordSize) {
1312       __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1313       __ verify_oop(temp);
1314     } else {
1315       __ ldrw(r16, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1316       __ decode_heap_oop(temp); // calls verify_oop
1317     }
1318     __ add(rscratch2, rscratch2, size);
1319     __ b(loop);
1320     __ bind(end);
1321   }
1322 
1323   // Arguments:
1324   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1325   //             ignored
1326   //   is_oop  - true => oop array, so generate store check code
1327   //   name    - stub name string
1328   //
1329   // Inputs:
1330   //   c_rarg0   - source array address
1331   //   c_rarg1   - destination array address
1332   //   c_rarg2   - element count, treated as ssize_t, can be zero
1333   //
1334   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1335   // the hardware handle it.  The two dwords within qwords that span
1336   // cache line boundaries will still be loaded and stored atomicly.
1337   //
1338   // Side Effects:
1339   //   disjoint_int_copy_entry is set to the no-overlap entry point
1340   //   used by generate_conjoint_int_oop_copy().
1341   //
1342   address generate_disjoint_copy(size_t size, bool aligned, bool is_oop, address *entry,
1343                                   const char *name, bool dest_uninitialized = false) {
1344     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1345     RegSet saved_reg = RegSet::of(s, d, count);
1346     __ align(CodeEntryAlignment);
1347     StubCodeMark mark(this, "StubRoutines", name);
1348     address start = __ pc();
1349     __ enter();
1350 
1351     if (entry != NULL) {
1352       *entry = __ pc();
1353       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1354       BLOCK_COMMENT("Entry:");
1355     }
1356 
1357     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1358     if (dest_uninitialized) {
1359       decorators |= IS_DEST_UNINITIALIZED;
1360     }
1361     if (aligned) {
1362       decorators |= ARRAYCOPY_ALIGNED;
1363     }
1364 
1365     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1366     bs->arraycopy_prologue(_masm, decorators, is_oop, d, count, saved_reg);
1367 
1368     if (is_oop) {
1369       // save regs before copy_memory
1370       __ push(RegSet::of(d, count), sp);
1371     }
1372     copy_memory(aligned, s, d, count, rscratch1, size);
1373 
1374     if (is_oop) {
1375       __ pop(RegSet::of(d, count), sp);
1376       if (VerifyOops)
1377         verify_oop_array(size, d, count, r16);
1378       __ sub(count, count, 1); // make an inclusive end pointer
1379       __ lea(count, Address(d, count, Address::lsl(exact_log2(size))));
1380     }
1381 
1382     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1383 
1384     __ leave();
1385     __ mov(r0, zr); // return 0
1386     __ ret(lr);
1387 #ifdef BUILTIN_SIM
1388     {
1389       AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck);
1390       sim->notifyCompile(const_cast<char*>(name), start);
1391     }
1392 #endif
1393     return start;
1394   }
1395 
1396   // Arguments:
1397   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1398   //             ignored
1399   //   is_oop  - true => oop array, so generate store check code
1400   //   name    - stub name string
1401   //
1402   // Inputs:
1403   //   c_rarg0   - source array address
1404   //   c_rarg1   - destination array address
1405   //   c_rarg2   - element count, treated as ssize_t, can be zero
1406   //
1407   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1408   // the hardware handle it.  The two dwords within qwords that span
1409   // cache line boundaries will still be loaded and stored atomicly.
1410   //
1411   address generate_conjoint_copy(size_t size, bool aligned, bool is_oop, address nooverlap_target,
1412                                  address *entry, const char *name,
1413                                  bool dest_uninitialized = false) {
1414     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1415     RegSet saved_regs = RegSet::of(s, d, count);
1416     StubCodeMark mark(this, "StubRoutines", name);
1417     address start = __ pc();
1418     __ enter();
1419 
1420     if (entry != NULL) {
1421       *entry = __ pc();
1422       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1423       BLOCK_COMMENT("Entry:");
1424     }
1425 
1426     // use fwd copy when (d-s) above_equal (count*size)
1427     __ sub(rscratch1, d, s);
1428     __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
1429     __ br(Assembler::HS, nooverlap_target);
1430 
1431     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1432     if (dest_uninitialized) {
1433       decorators |= IS_DEST_UNINITIALIZED;
1434     }
1435     if (aligned) {
1436       decorators |= ARRAYCOPY_ALIGNED;
1437     }
1438 
1439     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1440     bs->arraycopy_prologue(_masm, decorators, is_oop, d, count, saved_regs);
1441 
1442     if (is_oop) {
1443       // save regs before copy_memory
1444       __ push(RegSet::of(d, count), sp);
1445     }
1446     copy_memory(aligned, s, d, count, rscratch1, -size);
1447     if (is_oop) {
1448       __ pop(RegSet::of(d, count), sp);
1449       if (VerifyOops)
1450         verify_oop_array(size, d, count, r16);
1451       __ sub(count, count, 1); // make an inclusive end pointer
1452       __ lea(count, Address(d, count, Address::lsl(exact_log2(size))));
1453     }
1454     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1455     __ leave();
1456     __ mov(r0, zr); // return 0
1457     __ ret(lr);
1458 #ifdef BUILTIN_SIM
1459     {
1460       AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck);
1461       sim->notifyCompile(const_cast<char*>(name), start);
1462     }
1463 #endif
1464     return start;
1465 }
1466 
1467   // Arguments:
1468   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1469   //             ignored
1470   //   name    - stub name string
1471   //
1472   // Inputs:
1473   //   c_rarg0   - source array address
1474   //   c_rarg1   - destination array address
1475   //   c_rarg2   - element count, treated as ssize_t, can be zero
1476   //
1477   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1478   // we let the hardware handle it.  The one to eight bytes within words,
1479   // dwords or qwords that span cache line boundaries will still be loaded
1480   // and stored atomically.
1481   //
1482   // Side Effects:
1483   //   disjoint_byte_copy_entry is set to the no-overlap entry point  //
1484   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1485   // we let the hardware handle it.  The one to eight bytes within words,
1486   // dwords or qwords that span cache line boundaries will still be loaded
1487   // and stored atomically.
1488   //
1489   // Side Effects:
1490   //   disjoint_byte_copy_entry is set to the no-overlap entry point
1491   //   used by generate_conjoint_byte_copy().
1492   //
1493   address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
1494     const bool not_oop = false;
1495     return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name);
1496   }
1497 
1498   // Arguments:
1499   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1500   //             ignored
1501   //   name    - stub name string
1502   //
1503   // Inputs:
1504   //   c_rarg0   - source array address
1505   //   c_rarg1   - destination array address
1506   //   c_rarg2   - element count, treated as ssize_t, can be zero
1507   //
1508   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1509   // we let the hardware handle it.  The one to eight bytes within words,
1510   // dwords or qwords that span cache line boundaries will still be loaded
1511   // and stored atomically.
1512   //
1513   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1514                                       address* entry, const char *name) {
1515     const bool not_oop = false;
1516     return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name);
1517   }
1518 
1519   // Arguments:
1520   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1521   //             ignored
1522   //   name    - stub name string
1523   //
1524   // Inputs:
1525   //   c_rarg0   - source array address
1526   //   c_rarg1   - destination array address
1527   //   c_rarg2   - element count, treated as ssize_t, can be zero
1528   //
1529   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1530   // let the hardware handle it.  The two or four words within dwords
1531   // or qwords that span cache line boundaries will still be loaded
1532   // and stored atomically.
1533   //
1534   // Side Effects:
1535   //   disjoint_short_copy_entry is set to the no-overlap entry point
1536   //   used by generate_conjoint_short_copy().
1537   //
1538   address generate_disjoint_short_copy(bool aligned,
1539                                        address* entry, const char *name) {
1540     const bool not_oop = false;
1541     return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name);
1542   }
1543 
1544   // Arguments:
1545   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1546   //             ignored
1547   //   name    - stub name string
1548   //
1549   // Inputs:
1550   //   c_rarg0   - source array address
1551   //   c_rarg1   - destination array address
1552   //   c_rarg2   - element count, treated as ssize_t, can be zero
1553   //
1554   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1555   // let the hardware handle it.  The two or four words within dwords
1556   // or qwords that span cache line boundaries will still be loaded
1557   // and stored atomically.
1558   //
1559   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1560                                        address *entry, const char *name) {
1561     const bool not_oop = false;
1562     return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name);
1563 
1564   }
1565   // Arguments:
1566   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1567   //             ignored
1568   //   name    - stub name string
1569   //
1570   // Inputs:
1571   //   c_rarg0   - source array address
1572   //   c_rarg1   - destination array address
1573   //   c_rarg2   - element count, treated as ssize_t, can be zero
1574   //
1575   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1576   // the hardware handle it.  The two dwords within qwords that span
1577   // cache line boundaries will still be loaded and stored atomicly.
1578   //
1579   // Side Effects:
1580   //   disjoint_int_copy_entry is set to the no-overlap entry point
1581   //   used by generate_conjoint_int_oop_copy().
1582   //
1583   address generate_disjoint_int_copy(bool aligned, address *entry,
1584                                          const char *name, bool dest_uninitialized = false) {
1585     const bool not_oop = false;
1586     return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name);
1587   }
1588 
1589   // Arguments:
1590   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1591   //             ignored
1592   //   name    - stub name string
1593   //
1594   // Inputs:
1595   //   c_rarg0   - source array address
1596   //   c_rarg1   - destination array address
1597   //   c_rarg2   - element count, treated as ssize_t, can be zero
1598   //
1599   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1600   // the hardware handle it.  The two dwords within qwords that span
1601   // cache line boundaries will still be loaded and stored atomicly.
1602   //
1603   address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
1604                                      address *entry, const char *name,
1605                                      bool dest_uninitialized = false) {
1606     const bool not_oop = false;
1607     return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name);
1608   }
1609 
1610 
1611   // Arguments:
1612   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1613   //             ignored
1614   //   name    - stub name string
1615   //
1616   // Inputs:
1617   //   c_rarg0   - source array address
1618   //   c_rarg1   - destination array address
1619   //   c_rarg2   - element count, treated as size_t, can be zero
1620   //
1621   // Side Effects:
1622   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1623   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1624   //
1625   address generate_disjoint_long_copy(bool aligned, address *entry,
1626                                           const char *name, bool dest_uninitialized = false) {
1627     const bool not_oop = false;
1628     return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name);
1629   }
1630 
1631   // Arguments:
1632   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1633   //             ignored
1634   //   name    - stub name string
1635   //
1636   // Inputs:
1637   //   c_rarg0   - source array address
1638   //   c_rarg1   - destination array address
1639   //   c_rarg2   - element count, treated as size_t, can be zero
1640   //
1641   address generate_conjoint_long_copy(bool aligned,
1642                                       address nooverlap_target, address *entry,
1643                                       const char *name, bool dest_uninitialized = false) {
1644     const bool not_oop = false;
1645     return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name);
1646   }
1647 
1648   // Arguments:
1649   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1650   //             ignored
1651   //   name    - stub name string
1652   //
1653   // Inputs:
1654   //   c_rarg0   - source array address
1655   //   c_rarg1   - destination array address
1656   //   c_rarg2   - element count, treated as size_t, can be zero
1657   //
1658   // Side Effects:
1659   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1660   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1661   //
1662   address generate_disjoint_oop_copy(bool aligned, address *entry,
1663                                      const char *name, bool dest_uninitialized) {
1664     const bool is_oop = true;
1665     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1666     return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized);
1667   }
1668 
1669   // Arguments:
1670   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1671   //             ignored
1672   //   name    - stub name string
1673   //
1674   // Inputs:
1675   //   c_rarg0   - source array address
1676   //   c_rarg1   - destination array address
1677   //   c_rarg2   - element count, treated as size_t, can be zero
1678   //
1679   address generate_conjoint_oop_copy(bool aligned,
1680                                      address nooverlap_target, address *entry,
1681                                      const char *name, bool dest_uninitialized) {
1682     const bool is_oop = true;
1683     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1684     return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry,
1685                                   name, dest_uninitialized);
1686   }
1687 
1688 
1689   // Helper for generating a dynamic type check.
1690   // Smashes rscratch1, rscratch2.
1691   void generate_type_check(Register sub_klass,
1692                            Register super_check_offset,
1693                            Register super_klass,
1694                            Label& L_success) {
1695     assert_different_registers(sub_klass, super_check_offset, super_klass);
1696 
1697     BLOCK_COMMENT("type_check:");
1698 
1699     Label L_miss;
1700 
1701     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, NULL,
1702                                      super_check_offset);
1703     __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL);
1704 
1705     // Fall through on failure!
1706     __ BIND(L_miss);
1707   }
1708 
1709   //
1710   //  Generate checkcasting array copy stub
1711   //
1712   //  Input:
1713   //    c_rarg0   - source array address
1714   //    c_rarg1   - destination array address
1715   //    c_rarg2   - element count, treated as ssize_t, can be zero
1716   //    c_rarg3   - size_t ckoff (super_check_offset)
1717   //    c_rarg4   - oop ckval (super_klass)
1718   //
1719   //  Output:
1720   //    r0 ==  0  -  success
1721   //    r0 == -1^K - failure, where K is partial transfer count
1722   //
1723   address generate_checkcast_copy(const char *name, address *entry,
1724                                   bool dest_uninitialized = false) {
1725 
1726     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
1727 
1728     // Input registers (after setup_arg_regs)
1729     const Register from        = c_rarg0;   // source array address
1730     const Register to          = c_rarg1;   // destination array address
1731     const Register count       = c_rarg2;   // elementscount
1732     const Register ckoff       = c_rarg3;   // super_check_offset
1733     const Register ckval       = c_rarg4;   // super_klass
1734 
1735     RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
1736     RegSet wb_post_saved_regs = RegSet::of(count);
1737 
1738     // Registers used as temps (r18, r19, r20 are save-on-entry)
1739     const Register count_save  = r21;       // orig elementscount
1740     const Register start_to    = r20;       // destination array start address
1741     const Register copied_oop  = r18;       // actual oop copied
1742     const Register r19_klass   = r19;       // oop._klass
1743 
1744     //---------------------------------------------------------------
1745     // Assembler stub will be used for this call to arraycopy
1746     // if the two arrays are subtypes of Object[] but the
1747     // destination array type is not equal to or a supertype
1748     // of the source type.  Each element must be separately
1749     // checked.
1750 
1751     assert_different_registers(from, to, count, ckoff, ckval, start_to,
1752                                copied_oop, r19_klass, count_save);
1753 
1754     __ align(CodeEntryAlignment);
1755     StubCodeMark mark(this, "StubRoutines", name);
1756     address start = __ pc();
1757 
1758     __ enter(); // required for proper stackwalking of RuntimeStub frame
1759 
1760 #ifdef ASSERT
1761     // caller guarantees that the arrays really are different
1762     // otherwise, we would have to make conjoint checks
1763     { Label L;
1764       array_overlap_test(L, TIMES_OOP);
1765       __ stop("checkcast_copy within a single array");
1766       __ bind(L);
1767     }
1768 #endif //ASSERT
1769 
1770     // Caller of this entry point must set up the argument registers.
1771     if (entry != NULL) {
1772       *entry = __ pc();
1773       BLOCK_COMMENT("Entry:");
1774     }
1775 
1776      // Empty array:  Nothing to do.
1777     __ cbz(count, L_done);
1778 
1779     __ push(RegSet::of(r18, r19, r20, r21), sp);
1780 
1781 #ifdef ASSERT
1782     BLOCK_COMMENT("assert consistent ckoff/ckval");
1783     // The ckoff and ckval must be mutually consistent,
1784     // even though caller generates both.
1785     { Label L;
1786       int sco_offset = in_bytes(Klass::super_check_offset_offset());
1787       __ ldrw(start_to, Address(ckval, sco_offset));
1788       __ cmpw(ckoff, start_to);
1789       __ br(Assembler::EQ, L);
1790       __ stop("super_check_offset inconsistent");
1791       __ bind(L);
1792     }
1793 #endif //ASSERT
1794 
1795     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST;
1796     bool is_oop = true;
1797     if (dest_uninitialized) {
1798       decorators |= IS_DEST_UNINITIALIZED;
1799     }
1800 
1801     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1802     bs->arraycopy_prologue(_masm, decorators, is_oop, to, count, wb_pre_saved_regs);
1803 
1804     // save the original count
1805     __ mov(count_save, count);
1806 
1807     // Copy from low to high addresses
1808     __ mov(start_to, to);              // Save destination array start address
1809     __ b(L_load_element);
1810 
1811     // ======== begin loop ========
1812     // (Loop is rotated; its entry is L_load_element.)
1813     // Loop control:
1814     //   for (; count != 0; count--) {
1815     //     copied_oop = load_heap_oop(from++);
1816     //     ... generate_type_check ...;
1817     //     store_heap_oop(to++, copied_oop);
1818     //   }
1819     __ align(OptoLoopAlignment);
1820 
1821     __ BIND(L_store_element);
1822     __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop, noreg, noreg, AS_RAW);  // store the oop
1823     __ sub(count, count, 1);
1824     __ cbz(count, L_do_card_marks);
1825 
1826     // ======== loop entry is here ========
1827     __ BIND(L_load_element);
1828     __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8), noreg, noreg, AS_RAW); // load the oop
1829     __ cbz(copied_oop, L_store_element);
1830 
1831     __ load_klass(r19_klass, copied_oop);// query the object klass
1832     generate_type_check(r19_klass, ckoff, ckval, L_store_element);
1833     // ======== end loop ========
1834 
1835     // It was a real error; we must depend on the caller to finish the job.
1836     // Register count = remaining oops, count_orig = total oops.
1837     // Emit GC store barriers for the oops we have copied and report
1838     // their number to the caller.
1839 
1840     __ subs(count, count_save, count);     // K = partially copied oop count
1841     __ eon(count, count, zr);                   // report (-1^K) to caller
1842     __ br(Assembler::EQ, L_done_pop);
1843 
1844     __ BIND(L_do_card_marks);
1845     __ add(to, to, -heapOopSize);         // make an inclusive end pointer
1846     bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, to, rscratch1, wb_post_saved_regs);
1847 
1848     __ bind(L_done_pop);
1849     __ pop(RegSet::of(r18, r19, r20, r21), sp);
1850     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
1851 
1852     __ bind(L_done);
1853     __ mov(r0, count);
1854     __ leave();
1855     __ ret(lr);
1856 
1857     return start;
1858   }
1859 
1860   // Perform range checks on the proposed arraycopy.
1861   // Kills temp, but nothing else.
1862   // Also, clean the sign bits of src_pos and dst_pos.
1863   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
1864                               Register src_pos, // source position (c_rarg1)
1865                               Register dst,     // destination array oo (c_rarg2)
1866                               Register dst_pos, // destination position (c_rarg3)
1867                               Register length,
1868                               Register temp,
1869                               Label& L_failed) {
1870     BLOCK_COMMENT("arraycopy_range_checks:");
1871 
1872     assert_different_registers(rscratch1, temp);
1873 
1874     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
1875     __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
1876     __ addw(temp, length, src_pos);
1877     __ cmpw(temp, rscratch1);
1878     __ br(Assembler::HI, L_failed);
1879 
1880     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
1881     __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
1882     __ addw(temp, length, dst_pos);
1883     __ cmpw(temp, rscratch1);
1884     __ br(Assembler::HI, L_failed);
1885 
1886     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
1887     __ movw(src_pos, src_pos);
1888     __ movw(dst_pos, dst_pos);
1889 
1890     BLOCK_COMMENT("arraycopy_range_checks done");
1891   }
1892 
1893   // These stubs get called from some dumb test routine.
1894   // I'll write them properly when they're called from
1895   // something that's actually doing something.
1896   static void fake_arraycopy_stub(address src, address dst, int count) {
1897     assert(count == 0, "huh?");
1898   }
1899 
1900 
1901   //
1902   //  Generate 'unsafe' array copy stub
1903   //  Though just as safe as the other stubs, it takes an unscaled
1904   //  size_t argument instead of an element count.
1905   //
1906   //  Input:
1907   //    c_rarg0   - source array address
1908   //    c_rarg1   - destination array address
1909   //    c_rarg2   - byte count, treated as ssize_t, can be zero
1910   //
1911   // Examines the alignment of the operands and dispatches
1912   // to a long, int, short, or byte copy loop.
1913   //
1914   address generate_unsafe_copy(const char *name,
1915                                address byte_copy_entry,
1916                                address short_copy_entry,
1917                                address int_copy_entry,
1918                                address long_copy_entry) {
1919     Label L_long_aligned, L_int_aligned, L_short_aligned;
1920     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1921 
1922     __ align(CodeEntryAlignment);
1923     StubCodeMark mark(this, "StubRoutines", name);
1924     address start = __ pc();
1925     __ enter(); // required for proper stackwalking of RuntimeStub frame
1926 
1927     // bump this on entry, not on exit:
1928     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
1929 
1930     __ orr(rscratch1, s, d);
1931     __ orr(rscratch1, rscratch1, count);
1932 
1933     __ andr(rscratch1, rscratch1, BytesPerLong-1);
1934     __ cbz(rscratch1, L_long_aligned);
1935     __ andr(rscratch1, rscratch1, BytesPerInt-1);
1936     __ cbz(rscratch1, L_int_aligned);
1937     __ tbz(rscratch1, 0, L_short_aligned);
1938     __ b(RuntimeAddress(byte_copy_entry));
1939 
1940     __ BIND(L_short_aligned);
1941     __ lsr(count, count, LogBytesPerShort);  // size => short_count
1942     __ b(RuntimeAddress(short_copy_entry));
1943     __ BIND(L_int_aligned);
1944     __ lsr(count, count, LogBytesPerInt);    // size => int_count
1945     __ b(RuntimeAddress(int_copy_entry));
1946     __ BIND(L_long_aligned);
1947     __ lsr(count, count, LogBytesPerLong);   // size => long_count
1948     __ b(RuntimeAddress(long_copy_entry));
1949 
1950     return start;
1951   }
1952 
1953   //
1954   //  Generate generic array copy stubs
1955   //
1956   //  Input:
1957   //    c_rarg0    -  src oop
1958   //    c_rarg1    -  src_pos (32-bits)
1959   //    c_rarg2    -  dst oop
1960   //    c_rarg3    -  dst_pos (32-bits)
1961   //    c_rarg4    -  element count (32-bits)
1962   //
1963   //  Output:
1964   //    r0 ==  0  -  success
1965   //    r0 == -1^K - failure, where K is partial transfer count
1966   //
1967   address generate_generic_copy(const char *name,
1968                                 address byte_copy_entry, address short_copy_entry,
1969                                 address int_copy_entry, address oop_copy_entry,
1970                                 address long_copy_entry, address checkcast_copy_entry) {
1971 
1972     Label L_failed, L_objArray;
1973     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
1974 
1975     // Input registers
1976     const Register src        = c_rarg0;  // source array oop
1977     const Register src_pos    = c_rarg1;  // source position
1978     const Register dst        = c_rarg2;  // destination array oop
1979     const Register dst_pos    = c_rarg3;  // destination position
1980     const Register length     = c_rarg4;
1981 
1982 
1983     // Registers used as temps
1984     const Register dst_klass  = c_rarg5;
1985 
1986     __ align(CodeEntryAlignment);
1987 
1988     StubCodeMark mark(this, "StubRoutines", name);
1989 
1990     address start = __ pc();
1991 
1992     __ enter(); // required for proper stackwalking of RuntimeStub frame
1993 
1994     // bump this on entry, not on exit:
1995     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
1996 
1997     //-----------------------------------------------------------------------
1998     // Assembler stub will be used for this call to arraycopy
1999     // if the following conditions are met:
2000     //
2001     // (1) src and dst must not be null.
2002     // (2) src_pos must not be negative.
2003     // (3) dst_pos must not be negative.
2004     // (4) length  must not be negative.
2005     // (5) src klass and dst klass should be the same and not NULL.
2006     // (6) src and dst should be arrays.
2007     // (7) src_pos + length must not exceed length of src.
2008     // (8) dst_pos + length must not exceed length of dst.
2009     //
2010 
2011     //  if (src == NULL) return -1;
2012     __ cbz(src, L_failed);
2013 
2014     //  if (src_pos < 0) return -1;
2015     __ tbnz(src_pos, 31, L_failed);  // i.e. sign bit set
2016 
2017     //  if (dst == NULL) return -1;
2018     __ cbz(dst, L_failed);
2019 
2020     //  if (dst_pos < 0) return -1;
2021     __ tbnz(dst_pos, 31, L_failed);  // i.e. sign bit set
2022 
2023     // registers used as temp
2024     const Register scratch_length    = r16; // elements count to copy
2025     const Register scratch_src_klass = r17; // array klass
2026     const Register lh                = r18; // layout helper
2027 
2028     //  if (length < 0) return -1;
2029     __ movw(scratch_length, length);        // length (elements count, 32-bits value)
2030     __ tbnz(scratch_length, 31, L_failed);  // i.e. sign bit set
2031 
2032     __ load_klass(scratch_src_klass, src);
2033 #ifdef ASSERT
2034     //  assert(src->klass() != NULL);
2035     {
2036       BLOCK_COMMENT("assert klasses not null {");
2037       Label L1, L2;
2038       __ cbnz(scratch_src_klass, L2);   // it is broken if klass is NULL
2039       __ bind(L1);
2040       __ stop("broken null klass");
2041       __ bind(L2);
2042       __ load_klass(rscratch1, dst);
2043       __ cbz(rscratch1, L1);     // this would be broken also
2044       BLOCK_COMMENT("} assert klasses not null done");
2045     }
2046 #endif
2047 
2048     // Load layout helper (32-bits)
2049     //
2050     //  |array_tag|     | header_size | element_type |     |log2_element_size|
2051     // 32        30    24            16              8     2                 0
2052     //
2053     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2054     //
2055 
2056     const int lh_offset = in_bytes(Klass::layout_helper_offset());
2057 
2058     // Handle objArrays completely differently...
2059     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2060     __ ldrw(lh, Address(scratch_src_klass, lh_offset));
2061     __ movw(rscratch1, objArray_lh);
2062     __ eorw(rscratch2, lh, rscratch1);
2063     __ cbzw(rscratch2, L_objArray);
2064 
2065     //  if (src->klass() != dst->klass()) return -1;
2066     __ load_klass(rscratch2, dst);
2067     __ eor(rscratch2, rscratch2, scratch_src_klass);
2068     __ cbnz(rscratch2, L_failed);
2069 
2070     //  if (!src->is_Array()) return -1;
2071     __ tbz(lh, 31, L_failed);  // i.e. (lh >= 0)
2072 
2073     // At this point, it is known to be a typeArray (array_tag 0x3).
2074 #ifdef ASSERT
2075     {
2076       BLOCK_COMMENT("assert primitive array {");
2077       Label L;
2078       __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
2079       __ cmpw(lh, rscratch2);
2080       __ br(Assembler::GE, L);
2081       __ stop("must be a primitive array");
2082       __ bind(L);
2083       BLOCK_COMMENT("} assert primitive array done");
2084     }
2085 #endif
2086 
2087     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2088                            rscratch2, L_failed);
2089 
2090     // TypeArrayKlass
2091     //
2092     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2093     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2094     //
2095 
2096     const Register rscratch1_offset = rscratch1;    // array offset
2097     const Register r18_elsize = lh; // element size
2098 
2099     __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
2100            exact_log2(Klass::_lh_header_size_mask+1));   // array_offset
2101     __ add(src, src, rscratch1_offset);           // src array offset
2102     __ add(dst, dst, rscratch1_offset);           // dst array offset
2103     BLOCK_COMMENT("choose copy loop based on element size");
2104 
2105     // next registers should be set before the jump to corresponding stub
2106     const Register from     = c_rarg0;  // source array address
2107     const Register to       = c_rarg1;  // destination array address
2108     const Register count    = c_rarg2;  // elements count
2109 
2110     // 'from', 'to', 'count' registers should be set in such order
2111     // since they are the same as 'src', 'src_pos', 'dst'.
2112 
2113     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
2114 
2115     // The possible values of elsize are 0-3, i.e. exact_log2(element
2116     // size in bytes).  We do a simple bitwise binary search.
2117   __ BIND(L_copy_bytes);
2118     __ tbnz(r18_elsize, 1, L_copy_ints);
2119     __ tbnz(r18_elsize, 0, L_copy_shorts);
2120     __ lea(from, Address(src, src_pos));// src_addr
2121     __ lea(to,   Address(dst, dst_pos));// dst_addr
2122     __ movw(count, scratch_length); // length
2123     __ b(RuntimeAddress(byte_copy_entry));
2124 
2125   __ BIND(L_copy_shorts);
2126     __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
2127     __ lea(to,   Address(dst, dst_pos, Address::lsl(1)));// dst_addr
2128     __ movw(count, scratch_length); // length
2129     __ b(RuntimeAddress(short_copy_entry));
2130 
2131   __ BIND(L_copy_ints);
2132     __ tbnz(r18_elsize, 0, L_copy_longs);
2133     __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
2134     __ lea(to,   Address(dst, dst_pos, Address::lsl(2)));// dst_addr
2135     __ movw(count, scratch_length); // length
2136     __ b(RuntimeAddress(int_copy_entry));
2137 
2138   __ BIND(L_copy_longs);
2139 #ifdef ASSERT
2140     {
2141       BLOCK_COMMENT("assert long copy {");
2142       Label L;
2143       __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r18_elsize
2144       __ cmpw(r18_elsize, LogBytesPerLong);
2145       __ br(Assembler::EQ, L);
2146       __ stop("must be long copy, but elsize is wrong");
2147       __ bind(L);
2148       BLOCK_COMMENT("} assert long copy done");
2149     }
2150 #endif
2151     __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
2152     __ lea(to,   Address(dst, dst_pos, Address::lsl(3)));// dst_addr
2153     __ movw(count, scratch_length); // length
2154     __ b(RuntimeAddress(long_copy_entry));
2155 
2156     // ObjArrayKlass
2157   __ BIND(L_objArray);
2158     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
2159 
2160     Label L_plain_copy, L_checkcast_copy;
2161     //  test array classes for subtyping
2162     __ load_klass(r18, dst);
2163     __ cmp(scratch_src_klass, r18); // usual case is exact equality
2164     __ br(Assembler::NE, L_checkcast_copy);
2165 
2166     // Identically typed arrays can be copied without element-wise checks.
2167     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2168                            rscratch2, L_failed);
2169 
2170     __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2171     __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2172     __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2173     __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2174     __ movw(count, scratch_length); // length
2175   __ BIND(L_plain_copy);
2176     __ b(RuntimeAddress(oop_copy_entry));
2177 
2178   __ BIND(L_checkcast_copy);
2179     // live at this point:  scratch_src_klass, scratch_length, r18 (dst_klass)
2180     {
2181       // Before looking at dst.length, make sure dst is also an objArray.
2182       __ ldrw(rscratch1, Address(r18, lh_offset));
2183       __ movw(rscratch2, objArray_lh);
2184       __ eorw(rscratch1, rscratch1, rscratch2);
2185       __ cbnzw(rscratch1, L_failed);
2186 
2187       // It is safe to examine both src.length and dst.length.
2188       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2189                              r18, L_failed);
2190 
2191       __ load_klass(dst_klass, dst); // reload
2192 
2193       // Marshal the base address arguments now, freeing registers.
2194       __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2195       __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2196       __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2197       __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2198       __ movw(count, length);           // length (reloaded)
2199       Register sco_temp = c_rarg3;      // this register is free now
2200       assert_different_registers(from, to, count, sco_temp,
2201                                  dst_klass, scratch_src_klass);
2202       // assert_clean_int(count, sco_temp);
2203 
2204       // Generate the type check.
2205       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
2206       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2207 
2208       // Smashes rscratch1, rscratch2
2209       generate_type_check(scratch_src_klass, sco_temp, dst_klass, L_plain_copy);
2210 
2211       // Fetch destination element klass from the ObjArrayKlass header.
2212       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2213       __ ldr(dst_klass, Address(dst_klass, ek_offset));
2214       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2215 
2216       // the checkcast_copy loop needs two extra arguments:
2217       assert(c_rarg3 == sco_temp, "#3 already in place");
2218       // Set up arguments for checkcast_copy_entry.
2219       __ mov(c_rarg4, dst_klass);  // dst.klass.element_klass
2220       __ b(RuntimeAddress(checkcast_copy_entry));
2221     }
2222 
2223   __ BIND(L_failed);
2224     __ mov(r0, -1);
2225     __ leave();   // required for proper stackwalking of RuntimeStub frame
2226     __ ret(lr);
2227 
2228     return start;
2229   }
2230 
2231   //
2232   // Generate stub for array fill. If "aligned" is true, the
2233   // "to" address is assumed to be heapword aligned.
2234   //
2235   // Arguments for generated stub:
2236   //   to:    c_rarg0
2237   //   value: c_rarg1
2238   //   count: c_rarg2 treated as signed
2239   //
2240   address generate_fill(BasicType t, bool aligned, const char *name) {
2241     __ align(CodeEntryAlignment);
2242     StubCodeMark mark(this, "StubRoutines", name);
2243     address start = __ pc();
2244 
2245     BLOCK_COMMENT("Entry:");
2246 
2247     const Register to        = c_rarg0;  // source array address
2248     const Register value     = c_rarg1;  // value
2249     const Register count     = c_rarg2;  // elements count
2250 
2251     const Register bz_base = r10;        // base for block_zero routine
2252     const Register cnt_words = r11;      // temp register
2253 
2254     __ enter();
2255 
2256     Label L_fill_elements, L_exit1;
2257 
2258     int shift = -1;
2259     switch (t) {
2260       case T_BYTE:
2261         shift = 0;
2262         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2263         __ bfi(value, value, 8, 8);   // 8 bit -> 16 bit
2264         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2265         __ br(Assembler::LO, L_fill_elements);
2266         break;
2267       case T_SHORT:
2268         shift = 1;
2269         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2270         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2271         __ br(Assembler::LO, L_fill_elements);
2272         break;
2273       case T_INT:
2274         shift = 2;
2275         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2276         __ br(Assembler::LO, L_fill_elements);
2277         break;
2278       default: ShouldNotReachHere();
2279     }
2280 
2281     // Align source address at 8 bytes address boundary.
2282     Label L_skip_align1, L_skip_align2, L_skip_align4;
2283     if (!aligned) {
2284       switch (t) {
2285         case T_BYTE:
2286           // One byte misalignment happens only for byte arrays.
2287           __ tbz(to, 0, L_skip_align1);
2288           __ strb(value, Address(__ post(to, 1)));
2289           __ subw(count, count, 1);
2290           __ bind(L_skip_align1);
2291           // Fallthrough
2292         case T_SHORT:
2293           // Two bytes misalignment happens only for byte and short (char) arrays.
2294           __ tbz(to, 1, L_skip_align2);
2295           __ strh(value, Address(__ post(to, 2)));
2296           __ subw(count, count, 2 >> shift);
2297           __ bind(L_skip_align2);
2298           // Fallthrough
2299         case T_INT:
2300           // Align to 8 bytes, we know we are 4 byte aligned to start.
2301           __ tbz(to, 2, L_skip_align4);
2302           __ strw(value, Address(__ post(to, 4)));
2303           __ subw(count, count, 4 >> shift);
2304           __ bind(L_skip_align4);
2305           break;
2306         default: ShouldNotReachHere();
2307       }
2308     }
2309 
2310     //
2311     //  Fill large chunks
2312     //
2313     __ lsrw(cnt_words, count, 3 - shift); // number of words
2314     __ bfi(value, value, 32, 32);         // 32 bit -> 64 bit
2315     __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
2316     if (UseBlockZeroing) {
2317       Label non_block_zeroing, rest;
2318       // If the fill value is zero we can use the fast zero_words().
2319       __ cbnz(value, non_block_zeroing);
2320       __ mov(bz_base, to);
2321       __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
2322       __ zero_words(bz_base, cnt_words);
2323       __ b(rest);
2324       __ bind(non_block_zeroing);
2325       __ fill_words(to, cnt_words, value);
2326       __ bind(rest);
2327     } else {
2328       __ fill_words(to, cnt_words, value);
2329     }
2330 
2331     // Remaining count is less than 8 bytes. Fill it by a single store.
2332     // Note that the total length is no less than 8 bytes.
2333     if (t == T_BYTE || t == T_SHORT) {
2334       Label L_exit1;
2335       __ cbzw(count, L_exit1);
2336       __ add(to, to, count, Assembler::LSL, shift); // points to the end
2337       __ str(value, Address(to, -8));    // overwrite some elements
2338       __ bind(L_exit1);
2339       __ leave();
2340       __ ret(lr);
2341     }
2342 
2343     // Handle copies less than 8 bytes.
2344     Label L_fill_2, L_fill_4, L_exit2;
2345     __ bind(L_fill_elements);
2346     switch (t) {
2347       case T_BYTE:
2348         __ tbz(count, 0, L_fill_2);
2349         __ strb(value, Address(__ post(to, 1)));
2350         __ bind(L_fill_2);
2351         __ tbz(count, 1, L_fill_4);
2352         __ strh(value, Address(__ post(to, 2)));
2353         __ bind(L_fill_4);
2354         __ tbz(count, 2, L_exit2);
2355         __ strw(value, Address(to));
2356         break;
2357       case T_SHORT:
2358         __ tbz(count, 0, L_fill_4);
2359         __ strh(value, Address(__ post(to, 2)));
2360         __ bind(L_fill_4);
2361         __ tbz(count, 1, L_exit2);
2362         __ strw(value, Address(to));
2363         break;
2364       case T_INT:
2365         __ cbzw(count, L_exit2);
2366         __ strw(value, Address(to));
2367         break;
2368       default: ShouldNotReachHere();
2369     }
2370     __ bind(L_exit2);
2371     __ leave();
2372     __ ret(lr);
2373     return start;
2374   }
2375 
2376   void generate_arraycopy_stubs() {
2377     address entry;
2378     address entry_jbyte_arraycopy;
2379     address entry_jshort_arraycopy;
2380     address entry_jint_arraycopy;
2381     address entry_oop_arraycopy;
2382     address entry_jlong_arraycopy;
2383     address entry_checkcast_arraycopy;
2384 
2385     generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards);
2386     generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards);
2387 
2388     StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
2389 
2390     //*** jbyte
2391     // Always need aligned and unaligned versions
2392     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_byte_copy(false, &entry,
2393                                                                                   "jbyte_disjoint_arraycopy");
2394     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_byte_copy(false, entry,
2395                                                                                   &entry_jbyte_arraycopy,
2396                                                                                   "jbyte_arraycopy");
2397     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry,
2398                                                                                   "arrayof_jbyte_disjoint_arraycopy");
2399     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_byte_copy(true, entry, NULL,
2400                                                                                   "arrayof_jbyte_arraycopy");
2401 
2402     //*** jshort
2403     // Always need aligned and unaligned versions
2404     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_short_copy(false, &entry,
2405                                                                                     "jshort_disjoint_arraycopy");
2406     StubRoutines::_jshort_arraycopy                  = generate_conjoint_short_copy(false, entry,
2407                                                                                     &entry_jshort_arraycopy,
2408                                                                                     "jshort_arraycopy");
2409     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry,
2410                                                                                     "arrayof_jshort_disjoint_arraycopy");
2411     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_short_copy(true, entry, NULL,
2412                                                                                     "arrayof_jshort_arraycopy");
2413 
2414     //*** jint
2415     // Aligned versions
2416     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry,
2417                                                                                 "arrayof_jint_disjoint_arraycopy");
2418     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy,
2419                                                                                 "arrayof_jint_arraycopy");
2420     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
2421     // entry_jint_arraycopy always points to the unaligned version
2422     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_int_copy(false, &entry,
2423                                                                                 "jint_disjoint_arraycopy");
2424     StubRoutines::_jint_arraycopy                  = generate_conjoint_int_copy(false, entry,
2425                                                                                 &entry_jint_arraycopy,
2426                                                                                 "jint_arraycopy");
2427 
2428     //*** jlong
2429     // It is always aligned
2430     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry,
2431                                                                                   "arrayof_jlong_disjoint_arraycopy");
2432     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy,
2433                                                                                   "arrayof_jlong_arraycopy");
2434     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
2435     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
2436 
2437     //*** oops
2438     {
2439       // With compressed oops we need unaligned versions; notice that
2440       // we overwrite entry_oop_arraycopy.
2441       bool aligned = !UseCompressedOops;
2442 
2443       StubRoutines::_arrayof_oop_disjoint_arraycopy
2444         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy",
2445                                      /*dest_uninitialized*/false);
2446       StubRoutines::_arrayof_oop_arraycopy
2447         = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy",
2448                                      /*dest_uninitialized*/false);
2449       // Aligned versions without pre-barriers
2450       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
2451         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit",
2452                                      /*dest_uninitialized*/true);
2453       StubRoutines::_arrayof_oop_arraycopy_uninit
2454         = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit",
2455                                      /*dest_uninitialized*/true);
2456     }
2457 
2458     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
2459     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
2460     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
2461     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
2462 
2463     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
2464     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
2465                                                                         /*dest_uninitialized*/true);
2466 
2467     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
2468                                                               entry_jbyte_arraycopy,
2469                                                               entry_jshort_arraycopy,
2470                                                               entry_jint_arraycopy,
2471                                                               entry_jlong_arraycopy);
2472 
2473     StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
2474                                                                entry_jbyte_arraycopy,
2475                                                                entry_jshort_arraycopy,
2476                                                                entry_jint_arraycopy,
2477                                                                entry_oop_arraycopy,
2478                                                                entry_jlong_arraycopy,
2479                                                                entry_checkcast_arraycopy);
2480 
2481     StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
2482     StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
2483     StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
2484     StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
2485     StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
2486     StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
2487   }
2488 
2489   void generate_math_stubs() { Unimplemented(); }
2490 
2491   // Arguments:
2492   //
2493   // Inputs:
2494   //   c_rarg0   - source byte array address
2495   //   c_rarg1   - destination byte array address
2496   //   c_rarg2   - K (key) in little endian int array
2497   //
2498   address generate_aescrypt_encryptBlock() {
2499     __ align(CodeEntryAlignment);
2500     StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
2501 
2502     Label L_doLast;
2503 
2504     const Register from        = c_rarg0;  // source array address
2505     const Register to          = c_rarg1;  // destination array address
2506     const Register key         = c_rarg2;  // key array address
2507     const Register keylen      = rscratch1;
2508 
2509     address start = __ pc();
2510     __ enter();
2511 
2512     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2513 
2514     __ ld1(v0, __ T16B, from); // get 16 bytes of input
2515 
2516     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2517     __ rev32(v1, __ T16B, v1);
2518     __ rev32(v2, __ T16B, v2);
2519     __ rev32(v3, __ T16B, v3);
2520     __ rev32(v4, __ T16B, v4);
2521     __ aese(v0, v1);
2522     __ aesmc(v0, v0);
2523     __ aese(v0, v2);
2524     __ aesmc(v0, v0);
2525     __ aese(v0, v3);
2526     __ aesmc(v0, v0);
2527     __ aese(v0, v4);
2528     __ aesmc(v0, v0);
2529 
2530     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2531     __ rev32(v1, __ T16B, v1);
2532     __ rev32(v2, __ T16B, v2);
2533     __ rev32(v3, __ T16B, v3);
2534     __ rev32(v4, __ T16B, v4);
2535     __ aese(v0, v1);
2536     __ aesmc(v0, v0);
2537     __ aese(v0, v2);
2538     __ aesmc(v0, v0);
2539     __ aese(v0, v3);
2540     __ aesmc(v0, v0);
2541     __ aese(v0, v4);
2542     __ aesmc(v0, v0);
2543 
2544     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2545     __ rev32(v1, __ T16B, v1);
2546     __ rev32(v2, __ T16B, v2);
2547 
2548     __ cmpw(keylen, 44);
2549     __ br(Assembler::EQ, L_doLast);
2550 
2551     __ aese(v0, v1);
2552     __ aesmc(v0, v0);
2553     __ aese(v0, v2);
2554     __ aesmc(v0, v0);
2555 
2556     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2557     __ rev32(v1, __ T16B, v1);
2558     __ rev32(v2, __ T16B, v2);
2559 
2560     __ cmpw(keylen, 52);
2561     __ br(Assembler::EQ, L_doLast);
2562 
2563     __ aese(v0, v1);
2564     __ aesmc(v0, v0);
2565     __ aese(v0, v2);
2566     __ aesmc(v0, v0);
2567 
2568     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2569     __ rev32(v1, __ T16B, v1);
2570     __ rev32(v2, __ T16B, v2);
2571 
2572     __ BIND(L_doLast);
2573 
2574     __ aese(v0, v1);
2575     __ aesmc(v0, v0);
2576     __ aese(v0, v2);
2577 
2578     __ ld1(v1, __ T16B, key);
2579     __ rev32(v1, __ T16B, v1);
2580     __ eor(v0, __ T16B, v0, v1);
2581 
2582     __ st1(v0, __ T16B, to);
2583 
2584     __ mov(r0, 0);
2585 
2586     __ leave();
2587     __ ret(lr);
2588 
2589     return start;
2590   }
2591 
2592   // Arguments:
2593   //
2594   // Inputs:
2595   //   c_rarg0   - source byte array address
2596   //   c_rarg1   - destination byte array address
2597   //   c_rarg2   - K (key) in little endian int array
2598   //
2599   address generate_aescrypt_decryptBlock() {
2600     assert(UseAES, "need AES instructions and misaligned SSE support");
2601     __ align(CodeEntryAlignment);
2602     StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
2603     Label L_doLast;
2604 
2605     const Register from        = c_rarg0;  // source array address
2606     const Register to          = c_rarg1;  // destination array address
2607     const Register key         = c_rarg2;  // key array address
2608     const Register keylen      = rscratch1;
2609 
2610     address start = __ pc();
2611     __ enter(); // required for proper stackwalking of RuntimeStub frame
2612 
2613     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2614 
2615     __ ld1(v0, __ T16B, from); // get 16 bytes of input
2616 
2617     __ ld1(v5, __ T16B, __ post(key, 16));
2618     __ rev32(v5, __ T16B, v5);
2619 
2620     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2621     __ rev32(v1, __ T16B, v1);
2622     __ rev32(v2, __ T16B, v2);
2623     __ rev32(v3, __ T16B, v3);
2624     __ rev32(v4, __ T16B, v4);
2625     __ aesd(v0, v1);
2626     __ aesimc(v0, v0);
2627     __ aesd(v0, v2);
2628     __ aesimc(v0, v0);
2629     __ aesd(v0, v3);
2630     __ aesimc(v0, v0);
2631     __ aesd(v0, v4);
2632     __ aesimc(v0, v0);
2633 
2634     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2635     __ rev32(v1, __ T16B, v1);
2636     __ rev32(v2, __ T16B, v2);
2637     __ rev32(v3, __ T16B, v3);
2638     __ rev32(v4, __ T16B, v4);
2639     __ aesd(v0, v1);
2640     __ aesimc(v0, v0);
2641     __ aesd(v0, v2);
2642     __ aesimc(v0, v0);
2643     __ aesd(v0, v3);
2644     __ aesimc(v0, v0);
2645     __ aesd(v0, v4);
2646     __ aesimc(v0, v0);
2647 
2648     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2649     __ rev32(v1, __ T16B, v1);
2650     __ rev32(v2, __ T16B, v2);
2651 
2652     __ cmpw(keylen, 44);
2653     __ br(Assembler::EQ, L_doLast);
2654 
2655     __ aesd(v0, v1);
2656     __ aesimc(v0, v0);
2657     __ aesd(v0, v2);
2658     __ aesimc(v0, v0);
2659 
2660     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2661     __ rev32(v1, __ T16B, v1);
2662     __ rev32(v2, __ T16B, v2);
2663 
2664     __ cmpw(keylen, 52);
2665     __ br(Assembler::EQ, L_doLast);
2666 
2667     __ aesd(v0, v1);
2668     __ aesimc(v0, v0);
2669     __ aesd(v0, v2);
2670     __ aesimc(v0, v0);
2671 
2672     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2673     __ rev32(v1, __ T16B, v1);
2674     __ rev32(v2, __ T16B, v2);
2675 
2676     __ BIND(L_doLast);
2677 
2678     __ aesd(v0, v1);
2679     __ aesimc(v0, v0);
2680     __ aesd(v0, v2);
2681 
2682     __ eor(v0, __ T16B, v0, v5);
2683 
2684     __ st1(v0, __ T16B, to);
2685 
2686     __ mov(r0, 0);
2687 
2688     __ leave();
2689     __ ret(lr);
2690 
2691     return start;
2692   }
2693 
2694   // Arguments:
2695   //
2696   // Inputs:
2697   //   c_rarg0   - source byte array address
2698   //   c_rarg1   - destination byte array address
2699   //   c_rarg2   - K (key) in little endian int array
2700   //   c_rarg3   - r vector byte array address
2701   //   c_rarg4   - input length
2702   //
2703   // Output:
2704   //   x0        - input length
2705   //
2706   address generate_cipherBlockChaining_encryptAESCrypt() {
2707     assert(UseAES, "need AES instructions and misaligned SSE support");
2708     __ align(CodeEntryAlignment);
2709     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
2710 
2711     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2712 
2713     const Register from        = c_rarg0;  // source array address
2714     const Register to          = c_rarg1;  // destination array address
2715     const Register key         = c_rarg2;  // key array address
2716     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2717                                            // and left with the results of the last encryption block
2718     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2719     const Register keylen      = rscratch1;
2720 
2721     address start = __ pc();
2722 
2723       __ enter();
2724 
2725       __ movw(rscratch2, len_reg);
2726 
2727       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2728 
2729       __ ld1(v0, __ T16B, rvec);
2730 
2731       __ cmpw(keylen, 52);
2732       __ br(Assembler::CC, L_loadkeys_44);
2733       __ br(Assembler::EQ, L_loadkeys_52);
2734 
2735       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2736       __ rev32(v17, __ T16B, v17);
2737       __ rev32(v18, __ T16B, v18);
2738     __ BIND(L_loadkeys_52);
2739       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2740       __ rev32(v19, __ T16B, v19);
2741       __ rev32(v20, __ T16B, v20);
2742     __ BIND(L_loadkeys_44);
2743       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2744       __ rev32(v21, __ T16B, v21);
2745       __ rev32(v22, __ T16B, v22);
2746       __ rev32(v23, __ T16B, v23);
2747       __ rev32(v24, __ T16B, v24);
2748       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2749       __ rev32(v25, __ T16B, v25);
2750       __ rev32(v26, __ T16B, v26);
2751       __ rev32(v27, __ T16B, v27);
2752       __ rev32(v28, __ T16B, v28);
2753       __ ld1(v29, v30, v31, __ T16B, key);
2754       __ rev32(v29, __ T16B, v29);
2755       __ rev32(v30, __ T16B, v30);
2756       __ rev32(v31, __ T16B, v31);
2757 
2758     __ BIND(L_aes_loop);
2759       __ ld1(v1, __ T16B, __ post(from, 16));
2760       __ eor(v0, __ T16B, v0, v1);
2761 
2762       __ br(Assembler::CC, L_rounds_44);
2763       __ br(Assembler::EQ, L_rounds_52);
2764 
2765       __ aese(v0, v17); __ aesmc(v0, v0);
2766       __ aese(v0, v18); __ aesmc(v0, v0);
2767     __ BIND(L_rounds_52);
2768       __ aese(v0, v19); __ aesmc(v0, v0);
2769       __ aese(v0, v20); __ aesmc(v0, v0);
2770     __ BIND(L_rounds_44);
2771       __ aese(v0, v21); __ aesmc(v0, v0);
2772       __ aese(v0, v22); __ aesmc(v0, v0);
2773       __ aese(v0, v23); __ aesmc(v0, v0);
2774       __ aese(v0, v24); __ aesmc(v0, v0);
2775       __ aese(v0, v25); __ aesmc(v0, v0);
2776       __ aese(v0, v26); __ aesmc(v0, v0);
2777       __ aese(v0, v27); __ aesmc(v0, v0);
2778       __ aese(v0, v28); __ aesmc(v0, v0);
2779       __ aese(v0, v29); __ aesmc(v0, v0);
2780       __ aese(v0, v30);
2781       __ eor(v0, __ T16B, v0, v31);
2782 
2783       __ st1(v0, __ T16B, __ post(to, 16));
2784 
2785       __ subw(len_reg, len_reg, 16);
2786       __ cbnzw(len_reg, L_aes_loop);
2787 
2788       __ st1(v0, __ T16B, rvec);
2789 
2790       __ mov(r0, rscratch2);
2791 
2792       __ leave();
2793       __ ret(lr);
2794 
2795       return start;
2796   }
2797 
2798   // Arguments:
2799   //
2800   // Inputs:
2801   //   c_rarg0   - source byte array address
2802   //   c_rarg1   - destination byte array address
2803   //   c_rarg2   - K (key) in little endian int array
2804   //   c_rarg3   - r vector byte array address
2805   //   c_rarg4   - input length
2806   //
2807   // Output:
2808   //   r0        - input length
2809   //
2810   address generate_cipherBlockChaining_decryptAESCrypt() {
2811     assert(UseAES, "need AES instructions and misaligned SSE support");
2812     __ align(CodeEntryAlignment);
2813     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
2814 
2815     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2816 
2817     const Register from        = c_rarg0;  // source array address
2818     const Register to          = c_rarg1;  // destination array address
2819     const Register key         = c_rarg2;  // key array address
2820     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2821                                            // and left with the results of the last encryption block
2822     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2823     const Register keylen      = rscratch1;
2824 
2825     address start = __ pc();
2826 
2827       __ enter();
2828 
2829       __ movw(rscratch2, len_reg);
2830 
2831       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2832 
2833       __ ld1(v2, __ T16B, rvec);
2834 
2835       __ ld1(v31, __ T16B, __ post(key, 16));
2836       __ rev32(v31, __ T16B, v31);
2837 
2838       __ cmpw(keylen, 52);
2839       __ br(Assembler::CC, L_loadkeys_44);
2840       __ br(Assembler::EQ, L_loadkeys_52);
2841 
2842       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2843       __ rev32(v17, __ T16B, v17);
2844       __ rev32(v18, __ T16B, v18);
2845     __ BIND(L_loadkeys_52);
2846       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2847       __ rev32(v19, __ T16B, v19);
2848       __ rev32(v20, __ T16B, v20);
2849     __ BIND(L_loadkeys_44);
2850       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2851       __ rev32(v21, __ T16B, v21);
2852       __ rev32(v22, __ T16B, v22);
2853       __ rev32(v23, __ T16B, v23);
2854       __ rev32(v24, __ T16B, v24);
2855       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2856       __ rev32(v25, __ T16B, v25);
2857       __ rev32(v26, __ T16B, v26);
2858       __ rev32(v27, __ T16B, v27);
2859       __ rev32(v28, __ T16B, v28);
2860       __ ld1(v29, v30, __ T16B, key);
2861       __ rev32(v29, __ T16B, v29);
2862       __ rev32(v30, __ T16B, v30);
2863 
2864     __ BIND(L_aes_loop);
2865       __ ld1(v0, __ T16B, __ post(from, 16));
2866       __ orr(v1, __ T16B, v0, v0);
2867 
2868       __ br(Assembler::CC, L_rounds_44);
2869       __ br(Assembler::EQ, L_rounds_52);
2870 
2871       __ aesd(v0, v17); __ aesimc(v0, v0);
2872       __ aesd(v0, v18); __ aesimc(v0, v0);
2873     __ BIND(L_rounds_52);
2874       __ aesd(v0, v19); __ aesimc(v0, v0);
2875       __ aesd(v0, v20); __ aesimc(v0, v0);
2876     __ BIND(L_rounds_44);
2877       __ aesd(v0, v21); __ aesimc(v0, v0);
2878       __ aesd(v0, v22); __ aesimc(v0, v0);
2879       __ aesd(v0, v23); __ aesimc(v0, v0);
2880       __ aesd(v0, v24); __ aesimc(v0, v0);
2881       __ aesd(v0, v25); __ aesimc(v0, v0);
2882       __ aesd(v0, v26); __ aesimc(v0, v0);
2883       __ aesd(v0, v27); __ aesimc(v0, v0);
2884       __ aesd(v0, v28); __ aesimc(v0, v0);
2885       __ aesd(v0, v29); __ aesimc(v0, v0);
2886       __ aesd(v0, v30);
2887       __ eor(v0, __ T16B, v0, v31);
2888       __ eor(v0, __ T16B, v0, v2);
2889 
2890       __ st1(v0, __ T16B, __ post(to, 16));
2891       __ orr(v2, __ T16B, v1, v1);
2892 
2893       __ subw(len_reg, len_reg, 16);
2894       __ cbnzw(len_reg, L_aes_loop);
2895 
2896       __ st1(v2, __ T16B, rvec);
2897 
2898       __ mov(r0, rscratch2);
2899 
2900       __ leave();
2901       __ ret(lr);
2902 
2903     return start;
2904   }
2905 
2906   // Arguments:
2907   //
2908   // Inputs:
2909   //   c_rarg0   - byte[]  source+offset
2910   //   c_rarg1   - int[]   SHA.state
2911   //   c_rarg2   - int     offset
2912   //   c_rarg3   - int     limit
2913   //
2914   address generate_sha1_implCompress(bool multi_block, const char *name) {
2915     __ align(CodeEntryAlignment);
2916     StubCodeMark mark(this, "StubRoutines", name);
2917     address start = __ pc();
2918 
2919     Register buf   = c_rarg0;
2920     Register state = c_rarg1;
2921     Register ofs   = c_rarg2;
2922     Register limit = c_rarg3;
2923 
2924     Label keys;
2925     Label sha1_loop;
2926 
2927     // load the keys into v0..v3
2928     __ adr(rscratch1, keys);
2929     __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
2930     // load 5 words state into v6, v7
2931     __ ldrq(v6, Address(state, 0));
2932     __ ldrs(v7, Address(state, 16));
2933 
2934 
2935     __ BIND(sha1_loop);
2936     // load 64 bytes of data into v16..v19
2937     __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
2938     __ rev32(v16, __ T16B, v16);
2939     __ rev32(v17, __ T16B, v17);
2940     __ rev32(v18, __ T16B, v18);
2941     __ rev32(v19, __ T16B, v19);
2942 
2943     // do the sha1
2944     __ addv(v4, __ T4S, v16, v0);
2945     __ orr(v20, __ T16B, v6, v6);
2946 
2947     FloatRegister d0 = v16;
2948     FloatRegister d1 = v17;
2949     FloatRegister d2 = v18;
2950     FloatRegister d3 = v19;
2951 
2952     for (int round = 0; round < 20; round++) {
2953       FloatRegister tmp1 = (round & 1) ? v4 : v5;
2954       FloatRegister tmp2 = (round & 1) ? v21 : v22;
2955       FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
2956       FloatRegister tmp4 = (round & 1) ? v5 : v4;
2957       FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
2958 
2959       if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
2960       if (round < 19) __ addv(tmp1, __ T4S, d1, key);
2961       __ sha1h(tmp2, __ T4S, v20);
2962       if (round < 5)
2963         __ sha1c(v20, __ T4S, tmp3, tmp4);
2964       else if (round < 10 || round >= 15)
2965         __ sha1p(v20, __ T4S, tmp3, tmp4);
2966       else
2967         __ sha1m(v20, __ T4S, tmp3, tmp4);
2968       if (round < 16) __ sha1su1(d0, __ T4S, d3);
2969 
2970       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
2971     }
2972 
2973     __ addv(v7, __ T2S, v7, v21);
2974     __ addv(v6, __ T4S, v6, v20);
2975 
2976     if (multi_block) {
2977       __ add(ofs, ofs, 64);
2978       __ cmp(ofs, limit);
2979       __ br(Assembler::LE, sha1_loop);
2980       __ mov(c_rarg0, ofs); // return ofs
2981     }
2982 
2983     __ strq(v6, Address(state, 0));
2984     __ strs(v7, Address(state, 16));
2985 
2986     __ ret(lr);
2987 
2988     __ bind(keys);
2989     __ emit_int32(0x5a827999);
2990     __ emit_int32(0x6ed9eba1);
2991     __ emit_int32(0x8f1bbcdc);
2992     __ emit_int32(0xca62c1d6);
2993 
2994     return start;
2995   }
2996 
2997 
2998   // Arguments:
2999   //
3000   // Inputs:
3001   //   c_rarg0   - byte[]  source+offset
3002   //   c_rarg1   - int[]   SHA.state
3003   //   c_rarg2   - int     offset
3004   //   c_rarg3   - int     limit
3005   //
3006   address generate_sha256_implCompress(bool multi_block, const char *name) {
3007     static const uint32_t round_consts[64] = {
3008       0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
3009       0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
3010       0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
3011       0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
3012       0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
3013       0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
3014       0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
3015       0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
3016       0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
3017       0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
3018       0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
3019       0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
3020       0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
3021       0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
3022       0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
3023       0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
3024     };
3025     __ align(CodeEntryAlignment);
3026     StubCodeMark mark(this, "StubRoutines", name);
3027     address start = __ pc();
3028 
3029     Register buf   = c_rarg0;
3030     Register state = c_rarg1;
3031     Register ofs   = c_rarg2;
3032     Register limit = c_rarg3;
3033 
3034     Label sha1_loop;
3035 
3036     __ stpd(v8, v9, __ pre(sp, -32));
3037     __ stpd(v10, v11, Address(sp, 16));
3038 
3039 // dga == v0
3040 // dgb == v1
3041 // dg0 == v2
3042 // dg1 == v3
3043 // dg2 == v4
3044 // t0 == v6
3045 // t1 == v7
3046 
3047     // load 16 keys to v16..v31
3048     __ lea(rscratch1, ExternalAddress((address)round_consts));
3049     __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
3050     __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
3051     __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
3052     __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
3053 
3054     // load 8 words (256 bits) state
3055     __ ldpq(v0, v1, state);
3056 
3057     __ BIND(sha1_loop);
3058     // load 64 bytes of data into v8..v11
3059     __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
3060     __ rev32(v8, __ T16B, v8);
3061     __ rev32(v9, __ T16B, v9);
3062     __ rev32(v10, __ T16B, v10);
3063     __ rev32(v11, __ T16B, v11);
3064 
3065     __ addv(v6, __ T4S, v8, v16);
3066     __ orr(v2, __ T16B, v0, v0);
3067     __ orr(v3, __ T16B, v1, v1);
3068 
3069     FloatRegister d0 = v8;
3070     FloatRegister d1 = v9;
3071     FloatRegister d2 = v10;
3072     FloatRegister d3 = v11;
3073 
3074 
3075     for (int round = 0; round < 16; round++) {
3076       FloatRegister tmp1 = (round & 1) ? v6 : v7;
3077       FloatRegister tmp2 = (round & 1) ? v7 : v6;
3078       FloatRegister tmp3 = (round & 1) ? v2 : v4;
3079       FloatRegister tmp4 = (round & 1) ? v4 : v2;
3080 
3081       if (round < 12) __ sha256su0(d0, __ T4S, d1);
3082        __ orr(v4, __ T16B, v2, v2);
3083       if (round < 15)
3084         __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
3085       __ sha256h(v2, __ T4S, v3, tmp2);
3086       __ sha256h2(v3, __ T4S, v4, tmp2);
3087       if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
3088 
3089       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3090     }
3091 
3092     __ addv(v0, __ T4S, v0, v2);
3093     __ addv(v1, __ T4S, v1, v3);
3094 
3095     if (multi_block) {
3096       __ add(ofs, ofs, 64);
3097       __ cmp(ofs, limit);
3098       __ br(Assembler::LE, sha1_loop);
3099       __ mov(c_rarg0, ofs); // return ofs
3100     }
3101 
3102     __ ldpd(v10, v11, Address(sp, 16));
3103     __ ldpd(v8, v9, __ post(sp, 32));
3104 
3105     __ stpq(v0, v1, state);
3106 
3107     __ ret(lr);
3108 
3109     return start;
3110   }
3111 
3112 #ifndef BUILTIN_SIM
3113   // Safefetch stubs.
3114   void generate_safefetch(const char* name, int size, address* entry,
3115                           address* fault_pc, address* continuation_pc) {
3116     // safefetch signatures:
3117     //   int      SafeFetch32(int*      adr, int      errValue);
3118     //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
3119     //
3120     // arguments:
3121     //   c_rarg0 = adr
3122     //   c_rarg1 = errValue
3123     //
3124     // result:
3125     //   PPC_RET  = *adr or errValue
3126 
3127     StubCodeMark mark(this, "StubRoutines", name);
3128 
3129     // Entry point, pc or function descriptor.
3130     *entry = __ pc();
3131 
3132     // Load *adr into c_rarg1, may fault.
3133     *fault_pc = __ pc();
3134     switch (size) {
3135       case 4:
3136         // int32_t
3137         __ ldrw(c_rarg1, Address(c_rarg0, 0));
3138         break;
3139       case 8:
3140         // int64_t
3141         __ ldr(c_rarg1, Address(c_rarg0, 0));
3142         break;
3143       default:
3144         ShouldNotReachHere();
3145     }
3146 
3147     // return errValue or *adr
3148     *continuation_pc = __ pc();
3149     __ mov(r0, c_rarg1);
3150     __ ret(lr);
3151   }
3152 #endif
3153 
3154   /**
3155    *  Arguments:
3156    *
3157    * Inputs:
3158    *   c_rarg0   - int crc
3159    *   c_rarg1   - byte* buf
3160    *   c_rarg2   - int length
3161    *
3162    * Ouput:
3163    *       rax   - int crc result
3164    */
3165   address generate_updateBytesCRC32() {
3166     assert(UseCRC32Intrinsics, "what are we doing here?");
3167 
3168     __ align(CodeEntryAlignment);
3169     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
3170 
3171     address start = __ pc();
3172 
3173     const Register crc   = c_rarg0;  // crc
3174     const Register buf   = c_rarg1;  // source java byte array address
3175     const Register len   = c_rarg2;  // length
3176     const Register table0 = c_rarg3; // crc_table address
3177     const Register table1 = c_rarg4;
3178     const Register table2 = c_rarg5;
3179     const Register table3 = c_rarg6;
3180     const Register tmp3 = c_rarg7;
3181 
3182     BLOCK_COMMENT("Entry:");
3183     __ enter(); // required for proper stackwalking of RuntimeStub frame
3184 
3185     __ kernel_crc32(crc, buf, len,
3186               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
3187 
3188     __ leave(); // required for proper stackwalking of RuntimeStub frame
3189     __ ret(lr);
3190 
3191     return start;
3192   }
3193 
3194   /**
3195    *  Arguments:
3196    *
3197    * Inputs:
3198    *   c_rarg0   - int crc
3199    *   c_rarg1   - byte* buf
3200    *   c_rarg2   - int length
3201    *   c_rarg3   - int* table
3202    *
3203    * Ouput:
3204    *       r0   - int crc result
3205    */
3206   address generate_updateBytesCRC32C() {
3207     assert(UseCRC32CIntrinsics, "what are we doing here?");
3208 
3209     __ align(CodeEntryAlignment);
3210     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
3211 
3212     address start = __ pc();
3213 
3214     const Register crc   = c_rarg0;  // crc
3215     const Register buf   = c_rarg1;  // source java byte array address
3216     const Register len   = c_rarg2;  // length
3217     const Register table0 = c_rarg3; // crc_table address
3218     const Register table1 = c_rarg4;
3219     const Register table2 = c_rarg5;
3220     const Register table3 = c_rarg6;
3221     const Register tmp3 = c_rarg7;
3222 
3223     BLOCK_COMMENT("Entry:");
3224     __ enter(); // required for proper stackwalking of RuntimeStub frame
3225 
3226     __ kernel_crc32c(crc, buf, len,
3227               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
3228 
3229     __ leave(); // required for proper stackwalking of RuntimeStub frame
3230     __ ret(lr);
3231 
3232     return start;
3233   }
3234 
3235   /***
3236    *  Arguments:
3237    *
3238    *  Inputs:
3239    *   c_rarg0   - int   adler
3240    *   c_rarg1   - byte* buff
3241    *   c_rarg2   - int   len
3242    *
3243    * Output:
3244    *   c_rarg0   - int adler result
3245    */
3246   address generate_updateBytesAdler32() {
3247     __ align(CodeEntryAlignment);
3248     StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32");
3249     address start = __ pc();
3250 
3251     Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
3252 
3253     // Aliases
3254     Register adler  = c_rarg0;
3255     Register s1     = c_rarg0;
3256     Register s2     = c_rarg3;
3257     Register buff   = c_rarg1;
3258     Register len    = c_rarg2;
3259     Register nmax  = r4;
3260     Register base  = r5;
3261     Register count = r6;
3262     Register temp0 = rscratch1;
3263     Register temp1 = rscratch2;
3264     FloatRegister vbytes = v0;
3265     FloatRegister vs1acc = v1;
3266     FloatRegister vs2acc = v2;
3267     FloatRegister vtable = v3;
3268 
3269     // Max number of bytes we can process before having to take the mod
3270     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
3271     unsigned long BASE = 0xfff1;
3272     unsigned long NMAX = 0x15B0;
3273 
3274     __ mov(base, BASE);
3275     __ mov(nmax, NMAX);
3276 
3277     // Load accumulation coefficients for the upper 16 bits
3278     __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table));
3279     __ ld1(vtable, __ T16B, Address(temp0));
3280 
3281     // s1 is initialized to the lower 16 bits of adler
3282     // s2 is initialized to the upper 16 bits of adler
3283     __ ubfx(s2, adler, 16, 16);  // s2 = ((adler >> 16) & 0xffff)
3284     __ uxth(s1, adler);          // s1 = (adler & 0xffff)
3285 
3286     // The pipelined loop needs at least 16 elements for 1 iteration
3287     // It does check this, but it is more effective to skip to the cleanup loop
3288     __ cmp(len, (u1)16);
3289     __ br(Assembler::HS, L_nmax);
3290     __ cbz(len, L_combine);
3291 
3292     __ bind(L_simple_by1_loop);
3293     __ ldrb(temp0, Address(__ post(buff, 1)));
3294     __ add(s1, s1, temp0);
3295     __ add(s2, s2, s1);
3296     __ subs(len, len, 1);
3297     __ br(Assembler::HI, L_simple_by1_loop);
3298 
3299     // s1 = s1 % BASE
3300     __ subs(temp0, s1, base);
3301     __ csel(s1, temp0, s1, Assembler::HS);
3302 
3303     // s2 = s2 % BASE
3304     __ lsr(temp0, s2, 16);
3305     __ lsl(temp1, temp0, 4);
3306     __ sub(temp1, temp1, temp0);
3307     __ add(s2, temp1, s2, ext::uxth);
3308 
3309     __ subs(temp0, s2, base);
3310     __ csel(s2, temp0, s2, Assembler::HS);
3311 
3312     __ b(L_combine);
3313 
3314     __ bind(L_nmax);
3315     __ subs(len, len, nmax);
3316     __ sub(count, nmax, 16);
3317     __ br(Assembler::LO, L_by16);
3318 
3319     __ bind(L_nmax_loop);
3320 
3321     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
3322                                       vbytes, vs1acc, vs2acc, vtable);
3323 
3324     __ subs(count, count, 16);
3325     __ br(Assembler::HS, L_nmax_loop);
3326 
3327     // s1 = s1 % BASE
3328     __ lsr(temp0, s1, 16);
3329     __ lsl(temp1, temp0, 4);
3330     __ sub(temp1, temp1, temp0);
3331     __ add(temp1, temp1, s1, ext::uxth);
3332 
3333     __ lsr(temp0, temp1, 16);
3334     __ lsl(s1, temp0, 4);
3335     __ sub(s1, s1, temp0);
3336     __ add(s1, s1, temp1, ext:: uxth);
3337 
3338     __ subs(temp0, s1, base);
3339     __ csel(s1, temp0, s1, Assembler::HS);
3340 
3341     // s2 = s2 % BASE
3342     __ lsr(temp0, s2, 16);
3343     __ lsl(temp1, temp0, 4);
3344     __ sub(temp1, temp1, temp0);
3345     __ add(temp1, temp1, s2, ext::uxth);
3346 
3347     __ lsr(temp0, temp1, 16);
3348     __ lsl(s2, temp0, 4);
3349     __ sub(s2, s2, temp0);
3350     __ add(s2, s2, temp1, ext:: uxth);
3351 
3352     __ subs(temp0, s2, base);
3353     __ csel(s2, temp0, s2, Assembler::HS);
3354 
3355     __ subs(len, len, nmax);
3356     __ sub(count, nmax, 16);
3357     __ br(Assembler::HS, L_nmax_loop);
3358 
3359     __ bind(L_by16);
3360     __ adds(len, len, count);
3361     __ br(Assembler::LO, L_by1);
3362 
3363     __ bind(L_by16_loop);
3364 
3365     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
3366                                       vbytes, vs1acc, vs2acc, vtable);
3367 
3368     __ subs(len, len, 16);
3369     __ br(Assembler::HS, L_by16_loop);
3370 
3371     __ bind(L_by1);
3372     __ adds(len, len, 15);
3373     __ br(Assembler::LO, L_do_mod);
3374 
3375     __ bind(L_by1_loop);
3376     __ ldrb(temp0, Address(__ post(buff, 1)));
3377     __ add(s1, temp0, s1);
3378     __ add(s2, s2, s1);
3379     __ subs(len, len, 1);
3380     __ br(Assembler::HS, L_by1_loop);
3381 
3382     __ bind(L_do_mod);
3383     // s1 = s1 % BASE
3384     __ lsr(temp0, s1, 16);
3385     __ lsl(temp1, temp0, 4);
3386     __ sub(temp1, temp1, temp0);
3387     __ add(temp1, temp1, s1, ext::uxth);
3388 
3389     __ lsr(temp0, temp1, 16);
3390     __ lsl(s1, temp0, 4);
3391     __ sub(s1, s1, temp0);
3392     __ add(s1, s1, temp1, ext:: uxth);
3393 
3394     __ subs(temp0, s1, base);
3395     __ csel(s1, temp0, s1, Assembler::HS);
3396 
3397     // s2 = s2 % BASE
3398     __ lsr(temp0, s2, 16);
3399     __ lsl(temp1, temp0, 4);
3400     __ sub(temp1, temp1, temp0);
3401     __ add(temp1, temp1, s2, ext::uxth);
3402 
3403     __ lsr(temp0, temp1, 16);
3404     __ lsl(s2, temp0, 4);
3405     __ sub(s2, s2, temp0);
3406     __ add(s2, s2, temp1, ext:: uxth);
3407 
3408     __ subs(temp0, s2, base);
3409     __ csel(s2, temp0, s2, Assembler::HS);
3410 
3411     // Combine lower bits and higher bits
3412     __ bind(L_combine);
3413     __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
3414 
3415     __ ret(lr);
3416 
3417     return start;
3418   }
3419 
3420   void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff,
3421           Register temp0, Register temp1, FloatRegister vbytes,
3422           FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) {
3423     // Below is a vectorized implementation of updating s1 and s2 for 16 bytes.
3424     // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration.
3425     // In non-vectorized code, we update s1 and s2 as:
3426     //   s1 <- s1 + b1
3427     //   s2 <- s2 + s1
3428     //   s1 <- s1 + b2
3429     //   s2 <- s2 + b1
3430     //   ...
3431     //   s1 <- s1 + b16
3432     //   s2 <- s2 + s1
3433     // Putting above assignments together, we have:
3434     //   s1_new = s1 + b1 + b2 + ... + b16
3435     //   s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16)
3436     //          = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1)
3437     //          = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1)
3438     __ ld1(vbytes, __ T16B, Address(__ post(buff, 16)));
3439 
3440     // s2 = s2 + s1 * 16
3441     __ add(s2, s2, s1, Assembler::LSL, 4);
3442 
3443     // vs1acc = b1 + b2 + b3 + ... + b16
3444     // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1)
3445     __ umullv(vs2acc, __ T8B, vtable, vbytes);
3446     __ umlalv(vs2acc, __ T16B, vtable, vbytes);
3447     __ uaddlv(vs1acc, __ T16B, vbytes);
3448     __ uaddlv(vs2acc, __ T8H, vs2acc);
3449 
3450     // s1 = s1 + vs1acc, s2 = s2 + vs2acc
3451     __ fmovd(temp0, vs1acc);
3452     __ fmovd(temp1, vs2acc);
3453     __ add(s1, s1, temp0);
3454     __ add(s2, s2, temp1);
3455   }
3456 
3457   /**
3458    *  Arguments:
3459    *
3460    *  Input:
3461    *    c_rarg0   - x address
3462    *    c_rarg1   - x length
3463    *    c_rarg2   - y address
3464    *    c_rarg3   - y lenth
3465    *    c_rarg4   - z address
3466    *    c_rarg5   - z length
3467    */
3468   address generate_multiplyToLen() {
3469     __ align(CodeEntryAlignment);
3470     StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
3471 
3472     address start = __ pc();
3473     const Register x     = r0;
3474     const Register xlen  = r1;
3475     const Register y     = r2;
3476     const Register ylen  = r3;
3477     const Register z     = r4;
3478     const Register zlen  = r5;
3479 
3480     const Register tmp1  = r10;
3481     const Register tmp2  = r11;
3482     const Register tmp3  = r12;
3483     const Register tmp4  = r13;
3484     const Register tmp5  = r14;
3485     const Register tmp6  = r15;
3486     const Register tmp7  = r16;
3487 
3488     BLOCK_COMMENT("Entry:");
3489     __ enter(); // required for proper stackwalking of RuntimeStub frame
3490     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
3491     __ leave(); // required for proper stackwalking of RuntimeStub frame
3492     __ ret(lr);
3493 
3494     return start;
3495   }
3496 
3497   address generate_squareToLen() {
3498     // squareToLen algorithm for sizes 1..127 described in java code works
3499     // faster than multiply_to_len on some CPUs and slower on others, but
3500     // multiply_to_len shows a bit better overall results
3501     __ align(CodeEntryAlignment);
3502     StubCodeMark mark(this, "StubRoutines", "squareToLen");
3503     address start = __ pc();
3504 
3505     const Register x     = r0;
3506     const Register xlen  = r1;
3507     const Register z     = r2;
3508     const Register zlen  = r3;
3509     const Register y     = r4; // == x
3510     const Register ylen  = r5; // == xlen
3511 
3512     const Register tmp1  = r10;
3513     const Register tmp2  = r11;
3514     const Register tmp3  = r12;
3515     const Register tmp4  = r13;
3516     const Register tmp5  = r14;
3517     const Register tmp6  = r15;
3518     const Register tmp7  = r16;
3519 
3520     RegSet spilled_regs = RegSet::of(y, ylen);
3521     BLOCK_COMMENT("Entry:");
3522     __ enter();
3523     __ push(spilled_regs, sp);
3524     __ mov(y, x);
3525     __ mov(ylen, xlen);
3526     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
3527     __ pop(spilled_regs, sp);
3528     __ leave();
3529     __ ret(lr);
3530     return start;
3531   }
3532 
3533   address generate_mulAdd() {
3534     __ align(CodeEntryAlignment);
3535     StubCodeMark mark(this, "StubRoutines", "mulAdd");
3536 
3537     address start = __ pc();
3538 
3539     const Register out     = r0;
3540     const Register in      = r1;
3541     const Register offset  = r2;
3542     const Register len     = r3;
3543     const Register k       = r4;
3544 
3545     BLOCK_COMMENT("Entry:");
3546     __ enter();
3547     __ mul_add(out, in, offset, len, k);
3548     __ leave();
3549     __ ret(lr);
3550 
3551     return start;
3552   }
3553 
3554   void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi,
3555                       FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0,
3556                       FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4) {
3557     // Karatsuba multiplication performs a 128*128 -> 256-bit
3558     // multiplication in three 128-bit multiplications and a few
3559     // additions.
3560     //
3561     // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1)
3562     // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0
3563     //
3564     // Inputs:
3565     //
3566     // A0 in a.d[0]     (subkey)
3567     // A1 in a.d[1]
3568     // (A1+A0) in a1_xor_a0.d[0]
3569     //
3570     // B0 in b.d[0]     (state)
3571     // B1 in b.d[1]
3572 
3573     __ ext(tmp1, __ T16B, b, b, 0x08);
3574     __ pmull2(result_hi, __ T1Q, b, a, __ T2D);  // A1*B1
3575     __ eor(tmp1, __ T16B, tmp1, b);            // (B1+B0)
3576     __ pmull(result_lo,  __ T1Q, b, a, __ T1D);  // A0*B0
3577     __ pmull(tmp2, __ T1Q, tmp1, a1_xor_a0, __ T1D); // (A1+A0)(B1+B0)
3578 
3579     __ ext(tmp4, __ T16B, result_lo, result_hi, 0x08);
3580     __ eor(tmp3, __ T16B, result_hi, result_lo); // A1*B1+A0*B0
3581     __ eor(tmp2, __ T16B, tmp2, tmp4);
3582     __ eor(tmp2, __ T16B, tmp2, tmp3);
3583 
3584     // Register pair <result_hi:result_lo> holds the result of carry-less multiplication
3585     __ ins(result_hi, __ D, tmp2, 0, 1);
3586     __ ins(result_lo, __ D, tmp2, 1, 0);
3587   }
3588 
3589   void ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi,
3590                     FloatRegister p, FloatRegister z, FloatRegister t1) {
3591     const FloatRegister t0 = result;
3592 
3593     // The GCM field polynomial f is z^128 + p(z), where p =
3594     // z^7+z^2+z+1.
3595     //
3596     //    z^128 === -p(z)  (mod (z^128 + p(z)))
3597     //
3598     // so, given that the product we're reducing is
3599     //    a == lo + hi * z^128
3600     // substituting,
3601     //      === lo - hi * p(z)  (mod (z^128 + p(z)))
3602     //
3603     // we reduce by multiplying hi by p(z) and subtracting the result
3604     // from (i.e. XORing it with) lo.  Because p has no nonzero high
3605     // bits we can do this with two 64-bit multiplications, lo*p and
3606     // hi*p.
3607 
3608     __ pmull2(t0, __ T1Q, hi, p, __ T2D);
3609     __ ext(t1, __ T16B, t0, z, 8);
3610     __ eor(hi, __ T16B, hi, t1);
3611     __ ext(t1, __ T16B, z, t0, 8);
3612     __ eor(lo, __ T16B, lo, t1);
3613     __ pmull(t0, __ T1Q, hi, p, __ T1D);
3614     __ eor(result, __ T16B, lo, t0);
3615   }
3616 
3617   address generate_has_negatives(address &has_negatives_long) {
3618     const u1 large_loop_size = 64;
3619     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
3620     int dcache_line = VM_Version::dcache_line_size();
3621 
3622     Register ary1 = r1, len = r2, result = r0;
3623 
3624     __ align(CodeEntryAlignment);
3625 
3626     StubCodeMark mark(this, "StubRoutines", "has_negatives");
3627 
3628     address entry = __ pc();
3629 
3630     __ enter();
3631 
3632   Label RET_TRUE, RET_TRUE_NO_POP, RET_FALSE, ALIGNED, LOOP16, CHECK_16, DONE,
3633         LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;
3634 
3635   __ cmp(len, (u1)15);
3636   __ br(Assembler::GT, LEN_OVER_15);
3637   // The only case when execution falls into this code is when pointer is near
3638   // the end of memory page and we have to avoid reading next page
3639   __ add(ary1, ary1, len);
3640   __ subs(len, len, 8);
3641   __ br(Assembler::GT, LEN_OVER_8);
3642   __ ldr(rscratch2, Address(ary1, -8));
3643   __ sub(rscratch1, zr, len, __ LSL, 3);  // LSL 3 is to get bits from bytes.
3644   __ lsrv(rscratch2, rscratch2, rscratch1);
3645   __ tst(rscratch2, UPPER_BIT_MASK);
3646   __ cset(result, Assembler::NE);
3647   __ leave();
3648   __ ret(lr);
3649   __ bind(LEN_OVER_8);
3650   __ ldp(rscratch1, rscratch2, Address(ary1, -16));
3651   __ sub(len, len, 8); // no data dep., then sub can be executed while loading
3652   __ tst(rscratch2, UPPER_BIT_MASK);
3653   __ br(Assembler::NE, RET_TRUE_NO_POP);
3654   __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
3655   __ lsrv(rscratch1, rscratch1, rscratch2);
3656   __ tst(rscratch1, UPPER_BIT_MASK);
3657   __ cset(result, Assembler::NE);
3658   __ leave();
3659   __ ret(lr);
3660 
3661   Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10;
3662   const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6;
3663 
3664   has_negatives_long = __ pc(); // 2nd entry point
3665 
3666   __ enter();
3667 
3668   __ bind(LEN_OVER_15);
3669     __ push(spilled_regs, sp);
3670     __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment
3671     __ cbz(rscratch2, ALIGNED);
3672     __ ldp(tmp6, tmp1, Address(ary1));
3673     __ mov(tmp5, 16);
3674     __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address
3675     __ add(ary1, ary1, rscratch1);
3676     __ sub(len, len, rscratch1);
3677     __ orr(tmp6, tmp6, tmp1);
3678     __ tst(tmp6, UPPER_BIT_MASK);
3679     __ br(Assembler::NE, RET_TRUE);
3680 
3681   __ bind(ALIGNED);
3682     __ cmp(len, large_loop_size);
3683     __ br(Assembler::LT, CHECK_16);
3684     // Perform 16-byte load as early return in pre-loop to handle situation
3685     // when initially aligned large array has negative values at starting bytes,
3686     // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is
3687     // slower. Cases with negative bytes further ahead won't be affected that
3688     // much. In fact, it'll be faster due to early loads, less instructions and
3689     // less branches in LARGE_LOOP.
3690     __ ldp(tmp6, tmp1, Address(__ post(ary1, 16)));
3691     __ sub(len, len, 16);
3692     __ orr(tmp6, tmp6, tmp1);
3693     __ tst(tmp6, UPPER_BIT_MASK);
3694     __ br(Assembler::NE, RET_TRUE);
3695     __ cmp(len, large_loop_size);
3696     __ br(Assembler::LT, CHECK_16);
3697 
3698     if (SoftwarePrefetchHintDistance >= 0
3699         && SoftwarePrefetchHintDistance >= dcache_line) {
3700       // initial prefetch
3701       __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line));
3702     }
3703   __ bind(LARGE_LOOP);
3704     if (SoftwarePrefetchHintDistance >= 0) {
3705       __ prfm(Address(ary1, SoftwarePrefetchHintDistance));
3706     }
3707     // Issue load instructions first, since it can save few CPU/MEM cycles, also
3708     // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp)
3709     // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3
3710     // instructions per cycle and have less branches, but this approach disables
3711     // early return, thus, all 64 bytes are loaded and checked every time.
3712     __ ldp(tmp2, tmp3, Address(ary1));
3713     __ ldp(tmp4, tmp5, Address(ary1, 16));
3714     __ ldp(rscratch1, rscratch2, Address(ary1, 32));
3715     __ ldp(tmp6, tmp1, Address(ary1, 48));
3716     __ add(ary1, ary1, large_loop_size);
3717     __ sub(len, len, large_loop_size);
3718     __ orr(tmp2, tmp2, tmp3);
3719     __ orr(tmp4, tmp4, tmp5);
3720     __ orr(rscratch1, rscratch1, rscratch2);
3721     __ orr(tmp6, tmp6, tmp1);
3722     __ orr(tmp2, tmp2, tmp4);
3723     __ orr(rscratch1, rscratch1, tmp6);
3724     __ orr(tmp2, tmp2, rscratch1);
3725     __ tst(tmp2, UPPER_BIT_MASK);
3726     __ br(Assembler::NE, RET_TRUE);
3727     __ cmp(len, large_loop_size);
3728     __ br(Assembler::GE, LARGE_LOOP);
3729 
3730   __ bind(CHECK_16); // small 16-byte load pre-loop
3731     __ cmp(len, (u1)16);
3732     __ br(Assembler::LT, POST_LOOP16);
3733 
3734   __ bind(LOOP16); // small 16-byte load loop
3735     __ ldp(tmp2, tmp3, Address(__ post(ary1, 16)));
3736     __ sub(len, len, 16);
3737     __ orr(tmp2, tmp2, tmp3);
3738     __ tst(tmp2, UPPER_BIT_MASK);
3739     __ br(Assembler::NE, RET_TRUE);
3740     __ cmp(len, (u1)16);
3741     __ br(Assembler::GE, LOOP16); // 16-byte load loop end
3742 
3743   __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
3744     __ cmp(len, (u1)8);
3745     __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
3746     __ ldr(tmp3, Address(__ post(ary1, 8)));
3747     __ sub(len, len, 8);
3748     __ tst(tmp3, UPPER_BIT_MASK);
3749     __ br(Assembler::NE, RET_TRUE);
3750 
3751   __ bind(POST_LOOP16_LOAD_TAIL);
3752     __ cbz(len, RET_FALSE); // Can't shift left by 64 when len==0
3753     __ ldr(tmp1, Address(ary1));
3754     __ mov(tmp2, 64);
3755     __ sub(tmp4, tmp2, len, __ LSL, 3);
3756     __ lslv(tmp1, tmp1, tmp4);
3757     __ tst(tmp1, UPPER_BIT_MASK);
3758     __ br(Assembler::NE, RET_TRUE);
3759     // Fallthrough
3760 
3761   __ bind(RET_FALSE);
3762     __ pop(spilled_regs, sp);
3763     __ leave();
3764     __ mov(result, zr);
3765     __ ret(lr);
3766 
3767   __ bind(RET_TRUE);
3768     __ pop(spilled_regs, sp);
3769   __ bind(RET_TRUE_NO_POP);
3770     __ leave();
3771     __ mov(result, 1);
3772     __ ret(lr);
3773 
3774   __ bind(DONE);
3775     __ pop(spilled_regs, sp);
3776     __ leave();
3777     __ ret(lr);
3778     return entry;
3779   }
3780 
3781   void generate_large_array_equals_loop_nonsimd(int loopThreshold,
3782         bool usePrefetch, Label &NOT_EQUAL) {
3783     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
3784         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
3785         tmp7 = r12, tmp8 = r13;
3786     Label LOOP;
3787 
3788     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
3789     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
3790     __ bind(LOOP);
3791     if (usePrefetch) {
3792       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
3793       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
3794     }
3795     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
3796     __ eor(tmp1, tmp1, tmp2);
3797     __ eor(tmp3, tmp3, tmp4);
3798     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
3799     __ orr(tmp1, tmp1, tmp3);
3800     __ cbnz(tmp1, NOT_EQUAL);
3801     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
3802     __ eor(tmp5, tmp5, tmp6);
3803     __ eor(tmp7, tmp7, tmp8);
3804     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
3805     __ orr(tmp5, tmp5, tmp7);
3806     __ cbnz(tmp5, NOT_EQUAL);
3807     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
3808     __ eor(tmp1, tmp1, tmp2);
3809     __ eor(tmp3, tmp3, tmp4);
3810     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
3811     __ orr(tmp1, tmp1, tmp3);
3812     __ cbnz(tmp1, NOT_EQUAL);
3813     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
3814     __ eor(tmp5, tmp5, tmp6);
3815     __ sub(cnt1, cnt1, 8 * wordSize);
3816     __ eor(tmp7, tmp7, tmp8);
3817     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
3818     // tmp6 is not used. MacroAssembler::subs is used here (rather than
3819     // cmp) because subs allows an unlimited range of immediate operand.
3820     __ subs(tmp6, cnt1, loopThreshold);
3821     __ orr(tmp5, tmp5, tmp7);
3822     __ cbnz(tmp5, NOT_EQUAL);
3823     __ br(__ GE, LOOP);
3824     // post-loop
3825     __ eor(tmp1, tmp1, tmp2);
3826     __ eor(tmp3, tmp3, tmp4);
3827     __ orr(tmp1, tmp1, tmp3);
3828     __ sub(cnt1, cnt1, 2 * wordSize);
3829     __ cbnz(tmp1, NOT_EQUAL);
3830   }
3831 
3832   void generate_large_array_equals_loop_simd(int loopThreshold,
3833         bool usePrefetch, Label &NOT_EQUAL) {
3834     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
3835         tmp2 = rscratch2;
3836     Label LOOP;
3837 
3838     __ bind(LOOP);
3839     if (usePrefetch) {
3840       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
3841       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
3842     }
3843     __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize)));
3844     __ sub(cnt1, cnt1, 8 * wordSize);
3845     __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize)));
3846     __ subs(tmp1, cnt1, loopThreshold);
3847     __ eor(v0, __ T16B, v0, v4);
3848     __ eor(v1, __ T16B, v1, v5);
3849     __ eor(v2, __ T16B, v2, v6);
3850     __ eor(v3, __ T16B, v3, v7);
3851     __ orr(v0, __ T16B, v0, v1);
3852     __ orr(v1, __ T16B, v2, v3);
3853     __ orr(v0, __ T16B, v0, v1);
3854     __ umov(tmp1, v0, __ D, 0);
3855     __ umov(tmp2, v0, __ D, 1);
3856     __ orr(tmp1, tmp1, tmp2);
3857     __ cbnz(tmp1, NOT_EQUAL);
3858     __ br(__ GE, LOOP);
3859   }
3860 
3861   // a1 = r1 - array1 address
3862   // a2 = r2 - array2 address
3863   // result = r0 - return value. Already contains "false"
3864   // cnt1 = r10 - amount of elements left to check, reduced by wordSize
3865   // r3-r5 are reserved temporary registers
3866   address generate_large_array_equals() {
3867     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
3868         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
3869         tmp7 = r12, tmp8 = r13;
3870     Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP,
3871         SMALL_LOOP, POST_LOOP;
3872     const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16;
3873     // calculate if at least 32 prefetched bytes are used
3874     int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32;
3875     int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE);
3876     RegSet spilled_regs = RegSet::range(tmp6, tmp8);
3877     assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4,
3878         tmp5, tmp6, tmp7, tmp8);
3879 
3880     __ align(CodeEntryAlignment);
3881 
3882     StubCodeMark mark(this, "StubRoutines", "large_array_equals");
3883 
3884     address entry = __ pc();
3885     __ enter();
3886     __ sub(cnt1, cnt1, wordSize);  // first 8 bytes were loaded outside of stub
3887     // also advance pointers to use post-increment instead of pre-increment
3888     __ add(a1, a1, wordSize);
3889     __ add(a2, a2, wordSize);
3890     if (AvoidUnalignedAccesses) {
3891       // both implementations (SIMD/nonSIMD) are using relatively large load
3892       // instructions (ld1/ldp), which has huge penalty (up to x2 exec time)
3893       // on some CPUs in case of address is not at least 16-byte aligned.
3894       // Arrays are 8-byte aligned currently, so, we can make additional 8-byte
3895       // load if needed at least for 1st address and make if 16-byte aligned.
3896       Label ALIGNED16;
3897       __ tbz(a1, 3, ALIGNED16);
3898       __ ldr(tmp1, Address(__ post(a1, wordSize)));
3899       __ ldr(tmp2, Address(__ post(a2, wordSize)));
3900       __ sub(cnt1, cnt1, wordSize);
3901       __ eor(tmp1, tmp1, tmp2);
3902       __ cbnz(tmp1, NOT_EQUAL_NO_POP);
3903       __ bind(ALIGNED16);
3904     }
3905     if (UseSIMDForArrayEquals) {
3906       if (SoftwarePrefetchHintDistance >= 0) {
3907         __ subs(tmp1, cnt1, prefetchLoopThreshold);
3908         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
3909         generate_large_array_equals_loop_simd(prefetchLoopThreshold,
3910             /* prfm = */ true, NOT_EQUAL);
3911         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
3912         __ br(__ LT, TAIL);
3913       }
3914       __ bind(NO_PREFETCH_LARGE_LOOP);
3915       generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold,
3916           /* prfm = */ false, NOT_EQUAL);
3917     } else {
3918       __ push(spilled_regs, sp);
3919       if (SoftwarePrefetchHintDistance >= 0) {
3920         __ subs(tmp1, cnt1, prefetchLoopThreshold);
3921         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
3922         generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold,
3923             /* prfm = */ true, NOT_EQUAL);
3924         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
3925         __ br(__ LT, TAIL);
3926       }
3927       __ bind(NO_PREFETCH_LARGE_LOOP);
3928       generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold,
3929           /* prfm = */ false, NOT_EQUAL);
3930     }
3931     __ bind(TAIL);
3932       __ cbz(cnt1, EQUAL);
3933       __ subs(cnt1, cnt1, wordSize);
3934       __ br(__ LE, POST_LOOP);
3935     __ bind(SMALL_LOOP);
3936       __ ldr(tmp1, Address(__ post(a1, wordSize)));
3937       __ ldr(tmp2, Address(__ post(a2, wordSize)));
3938       __ subs(cnt1, cnt1, wordSize);
3939       __ eor(tmp1, tmp1, tmp2);
3940       __ cbnz(tmp1, NOT_EQUAL);
3941       __ br(__ GT, SMALL_LOOP);
3942     __ bind(POST_LOOP);
3943       __ ldr(tmp1, Address(a1, cnt1));
3944       __ ldr(tmp2, Address(a2, cnt1));
3945       __ eor(tmp1, tmp1, tmp2);
3946       __ cbnz(tmp1, NOT_EQUAL);
3947     __ bind(EQUAL);
3948       __ mov(result, true);
3949     __ bind(NOT_EQUAL);
3950       if (!UseSIMDForArrayEquals) {
3951         __ pop(spilled_regs, sp);
3952       }
3953     __ bind(NOT_EQUAL_NO_POP);
3954     __ leave();
3955     __ ret(lr);
3956     return entry;
3957   }
3958 
3959   address generate_dsin_dcos(bool isCos) {
3960     __ align(CodeEntryAlignment);
3961     StubCodeMark mark(this, "StubRoutines", isCos ? "libmDcos" : "libmDsin");
3962     address start = __ pc();
3963     __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw,
3964         (address)StubRoutines::aarch64::_two_over_pi,
3965         (address)StubRoutines::aarch64::_pio2,
3966         (address)StubRoutines::aarch64::_dsin_coef,
3967         (address)StubRoutines::aarch64::_dcos_coef);
3968     return start;
3969   }
3970 
3971   address generate_dlog() {
3972     __ align(CodeEntryAlignment);
3973     StubCodeMark mark(this, "StubRoutines", "dlog");
3974     address entry = __ pc();
3975     FloatRegister vtmp0 = v0, vtmp1 = v1, vtmp2 = v2, vtmp3 = v3, vtmp4 = v4,
3976         vtmp5 = v5, tmpC1 = v16, tmpC2 = v17, tmpC3 = v18, tmpC4 = v19;
3977     Register tmp1 = r0, tmp2 = r1, tmp3 = r2, tmp4 = r3, tmp5 = r4;
3978     __ fast_log(vtmp0, vtmp1, vtmp2, vtmp3, vtmp4, vtmp5, tmpC1, tmpC2, tmpC3,
3979         tmpC4, tmp1, tmp2, tmp3, tmp4, tmp5);
3980     return entry;
3981   }
3982 
3983   // code for comparing 16 bytes of strings with same encoding
3984   void compare_string_16_bytes_same(Label &DIFF1, Label &DIFF2) {
3985     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, tmp1 = r10, tmp2 = r11;
3986     __ ldr(rscratch1, Address(__ post(str1, 8)));
3987     __ eor(rscratch2, tmp1, tmp2);
3988     __ ldr(cnt1, Address(__ post(str2, 8)));
3989     __ cbnz(rscratch2, DIFF1);
3990     __ ldr(tmp1, Address(__ post(str1, 8)));
3991     __ eor(rscratch2, rscratch1, cnt1);
3992     __ ldr(tmp2, Address(__ post(str2, 8)));
3993     __ cbnz(rscratch2, DIFF2);
3994   }
3995 
3996   // code for comparing 16 characters of strings with Latin1 and Utf16 encoding
3997   void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1,
3998       Label &DIFF2) {
3999     Register cnt1 = r2, tmp1 = r10, tmp2 = r11, tmp3 = r12;
4000     FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2;
4001 
4002     __ ldrq(vtmp, Address(__ post(tmp2, 16)));
4003     __ ldr(tmpU, Address(__ post(cnt1, 8)));
4004     __ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
4005     // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3
4006 
4007     __ fmovd(tmpL, vtmp3);
4008     __ eor(rscratch2, tmp3, tmpL);
4009     __ cbnz(rscratch2, DIFF2);
4010 
4011     __ ldr(tmp3, Address(__ post(cnt1, 8)));
4012     __ umov(tmpL, vtmp3, __ D, 1);
4013     __ eor(rscratch2, tmpU, tmpL);
4014     __ cbnz(rscratch2, DIFF1);
4015 
4016     __ zip2(vtmp, __ T16B, vtmp, vtmpZ);
4017     __ ldr(tmpU, Address(__ post(cnt1, 8)));
4018     __ fmovd(tmpL, vtmp);
4019     __ eor(rscratch2, tmp3, tmpL);
4020     __ cbnz(rscratch2, DIFF2);
4021 
4022     __ ldr(tmp3, Address(__ post(cnt1, 8)));
4023     __ umov(tmpL, vtmp, __ D, 1);
4024     __ eor(rscratch2, tmpU, tmpL);
4025     __ cbnz(rscratch2, DIFF1);
4026   }
4027 
4028   // r0  = result
4029   // r1  = str1
4030   // r2  = cnt1
4031   // r3  = str2
4032   // r4  = cnt2
4033   // r10 = tmp1
4034   // r11 = tmp2
4035   address generate_compare_long_string_different_encoding(bool isLU) {
4036     __ align(CodeEntryAlignment);
4037     StubCodeMark mark(this, "StubRoutines", isLU
4038         ? "compare_long_string_different_encoding LU"
4039         : "compare_long_string_different_encoding UL");
4040     address entry = __ pc();
4041     Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2,
4042         DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, SMALL_LOOP_ENTER,
4043         LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2;
4044     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
4045         tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14;
4046     FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2;
4047     RegSet spilled_regs = RegSet::of(tmp3, tmp4);
4048 
4049     int prefetchLoopExitCondition = MAX(32, SoftwarePrefetchHintDistance/2);
4050 
4051     __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ);
4052     // cnt2 == amount of characters left to compare
4053     // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL))
4054     __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
4055     __ add(str1, str1, isLU ? wordSize/2 : wordSize);
4056     __ add(str2, str2, isLU ? wordSize : wordSize/2);
4057     __ fmovd(isLU ? tmp1 : tmp2, vtmp);
4058     __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
4059     __ add(str1, str1, cnt2, __ LSL, isLU ? 0 : 1);
4060     __ eor(rscratch2, tmp1, tmp2);
4061     __ add(str2, str2, cnt2, __ LSL, isLU ? 1 : 0);
4062     __ mov(rscratch1, tmp2);
4063     __ cbnz(rscratch2, CALCULATE_DIFFERENCE);
4064     Register strU = isLU ? str2 : str1,
4065              strL = isLU ? str1 : str2,
4066              tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison
4067              tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison
4068     __ push(spilled_regs, sp);
4069     __ sub(tmp2, strL, cnt2); // strL pointer to load from
4070     __ sub(cnt1, strU, cnt2, __ LSL, 1); // strU pointer to load from
4071 
4072     __ ldr(tmp3, Address(__ post(cnt1, 8)));
4073 
4074     if (SoftwarePrefetchHintDistance >= 0) {
4075       __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
4076       __ br(__ LT, SMALL_LOOP);
4077       __ bind(LARGE_LOOP_PREFETCH);
4078         __ prfm(Address(tmp2, SoftwarePrefetchHintDistance));
4079         __ mov(tmp4, 2);
4080         __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
4081         __ bind(LARGE_LOOP_PREFETCH_REPEAT1);
4082           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
4083           __ subs(tmp4, tmp4, 1);
4084           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1);
4085           __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
4086           __ mov(tmp4, 2);
4087         __ bind(LARGE_LOOP_PREFETCH_REPEAT2);
4088           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
4089           __ subs(tmp4, tmp4, 1);
4090           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2);
4091           __ sub(cnt2, cnt2, 64);
4092           __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
4093           __ br(__ GE, LARGE_LOOP_PREFETCH);
4094     }
4095     __ cbz(cnt2, LOAD_LAST); // no characters left except last load
4096     __ subs(cnt2, cnt2, 16);
4097     __ br(__ LT, TAIL);
4098     __ b(SMALL_LOOP_ENTER);
4099     __ bind(SMALL_LOOP); // smaller loop
4100       __ subs(cnt2, cnt2, 16);
4101     __ bind(SMALL_LOOP_ENTER);
4102       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
4103       __ br(__ GE, SMALL_LOOP);
4104       __ cbz(cnt2, LOAD_LAST);
4105     __ bind(TAIL); // 1..15 characters left
4106       __ subs(zr, cnt2, -8);
4107       __ br(__ GT, TAIL_LOAD_16);
4108       __ ldrd(vtmp, Address(tmp2));
4109       __ zip1(vtmp3, __ T8B, vtmp, vtmpZ);
4110 
4111       __ ldr(tmpU, Address(__ post(cnt1, 8)));
4112       __ fmovd(tmpL, vtmp3);
4113       __ eor(rscratch2, tmp3, tmpL);
4114       __ cbnz(rscratch2, DIFF2);
4115       __ umov(tmpL, vtmp3, __ D, 1);
4116       __ eor(rscratch2, tmpU, tmpL);
4117       __ cbnz(rscratch2, DIFF1);
4118       __ b(LOAD_LAST);
4119     __ bind(TAIL_LOAD_16);
4120       __ ldrq(vtmp, Address(tmp2));
4121       __ ldr(tmpU, Address(__ post(cnt1, 8)));
4122       __ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
4123       __ zip2(vtmp, __ T16B, vtmp, vtmpZ);
4124       __ fmovd(tmpL, vtmp3);
4125       __ eor(rscratch2, tmp3, tmpL);
4126       __ cbnz(rscratch2, DIFF2);
4127 
4128       __ ldr(tmp3, Address(__ post(cnt1, 8)));
4129       __ umov(tmpL, vtmp3, __ D, 1);
4130       __ eor(rscratch2, tmpU, tmpL);
4131       __ cbnz(rscratch2, DIFF1);
4132 
4133       __ ldr(tmpU, Address(__ post(cnt1, 8)));
4134       __ fmovd(tmpL, vtmp);
4135       __ eor(rscratch2, tmp3, tmpL);
4136       __ cbnz(rscratch2, DIFF2);
4137 
4138       __ umov(tmpL, vtmp, __ D, 1);
4139       __ eor(rscratch2, tmpU, tmpL);
4140       __ cbnz(rscratch2, DIFF1);
4141       __ b(LOAD_LAST);
4142     __ bind(DIFF2);
4143       __ mov(tmpU, tmp3);
4144     __ bind(DIFF1);
4145       __ pop(spilled_regs, sp);
4146       __ b(CALCULATE_DIFFERENCE);
4147     __ bind(LOAD_LAST);
4148       __ pop(spilled_regs, sp);
4149 
4150       __ ldrs(vtmp, Address(strL));
4151       __ ldr(tmpU, Address(strU));
4152       __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
4153       __ fmovd(tmpL, vtmp);
4154 
4155       __ eor(rscratch2, tmpU, tmpL);
4156       __ cbz(rscratch2, DONE);
4157 
4158     // Find the first different characters in the longwords and
4159     // compute their difference.
4160     __ bind(CALCULATE_DIFFERENCE);
4161       __ rev(rscratch2, rscratch2);
4162       __ clz(rscratch2, rscratch2);
4163       __ andr(rscratch2, rscratch2, -16);
4164       __ lsrv(tmp1, tmp1, rscratch2);
4165       __ uxthw(tmp1, tmp1);
4166       __ lsrv(rscratch1, rscratch1, rscratch2);
4167       __ uxthw(rscratch1, rscratch1);
4168       __ subw(result, tmp1, rscratch1);
4169     __ bind(DONE);
4170       __ ret(lr);
4171     return entry;
4172   }
4173 
4174   // r0  = result
4175   // r1  = str1
4176   // r2  = cnt1
4177   // r3  = str2
4178   // r4  = cnt2
4179   // r10 = tmp1
4180   // r11 = tmp2
4181   address generate_compare_long_string_same_encoding(bool isLL) {
4182     __ align(CodeEntryAlignment);
4183     StubCodeMark mark(this, "StubRoutines", isLL
4184         ? "compare_long_string_same_encoding LL"
4185         : "compare_long_string_same_encoding UU");
4186     address entry = __ pc();
4187     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
4188         tmp1 = r10, tmp2 = r11;
4189     Label SMALL_LOOP, LARGE_LOOP_PREFETCH, CHECK_LAST, DIFF2, TAIL,
4190         LENGTH_DIFF, DIFF, LAST_CHECK_AND_LENGTH_DIFF,
4191         DIFF_LAST_POSITION, DIFF_LAST_POSITION2;
4192     // exit from large loop when less than 64 bytes left to read or we're about
4193     // to prefetch memory behind array border
4194     int largeLoopExitCondition = MAX(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);
4195     // cnt1/cnt2 contains amount of characters to compare. cnt1 can be re-used
4196     // update cnt2 counter with already loaded 8 bytes
4197     __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2));
4198     // update pointers, because of previous read
4199     __ add(str1, str1, wordSize);
4200     __ add(str2, str2, wordSize);
4201     if (SoftwarePrefetchHintDistance >= 0) {
4202       __ bind(LARGE_LOOP_PREFETCH);
4203         __ prfm(Address(str1, SoftwarePrefetchHintDistance));
4204         __ prfm(Address(str2, SoftwarePrefetchHintDistance));
4205         compare_string_16_bytes_same(DIFF, DIFF2);
4206         compare_string_16_bytes_same(DIFF, DIFF2);
4207         __ sub(cnt2, cnt2, isLL ? 64 : 32);
4208         compare_string_16_bytes_same(DIFF, DIFF2);
4209         __ subs(rscratch2, cnt2, largeLoopExitCondition);
4210         compare_string_16_bytes_same(DIFF, DIFF2);
4211         __ br(__ GT, LARGE_LOOP_PREFETCH);
4212         __ cbz(cnt2, LAST_CHECK_AND_LENGTH_DIFF); // no more chars left?
4213         // less than 16 bytes left?
4214         __ subs(cnt2, cnt2, isLL ? 16 : 8);
4215         __ br(__ LT, TAIL);
4216     }
4217     __ bind(SMALL_LOOP);
4218       compare_string_16_bytes_same(DIFF, DIFF2);
4219       __ subs(cnt2, cnt2, isLL ? 16 : 8);
4220       __ br(__ GE, SMALL_LOOP);
4221     __ bind(TAIL);
4222       __ adds(cnt2, cnt2, isLL ? 16 : 8);
4223       __ br(__ EQ, LAST_CHECK_AND_LENGTH_DIFF);
4224       __ subs(cnt2, cnt2, isLL ? 8 : 4);
4225       __ br(__ LE, CHECK_LAST);
4226       __ eor(rscratch2, tmp1, tmp2);
4227       __ cbnz(rscratch2, DIFF);
4228       __ ldr(tmp1, Address(__ post(str1, 8)));
4229       __ ldr(tmp2, Address(__ post(str2, 8)));
4230       __ sub(cnt2, cnt2, isLL ? 8 : 4);
4231     __ bind(CHECK_LAST);
4232       if (!isLL) {
4233         __ add(cnt2, cnt2, cnt2); // now in bytes
4234       }
4235       __ eor(rscratch2, tmp1, tmp2);
4236       __ cbnz(rscratch2, DIFF);
4237       __ ldr(rscratch1, Address(str1, cnt2));
4238       __ ldr(cnt1, Address(str2, cnt2));
4239       __ eor(rscratch2, rscratch1, cnt1);
4240       __ cbz(rscratch2, LENGTH_DIFF);
4241       // Find the first different characters in the longwords and
4242       // compute their difference.
4243     __ bind(DIFF2);
4244       __ rev(rscratch2, rscratch2);
4245       __ clz(rscratch2, rscratch2);
4246       __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
4247       __ lsrv(rscratch1, rscratch1, rscratch2);
4248       if (isLL) {
4249         __ lsrv(cnt1, cnt1, rscratch2);
4250         __ uxtbw(rscratch1, rscratch1);
4251         __ uxtbw(cnt1, cnt1);
4252       } else {
4253         __ lsrv(cnt1, cnt1, rscratch2);
4254         __ uxthw(rscratch1, rscratch1);
4255         __ uxthw(cnt1, cnt1);
4256       }
4257       __ subw(result, rscratch1, cnt1);
4258       __ b(LENGTH_DIFF);
4259     __ bind(DIFF);
4260       __ rev(rscratch2, rscratch2);
4261       __ clz(rscratch2, rscratch2);
4262       __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
4263       __ lsrv(tmp1, tmp1, rscratch2);
4264       if (isLL) {
4265         __ lsrv(tmp2, tmp2, rscratch2);
4266         __ uxtbw(tmp1, tmp1);
4267         __ uxtbw(tmp2, tmp2);
4268       } else {
4269         __ lsrv(tmp2, tmp2, rscratch2);
4270         __ uxthw(tmp1, tmp1);
4271         __ uxthw(tmp2, tmp2);
4272       }
4273       __ subw(result, tmp1, tmp2);
4274       __ b(LENGTH_DIFF);
4275     __ bind(LAST_CHECK_AND_LENGTH_DIFF);
4276       __ eor(rscratch2, tmp1, tmp2);
4277       __ cbnz(rscratch2, DIFF);
4278     __ bind(LENGTH_DIFF);
4279       __ ret(lr);
4280     return entry;
4281   }
4282 
4283   void generate_compare_long_strings() {
4284       StubRoutines::aarch64::_compare_long_string_LL
4285           = generate_compare_long_string_same_encoding(true);
4286       StubRoutines::aarch64::_compare_long_string_UU
4287           = generate_compare_long_string_same_encoding(false);
4288       StubRoutines::aarch64::_compare_long_string_LU
4289           = generate_compare_long_string_different_encoding(true);
4290       StubRoutines::aarch64::_compare_long_string_UL
4291           = generate_compare_long_string_different_encoding(false);
4292   }
4293 
4294   // R0 = result
4295   // R1 = str2
4296   // R2 = cnt1
4297   // R3 = str1
4298   // R4 = cnt2
4299   // This generic linear code use few additional ideas, which makes it faster:
4300   // 1) we can safely keep at least 1st register of pattern(since length >= 8)
4301   // in order to skip initial loading(help in systems with 1 ld pipeline)
4302   // 2) we can use "fast" algorithm of finding single character to search for
4303   // first symbol with less branches(1 branch per each loaded register instead
4304   // of branch for each symbol), so, this is where constants like
4305   // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from
4306   // 3) after loading and analyzing 1st register of source string, it can be
4307   // used to search for every 1st character entry, saving few loads in
4308   // comparison with "simplier-but-slower" implementation
4309   // 4) in order to avoid lots of push/pop operations, code below is heavily
4310   // re-using/re-initializing/compressing register values, which makes code
4311   // larger and a bit less readable, however, most of extra operations are
4312   // issued during loads or branches, so, penalty is minimal
4313   address generate_string_indexof_linear(bool str1_isL, bool str2_isL) {
4314     const char* stubName = str1_isL
4315         ? (str2_isL ? "indexof_linear_ll" : "indexof_linear_ul")
4316         : "indexof_linear_uu";
4317     __ align(CodeEntryAlignment);
4318     StubCodeMark mark(this, "StubRoutines", stubName);
4319     address entry = __ pc();
4320 
4321     int str1_chr_size = str1_isL ? 1 : 2;
4322     int str2_chr_size = str2_isL ? 1 : 2;
4323     int str1_chr_shift = str1_isL ? 0 : 1;
4324     int str2_chr_shift = str2_isL ? 0 : 1;
4325     bool isL = str1_isL && str2_isL;
4326    // parameters
4327     Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4;
4328     // temporary registers
4329     Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23;
4330     RegSet spilled_regs = RegSet::range(tmp1, tmp4);
4331     // redefinitions
4332     Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3;
4333 
4334     __ push(spilled_regs, sp);
4335     Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO,
4336         L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
4337         L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
4338         L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
4339         L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
4340         L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
4341     // Read whole register from str1. It is safe, because length >=8 here
4342     __ ldr(ch1, Address(str1));
4343     // Read whole register from str2. It is safe, because length >=8 here
4344     __ ldr(ch2, Address(str2));
4345     __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF);
4346     if (str1_isL != str2_isL) {
4347       __ eor(v0, __ T16B, v0, v0);
4348     }
4349     __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
4350     __ mul(first, first, tmp1);
4351     // check if we have less than 1 register to check
4352     __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1);
4353     if (str1_isL != str2_isL) {
4354       __ fmovd(v1, ch1);
4355     }
4356     __ br(__ LE, L_SMALL);
4357     __ eor(ch2, first, ch2);
4358     if (str1_isL != str2_isL) {
4359       __ zip1(v1, __ T16B, v1, v0);
4360     }
4361     __ sub(tmp2, ch2, tmp1);
4362     __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4363     __ bics(tmp2, tmp2, ch2);
4364     if (str1_isL != str2_isL) {
4365       __ fmovd(ch1, v1);
4366     }
4367     __ br(__ NE, L_HAS_ZERO);
4368     __ subs(cnt2, cnt2, wordSize/str2_chr_size);
4369     __ add(result, result, wordSize/str2_chr_size);
4370     __ add(str2, str2, wordSize);
4371     __ br(__ LT, L_POST_LOOP);
4372     __ BIND(L_LOOP);
4373       __ ldr(ch2, Address(str2));
4374       __ eor(ch2, first, ch2);
4375       __ sub(tmp2, ch2, tmp1);
4376       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4377       __ bics(tmp2, tmp2, ch2);
4378       __ br(__ NE, L_HAS_ZERO);
4379     __ BIND(L_LOOP_PROCEED);
4380       __ subs(cnt2, cnt2, wordSize/str2_chr_size);
4381       __ add(str2, str2, wordSize);
4382       __ add(result, result, wordSize/str2_chr_size);
4383       __ br(__ GE, L_LOOP);
4384     __ BIND(L_POST_LOOP);
4385       __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check
4386       __ br(__ LE, NOMATCH);
4387       __ ldr(ch2, Address(str2));
4388       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
4389       __ eor(ch2, first, ch2);
4390       __ sub(tmp2, ch2, tmp1);
4391       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4392       __ mov(tmp4, -1); // all bits set
4393       __ b(L_SMALL_PROCEED);
4394     __ align(OptoLoopAlignment);
4395     __ BIND(L_SMALL);
4396       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
4397       __ eor(ch2, first, ch2);
4398       if (str1_isL != str2_isL) {
4399         __ zip1(v1, __ T16B, v1, v0);
4400       }
4401       __ sub(tmp2, ch2, tmp1);
4402       __ mov(tmp4, -1); // all bits set
4403       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4404       if (str1_isL != str2_isL) {
4405         __ fmovd(ch1, v1); // move converted 4 symbols
4406       }
4407     __ BIND(L_SMALL_PROCEED);
4408       __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits.
4409       __ bic(tmp2, tmp2, ch2);
4410       __ ands(tmp2, tmp2, tmp4); // clear useless bits and check
4411       __ rbit(tmp2, tmp2);
4412       __ br(__ EQ, NOMATCH);
4413     __ BIND(L_SMALL_HAS_ZERO_LOOP);
4414       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's
4415       __ cmp(cnt1, u1(wordSize/str2_chr_size));
4416       __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2);
4417       if (str2_isL) { // LL
4418         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
4419         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
4420         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
4421         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
4422         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
4423       } else {
4424         __ mov(ch2, 0xE); // all bits in byte set except last one
4425         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
4426         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4427         __ lslv(tmp2, tmp2, tmp4);
4428         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4429         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4430         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
4431         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4432       }
4433       __ cmp(ch1, ch2);
4434       __ mov(tmp4, wordSize/str2_chr_size);
4435       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
4436     __ BIND(L_SMALL_CMP_LOOP);
4437       str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
4438                : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
4439       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
4440                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
4441       __ add(tmp4, tmp4, 1);
4442       __ cmp(tmp4, cnt1);
4443       __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP);
4444       __ cmp(first, ch2);
4445       __ br(__ EQ, L_SMALL_CMP_LOOP);
4446     __ BIND(L_SMALL_CMP_LOOP_NOMATCH);
4447       __ cbz(tmp2, NOMATCH); // no more matches. exit
4448       __ clz(tmp4, tmp2);
4449       __ add(result, result, 1); // advance index
4450       __ add(str2, str2, str2_chr_size); // advance pointer
4451       __ b(L_SMALL_HAS_ZERO_LOOP);
4452     __ align(OptoLoopAlignment);
4453     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP);
4454       __ cmp(first, ch2);
4455       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
4456       __ b(DONE);
4457     __ align(OptoLoopAlignment);
4458     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2);
4459       if (str2_isL) { // LL
4460         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
4461         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
4462         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
4463         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
4464         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
4465       } else {
4466         __ mov(ch2, 0xE); // all bits in byte set except last one
4467         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
4468         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4469         __ lslv(tmp2, tmp2, tmp4);
4470         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4471         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4472         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
4473         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4474       }
4475       __ cmp(ch1, ch2);
4476       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
4477       __ b(DONE);
4478     __ align(OptoLoopAlignment);
4479     __ BIND(L_HAS_ZERO);
4480       __ rbit(tmp2, tmp2);
4481       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's
4482       // Now, perform compression of counters(cnt2 and cnt1) into one register.
4483       // It's fine because both counters are 32bit and are not changed in this
4484       // loop. Just restore it on exit. So, cnt1 can be re-used in this loop.
4485       __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2);
4486       __ sub(result, result, 1);
4487     __ BIND(L_HAS_ZERO_LOOP);
4488       __ mov(cnt1, wordSize/str2_chr_size);
4489       __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2);
4490       __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare
4491       if (str2_isL) {
4492         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
4493         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4494         __ lslv(tmp2, tmp2, tmp4);
4495         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4496         __ add(tmp4, tmp4, 1);
4497         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4498         __ lsl(tmp2, tmp2, 1);
4499         __ mov(tmp4, wordSize/str2_chr_size);
4500       } else {
4501         __ mov(ch2, 0xE);
4502         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
4503         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4504         __ lslv(tmp2, tmp2, tmp4);
4505         __ add(tmp4, tmp4, 1);
4506         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4507         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
4508         __ lsl(tmp2, tmp2, 1);
4509         __ mov(tmp4, wordSize/str2_chr_size);
4510         __ sub(str2, str2, str2_chr_size);
4511       }
4512       __ cmp(ch1, ch2);
4513       __ mov(tmp4, wordSize/str2_chr_size);
4514       __ br(__ NE, L_CMP_LOOP_NOMATCH);
4515     __ BIND(L_CMP_LOOP);
4516       str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
4517                : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
4518       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
4519                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
4520       __ add(tmp4, tmp4, 1);
4521       __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2);
4522       __ br(__ GE, L_CMP_LOOP_LAST_CMP);
4523       __ cmp(cnt1, ch2);
4524       __ br(__ EQ, L_CMP_LOOP);
4525     __ BIND(L_CMP_LOOP_NOMATCH);
4526       // here we're not matched
4527       __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop
4528       __ clz(tmp4, tmp2);
4529       __ add(str2, str2, str2_chr_size); // advance pointer
4530       __ b(L_HAS_ZERO_LOOP);
4531     __ align(OptoLoopAlignment);
4532     __ BIND(L_CMP_LOOP_LAST_CMP);
4533       __ cmp(cnt1, ch2);
4534       __ br(__ NE, L_CMP_LOOP_NOMATCH);
4535       __ b(DONE);
4536     __ align(OptoLoopAlignment);
4537     __ BIND(L_CMP_LOOP_LAST_CMP2);
4538       if (str2_isL) {
4539         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
4540         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4541         __ lslv(tmp2, tmp2, tmp4);
4542         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4543         __ add(tmp4, tmp4, 1);
4544         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4545         __ lsl(tmp2, tmp2, 1);
4546       } else {
4547         __ mov(ch2, 0xE);
4548         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
4549         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4550         __ lslv(tmp2, tmp2, tmp4);
4551         __ add(tmp4, tmp4, 1);
4552         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4553         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
4554         __ lsl(tmp2, tmp2, 1);
4555         __ sub(str2, str2, str2_chr_size);
4556       }
4557       __ cmp(ch1, ch2);
4558       __ br(__ NE, L_CMP_LOOP_NOMATCH);
4559       __ b(DONE);
4560     __ align(OptoLoopAlignment);
4561     __ BIND(L_HAS_ZERO_LOOP_NOMATCH);
4562       // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
4563       // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
4564       // so, result was increased at max by wordSize/str2_chr_size - 1, so,
4565       // respective high bit wasn't changed. L_LOOP_PROCEED will increase
4566       // result by analyzed characters value, so, we can just reset lower bits
4567       // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
4568       // 2) restore cnt1 and cnt2 values from "compressed" cnt2
4569       // 3) advance str2 value to represent next str2 octet. result & 7/3 is
4570       // index of last analyzed substring inside current octet. So, str2 in at
4571       // respective start address. We need to advance it to next octet
4572       __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed
4573       __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2);
4574       __ bfm(result, zr, 0, 2 - str2_chr_shift);
4575       __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2
4576       __ movw(cnt2, cnt2);
4577       __ b(L_LOOP_PROCEED);
4578     __ align(OptoLoopAlignment);
4579     __ BIND(NOMATCH);
4580       __ mov(result, -1);
4581     __ BIND(DONE);
4582       __ pop(spilled_regs, sp);
4583       __ ret(lr);
4584     return entry;
4585   }
4586 
4587   void generate_string_indexof_stubs() {
4588     StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true);
4589     StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false);
4590     StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
4591   }
4592 
4593   void inflate_and_store_2_fp_registers(bool generatePrfm,
4594       FloatRegister src1, FloatRegister src2) {
4595     Register dst = r1;
4596     __ zip1(v1, __ T16B, src1, v0);
4597     __ zip2(v2, __ T16B, src1, v0);
4598     if (generatePrfm) {
4599       __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM);
4600     }
4601     __ zip1(v3, __ T16B, src2, v0);
4602     __ zip2(v4, __ T16B, src2, v0);
4603     __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64)));
4604   }
4605 
4606   // R0 = src
4607   // R1 = dst
4608   // R2 = len
4609   // R3 = len >> 3
4610   // V0 = 0
4611   // v1 = loaded 8 bytes
4612   address generate_large_byte_array_inflate() {
4613     __ align(CodeEntryAlignment);
4614     StubCodeMark mark(this, "StubRoutines", "large_byte_array_inflate");
4615     address entry = __ pc();
4616     Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE;
4617     Register src = r0, dst = r1, len = r2, octetCounter = r3;
4618     const int large_loop_threshold = MAX(64, SoftwarePrefetchHintDistance)/8 + 4;
4619 
4620     // do one more 8-byte read to have address 16-byte aligned in most cases
4621     // also use single store instruction
4622     __ ldrd(v2, __ post(src, 8));
4623     __ sub(octetCounter, octetCounter, 2);
4624     __ zip1(v1, __ T16B, v1, v0);
4625     __ zip1(v2, __ T16B, v2, v0);
4626     __ st1(v1, v2, __ T16B, __ post(dst, 32));
4627     __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
4628     __ subs(rscratch1, octetCounter, large_loop_threshold);
4629     __ br(__ LE, LOOP_START);
4630     __ b(LOOP_PRFM_START);
4631     __ bind(LOOP_PRFM);
4632       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
4633     __ bind(LOOP_PRFM_START);
4634       __ prfm(Address(src, SoftwarePrefetchHintDistance));
4635       __ sub(octetCounter, octetCounter, 8);
4636       __ subs(rscratch1, octetCounter, large_loop_threshold);
4637       inflate_and_store_2_fp_registers(true, v3, v4);
4638       inflate_and_store_2_fp_registers(true, v5, v6);
4639       __ br(__ GT, LOOP_PRFM);
4640       __ cmp(octetCounter, (u1)8);
4641       __ br(__ LT, DONE);
4642     __ bind(LOOP);
4643       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
4644       __ bind(LOOP_START);
4645       __ sub(octetCounter, octetCounter, 8);
4646       __ cmp(octetCounter, (u1)8);
4647       inflate_and_store_2_fp_registers(false, v3, v4);
4648       inflate_and_store_2_fp_registers(false, v5, v6);
4649       __ br(__ GE, LOOP);
4650     __ bind(DONE);
4651       __ ret(lr);
4652     return entry;
4653   }
4654 
4655   /**
4656    *  Arguments:
4657    *
4658    *  Input:
4659    *  c_rarg0   - current state address
4660    *  c_rarg1   - H key address
4661    *  c_rarg2   - data address
4662    *  c_rarg3   - number of blocks
4663    *
4664    *  Output:
4665    *  Updated state at c_rarg0
4666    */
4667   address generate_ghash_processBlocks() {
4668     // Bafflingly, GCM uses little-endian for the byte order, but
4669     // big-endian for the bit order.  For example, the polynomial 1 is
4670     // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
4671     //
4672     // So, we must either reverse the bytes in each word and do
4673     // everything big-endian or reverse the bits in each byte and do
4674     // it little-endian.  On AArch64 it's more idiomatic to reverse
4675     // the bits in each byte (we have an instruction, RBIT, to do
4676     // that) and keep the data in little-endian bit order throught the
4677     // calculation, bit-reversing the inputs and outputs.
4678 
4679     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
4680     __ align(wordSize * 2);
4681     address p = __ pc();
4682     __ emit_int64(0x87);  // The low-order bits of the field
4683                           // polynomial (i.e. p = z^7+z^2+z+1)
4684                           // repeated in the low and high parts of a
4685                           // 128-bit vector
4686     __ emit_int64(0x87);
4687 
4688     __ align(CodeEntryAlignment);
4689     address start = __ pc();
4690 
4691     Register state   = c_rarg0;
4692     Register subkeyH = c_rarg1;
4693     Register data    = c_rarg2;
4694     Register blocks  = c_rarg3;
4695 
4696     FloatRegister vzr = v30;
4697     __ eor(vzr, __ T16B, vzr, vzr); // zero register
4698 
4699     __ ldrq(v0, Address(state));
4700     __ ldrq(v1, Address(subkeyH));
4701 
4702     __ rev64(v0, __ T16B, v0);          // Bit-reverse words in state and subkeyH
4703     __ rbit(v0, __ T16B, v0);
4704     __ rev64(v1, __ T16B, v1);
4705     __ rbit(v1, __ T16B, v1);
4706 
4707     __ ldrq(v26, p);
4708 
4709     __ ext(v16, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
4710     __ eor(v16, __ T16B, v16, v1);      // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
4711 
4712     {
4713       Label L_ghash_loop;
4714       __ bind(L_ghash_loop);
4715 
4716       __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
4717                                                  // reversing each byte
4718       __ rbit(v2, __ T16B, v2);
4719       __ eor(v2, __ T16B, v0, v2);   // bit-swapped data ^ bit-swapped state
4720 
4721       // Multiply state in v2 by subkey in v1
4722       ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
4723                      /*a*/v1, /*b*/v2, /*a1_xor_a0*/v16,
4724                      /*temps*/v6, v20, v18, v21);
4725       // Reduce v7:v5 by the field polynomial
4726       ghash_reduce(v0, v5, v7, v26, vzr, v20);
4727 
4728       __ sub(blocks, blocks, 1);
4729       __ cbnz(blocks, L_ghash_loop);
4730     }
4731 
4732     // The bit-reversed result is at this point in v0
4733     __ rev64(v1, __ T16B, v0);
4734     __ rbit(v1, __ T16B, v1);
4735 
4736     __ st1(v1, __ T16B, state);
4737     __ ret(lr);
4738 
4739     return start;
4740   }
4741 
4742   // Continuation point for throwing of implicit exceptions that are
4743   // not handled in the current activation. Fabricates an exception
4744   // oop and initiates normal exception dispatching in this
4745   // frame. Since we need to preserve callee-saved values (currently
4746   // only for C2, but done for C1 as well) we need a callee-saved oop
4747   // map and therefore have to make these stubs into RuntimeStubs
4748   // rather than BufferBlobs.  If the compiler needs all registers to
4749   // be preserved between the fault point and the exception handler
4750   // then it must assume responsibility for that in
4751   // AbstractCompiler::continuation_for_implicit_null_exception or
4752   // continuation_for_implicit_division_by_zero_exception. All other
4753   // implicit exceptions (e.g., NullPointerException or
4754   // AbstractMethodError on entry) are either at call sites or
4755   // otherwise assume that stack unwinding will be initiated, so
4756   // caller saved registers were assumed volatile in the compiler.
4757 
4758 #undef __
4759 #define __ masm->
4760 
4761   address generate_throw_exception(const char* name,
4762                                    address runtime_entry,
4763                                    Register arg1 = noreg,
4764                                    Register arg2 = noreg) {
4765     // Information about frame layout at time of blocking runtime call.
4766     // Note that we only have to preserve callee-saved registers since
4767     // the compilers are responsible for supplying a continuation point
4768     // if they expect all registers to be preserved.
4769     // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0
4770     enum layout {
4771       rfp_off = 0,
4772       rfp_off2,
4773       return_off,
4774       return_off2,
4775       framesize // inclusive of return address
4776     };
4777 
4778     int insts_size = 512;
4779     int locs_size  = 64;
4780 
4781     CodeBuffer code(name, insts_size, locs_size);
4782     OopMapSet* oop_maps  = new OopMapSet();
4783     MacroAssembler* masm = new MacroAssembler(&code);
4784 
4785     address start = __ pc();
4786 
4787     // This is an inlined and slightly modified version of call_VM
4788     // which has the ability to fetch the return PC out of
4789     // thread-local storage and also sets up last_Java_sp slightly
4790     // differently than the real call_VM
4791 
4792     __ enter(); // Save FP and LR before call
4793 
4794     assert(is_even(framesize/2), "sp not 16-byte aligned");
4795 
4796     // lr and fp are already in place
4797     __ sub(sp, rfp, ((unsigned)framesize-4) << LogBytesPerInt); // prolog
4798 
4799     int frame_complete = __ pc() - start;
4800 
4801     // Set up last_Java_sp and last_Java_fp
4802     address the_pc = __ pc();
4803     __ set_last_Java_frame(sp, rfp, the_pc, rscratch1);
4804 
4805     // Call runtime
4806     if (arg1 != noreg) {
4807       assert(arg2 != c_rarg1, "clobbered");
4808       __ mov(c_rarg1, arg1);
4809     }
4810     if (arg2 != noreg) {
4811       __ mov(c_rarg2, arg2);
4812     }
4813     __ mov(c_rarg0, rthread);
4814     BLOCK_COMMENT("call runtime_entry");
4815     __ mov(rscratch1, runtime_entry);
4816     __ blrt(rscratch1, 3 /* number_of_arguments */, 0, 1);
4817 
4818     // Generate oop map
4819     OopMap* map = new OopMap(framesize, 0);
4820 
4821     oop_maps->add_gc_map(the_pc - start, map);
4822 
4823     __ reset_last_Java_frame(true);
4824     __ maybe_isb();
4825 
4826     __ leave();
4827 
4828     // check for pending exceptions
4829 #ifdef ASSERT
4830     Label L;
4831     __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
4832     __ cbnz(rscratch1, L);
4833     __ should_not_reach_here();
4834     __ bind(L);
4835 #endif // ASSERT
4836     __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
4837 
4838 
4839     // codeBlob framesize is in words (not VMRegImpl::slot_size)
4840     RuntimeStub* stub =
4841       RuntimeStub::new_runtime_stub(name,
4842                                     &code,
4843                                     frame_complete,
4844                                     (framesize >> (LogBytesPerWord - LogBytesPerInt)),
4845                                     oop_maps, false);
4846     return stub->entry_point();
4847   }
4848 
4849   class MontgomeryMultiplyGenerator : public MacroAssembler {
4850 
4851     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
4852       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
4853 
4854     RegSet _toSave;
4855     bool _squaring;
4856 
4857   public:
4858     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
4859       : MacroAssembler(as->code()), _squaring(squaring) {
4860 
4861       // Register allocation
4862 
4863       Register reg = c_rarg0;
4864       Pa_base = reg;       // Argument registers
4865       if (squaring)
4866         Pb_base = Pa_base;
4867       else
4868         Pb_base = ++reg;
4869       Pn_base = ++reg;
4870       Rlen= ++reg;
4871       inv = ++reg;
4872       Pm_base = ++reg;
4873 
4874                           // Working registers:
4875       Ra =  ++reg;        // The current digit of a, b, n, and m.
4876       Rb =  ++reg;
4877       Rm =  ++reg;
4878       Rn =  ++reg;
4879 
4880       Pa =  ++reg;        // Pointers to the current/next digit of a, b, n, and m.
4881       Pb =  ++reg;
4882       Pm =  ++reg;
4883       Pn =  ++reg;
4884 
4885       t0 =  ++reg;        // Three registers which form a
4886       t1 =  ++reg;        // triple-precision accumuator.
4887       t2 =  ++reg;
4888 
4889       Ri =  ++reg;        // Inner and outer loop indexes.
4890       Rj =  ++reg;
4891 
4892       Rhi_ab = ++reg;     // Product registers: low and high parts
4893       Rlo_ab = ++reg;     // of a*b and m*n.
4894       Rhi_mn = ++reg;
4895       Rlo_mn = ++reg;
4896 
4897       // r19 and up are callee-saved.
4898       _toSave = RegSet::range(r19, reg) + Pm_base;
4899     }
4900 
4901   private:
4902     void save_regs() {
4903       push(_toSave, sp);
4904     }
4905 
4906     void restore_regs() {
4907       pop(_toSave, sp);
4908     }
4909 
4910     template <typename T>
4911     void unroll_2(Register count, T block) {
4912       Label loop, end, odd;
4913       tbnz(count, 0, odd);
4914       cbz(count, end);
4915       align(16);
4916       bind(loop);
4917       (this->*block)();
4918       bind(odd);
4919       (this->*block)();
4920       subs(count, count, 2);
4921       br(Assembler::GT, loop);
4922       bind(end);
4923     }
4924 
4925     template <typename T>
4926     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
4927       Label loop, end, odd;
4928       tbnz(count, 0, odd);
4929       cbz(count, end);
4930       align(16);
4931       bind(loop);
4932       (this->*block)(d, s, tmp);
4933       bind(odd);
4934       (this->*block)(d, s, tmp);
4935       subs(count, count, 2);
4936       br(Assembler::GT, loop);
4937       bind(end);
4938     }
4939 
4940     void pre1(RegisterOrConstant i) {
4941       block_comment("pre1");
4942       // Pa = Pa_base;
4943       // Pb = Pb_base + i;
4944       // Pm = Pm_base;
4945       // Pn = Pn_base + i;
4946       // Ra = *Pa;
4947       // Rb = *Pb;
4948       // Rm = *Pm;
4949       // Rn = *Pn;
4950       ldr(Ra, Address(Pa_base));
4951       ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
4952       ldr(Rm, Address(Pm_base));
4953       ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
4954       lea(Pa, Address(Pa_base));
4955       lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
4956       lea(Pm, Address(Pm_base));
4957       lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
4958 
4959       // Zero the m*n result.
4960       mov(Rhi_mn, zr);
4961       mov(Rlo_mn, zr);
4962     }
4963 
4964     // The core multiply-accumulate step of a Montgomery
4965     // multiplication.  The idea is to schedule operations as a
4966     // pipeline so that instructions with long latencies (loads and
4967     // multiplies) have time to complete before their results are
4968     // used.  This most benefits in-order implementations of the
4969     // architecture but out-of-order ones also benefit.
4970     void step() {
4971       block_comment("step");
4972       // MACC(Ra, Rb, t0, t1, t2);
4973       // Ra = *++Pa;
4974       // Rb = *--Pb;
4975       umulh(Rhi_ab, Ra, Rb);
4976       mul(Rlo_ab, Ra, Rb);
4977       ldr(Ra, pre(Pa, wordSize));
4978       ldr(Rb, pre(Pb, -wordSize));
4979       acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
4980                                        // previous iteration.
4981       // MACC(Rm, Rn, t0, t1, t2);
4982       // Rm = *++Pm;
4983       // Rn = *--Pn;
4984       umulh(Rhi_mn, Rm, Rn);
4985       mul(Rlo_mn, Rm, Rn);
4986       ldr(Rm, pre(Pm, wordSize));
4987       ldr(Rn, pre(Pn, -wordSize));
4988       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
4989     }
4990 
4991     void post1() {
4992       block_comment("post1");
4993 
4994       // MACC(Ra, Rb, t0, t1, t2);
4995       // Ra = *++Pa;
4996       // Rb = *--Pb;
4997       umulh(Rhi_ab, Ra, Rb);
4998       mul(Rlo_ab, Ra, Rb);
4999       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
5000       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
5001 
5002       // *Pm = Rm = t0 * inv;
5003       mul(Rm, t0, inv);
5004       str(Rm, Address(Pm));
5005 
5006       // MACC(Rm, Rn, t0, t1, t2);
5007       // t0 = t1; t1 = t2; t2 = 0;
5008       umulh(Rhi_mn, Rm, Rn);
5009 
5010 #ifndef PRODUCT
5011       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
5012       {
5013         mul(Rlo_mn, Rm, Rn);
5014         add(Rlo_mn, t0, Rlo_mn);
5015         Label ok;
5016         cbz(Rlo_mn, ok); {
5017           stop("broken Montgomery multiply");
5018         } bind(ok);
5019       }
5020 #endif
5021       // We have very carefully set things up so that
5022       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
5023       // the lower half of Rm * Rn because we know the result already:
5024       // it must be -t0.  t0 + (-t0) must generate a carry iff
5025       // t0 != 0.  So, rather than do a mul and an adds we just set
5026       // the carry flag iff t0 is nonzero.
5027       //
5028       // mul(Rlo_mn, Rm, Rn);
5029       // adds(zr, t0, Rlo_mn);
5030       subs(zr, t0, 1); // Set carry iff t0 is nonzero
5031       adcs(t0, t1, Rhi_mn);
5032       adc(t1, t2, zr);
5033       mov(t2, zr);
5034     }
5035 
5036     void pre2(RegisterOrConstant i, RegisterOrConstant len) {
5037       block_comment("pre2");
5038       // Pa = Pa_base + i-len;
5039       // Pb = Pb_base + len;
5040       // Pm = Pm_base + i-len;
5041       // Pn = Pn_base + len;
5042 
5043       if (i.is_register()) {
5044         sub(Rj, i.as_register(), len);
5045       } else {
5046         mov(Rj, i.as_constant());
5047         sub(Rj, Rj, len);
5048       }
5049       // Rj == i-len
5050 
5051       lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
5052       lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
5053       lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
5054       lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
5055 
5056       // Ra = *++Pa;
5057       // Rb = *--Pb;
5058       // Rm = *++Pm;
5059       // Rn = *--Pn;
5060       ldr(Ra, pre(Pa, wordSize));
5061       ldr(Rb, pre(Pb, -wordSize));
5062       ldr(Rm, pre(Pm, wordSize));
5063       ldr(Rn, pre(Pn, -wordSize));
5064 
5065       mov(Rhi_mn, zr);
5066       mov(Rlo_mn, zr);
5067     }
5068 
5069     void post2(RegisterOrConstant i, RegisterOrConstant len) {
5070       block_comment("post2");
5071       if (i.is_constant()) {
5072         mov(Rj, i.as_constant()-len.as_constant());
5073       } else {
5074         sub(Rj, i.as_register(), len);
5075       }
5076 
5077       adds(t0, t0, Rlo_mn); // The pending m*n, low part
5078 
5079       // As soon as we know the least significant digit of our result,
5080       // store it.
5081       // Pm_base[i-len] = t0;
5082       str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
5083 
5084       // t0 = t1; t1 = t2; t2 = 0;
5085       adcs(t0, t1, Rhi_mn); // The pending m*n, high part
5086       adc(t1, t2, zr);
5087       mov(t2, zr);
5088     }
5089 
5090     // A carry in t0 after Montgomery multiplication means that we
5091     // should subtract multiples of n from our result in m.  We'll
5092     // keep doing that until there is no carry.
5093     void normalize(RegisterOrConstant len) {
5094       block_comment("normalize");
5095       // while (t0)
5096       //   t0 = sub(Pm_base, Pn_base, t0, len);
5097       Label loop, post, again;
5098       Register cnt = t1, i = t2; // Re-use registers; we're done with them now
5099       cbz(t0, post); {
5100         bind(again); {
5101           mov(i, zr);
5102           mov(cnt, len);
5103           ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
5104           ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
5105           subs(zr, zr, zr); // set carry flag, i.e. no borrow
5106           align(16);
5107           bind(loop); {
5108             sbcs(Rm, Rm, Rn);
5109             str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
5110             add(i, i, 1);
5111             ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
5112             ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
5113             sub(cnt, cnt, 1);
5114           } cbnz(cnt, loop);
5115           sbc(t0, t0, zr);
5116         } cbnz(t0, again);
5117       } bind(post);
5118     }
5119 
5120     // Move memory at s to d, reversing words.
5121     //    Increments d to end of copied memory
5122     //    Destroys tmp1, tmp2
5123     //    Preserves len
5124     //    Leaves s pointing to the address which was in d at start
5125     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
5126       assert(tmp1 < r19 && tmp2 < r19, "register corruption");
5127 
5128       lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
5129       mov(tmp1, len);
5130       unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
5131       sub(s, d, len, ext::uxtw, LogBytesPerWord);
5132     }
5133     // where
5134     void reverse1(Register d, Register s, Register tmp) {
5135       ldr(tmp, pre(s, -wordSize));
5136       ror(tmp, tmp, 32);
5137       str(tmp, post(d, wordSize));
5138     }
5139 
5140     void step_squaring() {
5141       // An extra ACC
5142       step();
5143       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
5144     }
5145 
5146     void last_squaring(RegisterOrConstant i) {
5147       Label dont;
5148       // if ((i & 1) == 0) {
5149       tbnz(i.as_register(), 0, dont); {
5150         // MACC(Ra, Rb, t0, t1, t2);
5151         // Ra = *++Pa;
5152         // Rb = *--Pb;
5153         umulh(Rhi_ab, Ra, Rb);
5154         mul(Rlo_ab, Ra, Rb);
5155         acc(Rhi_ab, Rlo_ab, t0, t1, t2);
5156       } bind(dont);
5157     }
5158 
5159     void extra_step_squaring() {
5160       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
5161 
5162       // MACC(Rm, Rn, t0, t1, t2);
5163       // Rm = *++Pm;
5164       // Rn = *--Pn;
5165       umulh(Rhi_mn, Rm, Rn);
5166       mul(Rlo_mn, Rm, Rn);
5167       ldr(Rm, pre(Pm, wordSize));
5168       ldr(Rn, pre(Pn, -wordSize));
5169     }
5170 
5171     void post1_squaring() {
5172       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
5173 
5174       // *Pm = Rm = t0 * inv;
5175       mul(Rm, t0, inv);
5176       str(Rm, Address(Pm));
5177 
5178       // MACC(Rm, Rn, t0, t1, t2);
5179       // t0 = t1; t1 = t2; t2 = 0;
5180       umulh(Rhi_mn, Rm, Rn);
5181 
5182 #ifndef PRODUCT
5183       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
5184       {
5185         mul(Rlo_mn, Rm, Rn);
5186         add(Rlo_mn, t0, Rlo_mn);
5187         Label ok;
5188         cbz(Rlo_mn, ok); {
5189           stop("broken Montgomery multiply");
5190         } bind(ok);
5191       }
5192 #endif
5193       // We have very carefully set things up so that
5194       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
5195       // the lower half of Rm * Rn because we know the result already:
5196       // it must be -t0.  t0 + (-t0) must generate a carry iff
5197       // t0 != 0.  So, rather than do a mul and an adds we just set
5198       // the carry flag iff t0 is nonzero.
5199       //
5200       // mul(Rlo_mn, Rm, Rn);
5201       // adds(zr, t0, Rlo_mn);
5202       subs(zr, t0, 1); // Set carry iff t0 is nonzero
5203       adcs(t0, t1, Rhi_mn);
5204       adc(t1, t2, zr);
5205       mov(t2, zr);
5206     }
5207 
5208     void acc(Register Rhi, Register Rlo,
5209              Register t0, Register t1, Register t2) {
5210       adds(t0, t0, Rlo);
5211       adcs(t1, t1, Rhi);
5212       adc(t2, t2, zr);
5213     }
5214 
5215   public:
5216     /**
5217      * Fast Montgomery multiplication.  The derivation of the
5218      * algorithm is in A Cryptographic Library for the Motorola
5219      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
5220      *
5221      * Arguments:
5222      *
5223      * Inputs for multiplication:
5224      *   c_rarg0   - int array elements a
5225      *   c_rarg1   - int array elements b
5226      *   c_rarg2   - int array elements n (the modulus)
5227      *   c_rarg3   - int length
5228      *   c_rarg4   - int inv
5229      *   c_rarg5   - int array elements m (the result)
5230      *
5231      * Inputs for squaring:
5232      *   c_rarg0   - int array elements a
5233      *   c_rarg1   - int array elements n (the modulus)
5234      *   c_rarg2   - int length
5235      *   c_rarg3   - int inv
5236      *   c_rarg4   - int array elements m (the result)
5237      *
5238      */
5239     address generate_multiply() {
5240       Label argh, nothing;
5241       bind(argh);
5242       stop("MontgomeryMultiply total_allocation must be <= 8192");
5243 
5244       align(CodeEntryAlignment);
5245       address entry = pc();
5246 
5247       cbzw(Rlen, nothing);
5248 
5249       enter();
5250 
5251       // Make room.
5252       cmpw(Rlen, 512);
5253       br(Assembler::HI, argh);
5254       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
5255       andr(sp, Ra, -2 * wordSize);
5256 
5257       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
5258 
5259       {
5260         // Copy input args, reversing as we go.  We use Ra as a
5261         // temporary variable.
5262         reverse(Ra, Pa_base, Rlen, t0, t1);
5263         if (!_squaring)
5264           reverse(Ra, Pb_base, Rlen, t0, t1);
5265         reverse(Ra, Pn_base, Rlen, t0, t1);
5266       }
5267 
5268       // Push all call-saved registers and also Pm_base which we'll need
5269       // at the end.
5270       save_regs();
5271 
5272 #ifndef PRODUCT
5273       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
5274       {
5275         ldr(Rn, Address(Pn_base, 0));
5276         mul(Rlo_mn, Rn, inv);
5277         subs(zr, Rlo_mn, -1);
5278         Label ok;
5279         br(EQ, ok); {
5280           stop("broken inverse in Montgomery multiply");
5281         } bind(ok);
5282       }
5283 #endif
5284 
5285       mov(Pm_base, Ra);
5286 
5287       mov(t0, zr);
5288       mov(t1, zr);
5289       mov(t2, zr);
5290 
5291       block_comment("for (int i = 0; i < len; i++) {");
5292       mov(Ri, zr); {
5293         Label loop, end;
5294         cmpw(Ri, Rlen);
5295         br(Assembler::GE, end);
5296 
5297         bind(loop);
5298         pre1(Ri);
5299 
5300         block_comment("  for (j = i; j; j--) {"); {
5301           movw(Rj, Ri);
5302           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
5303         } block_comment("  } // j");
5304 
5305         post1();
5306         addw(Ri, Ri, 1);
5307         cmpw(Ri, Rlen);
5308         br(Assembler::LT, loop);
5309         bind(end);
5310         block_comment("} // i");
5311       }
5312 
5313       block_comment("for (int i = len; i < 2*len; i++) {");
5314       mov(Ri, Rlen); {
5315         Label loop, end;
5316         cmpw(Ri, Rlen, Assembler::LSL, 1);
5317         br(Assembler::GE, end);
5318 
5319         bind(loop);
5320         pre2(Ri, Rlen);
5321 
5322         block_comment("  for (j = len*2-i-1; j; j--) {"); {
5323           lslw(Rj, Rlen, 1);
5324           subw(Rj, Rj, Ri);
5325           subw(Rj, Rj, 1);
5326           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
5327         } block_comment("  } // j");
5328 
5329         post2(Ri, Rlen);
5330         addw(Ri, Ri, 1);
5331         cmpw(Ri, Rlen, Assembler::LSL, 1);
5332         br(Assembler::LT, loop);
5333         bind(end);
5334       }
5335       block_comment("} // i");
5336 
5337       normalize(Rlen);
5338 
5339       mov(Ra, Pm_base);  // Save Pm_base in Ra
5340       restore_regs();  // Restore caller's Pm_base
5341 
5342       // Copy our result into caller's Pm_base
5343       reverse(Pm_base, Ra, Rlen, t0, t1);
5344 
5345       leave();
5346       bind(nothing);
5347       ret(lr);
5348 
5349       return entry;
5350     }
5351     // In C, approximately:
5352 
5353     // void
5354     // montgomery_multiply(unsigned long Pa_base[], unsigned long Pb_base[],
5355     //                     unsigned long Pn_base[], unsigned long Pm_base[],
5356     //                     unsigned long inv, int len) {
5357     //   unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
5358     //   unsigned long *Pa, *Pb, *Pn, *Pm;
5359     //   unsigned long Ra, Rb, Rn, Rm;
5360 
5361     //   int i;
5362 
5363     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
5364 
5365     //   for (i = 0; i < len; i++) {
5366     //     int j;
5367 
5368     //     Pa = Pa_base;
5369     //     Pb = Pb_base + i;
5370     //     Pm = Pm_base;
5371     //     Pn = Pn_base + i;
5372 
5373     //     Ra = *Pa;
5374     //     Rb = *Pb;
5375     //     Rm = *Pm;
5376     //     Rn = *Pn;
5377 
5378     //     int iters = i;
5379     //     for (j = 0; iters--; j++) {
5380     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
5381     //       MACC(Ra, Rb, t0, t1, t2);
5382     //       Ra = *++Pa;
5383     //       Rb = *--Pb;
5384     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5385     //       MACC(Rm, Rn, t0, t1, t2);
5386     //       Rm = *++Pm;
5387     //       Rn = *--Pn;
5388     //     }
5389 
5390     //     assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
5391     //     MACC(Ra, Rb, t0, t1, t2);
5392     //     *Pm = Rm = t0 * inv;
5393     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
5394     //     MACC(Rm, Rn, t0, t1, t2);
5395 
5396     //     assert(t0 == 0, "broken Montgomery multiply");
5397 
5398     //     t0 = t1; t1 = t2; t2 = 0;
5399     //   }
5400 
5401     //   for (i = len; i < 2*len; i++) {
5402     //     int j;
5403 
5404     //     Pa = Pa_base + i-len;
5405     //     Pb = Pb_base + len;
5406     //     Pm = Pm_base + i-len;
5407     //     Pn = Pn_base + len;
5408 
5409     //     Ra = *++Pa;
5410     //     Rb = *--Pb;
5411     //     Rm = *++Pm;
5412     //     Rn = *--Pn;
5413 
5414     //     int iters = len*2-i-1;
5415     //     for (j = i-len+1; iters--; j++) {
5416     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
5417     //       MACC(Ra, Rb, t0, t1, t2);
5418     //       Ra = *++Pa;
5419     //       Rb = *--Pb;
5420     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5421     //       MACC(Rm, Rn, t0, t1, t2);
5422     //       Rm = *++Pm;
5423     //       Rn = *--Pn;
5424     //     }
5425 
5426     //     Pm_base[i-len] = t0;
5427     //     t0 = t1; t1 = t2; t2 = 0;
5428     //   }
5429 
5430     //   while (t0)
5431     //     t0 = sub(Pm_base, Pn_base, t0, len);
5432     // }
5433 
5434     /**
5435      * Fast Montgomery squaring.  This uses asymptotically 25% fewer
5436      * multiplies than Montgomery multiplication so it should be up to
5437      * 25% faster.  However, its loop control is more complex and it
5438      * may actually run slower on some machines.
5439      *
5440      * Arguments:
5441      *
5442      * Inputs:
5443      *   c_rarg0   - int array elements a
5444      *   c_rarg1   - int array elements n (the modulus)
5445      *   c_rarg2   - int length
5446      *   c_rarg3   - int inv
5447      *   c_rarg4   - int array elements m (the result)
5448      *
5449      */
5450     address generate_square() {
5451       Label argh;
5452       bind(argh);
5453       stop("MontgomeryMultiply total_allocation must be <= 8192");
5454 
5455       align(CodeEntryAlignment);
5456       address entry = pc();
5457 
5458       enter();
5459 
5460       // Make room.
5461       cmpw(Rlen, 512);
5462       br(Assembler::HI, argh);
5463       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
5464       andr(sp, Ra, -2 * wordSize);
5465 
5466       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
5467 
5468       {
5469         // Copy input args, reversing as we go.  We use Ra as a
5470         // temporary variable.
5471         reverse(Ra, Pa_base, Rlen, t0, t1);
5472         reverse(Ra, Pn_base, Rlen, t0, t1);
5473       }
5474 
5475       // Push all call-saved registers and also Pm_base which we'll need
5476       // at the end.
5477       save_regs();
5478 
5479       mov(Pm_base, Ra);
5480 
5481       mov(t0, zr);
5482       mov(t1, zr);
5483       mov(t2, zr);
5484 
5485       block_comment("for (int i = 0; i < len; i++) {");
5486       mov(Ri, zr); {
5487         Label loop, end;
5488         bind(loop);
5489         cmp(Ri, Rlen);
5490         br(Assembler::GE, end);
5491 
5492         pre1(Ri);
5493 
5494         block_comment("for (j = (i+1)/2; j; j--) {"); {
5495           add(Rj, Ri, 1);
5496           lsr(Rj, Rj, 1);
5497           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
5498         } block_comment("  } // j");
5499 
5500         last_squaring(Ri);
5501 
5502         block_comment("  for (j = i/2; j; j--) {"); {
5503           lsr(Rj, Ri, 1);
5504           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
5505         } block_comment("  } // j");
5506 
5507         post1_squaring();
5508         add(Ri, Ri, 1);
5509         cmp(Ri, Rlen);
5510         br(Assembler::LT, loop);
5511 
5512         bind(end);
5513         block_comment("} // i");
5514       }
5515 
5516       block_comment("for (int i = len; i < 2*len; i++) {");
5517       mov(Ri, Rlen); {
5518         Label loop, end;
5519         bind(loop);
5520         cmp(Ri, Rlen, Assembler::LSL, 1);
5521         br(Assembler::GE, end);
5522 
5523         pre2(Ri, Rlen);
5524 
5525         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
5526           lsl(Rj, Rlen, 1);
5527           sub(Rj, Rj, Ri);
5528           sub(Rj, Rj, 1);
5529           lsr(Rj, Rj, 1);
5530           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
5531         } block_comment("  } // j");
5532 
5533         last_squaring(Ri);
5534 
5535         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
5536           lsl(Rj, Rlen, 1);
5537           sub(Rj, Rj, Ri);
5538           lsr(Rj, Rj, 1);
5539           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
5540         } block_comment("  } // j");
5541 
5542         post2(Ri, Rlen);
5543         add(Ri, Ri, 1);
5544         cmp(Ri, Rlen, Assembler::LSL, 1);
5545 
5546         br(Assembler::LT, loop);
5547         bind(end);
5548         block_comment("} // i");
5549       }
5550 
5551       normalize(Rlen);
5552 
5553       mov(Ra, Pm_base);  // Save Pm_base in Ra
5554       restore_regs();  // Restore caller's Pm_base
5555 
5556       // Copy our result into caller's Pm_base
5557       reverse(Pm_base, Ra, Rlen, t0, t1);
5558 
5559       leave();
5560       ret(lr);
5561 
5562       return entry;
5563     }
5564     // In C, approximately:
5565 
5566     // void
5567     // montgomery_square(unsigned long Pa_base[], unsigned long Pn_base[],
5568     //                   unsigned long Pm_base[], unsigned long inv, int len) {
5569     //   unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
5570     //   unsigned long *Pa, *Pb, *Pn, *Pm;
5571     //   unsigned long Ra, Rb, Rn, Rm;
5572 
5573     //   int i;
5574 
5575     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
5576 
5577     //   for (i = 0; i < len; i++) {
5578     //     int j;
5579 
5580     //     Pa = Pa_base;
5581     //     Pb = Pa_base + i;
5582     //     Pm = Pm_base;
5583     //     Pn = Pn_base + i;
5584 
5585     //     Ra = *Pa;
5586     //     Rb = *Pb;
5587     //     Rm = *Pm;
5588     //     Rn = *Pn;
5589 
5590     //     int iters = (i+1)/2;
5591     //     for (j = 0; iters--; j++) {
5592     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
5593     //       MACC2(Ra, Rb, t0, t1, t2);
5594     //       Ra = *++Pa;
5595     //       Rb = *--Pb;
5596     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5597     //       MACC(Rm, Rn, t0, t1, t2);
5598     //       Rm = *++Pm;
5599     //       Rn = *--Pn;
5600     //     }
5601     //     if ((i & 1) == 0) {
5602     //       assert(Ra == Pa_base[j], "must be");
5603     //       MACC(Ra, Ra, t0, t1, t2);
5604     //     }
5605     //     iters = i/2;
5606     //     assert(iters == i-j, "must be");
5607     //     for (; iters--; j++) {
5608     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5609     //       MACC(Rm, Rn, t0, t1, t2);
5610     //       Rm = *++Pm;
5611     //       Rn = *--Pn;
5612     //     }
5613 
5614     //     *Pm = Rm = t0 * inv;
5615     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
5616     //     MACC(Rm, Rn, t0, t1, t2);
5617 
5618     //     assert(t0 == 0, "broken Montgomery multiply");
5619 
5620     //     t0 = t1; t1 = t2; t2 = 0;
5621     //   }
5622 
5623     //   for (i = len; i < 2*len; i++) {
5624     //     int start = i-len+1;
5625     //     int end = start + (len - start)/2;
5626     //     int j;
5627 
5628     //     Pa = Pa_base + i-len;
5629     //     Pb = Pa_base + len;
5630     //     Pm = Pm_base + i-len;
5631     //     Pn = Pn_base + len;
5632 
5633     //     Ra = *++Pa;
5634     //     Rb = *--Pb;
5635     //     Rm = *++Pm;
5636     //     Rn = *--Pn;
5637 
5638     //     int iters = (2*len-i-1)/2;
5639     //     assert(iters == end-start, "must be");
5640     //     for (j = start; iters--; j++) {
5641     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
5642     //       MACC2(Ra, Rb, t0, t1, t2);
5643     //       Ra = *++Pa;
5644     //       Rb = *--Pb;
5645     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5646     //       MACC(Rm, Rn, t0, t1, t2);
5647     //       Rm = *++Pm;
5648     //       Rn = *--Pn;
5649     //     }
5650     //     if ((i & 1) == 0) {
5651     //       assert(Ra == Pa_base[j], "must be");
5652     //       MACC(Ra, Ra, t0, t1, t2);
5653     //     }
5654     //     iters =  (2*len-i)/2;
5655     //     assert(iters == len-j, "must be");
5656     //     for (; iters--; j++) {
5657     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5658     //       MACC(Rm, Rn, t0, t1, t2);
5659     //       Rm = *++Pm;
5660     //       Rn = *--Pn;
5661     //     }
5662     //     Pm_base[i-len] = t0;
5663     //     t0 = t1; t1 = t2; t2 = 0;
5664     //   }
5665 
5666     //   while (t0)
5667     //     t0 = sub(Pm_base, Pn_base, t0, len);
5668     // }
5669   };
5670 
5671 
5672   // Initialization
5673   void generate_initial() {
5674     // Generate initial stubs and initializes the entry points
5675 
5676     // entry points that exist in all platforms Note: This is code
5677     // that could be shared among different platforms - however the
5678     // benefit seems to be smaller than the disadvantage of having a
5679     // much more complicated generator structure. See also comment in
5680     // stubRoutines.hpp.
5681 
5682     StubRoutines::_forward_exception_entry = generate_forward_exception();
5683 
5684     StubRoutines::_call_stub_entry =
5685       generate_call_stub(StubRoutines::_call_stub_return_address);
5686 
5687     // is referenced by megamorphic call
5688     StubRoutines::_catch_exception_entry = generate_catch_exception();
5689 
5690     // Build this early so it's available for the interpreter.
5691     StubRoutines::_throw_StackOverflowError_entry =
5692       generate_throw_exception("StackOverflowError throw_exception",
5693                                CAST_FROM_FN_PTR(address,
5694                                                 SharedRuntime::throw_StackOverflowError));
5695     StubRoutines::_throw_delayed_StackOverflowError_entry =
5696       generate_throw_exception("delayed StackOverflowError throw_exception",
5697                                CAST_FROM_FN_PTR(address,
5698                                                 SharedRuntime::throw_delayed_StackOverflowError));
5699     if (UseCRC32Intrinsics) {
5700       // set table address before stub generation which use it
5701       StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table;
5702       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
5703     }
5704 
5705     if (UseCRC32CIntrinsics) {
5706       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
5707     }
5708 
5709     // Disabled until JDK-8210858 is fixed
5710     // if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog)) {
5711     //   StubRoutines::_dlog = generate_dlog();
5712     // }
5713 
5714     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
5715       StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false);
5716     }
5717 
5718     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
5719       StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true);
5720     }
5721   }
5722 
5723   void generate_all() {
5724     // support for verify_oop (must happen after universe_init)
5725     StubRoutines::_verify_oop_subroutine_entry     = generate_verify_oop();
5726     StubRoutines::_throw_AbstractMethodError_entry =
5727       generate_throw_exception("AbstractMethodError throw_exception",
5728                                CAST_FROM_FN_PTR(address,
5729                                                 SharedRuntime::
5730                                                 throw_AbstractMethodError));
5731 
5732     StubRoutines::_throw_IncompatibleClassChangeError_entry =
5733       generate_throw_exception("IncompatibleClassChangeError throw_exception",
5734                                CAST_FROM_FN_PTR(address,
5735                                                 SharedRuntime::
5736                                                 throw_IncompatibleClassChangeError));
5737 
5738     StubRoutines::_throw_NullPointerException_at_call_entry =
5739       generate_throw_exception("NullPointerException at call throw_exception",
5740                                CAST_FROM_FN_PTR(address,
5741                                                 SharedRuntime::
5742                                                 throw_NullPointerException_at_call));
5743 
5744     // arraycopy stubs used by compilers
5745     generate_arraycopy_stubs();
5746 
5747     // has negatives stub for large arrays.
5748     StubRoutines::aarch64::_has_negatives = generate_has_negatives(StubRoutines::aarch64::_has_negatives_long);
5749 
5750     // array equals stub for large arrays.
5751     if (!UseSimpleArrayEquals) {
5752       StubRoutines::aarch64::_large_array_equals = generate_large_array_equals();
5753     }
5754 
5755     generate_compare_long_strings();
5756 
5757     generate_string_indexof_stubs();
5758 
5759     // byte_array_inflate stub for large arrays.
5760     StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate();
5761 
5762 #ifdef COMPILER2
5763     if (UseMultiplyToLenIntrinsic) {
5764       StubRoutines::_multiplyToLen = generate_multiplyToLen();
5765     }
5766 
5767     if (UseSquareToLenIntrinsic) {
5768       StubRoutines::_squareToLen = generate_squareToLen();
5769     }
5770 
5771     if (UseMulAddIntrinsic) {
5772       StubRoutines::_mulAdd = generate_mulAdd();
5773     }
5774 
5775     if (UseMontgomeryMultiplyIntrinsic) {
5776       StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply");
5777       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
5778       StubRoutines::_montgomeryMultiply = g.generate_multiply();
5779     }
5780 
5781     if (UseMontgomerySquareIntrinsic) {
5782       StubCodeMark mark(this, "StubRoutines", "montgomerySquare");
5783       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
5784       // We use generate_multiply() rather than generate_square()
5785       // because it's faster for the sizes of modulus we care about.
5786       StubRoutines::_montgomerySquare = g.generate_multiply();
5787     }
5788 #endif // COMPILER2
5789 
5790 #ifndef BUILTIN_SIM
5791     // generate GHASH intrinsics code
5792     if (UseGHASHIntrinsics) {
5793       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
5794     }
5795 
5796     if (UseAESIntrinsics) {
5797       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
5798       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
5799       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
5800       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
5801     }
5802 
5803     if (UseSHA1Intrinsics) {
5804       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(false,   "sha1_implCompress");
5805       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(true,    "sha1_implCompressMB");
5806     }
5807     if (UseSHA256Intrinsics) {
5808       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(false, "sha256_implCompress");
5809       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true,  "sha256_implCompressMB");
5810     }
5811 
5812     // generate Adler32 intrinsics code
5813     if (UseAdler32Intrinsics) {
5814       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
5815     }
5816 
5817     // Safefetch stubs.
5818     generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
5819                                                        &StubRoutines::_safefetch32_fault_pc,
5820                                                        &StubRoutines::_safefetch32_continuation_pc);
5821     generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
5822                                                        &StubRoutines::_safefetchN_fault_pc,
5823                                                        &StubRoutines::_safefetchN_continuation_pc);
5824 #endif
5825     StubRoutines::aarch64::set_completed();
5826   }
5827 
5828  public:
5829   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
5830     if (all) {
5831       generate_all();
5832     } else {
5833       generate_initial();
5834     }
5835   }
5836 }; // end class declaration
5837 
5838 void StubGenerator_generate(CodeBuffer* code, bool all) {
5839   StubGenerator g(code, all);
5840 }