1 /*
   2  * Copyright (c) 2003, 2020, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2020, Red Hat Inc. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/macroAssembler.hpp"
  28 #include "asm/macroAssembler.inline.hpp"
  29 #include "gc/shared/barrierSet.hpp"
  30 #include "gc/shared/barrierSetAssembler.hpp"
  31 #include "interpreter/interpreter.hpp"
  32 #include "memory/universe.hpp"
  33 #include "nativeInst_aarch64.hpp"
  34 #include "oops/instanceOop.hpp"
  35 #include "oops/method.hpp"
  36 #include "oops/objArrayKlass.hpp"
  37 #include "oops/oop.inline.hpp"
  38 #include "prims/methodHandles.hpp"
  39 #include "runtime/frame.inline.hpp"
  40 #include "runtime/handles.inline.hpp"
  41 #include "runtime/sharedRuntime.hpp"
  42 #include "runtime/stubCodeGenerator.hpp"
  43 #include "runtime/stubRoutines.hpp"
  44 #include "runtime/thread.inline.hpp"
  45 #include "utilities/align.hpp"
  46 #include "utilities/powerOfTwo.hpp"
  47 #ifdef COMPILER2
  48 #include "opto/runtime.hpp"
  49 #endif
  50 #if INCLUDE_ZGC
  51 #include "gc/z/zThreadLocalData.hpp"
  52 #endif
  53 
  54 // Declaration and definition of StubGenerator (no .hpp file).
  55 // For a more detailed description of the stub routine structure
  56 // see the comment in stubRoutines.hpp
  57 
  58 #undef __
  59 #define __ _masm->
  60 #define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8))
  61 
  62 #ifdef PRODUCT
  63 #define BLOCK_COMMENT(str) /* nothing */
  64 #else
  65 #define BLOCK_COMMENT(str) __ block_comment(str)
  66 #endif
  67 
  68 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  69 
  70 // Stub Code definitions
  71 
  72 class StubGenerator: public StubCodeGenerator {
  73  private:
  74 
  75 #ifdef PRODUCT
  76 #define inc_counter_np(counter) ((void)0)
  77 #else
  78   void inc_counter_np_(int& counter) {
  79     __ lea(rscratch2, ExternalAddress((address)&counter));
  80     __ ldrw(rscratch1, Address(rscratch2));
  81     __ addw(rscratch1, rscratch1, 1);
  82     __ strw(rscratch1, Address(rscratch2));
  83   }
  84 #define inc_counter_np(counter) \
  85   BLOCK_COMMENT("inc_counter " #counter); \
  86   inc_counter_np_(counter);
  87 #endif
  88 
  89   // Call stubs are used to call Java from C
  90   //
  91   // Arguments:
  92   //    c_rarg0:   call wrapper address                   address
  93   //    c_rarg1:   result                                 address
  94   //    c_rarg2:   result type                            BasicType
  95   //    c_rarg3:   method                                 Method*
  96   //    c_rarg4:   (interpreter) entry point              address
  97   //    c_rarg5:   parameters                             intptr_t*
  98   //    c_rarg6:   parameter size (in words)              int
  99   //    c_rarg7:   thread                                 Thread*
 100   //
 101   // There is no return from the stub itself as any Java result
 102   // is written to result
 103   //
 104   // we save r30 (lr) as the return PC at the base of the frame and
 105   // link r29 (fp) below it as the frame pointer installing sp (r31)
 106   // into fp.
 107   //
 108   // we save r0-r7, which accounts for all the c arguments.
 109   //
 110   // TODO: strictly do we need to save them all? they are treated as
 111   // volatile by C so could we omit saving the ones we are going to
 112   // place in global registers (thread? method?) or those we only use
 113   // during setup of the Java call?
 114   //
 115   // we don't need to save r8 which C uses as an indirect result location
 116   // return register.
 117   //
 118   // we don't need to save r9-r15 which both C and Java treat as
 119   // volatile
 120   //
 121   // we don't need to save r16-18 because Java does not use them
 122   //
 123   // we save r19-r28 which Java uses as scratch registers and C
 124   // expects to be callee-save
 125   //
 126   // we save the bottom 64 bits of each value stored in v8-v15; it is
 127   // the responsibility of the caller to preserve larger values.
 128   //
 129   // so the stub frame looks like this when we enter Java code
 130   //
 131   //     [ return_from_Java     ] <--- sp
 132   //     [ argument word n      ]
 133   //      ...
 134   // -27 [ argument word 1      ]
 135   // -26 [ saved v15            ] <--- sp_after_call
 136   // -25 [ saved v14            ]
 137   // -24 [ saved v13            ]
 138   // -23 [ saved v12            ]
 139   // -22 [ saved v11            ]
 140   // -21 [ saved v10            ]
 141   // -20 [ saved v9             ]
 142   // -19 [ saved v8             ]
 143   // -18 [ saved r28            ]
 144   // -17 [ saved r27            ]
 145   // -16 [ saved r26            ]
 146   // -15 [ saved r25            ]
 147   // -14 [ saved r24            ]
 148   // -13 [ saved r23            ]
 149   // -12 [ saved r22            ]
 150   // -11 [ saved r21            ]
 151   // -10 [ saved r20            ]
 152   //  -9 [ saved r19            ]
 153   //  -8 [ call wrapper    (r0) ]
 154   //  -7 [ result          (r1) ]
 155   //  -6 [ result type     (r2) ]
 156   //  -5 [ method          (r3) ]
 157   //  -4 [ entry point     (r4) ]
 158   //  -3 [ parameters      (r5) ]
 159   //  -2 [ parameter size  (r6) ]
 160   //  -1 [ thread (r7)          ]
 161   //   0 [ saved fp       (r29) ] <--- fp == saved sp (r31)
 162   //   1 [ saved lr       (r30) ]
 163 
 164   // Call stub stack layout word offsets from fp
 165   enum call_stub_layout {
 166     sp_after_call_off = -26,
 167 
 168     d15_off            = -26,
 169     d13_off            = -24,
 170     d11_off            = -22,
 171     d9_off             = -20,
 172 
 173     r28_off            = -18,
 174     r26_off            = -16,
 175     r24_off            = -14,
 176     r22_off            = -12,
 177     r20_off            = -10,
 178     call_wrapper_off   =  -8,
 179     result_off         =  -7,
 180     result_type_off    =  -6,
 181     method_off         =  -5,
 182     entry_point_off    =  -4,
 183     parameter_size_off =  -2,
 184     thread_off         =  -1,
 185     fp_f               =   0,
 186     retaddr_off        =   1,
 187   };
 188 
 189   address generate_call_stub(address& return_address) {
 190     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
 191            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
 192            "adjust this code");
 193 
 194     StubCodeMark mark(this, "StubRoutines", "call_stub");
 195     address start = __ pc();
 196 
 197     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 198 
 199     const Address call_wrapper  (rfp, call_wrapper_off   * wordSize);
 200     const Address result        (rfp, result_off         * wordSize);
 201     const Address result_type   (rfp, result_type_off    * wordSize);
 202     const Address method        (rfp, method_off         * wordSize);
 203     const Address entry_point   (rfp, entry_point_off    * wordSize);
 204     const Address parameter_size(rfp, parameter_size_off * wordSize);
 205 
 206     const Address thread        (rfp, thread_off         * wordSize);
 207 
 208     const Address d15_save      (rfp, d15_off * wordSize);
 209     const Address d13_save      (rfp, d13_off * wordSize);
 210     const Address d11_save      (rfp, d11_off * wordSize);
 211     const Address d9_save       (rfp, d9_off * wordSize);
 212 
 213     const Address r28_save      (rfp, r28_off * wordSize);
 214     const Address r26_save      (rfp, r26_off * wordSize);
 215     const Address r24_save      (rfp, r24_off * wordSize);
 216     const Address r22_save      (rfp, r22_off * wordSize);
 217     const Address r20_save      (rfp, r20_off * wordSize);
 218 
 219     // stub code
 220 
 221     address aarch64_entry = __ pc();
 222 
 223     // set up frame and move sp to end of save area
 224     __ enter();
 225     __ sub(sp, rfp, -sp_after_call_off * wordSize);
 226 
 227     // save register parameters and Java scratch/global registers
 228     // n.b. we save thread even though it gets installed in
 229     // rthread because we want to sanity check rthread later
 230     __ str(c_rarg7,  thread);
 231     __ strw(c_rarg6, parameter_size);
 232     __ stp(c_rarg4, c_rarg5,  entry_point);
 233     __ stp(c_rarg2, c_rarg3,  result_type);
 234     __ stp(c_rarg0, c_rarg1,  call_wrapper);
 235 
 236     __ stp(r20, r19,   r20_save);
 237     __ stp(r22, r21,   r22_save);
 238     __ stp(r24, r23,   r24_save);
 239     __ stp(r26, r25,   r26_save);
 240     __ stp(r28, r27,   r28_save);
 241 
 242     __ stpd(v9,  v8,   d9_save);
 243     __ stpd(v11, v10,  d11_save);
 244     __ stpd(v13, v12,  d13_save);
 245     __ stpd(v15, v14,  d15_save);
 246 
 247     // install Java thread in global register now we have saved
 248     // whatever value it held
 249     __ mov(rthread, c_rarg7);
 250     // And method
 251     __ mov(rmethod, c_rarg3);
 252 
 253     // set up the heapbase register
 254     __ reinit_heapbase();
 255 
 256 #ifdef ASSERT
 257     // make sure we have no pending exceptions
 258     {
 259       Label L;
 260       __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
 261       __ cmp(rscratch1, (u1)NULL_WORD);
 262       __ br(Assembler::EQ, L);
 263       __ stop("StubRoutines::call_stub: entered with pending exception");
 264       __ BIND(L);
 265     }
 266 #endif
 267     // pass parameters if any
 268     __ mov(esp, sp);
 269     __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
 270     __ andr(sp, rscratch1, -2 * wordSize);
 271 
 272     BLOCK_COMMENT("pass parameters if any");
 273     Label parameters_done;
 274     // parameter count is still in c_rarg6
 275     // and parameter pointer identifying param 1 is in c_rarg5
 276     __ cbzw(c_rarg6, parameters_done);
 277 
 278     address loop = __ pc();
 279     __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
 280     __ subsw(c_rarg6, c_rarg6, 1);
 281     __ push(rscratch1);
 282     __ br(Assembler::GT, loop);
 283 
 284     __ BIND(parameters_done);
 285 
 286     // call Java entry -- passing methdoOop, and current sp
 287     //      rmethod: Method*
 288     //      r13: sender sp
 289     BLOCK_COMMENT("call Java function");
 290     __ mov(r13, sp);
 291     __ blr(c_rarg4);
 292 
 293     // we do this here because the notify will already have been done
 294     // if we get to the next instruction via an exception
 295     //
 296     // n.b. adding this instruction here affects the calculation of
 297     // whether or not a routine returns to the call stub (used when
 298     // doing stack walks) since the normal test is to check the return
 299     // pc against the address saved below. so we may need to allow for
 300     // this extra instruction in the check.
 301 
 302     // save current address for use by exception handling code
 303 
 304     return_address = __ pc();
 305 
 306     // store result depending on type (everything that is not
 307     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
 308     // n.b. this assumes Java returns an integral result in r0
 309     // and a floating result in j_farg0
 310     __ ldr(j_rarg2, result);
 311     Label is_long, is_float, is_double, exit;
 312     __ ldr(j_rarg1, result_type);
 313     __ cmp(j_rarg1, (u1)T_OBJECT);
 314     __ br(Assembler::EQ, is_long);
 315     __ cmp(j_rarg1, (u1)T_LONG);
 316     __ br(Assembler::EQ, is_long);
 317     __ cmp(j_rarg1, (u1)T_FLOAT);
 318     __ br(Assembler::EQ, is_float);
 319     __ cmp(j_rarg1, (u1)T_DOUBLE);
 320     __ br(Assembler::EQ, is_double);
 321 
 322     // handle T_INT case
 323     __ strw(r0, Address(j_rarg2));
 324 
 325     __ BIND(exit);
 326 
 327     // pop parameters
 328     __ sub(esp, rfp, -sp_after_call_off * wordSize);
 329 
 330 #ifdef ASSERT
 331     // verify that threads correspond
 332     {
 333       Label L, S;
 334       __ ldr(rscratch1, thread);
 335       __ cmp(rthread, rscratch1);
 336       __ br(Assembler::NE, S);
 337       __ get_thread(rscratch1);
 338       __ cmp(rthread, rscratch1);
 339       __ br(Assembler::EQ, L);
 340       __ BIND(S);
 341       __ stop("StubRoutines::call_stub: threads must correspond");
 342       __ BIND(L);
 343     }
 344 #endif
 345 
 346     // restore callee-save registers
 347     __ ldpd(v15, v14,  d15_save);
 348     __ ldpd(v13, v12,  d13_save);
 349     __ ldpd(v11, v10,  d11_save);
 350     __ ldpd(v9,  v8,   d9_save);
 351 
 352     __ ldp(r28, r27,   r28_save);
 353     __ ldp(r26, r25,   r26_save);
 354     __ ldp(r24, r23,   r24_save);
 355     __ ldp(r22, r21,   r22_save);
 356     __ ldp(r20, r19,   r20_save);
 357 
 358     __ ldp(c_rarg0, c_rarg1,  call_wrapper);
 359     __ ldrw(c_rarg2, result_type);
 360     __ ldr(c_rarg3,  method);
 361     __ ldp(c_rarg4, c_rarg5,  entry_point);
 362     __ ldp(c_rarg6, c_rarg7,  parameter_size);
 363 
 364     // leave frame and return to caller
 365     __ leave();
 366     __ ret(lr);
 367 
 368     // handle return types different from T_INT
 369 
 370     __ BIND(is_long);
 371     __ str(r0, Address(j_rarg2, 0));
 372     __ br(Assembler::AL, exit);
 373 
 374     __ BIND(is_float);
 375     __ strs(j_farg0, Address(j_rarg2, 0));
 376     __ br(Assembler::AL, exit);
 377 
 378     __ BIND(is_double);
 379     __ strd(j_farg0, Address(j_rarg2, 0));
 380     __ br(Assembler::AL, exit);
 381 
 382     return start;
 383   }
 384 
 385   // Return point for a Java call if there's an exception thrown in
 386   // Java code.  The exception is caught and transformed into a
 387   // pending exception stored in JavaThread that can be tested from
 388   // within the VM.
 389   //
 390   // Note: Usually the parameters are removed by the callee. In case
 391   // of an exception crossing an activation frame boundary, that is
 392   // not the case if the callee is compiled code => need to setup the
 393   // rsp.
 394   //
 395   // r0: exception oop
 396 
 397   address generate_catch_exception() {
 398     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 399     address start = __ pc();
 400 
 401     // same as in generate_call_stub():
 402     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 403     const Address thread        (rfp, thread_off         * wordSize);
 404 
 405 #ifdef ASSERT
 406     // verify that threads correspond
 407     {
 408       Label L, S;
 409       __ ldr(rscratch1, thread);
 410       __ cmp(rthread, rscratch1);
 411       __ br(Assembler::NE, S);
 412       __ get_thread(rscratch1);
 413       __ cmp(rthread, rscratch1);
 414       __ br(Assembler::EQ, L);
 415       __ bind(S);
 416       __ stop("StubRoutines::catch_exception: threads must correspond");
 417       __ bind(L);
 418     }
 419 #endif
 420 
 421     // set pending exception
 422     __ verify_oop(r0);
 423 
 424     __ str(r0, Address(rthread, Thread::pending_exception_offset()));
 425     __ mov(rscratch1, (address)__FILE__);
 426     __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
 427     __ movw(rscratch1, (int)__LINE__);
 428     __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
 429 
 430     // complete return to VM
 431     assert(StubRoutines::_call_stub_return_address != NULL,
 432            "_call_stub_return_address must have been generated before");
 433     __ b(StubRoutines::_call_stub_return_address);
 434 
 435     return start;
 436   }
 437 
 438   // Continuation point for runtime calls returning with a pending
 439   // exception.  The pending exception check happened in the runtime
 440   // or native call stub.  The pending exception in Thread is
 441   // converted into a Java-level exception.
 442   //
 443   // Contract with Java-level exception handlers:
 444   // r0: exception
 445   // r3: throwing pc
 446   //
 447   // NOTE: At entry of this stub, exception-pc must be in LR !!
 448 
 449   // NOTE: this is always used as a jump target within generated code
 450   // so it just needs to be generated code wiht no x86 prolog
 451 
 452   address generate_forward_exception() {
 453     StubCodeMark mark(this, "StubRoutines", "forward exception");
 454     address start = __ pc();
 455 
 456     // Upon entry, LR points to the return address returning into
 457     // Java (interpreted or compiled) code; i.e., the return address
 458     // becomes the throwing pc.
 459     //
 460     // Arguments pushed before the runtime call are still on the stack
 461     // but the exception handler will reset the stack pointer ->
 462     // ignore them.  A potential result in registers can be ignored as
 463     // well.
 464 
 465 #ifdef ASSERT
 466     // make sure this code is only executed if there is a pending exception
 467     {
 468       Label L;
 469       __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
 470       __ cbnz(rscratch1, L);
 471       __ stop("StubRoutines::forward exception: no pending exception (1)");
 472       __ bind(L);
 473     }
 474 #endif
 475 
 476     // compute exception handler into r19
 477 
 478     // call the VM to find the handler address associated with the
 479     // caller address. pass thread in r0 and caller pc (ret address)
 480     // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
 481     // the stack.
 482     __ mov(c_rarg1, lr);
 483     // lr will be trashed by the VM call so we move it to R19
 484     // (callee-saved) because we also need to pass it to the handler
 485     // returned by this call.
 486     __ mov(r19, lr);
 487     BLOCK_COMMENT("call exception_handler_for_return_address");
 488     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 489                          SharedRuntime::exception_handler_for_return_address),
 490                     rthread, c_rarg1);
 491     // we should not really care that lr is no longer the callee
 492     // address. we saved the value the handler needs in r19 so we can
 493     // just copy it to r3. however, the C2 handler will push its own
 494     // frame and then calls into the VM and the VM code asserts that
 495     // the PC for the frame above the handler belongs to a compiled
 496     // Java method. So, we restore lr here to satisfy that assert.
 497     __ mov(lr, r19);
 498     // setup r0 & r3 & clear pending exception
 499     __ mov(r3, r19);
 500     __ mov(r19, r0);
 501     __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
 502     __ str(zr, Address(rthread, Thread::pending_exception_offset()));
 503 
 504 #ifdef ASSERT
 505     // make sure exception is set
 506     {
 507       Label L;
 508       __ cbnz(r0, L);
 509       __ stop("StubRoutines::forward exception: no pending exception (2)");
 510       __ bind(L);
 511     }
 512 #endif
 513 
 514     // continue at exception handler
 515     // r0: exception
 516     // r3: throwing pc
 517     // r19: exception handler
 518     __ verify_oop(r0);
 519     __ br(r19);
 520 
 521     return start;
 522   }
 523 
 524   // Non-destructive plausibility checks for oops
 525   //
 526   // Arguments:
 527   //    r0: oop to verify
 528   //    rscratch1: error message
 529   //
 530   // Stack after saving c_rarg3:
 531   //    [tos + 0]: saved c_rarg3
 532   //    [tos + 1]: saved c_rarg2
 533   //    [tos + 2]: saved lr
 534   //    [tos + 3]: saved rscratch2
 535   //    [tos + 4]: saved r0
 536   //    [tos + 5]: saved rscratch1
 537   address generate_verify_oop() {
 538 
 539     StubCodeMark mark(this, "StubRoutines", "verify_oop");
 540     address start = __ pc();
 541 
 542     Label exit, error;
 543 
 544     // save c_rarg2 and c_rarg3
 545     __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
 546 
 547     // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 548     __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 549     __ ldr(c_rarg3, Address(c_rarg2));
 550     __ add(c_rarg3, c_rarg3, 1);
 551     __ str(c_rarg3, Address(c_rarg2));
 552 
 553     // object is in r0
 554     // make sure object is 'reasonable'
 555     __ cbz(r0, exit); // if obj is NULL it is OK
 556 
 557 #if INCLUDE_ZGC
 558     if (UseZGC) {
 559       // Check if mask is good.
 560       // verifies that ZAddressBadMask & r0 == 0
 561       __ ldr(c_rarg3, Address(rthread, ZThreadLocalData::address_bad_mask_offset()));
 562       __ andr(c_rarg2, r0, c_rarg3);
 563       __ cbnz(c_rarg2, error);
 564     }
 565 #endif
 566 
 567     // Check if the oop is in the right area of memory
 568     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask());
 569     __ andr(c_rarg2, r0, c_rarg3);
 570     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits());
 571 
 572     // Compare c_rarg2 and c_rarg3.  We don't use a compare
 573     // instruction here because the flags register is live.
 574     __ eor(c_rarg2, c_rarg2, c_rarg3);
 575     __ cbnz(c_rarg2, error);
 576 
 577     // make sure klass is 'reasonable', which is not zero.
 578     __ load_klass(r0, r0);  // get klass
 579     __ cbz(r0, error);      // if klass is NULL it is broken
 580 
 581     // return if everything seems ok
 582     __ bind(exit);
 583 
 584     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 585     __ ret(lr);
 586 
 587     // handle errors
 588     __ bind(error);
 589     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 590 
 591     __ push(RegSet::range(r0, r29), sp);
 592     // debug(char* msg, int64_t pc, int64_t regs[])
 593     __ mov(c_rarg0, rscratch1);      // pass address of error message
 594     __ mov(c_rarg1, lr);             // pass return address
 595     __ mov(c_rarg2, sp);             // pass address of regs on stack
 596 #ifndef PRODUCT
 597     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
 598 #endif
 599     BLOCK_COMMENT("call MacroAssembler::debug");
 600     __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
 601     __ blr(rscratch1);
 602     __ hlt(0);
 603 
 604     return start;
 605   }
 606 
 607   void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); }
 608 
 609   // Generate indices for iota vector.
 610   address generate_iota_indices(const char *stub_name) {
 611     __ align(CodeEntryAlignment);
 612     StubCodeMark mark(this, "StubRoutines", stub_name);
 613     address start = __ pc();
 614     __ emit_data64(0x0706050403020100, relocInfo::none);
 615     __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none);
 616     return start;
 617   }
 618 
 619   // The inner part of zero_words().  This is the bulk operation,
 620   // zeroing words in blocks, possibly using DC ZVA to do it.  The
 621   // caller is responsible for zeroing the last few words.
 622   //
 623   // Inputs:
 624   // r10: the HeapWord-aligned base address of an array to zero.
 625   // r11: the count in HeapWords, r11 > 0.
 626   //
 627   // Returns r10 and r11, adjusted for the caller to clear.
 628   // r10: the base address of the tail of words left to clear.
 629   // r11: the number of words in the tail.
 630   //      r11 < MacroAssembler::zero_words_block_size.
 631 
 632   address generate_zero_blocks() {
 633     Label done;
 634     Label base_aligned;
 635 
 636     Register base = r10, cnt = r11;
 637 
 638     __ align(CodeEntryAlignment);
 639     StubCodeMark mark(this, "StubRoutines", "zero_blocks");
 640     address start = __ pc();
 641 
 642     if (UseBlockZeroing) {
 643       int zva_length = VM_Version::zva_length();
 644 
 645       // Ensure ZVA length can be divided by 16. This is required by
 646       // the subsequent operations.
 647       assert (zva_length % 16 == 0, "Unexpected ZVA Length");
 648 
 649       __ tbz(base, 3, base_aligned);
 650       __ str(zr, Address(__ post(base, 8)));
 651       __ sub(cnt, cnt, 1);
 652       __ bind(base_aligned);
 653 
 654       // Ensure count >= zva_length * 2 so that it still deserves a zva after
 655       // alignment.
 656       Label small;
 657       int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
 658       __ subs(rscratch1, cnt, low_limit >> 3);
 659       __ br(Assembler::LT, small);
 660       __ zero_dcache_blocks(base, cnt);
 661       __ bind(small);
 662     }
 663 
 664     {
 665       // Number of stp instructions we'll unroll
 666       const int unroll =
 667         MacroAssembler::zero_words_block_size / 2;
 668       // Clear the remaining blocks.
 669       Label loop;
 670       __ subs(cnt, cnt, unroll * 2);
 671       __ br(Assembler::LT, done);
 672       __ bind(loop);
 673       for (int i = 0; i < unroll; i++)
 674         __ stp(zr, zr, __ post(base, 16));
 675       __ subs(cnt, cnt, unroll * 2);
 676       __ br(Assembler::GE, loop);
 677       __ bind(done);
 678       __ add(cnt, cnt, unroll * 2);
 679     }
 680 
 681     __ ret(lr);
 682 
 683     return start;
 684   }
 685 
 686 
 687   typedef enum {
 688     copy_forwards = 1,
 689     copy_backwards = -1
 690   } copy_direction;
 691 
 692   // Bulk copy of blocks of 8 words.
 693   //
 694   // count is a count of words.
 695   //
 696   // Precondition: count >= 8
 697   //
 698   // Postconditions:
 699   //
 700   // The least significant bit of count contains the remaining count
 701   // of words to copy.  The rest of count is trash.
 702   //
 703   // s and d are adjusted to point to the remaining words to copy
 704   //
 705   void generate_copy_longs(Label &start, Register s, Register d, Register count,
 706                            copy_direction direction) {
 707     int unit = wordSize * direction;
 708     int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
 709 
 710     int offset;
 711     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
 712       t4 = r7, t5 = r10, t6 = r11, t7 = r12;
 713     const Register stride = r13;
 714 
 715     assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7);
 716     assert_different_registers(s, d, count, rscratch1);
 717 
 718     Label again, drain;
 719     const char *stub_name;
 720     if (direction == copy_forwards)
 721       stub_name = "forward_copy_longs";
 722     else
 723       stub_name = "backward_copy_longs";
 724 
 725     __ align(CodeEntryAlignment);
 726 
 727     StubCodeMark mark(this, "StubRoutines", stub_name);
 728 
 729     __ bind(start);
 730 
 731     Label unaligned_copy_long;
 732     if (AvoidUnalignedAccesses) {
 733       __ tbnz(d, 3, unaligned_copy_long);
 734     }
 735 
 736     if (direction == copy_forwards) {
 737       __ sub(s, s, bias);
 738       __ sub(d, d, bias);
 739     }
 740 
 741 #ifdef ASSERT
 742     // Make sure we are never given < 8 words
 743     {
 744       Label L;
 745       __ cmp(count, (u1)8);
 746       __ br(Assembler::GE, L);
 747       __ stop("genrate_copy_longs called with < 8 words");
 748       __ bind(L);
 749     }
 750 #endif
 751 
 752     // Fill 8 registers
 753     if (UseSIMDForMemoryOps) {
 754       __ ldpq(v0, v1, Address(s, 4 * unit));
 755       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
 756     } else {
 757       __ ldp(t0, t1, Address(s, 2 * unit));
 758       __ ldp(t2, t3, Address(s, 4 * unit));
 759       __ ldp(t4, t5, Address(s, 6 * unit));
 760       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 761     }
 762 
 763     __ subs(count, count, 16);
 764     __ br(Assembler::LO, drain);
 765 
 766     int prefetch = PrefetchCopyIntervalInBytes;
 767     bool use_stride = false;
 768     if (direction == copy_backwards) {
 769        use_stride = prefetch > 256;
 770        prefetch = -prefetch;
 771        if (use_stride) __ mov(stride, prefetch);
 772     }
 773 
 774     __ bind(again);
 775 
 776     if (PrefetchCopyIntervalInBytes > 0)
 777       __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 778 
 779     if (UseSIMDForMemoryOps) {
 780       __ stpq(v0, v1, Address(d, 4 * unit));
 781       __ ldpq(v0, v1, Address(s, 4 * unit));
 782       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
 783       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
 784     } else {
 785       __ stp(t0, t1, Address(d, 2 * unit));
 786       __ ldp(t0, t1, Address(s, 2 * unit));
 787       __ stp(t2, t3, Address(d, 4 * unit));
 788       __ ldp(t2, t3, Address(s, 4 * unit));
 789       __ stp(t4, t5, Address(d, 6 * unit));
 790       __ ldp(t4, t5, Address(s, 6 * unit));
 791       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 792       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 793     }
 794 
 795     __ subs(count, count, 8);
 796     __ br(Assembler::HS, again);
 797 
 798     // Drain
 799     __ bind(drain);
 800     if (UseSIMDForMemoryOps) {
 801       __ stpq(v0, v1, Address(d, 4 * unit));
 802       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
 803     } else {
 804       __ stp(t0, t1, Address(d, 2 * unit));
 805       __ stp(t2, t3, Address(d, 4 * unit));
 806       __ stp(t4, t5, Address(d, 6 * unit));
 807       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 808     }
 809 
 810     {
 811       Label L1, L2;
 812       __ tbz(count, exact_log2(4), L1);
 813       if (UseSIMDForMemoryOps) {
 814         __ ldpq(v0, v1, Address(__ pre(s, 4 * unit)));
 815         __ stpq(v0, v1, Address(__ pre(d, 4 * unit)));
 816       } else {
 817         __ ldp(t0, t1, Address(s, 2 * unit));
 818         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
 819         __ stp(t0, t1, Address(d, 2 * unit));
 820         __ stp(t2, t3, Address(__ pre(d, 4 * unit)));
 821       }
 822       __ bind(L1);
 823 
 824       if (direction == copy_forwards) {
 825         __ add(s, s, bias);
 826         __ add(d, d, bias);
 827       }
 828 
 829       __ tbz(count, 1, L2);
 830       __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
 831       __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
 832       __ bind(L2);
 833     }
 834 
 835     __ ret(lr);
 836 
 837     if (AvoidUnalignedAccesses) {
 838       Label drain, again;
 839       // Register order for storing. Order is different for backward copy.
 840 
 841       __ bind(unaligned_copy_long);
 842 
 843       // source address is even aligned, target odd aligned
 844       //
 845       // when forward copying word pairs we read long pairs at offsets
 846       // {0, 2, 4, 6} (in long words). when backwards copying we read
 847       // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
 848       // address by -2 in the forwards case so we can compute the
 849       // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
 850       // or -1.
 851       //
 852       // when forward copying we need to store 1 word, 3 pairs and
 853       // then 1 word at offsets {0, 1, 3, 5, 7}. Rather thna use a
 854       // zero offset We adjust the destination by -1 which means we
 855       // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
 856       //
 857       // When backwards copyng we need to store 1 word, 3 pairs and
 858       // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
 859       // offsets {1, 3, 5, 7, 8} * unit.
 860 
 861       if (direction == copy_forwards) {
 862         __ sub(s, s, 16);
 863         __ sub(d, d, 8);
 864       }
 865 
 866       // Fill 8 registers
 867       //
 868       // for forwards copy s was offset by -16 from the original input
 869       // value of s so the register contents are at these offsets
 870       // relative to the 64 bit block addressed by that original input
 871       // and so on for each successive 64 byte block when s is updated
 872       //
 873       // t0 at offset 0,  t1 at offset 8
 874       // t2 at offset 16, t3 at offset 24
 875       // t4 at offset 32, t5 at offset 40
 876       // t6 at offset 48, t7 at offset 56
 877 
 878       // for backwards copy s was not offset so the register contents
 879       // are at these offsets into the preceding 64 byte block
 880       // relative to that original input and so on for each successive
 881       // preceding 64 byte block when s is updated. this explains the
 882       // slightly counter-intuitive looking pattern of register usage
 883       // in the stp instructions for backwards copy.
 884       //
 885       // t0 at offset -16, t1 at offset -8
 886       // t2 at offset -32, t3 at offset -24
 887       // t4 at offset -48, t5 at offset -40
 888       // t6 at offset -64, t7 at offset -56
 889 
 890       __ ldp(t0, t1, Address(s, 2 * unit));
 891       __ ldp(t2, t3, Address(s, 4 * unit));
 892       __ ldp(t4, t5, Address(s, 6 * unit));
 893       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 894 
 895       __ subs(count, count, 16);
 896       __ br(Assembler::LO, drain);
 897 
 898       int prefetch = PrefetchCopyIntervalInBytes;
 899       bool use_stride = false;
 900       if (direction == copy_backwards) {
 901          use_stride = prefetch > 256;
 902          prefetch = -prefetch;
 903          if (use_stride) __ mov(stride, prefetch);
 904       }
 905 
 906       __ bind(again);
 907 
 908       if (PrefetchCopyIntervalInBytes > 0)
 909         __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 910 
 911       if (direction == copy_forwards) {
 912        // allowing for the offset of -8 the store instructions place
 913        // registers into the target 64 bit block at the following
 914        // offsets
 915        //
 916        // t0 at offset 0
 917        // t1 at offset 8,  t2 at offset 16
 918        // t3 at offset 24, t4 at offset 32
 919        // t5 at offset 40, t6 at offset 48
 920        // t7 at offset 56
 921 
 922         __ str(t0, Address(d, 1 * unit));
 923         __ stp(t1, t2, Address(d, 2 * unit));
 924         __ ldp(t0, t1, Address(s, 2 * unit));
 925         __ stp(t3, t4, Address(d, 4 * unit));
 926         __ ldp(t2, t3, Address(s, 4 * unit));
 927         __ stp(t5, t6, Address(d, 6 * unit));
 928         __ ldp(t4, t5, Address(s, 6 * unit));
 929         __ str(t7, Address(__ pre(d, 8 * unit)));
 930         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 931       } else {
 932        // d was not offset when we started so the registers are
 933        // written into the 64 bit block preceding d with the following
 934        // offsets
 935        //
 936        // t1 at offset -8
 937        // t3 at offset -24, t0 at offset -16
 938        // t5 at offset -48, t2 at offset -32
 939        // t7 at offset -56, t4 at offset -48
 940        //                   t6 at offset -64
 941        //
 942        // note that this matches the offsets previously noted for the
 943        // loads
 944 
 945         __ str(t1, Address(d, 1 * unit));
 946         __ stp(t3, t0, Address(d, 3 * unit));
 947         __ ldp(t0, t1, Address(s, 2 * unit));
 948         __ stp(t5, t2, Address(d, 5 * unit));
 949         __ ldp(t2, t3, Address(s, 4 * unit));
 950         __ stp(t7, t4, Address(d, 7 * unit));
 951         __ ldp(t4, t5, Address(s, 6 * unit));
 952         __ str(t6, Address(__ pre(d, 8 * unit)));
 953         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 954       }
 955 
 956       __ subs(count, count, 8);
 957       __ br(Assembler::HS, again);
 958 
 959       // Drain
 960       //
 961       // this uses the same pattern of offsets and register arguments
 962       // as above
 963       __ bind(drain);
 964       if (direction == copy_forwards) {
 965         __ str(t0, Address(d, 1 * unit));
 966         __ stp(t1, t2, Address(d, 2 * unit));
 967         __ stp(t3, t4, Address(d, 4 * unit));
 968         __ stp(t5, t6, Address(d, 6 * unit));
 969         __ str(t7, Address(__ pre(d, 8 * unit)));
 970       } else {
 971         __ str(t1, Address(d, 1 * unit));
 972         __ stp(t3, t0, Address(d, 3 * unit));
 973         __ stp(t5, t2, Address(d, 5 * unit));
 974         __ stp(t7, t4, Address(d, 7 * unit));
 975         __ str(t6, Address(__ pre(d, 8 * unit)));
 976       }
 977       // now we need to copy any remaining part block which may
 978       // include a 4 word block subblock and/or a 2 word subblock.
 979       // bits 2 and 1 in the count are the tell-tale for whetehr we
 980       // have each such subblock
 981       {
 982         Label L1, L2;
 983         __ tbz(count, exact_log2(4), L1);
 984        // this is the same as above but copying only 4 longs hence
 985        // with ony one intervening stp between the str instructions
 986        // but note that the offsets and registers still follow the
 987        // same pattern
 988         __ ldp(t0, t1, Address(s, 2 * unit));
 989         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
 990         if (direction == copy_forwards) {
 991           __ str(t0, Address(d, 1 * unit));
 992           __ stp(t1, t2, Address(d, 2 * unit));
 993           __ str(t3, Address(__ pre(d, 4 * unit)));
 994         } else {
 995           __ str(t1, Address(d, 1 * unit));
 996           __ stp(t3, t0, Address(d, 3 * unit));
 997           __ str(t2, Address(__ pre(d, 4 * unit)));
 998         }
 999         __ bind(L1);
1000 
1001         __ tbz(count, 1, L2);
1002        // this is the same as above but copying only 2 longs hence
1003        // there is no intervening stp between the str instructions
1004        // but note that the offset and register patterns are still
1005        // the same
1006         __ ldp(t0, t1, Address(__ pre(s, 2 * unit)));
1007         if (direction == copy_forwards) {
1008           __ str(t0, Address(d, 1 * unit));
1009           __ str(t1, Address(__ pre(d, 2 * unit)));
1010         } else {
1011           __ str(t1, Address(d, 1 * unit));
1012           __ str(t0, Address(__ pre(d, 2 * unit)));
1013         }
1014         __ bind(L2);
1015 
1016        // for forwards copy we need to re-adjust the offsets we
1017        // applied so that s and d are follow the last words written
1018 
1019        if (direction == copy_forwards) {
1020          __ add(s, s, 16);
1021          __ add(d, d, 8);
1022        }
1023 
1024       }
1025 
1026       __ ret(lr);
1027       }
1028   }
1029 
1030   // Small copy: less than 16 bytes.
1031   //
1032   // NB: Ignores all of the bits of count which represent more than 15
1033   // bytes, so a caller doesn't have to mask them.
1034 
1035   void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) {
1036     bool is_backwards = step < 0;
1037     size_t granularity = uabs(step);
1038     int direction = is_backwards ? -1 : 1;
1039     int unit = wordSize * direction;
1040 
1041     Label Lword, Lint, Lshort, Lbyte;
1042 
1043     assert(granularity
1044            && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
1045 
1046     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6;
1047 
1048     // ??? I don't know if this bit-test-and-branch is the right thing
1049     // to do.  It does a lot of jumping, resulting in several
1050     // mispredicted branches.  It might make more sense to do this
1051     // with something like Duff's device with a single computed branch.
1052 
1053     __ tbz(count, 3 - exact_log2(granularity), Lword);
1054     __ ldr(tmp, Address(__ adjust(s, unit, is_backwards)));
1055     __ str(tmp, Address(__ adjust(d, unit, is_backwards)));
1056     __ bind(Lword);
1057 
1058     if (granularity <= sizeof (jint)) {
1059       __ tbz(count, 2 - exact_log2(granularity), Lint);
1060       __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
1061       __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
1062       __ bind(Lint);
1063     }
1064 
1065     if (granularity <= sizeof (jshort)) {
1066       __ tbz(count, 1 - exact_log2(granularity), Lshort);
1067       __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
1068       __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
1069       __ bind(Lshort);
1070     }
1071 
1072     if (granularity <= sizeof (jbyte)) {
1073       __ tbz(count, 0, Lbyte);
1074       __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
1075       __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
1076       __ bind(Lbyte);
1077     }
1078   }
1079 
1080   Label copy_f, copy_b;
1081 
1082   // All-singing all-dancing memory copy.
1083   //
1084   // Copy count units of memory from s to d.  The size of a unit is
1085   // step, which can be positive or negative depending on the direction
1086   // of copy.  If is_aligned is false, we align the source address.
1087   //
1088 
1089   void copy_memory(bool is_aligned, Register s, Register d,
1090                    Register count, Register tmp, int step) {
1091     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
1092     bool is_backwards = step < 0;
1093     int granularity = uabs(step);
1094     const Register t0 = r3, t1 = r4;
1095 
1096     // <= 96 bytes do inline. Direction doesn't matter because we always
1097     // load all the data before writing anything
1098     Label copy4, copy8, copy16, copy32, copy80, copy_big, finish;
1099     const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8;
1100     const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12;
1101     const Register send = r17, dend = r18;
1102 
1103     if (PrefetchCopyIntervalInBytes > 0)
1104       __ prfm(Address(s, 0), PLDL1KEEP);
1105     __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity));
1106     __ br(Assembler::HI, copy_big);
1107 
1108     __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
1109     __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
1110 
1111     __ cmp(count, u1(16/granularity));
1112     __ br(Assembler::LS, copy16);
1113 
1114     __ cmp(count, u1(64/granularity));
1115     __ br(Assembler::HI, copy80);
1116 
1117     __ cmp(count, u1(32/granularity));
1118     __ br(Assembler::LS, copy32);
1119 
1120     // 33..64 bytes
1121     if (UseSIMDForMemoryOps) {
1122       __ ldpq(v0, v1, Address(s, 0));
1123       __ ldpq(v2, v3, Address(send, -32));
1124       __ stpq(v0, v1, Address(d, 0));
1125       __ stpq(v2, v3, Address(dend, -32));
1126     } else {
1127       __ ldp(t0, t1, Address(s, 0));
1128       __ ldp(t2, t3, Address(s, 16));
1129       __ ldp(t4, t5, Address(send, -32));
1130       __ ldp(t6, t7, Address(send, -16));
1131 
1132       __ stp(t0, t1, Address(d, 0));
1133       __ stp(t2, t3, Address(d, 16));
1134       __ stp(t4, t5, Address(dend, -32));
1135       __ stp(t6, t7, Address(dend, -16));
1136     }
1137     __ b(finish);
1138 
1139     // 17..32 bytes
1140     __ bind(copy32);
1141     __ ldp(t0, t1, Address(s, 0));
1142     __ ldp(t2, t3, Address(send, -16));
1143     __ stp(t0, t1, Address(d, 0));
1144     __ stp(t2, t3, Address(dend, -16));
1145     __ b(finish);
1146 
1147     // 65..80/96 bytes
1148     // (96 bytes if SIMD because we do 32 byes per instruction)
1149     __ bind(copy80);
1150     if (UseSIMDForMemoryOps) {
1151       __ ld4(v0, v1, v2, v3, __ T16B, Address(s, 0));
1152       __ ldpq(v4, v5, Address(send, -32));
1153       __ st4(v0, v1, v2, v3, __ T16B, Address(d, 0));
1154       __ stpq(v4, v5, Address(dend, -32));
1155     } else {
1156       __ ldp(t0, t1, Address(s, 0));
1157       __ ldp(t2, t3, Address(s, 16));
1158       __ ldp(t4, t5, Address(s, 32));
1159       __ ldp(t6, t7, Address(s, 48));
1160       __ ldp(t8, t9, Address(send, -16));
1161 
1162       __ stp(t0, t1, Address(d, 0));
1163       __ stp(t2, t3, Address(d, 16));
1164       __ stp(t4, t5, Address(d, 32));
1165       __ stp(t6, t7, Address(d, 48));
1166       __ stp(t8, t9, Address(dend, -16));
1167     }
1168     __ b(finish);
1169 
1170     // 0..16 bytes
1171     __ bind(copy16);
1172     __ cmp(count, u1(8/granularity));
1173     __ br(Assembler::LO, copy8);
1174 
1175     // 8..16 bytes
1176     __ ldr(t0, Address(s, 0));
1177     __ ldr(t1, Address(send, -8));
1178     __ str(t0, Address(d, 0));
1179     __ str(t1, Address(dend, -8));
1180     __ b(finish);
1181 
1182     if (granularity < 8) {
1183       // 4..7 bytes
1184       __ bind(copy8);
1185       __ tbz(count, 2 - exact_log2(granularity), copy4);
1186       __ ldrw(t0, Address(s, 0));
1187       __ ldrw(t1, Address(send, -4));
1188       __ strw(t0, Address(d, 0));
1189       __ strw(t1, Address(dend, -4));
1190       __ b(finish);
1191       if (granularity < 4) {
1192         // 0..3 bytes
1193         __ bind(copy4);
1194         __ cbz(count, finish); // get rid of 0 case
1195         if (granularity == 2) {
1196           __ ldrh(t0, Address(s, 0));
1197           __ strh(t0, Address(d, 0));
1198         } else { // granularity == 1
1199           // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
1200           // the first and last byte.
1201           // Handle the 3 byte case by loading and storing base + count/2
1202           // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
1203           // This does means in the 1 byte case we load/store the same
1204           // byte 3 times.
1205           __ lsr(count, count, 1);
1206           __ ldrb(t0, Address(s, 0));
1207           __ ldrb(t1, Address(send, -1));
1208           __ ldrb(t2, Address(s, count));
1209           __ strb(t0, Address(d, 0));
1210           __ strb(t1, Address(dend, -1));
1211           __ strb(t2, Address(d, count));
1212         }
1213         __ b(finish);
1214       }
1215     }
1216 
1217     __ bind(copy_big);
1218     if (is_backwards) {
1219       __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
1220       __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
1221     }
1222 
1223     // Now we've got the small case out of the way we can align the
1224     // source address on a 2-word boundary.
1225 
1226     Label aligned;
1227 
1228     if (is_aligned) {
1229       // We may have to adjust by 1 word to get s 2-word-aligned.
1230       __ tbz(s, exact_log2(wordSize), aligned);
1231       __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards)));
1232       __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards)));
1233       __ sub(count, count, wordSize/granularity);
1234     } else {
1235       if (is_backwards) {
1236         __ andr(rscratch2, s, 2 * wordSize - 1);
1237       } else {
1238         __ neg(rscratch2, s);
1239         __ andr(rscratch2, rscratch2, 2 * wordSize - 1);
1240       }
1241       // rscratch2 is the byte adjustment needed to align s.
1242       __ cbz(rscratch2, aligned);
1243       int shift = exact_log2(granularity);
1244       if (shift)  __ lsr(rscratch2, rscratch2, shift);
1245       __ sub(count, count, rscratch2);
1246 
1247 #if 0
1248       // ?? This code is only correct for a disjoint copy.  It may or
1249       // may not make sense to use it in that case.
1250 
1251       // Copy the first pair; s and d may not be aligned.
1252       __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
1253       __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
1254 
1255       // Align s and d, adjust count
1256       if (is_backwards) {
1257         __ sub(s, s, rscratch2);
1258         __ sub(d, d, rscratch2);
1259       } else {
1260         __ add(s, s, rscratch2);
1261         __ add(d, d, rscratch2);
1262       }
1263 #else
1264       copy_memory_small(s, d, rscratch2, rscratch1, step);
1265 #endif
1266     }
1267 
1268     __ bind(aligned);
1269 
1270     // s is now 2-word-aligned.
1271 
1272     // We have a count of units and some trailing bytes.  Adjust the
1273     // count and do a bulk copy of words.
1274     __ lsr(rscratch2, count, exact_log2(wordSize/granularity));
1275     if (direction == copy_forwards)
1276       __ bl(copy_f);
1277     else
1278       __ bl(copy_b);
1279 
1280     // And the tail.
1281     copy_memory_small(s, d, count, tmp, step);
1282 
1283     if (granularity >= 8) __ bind(copy8);
1284     if (granularity >= 4) __ bind(copy4);
1285     __ bind(finish);
1286   }
1287 
1288 
1289   void clobber_registers() {
1290 #ifdef ASSERT
1291     __ mov(rscratch1, (uint64_t)0xdeadbeef);
1292     __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
1293     for (Register r = r3; r <= r18; r++)
1294       if (r != rscratch1) __ mov(r, rscratch1);
1295 #endif
1296   }
1297 
1298   // Scan over array at a for count oops, verifying each one.
1299   // Preserves a and count, clobbers rscratch1 and rscratch2.
1300   void verify_oop_array (size_t size, Register a, Register count, Register temp) {
1301     Label loop, end;
1302     __ mov(rscratch1, a);
1303     __ mov(rscratch2, zr);
1304     __ bind(loop);
1305     __ cmp(rscratch2, count);
1306     __ br(Assembler::HS, end);
1307     if (size == (size_t)wordSize) {
1308       __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1309       __ verify_oop(temp);
1310     } else {
1311       __ ldrw(r16, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1312       __ decode_heap_oop(temp); // calls verify_oop
1313     }
1314     __ add(rscratch2, rscratch2, size);
1315     __ b(loop);
1316     __ bind(end);
1317   }
1318 
1319   // Arguments:
1320   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1321   //             ignored
1322   //   is_oop  - true => oop array, so generate store check code
1323   //   name    - stub name string
1324   //
1325   // Inputs:
1326   //   c_rarg0   - source array address
1327   //   c_rarg1   - destination array address
1328   //   c_rarg2   - element count, treated as ssize_t, can be zero
1329   //
1330   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1331   // the hardware handle it.  The two dwords within qwords that span
1332   // cache line boundaries will still be loaded and stored atomicly.
1333   //
1334   // Side Effects:
1335   //   disjoint_int_copy_entry is set to the no-overlap entry point
1336   //   used by generate_conjoint_int_oop_copy().
1337   //
1338   address generate_disjoint_copy(size_t size, bool aligned, bool is_oop, address *entry,
1339                                   const char *name, bool dest_uninitialized = false) {
1340     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1341     RegSet saved_reg = RegSet::of(s, d, count);
1342     __ align(CodeEntryAlignment);
1343     StubCodeMark mark(this, "StubRoutines", name);
1344     address start = __ pc();
1345     __ enter();
1346 
1347     if (entry != NULL) {
1348       *entry = __ pc();
1349       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1350       BLOCK_COMMENT("Entry:");
1351     }
1352 
1353     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1354     if (dest_uninitialized) {
1355       decorators |= IS_DEST_UNINITIALIZED;
1356     }
1357     if (aligned) {
1358       decorators |= ARRAYCOPY_ALIGNED;
1359     }
1360 
1361     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1362     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg);
1363 
1364     if (is_oop) {
1365       // save regs before copy_memory
1366       __ push(RegSet::of(d, count), sp);
1367     }
1368     {
1369       // UnsafeCopyMemory page error: continue after ucm
1370       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1371       UnsafeCopyMemoryMark ucmm(this, add_entry, true);
1372       copy_memory(aligned, s, d, count, rscratch1, size);
1373     }
1374 
1375     if (is_oop) {
1376       __ pop(RegSet::of(d, count), sp);
1377       if (VerifyOops)
1378         verify_oop_array(size, d, count, r16);
1379     }
1380 
1381     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1382 
1383     __ leave();
1384     __ mov(r0, zr); // return 0
1385     __ ret(lr);
1386     return start;
1387   }
1388 
1389   // Arguments:
1390   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1391   //             ignored
1392   //   is_oop  - true => oop array, so generate store check code
1393   //   name    - stub name string
1394   //
1395   // Inputs:
1396   //   c_rarg0   - source array address
1397   //   c_rarg1   - destination array address
1398   //   c_rarg2   - element count, treated as ssize_t, can be zero
1399   //
1400   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1401   // the hardware handle it.  The two dwords within qwords that span
1402   // cache line boundaries will still be loaded and stored atomicly.
1403   //
1404   address generate_conjoint_copy(size_t size, bool aligned, bool is_oop, address nooverlap_target,
1405                                  address *entry, const char *name,
1406                                  bool dest_uninitialized = false) {
1407     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1408     RegSet saved_regs = RegSet::of(s, d, count);
1409     StubCodeMark mark(this, "StubRoutines", name);
1410     address start = __ pc();
1411     __ enter();
1412 
1413     if (entry != NULL) {
1414       *entry = __ pc();
1415       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1416       BLOCK_COMMENT("Entry:");
1417     }
1418 
1419     // use fwd copy when (d-s) above_equal (count*size)
1420     __ sub(rscratch1, d, s);
1421     __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
1422     __ br(Assembler::HS, nooverlap_target);
1423 
1424     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1425     if (dest_uninitialized) {
1426       decorators |= IS_DEST_UNINITIALIZED;
1427     }
1428     if (aligned) {
1429       decorators |= ARRAYCOPY_ALIGNED;
1430     }
1431 
1432     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1433     bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs);
1434 
1435     if (is_oop) {
1436       // save regs before copy_memory
1437       __ push(RegSet::of(d, count), sp);
1438     }
1439     {
1440       // UnsafeCopyMemory page error: continue after ucm
1441       bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1442       UnsafeCopyMemoryMark ucmm(this, add_entry, true);
1443       copy_memory(aligned, s, d, count, rscratch1, -size);
1444     }
1445     if (is_oop) {
1446       __ pop(RegSet::of(d, count), sp);
1447       if (VerifyOops)
1448         verify_oop_array(size, d, count, r16);
1449     }
1450     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1451     __ leave();
1452     __ mov(r0, zr); // return 0
1453     __ ret(lr);
1454     return start;
1455 }
1456 
1457   // Arguments:
1458   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1459   //             ignored
1460   //   name    - stub name string
1461   //
1462   // Inputs:
1463   //   c_rarg0   - source array address
1464   //   c_rarg1   - destination array address
1465   //   c_rarg2   - element count, treated as ssize_t, can be zero
1466   //
1467   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1468   // we let the hardware handle it.  The one to eight bytes within words,
1469   // dwords or qwords that span cache line boundaries will still be loaded
1470   // and stored atomically.
1471   //
1472   // Side Effects:
1473   //   disjoint_byte_copy_entry is set to the no-overlap entry point  //
1474   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1475   // we let the hardware handle it.  The one to eight bytes within words,
1476   // dwords or qwords that span cache line boundaries will still be loaded
1477   // and stored atomically.
1478   //
1479   // Side Effects:
1480   //   disjoint_byte_copy_entry is set to the no-overlap entry point
1481   //   used by generate_conjoint_byte_copy().
1482   //
1483   address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
1484     const bool not_oop = false;
1485     return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name);
1486   }
1487 
1488   // Arguments:
1489   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1490   //             ignored
1491   //   name    - stub name string
1492   //
1493   // Inputs:
1494   //   c_rarg0   - source array address
1495   //   c_rarg1   - destination array address
1496   //   c_rarg2   - element count, treated as ssize_t, can be zero
1497   //
1498   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1499   // we let the hardware handle it.  The one to eight bytes within words,
1500   // dwords or qwords that span cache line boundaries will still be loaded
1501   // and stored atomically.
1502   //
1503   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1504                                       address* entry, const char *name) {
1505     const bool not_oop = false;
1506     return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name);
1507   }
1508 
1509   // Arguments:
1510   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1511   //             ignored
1512   //   name    - stub name string
1513   //
1514   // Inputs:
1515   //   c_rarg0   - source array address
1516   //   c_rarg1   - destination array address
1517   //   c_rarg2   - element count, treated as ssize_t, can be zero
1518   //
1519   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1520   // let the hardware handle it.  The two or four words within dwords
1521   // or qwords that span cache line boundaries will still be loaded
1522   // and stored atomically.
1523   //
1524   // Side Effects:
1525   //   disjoint_short_copy_entry is set to the no-overlap entry point
1526   //   used by generate_conjoint_short_copy().
1527   //
1528   address generate_disjoint_short_copy(bool aligned,
1529                                        address* entry, const char *name) {
1530     const bool not_oop = false;
1531     return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name);
1532   }
1533 
1534   // Arguments:
1535   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1536   //             ignored
1537   //   name    - stub name string
1538   //
1539   // Inputs:
1540   //   c_rarg0   - source array address
1541   //   c_rarg1   - destination array address
1542   //   c_rarg2   - element count, treated as ssize_t, can be zero
1543   //
1544   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1545   // let the hardware handle it.  The two or four words within dwords
1546   // or qwords that span cache line boundaries will still be loaded
1547   // and stored atomically.
1548   //
1549   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1550                                        address *entry, const char *name) {
1551     const bool not_oop = false;
1552     return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name);
1553 
1554   }
1555   // Arguments:
1556   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1557   //             ignored
1558   //   name    - stub name string
1559   //
1560   // Inputs:
1561   //   c_rarg0   - source array address
1562   //   c_rarg1   - destination array address
1563   //   c_rarg2   - element count, treated as ssize_t, can be zero
1564   //
1565   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1566   // the hardware handle it.  The two dwords within qwords that span
1567   // cache line boundaries will still be loaded and stored atomicly.
1568   //
1569   // Side Effects:
1570   //   disjoint_int_copy_entry is set to the no-overlap entry point
1571   //   used by generate_conjoint_int_oop_copy().
1572   //
1573   address generate_disjoint_int_copy(bool aligned, address *entry,
1574                                          const char *name, bool dest_uninitialized = false) {
1575     const bool not_oop = false;
1576     return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name);
1577   }
1578 
1579   // Arguments:
1580   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1581   //             ignored
1582   //   name    - stub name string
1583   //
1584   // Inputs:
1585   //   c_rarg0   - source array address
1586   //   c_rarg1   - destination array address
1587   //   c_rarg2   - element count, treated as ssize_t, can be zero
1588   //
1589   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1590   // the hardware handle it.  The two dwords within qwords that span
1591   // cache line boundaries will still be loaded and stored atomicly.
1592   //
1593   address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
1594                                      address *entry, const char *name,
1595                                      bool dest_uninitialized = false) {
1596     const bool not_oop = false;
1597     return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name);
1598   }
1599 
1600 
1601   // Arguments:
1602   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1603   //             ignored
1604   //   name    - stub name string
1605   //
1606   // Inputs:
1607   //   c_rarg0   - source array address
1608   //   c_rarg1   - destination array address
1609   //   c_rarg2   - element count, treated as size_t, can be zero
1610   //
1611   // Side Effects:
1612   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1613   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1614   //
1615   address generate_disjoint_long_copy(bool aligned, address *entry,
1616                                           const char *name, bool dest_uninitialized = false) {
1617     const bool not_oop = false;
1618     return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name);
1619   }
1620 
1621   // Arguments:
1622   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1623   //             ignored
1624   //   name    - stub name string
1625   //
1626   // Inputs:
1627   //   c_rarg0   - source array address
1628   //   c_rarg1   - destination array address
1629   //   c_rarg2   - element count, treated as size_t, can be zero
1630   //
1631   address generate_conjoint_long_copy(bool aligned,
1632                                       address nooverlap_target, address *entry,
1633                                       const char *name, bool dest_uninitialized = false) {
1634     const bool not_oop = false;
1635     return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name);
1636   }
1637 
1638   // Arguments:
1639   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1640   //             ignored
1641   //   name    - stub name string
1642   //
1643   // Inputs:
1644   //   c_rarg0   - source array address
1645   //   c_rarg1   - destination array address
1646   //   c_rarg2   - element count, treated as size_t, can be zero
1647   //
1648   // Side Effects:
1649   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1650   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1651   //
1652   address generate_disjoint_oop_copy(bool aligned, address *entry,
1653                                      const char *name, bool dest_uninitialized) {
1654     const bool is_oop = true;
1655     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1656     return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized);
1657   }
1658 
1659   // Arguments:
1660   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1661   //             ignored
1662   //   name    - stub name string
1663   //
1664   // Inputs:
1665   //   c_rarg0   - source array address
1666   //   c_rarg1   - destination array address
1667   //   c_rarg2   - element count, treated as size_t, can be zero
1668   //
1669   address generate_conjoint_oop_copy(bool aligned,
1670                                      address nooverlap_target, address *entry,
1671                                      const char *name, bool dest_uninitialized) {
1672     const bool is_oop = true;
1673     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1674     return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry,
1675                                   name, dest_uninitialized);
1676   }
1677 
1678 
1679   // Helper for generating a dynamic type check.
1680   // Smashes rscratch1, rscratch2.
1681   void generate_type_check(Register sub_klass,
1682                            Register super_check_offset,
1683                            Register super_klass,
1684                            Label& L_success) {
1685     assert_different_registers(sub_klass, super_check_offset, super_klass);
1686 
1687     BLOCK_COMMENT("type_check:");
1688 
1689     Label L_miss;
1690 
1691     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, NULL,
1692                                      super_check_offset);
1693     __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL);
1694 
1695     // Fall through on failure!
1696     __ BIND(L_miss);
1697   }
1698 
1699   //
1700   //  Generate checkcasting array copy stub
1701   //
1702   //  Input:
1703   //    c_rarg0   - source array address
1704   //    c_rarg1   - destination array address
1705   //    c_rarg2   - element count, treated as ssize_t, can be zero
1706   //    c_rarg3   - size_t ckoff (super_check_offset)
1707   //    c_rarg4   - oop ckval (super_klass)
1708   //
1709   //  Output:
1710   //    r0 ==  0  -  success
1711   //    r0 == -1^K - failure, where K is partial transfer count
1712   //
1713   address generate_checkcast_copy(const char *name, address *entry,
1714                                   bool dest_uninitialized = false) {
1715 
1716     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
1717 
1718     // Input registers (after setup_arg_regs)
1719     const Register from        = c_rarg0;   // source array address
1720     const Register to          = c_rarg1;   // destination array address
1721     const Register count       = c_rarg2;   // elementscount
1722     const Register ckoff       = c_rarg3;   // super_check_offset
1723     const Register ckval       = c_rarg4;   // super_klass
1724 
1725     RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
1726     RegSet wb_post_saved_regs = RegSet::of(count);
1727 
1728     // Registers used as temps (r18, r19, r20 are save-on-entry)
1729     const Register count_save  = r21;       // orig elementscount
1730     const Register start_to    = r20;       // destination array start address
1731     const Register copied_oop  = r18;       // actual oop copied
1732     const Register r19_klass   = r19;       // oop._klass
1733 
1734     //---------------------------------------------------------------
1735     // Assembler stub will be used for this call to arraycopy
1736     // if the two arrays are subtypes of Object[] but the
1737     // destination array type is not equal to or a supertype
1738     // of the source type.  Each element must be separately
1739     // checked.
1740 
1741     assert_different_registers(from, to, count, ckoff, ckval, start_to,
1742                                copied_oop, r19_klass, count_save);
1743 
1744     __ align(CodeEntryAlignment);
1745     StubCodeMark mark(this, "StubRoutines", name);
1746     address start = __ pc();
1747 
1748     __ enter(); // required for proper stackwalking of RuntimeStub frame
1749 
1750 #ifdef ASSERT
1751     // caller guarantees that the arrays really are different
1752     // otherwise, we would have to make conjoint checks
1753     { Label L;
1754       array_overlap_test(L, TIMES_OOP);
1755       __ stop("checkcast_copy within a single array");
1756       __ bind(L);
1757     }
1758 #endif //ASSERT
1759 
1760     // Caller of this entry point must set up the argument registers.
1761     if (entry != NULL) {
1762       *entry = __ pc();
1763       BLOCK_COMMENT("Entry:");
1764     }
1765 
1766      // Empty array:  Nothing to do.
1767     __ cbz(count, L_done);
1768 
1769     __ push(RegSet::of(r18, r19, r20, r21), sp);
1770 
1771 #ifdef ASSERT
1772     BLOCK_COMMENT("assert consistent ckoff/ckval");
1773     // The ckoff and ckval must be mutually consistent,
1774     // even though caller generates both.
1775     { Label L;
1776       int sco_offset = in_bytes(Klass::super_check_offset_offset());
1777       __ ldrw(start_to, Address(ckval, sco_offset));
1778       __ cmpw(ckoff, start_to);
1779       __ br(Assembler::EQ, L);
1780       __ stop("super_check_offset inconsistent");
1781       __ bind(L);
1782     }
1783 #endif //ASSERT
1784 
1785     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
1786     bool is_oop = true;
1787     if (dest_uninitialized) {
1788       decorators |= IS_DEST_UNINITIALIZED;
1789     }
1790 
1791     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1792     bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs);
1793 
1794     // save the original count
1795     __ mov(count_save, count);
1796 
1797     // Copy from low to high addresses
1798     __ mov(start_to, to);              // Save destination array start address
1799     __ b(L_load_element);
1800 
1801     // ======== begin loop ========
1802     // (Loop is rotated; its entry is L_load_element.)
1803     // Loop control:
1804     //   for (; count != 0; count--) {
1805     //     copied_oop = load_heap_oop(from++);
1806     //     ... generate_type_check ...;
1807     //     store_heap_oop(to++, copied_oop);
1808     //   }
1809     __ align(OptoLoopAlignment);
1810 
1811     __ BIND(L_store_element);
1812     __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop, noreg, noreg, AS_RAW);  // store the oop
1813     __ sub(count, count, 1);
1814     __ cbz(count, L_do_card_marks);
1815 
1816     // ======== loop entry is here ========
1817     __ BIND(L_load_element);
1818     __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8), noreg, noreg, AS_RAW); // load the oop
1819     __ cbz(copied_oop, L_store_element);
1820 
1821     __ load_klass(r19_klass, copied_oop);// query the object klass
1822     generate_type_check(r19_klass, ckoff, ckval, L_store_element);
1823     // ======== end loop ========
1824 
1825     // It was a real error; we must depend on the caller to finish the job.
1826     // Register count = remaining oops, count_orig = total oops.
1827     // Emit GC store barriers for the oops we have copied and report
1828     // their number to the caller.
1829 
1830     __ subs(count, count_save, count);     // K = partially copied oop count
1831     __ eon(count, count, zr);                   // report (-1^K) to caller
1832     __ br(Assembler::EQ, L_done_pop);
1833 
1834     __ BIND(L_do_card_marks);
1835     bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs);
1836 
1837     __ bind(L_done_pop);
1838     __ pop(RegSet::of(r18, r19, r20, r21), sp);
1839     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
1840 
1841     __ bind(L_done);
1842     __ mov(r0, count);
1843     __ leave();
1844     __ ret(lr);
1845 
1846     return start;
1847   }
1848 
1849   // Perform range checks on the proposed arraycopy.
1850   // Kills temp, but nothing else.
1851   // Also, clean the sign bits of src_pos and dst_pos.
1852   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
1853                               Register src_pos, // source position (c_rarg1)
1854                               Register dst,     // destination array oo (c_rarg2)
1855                               Register dst_pos, // destination position (c_rarg3)
1856                               Register length,
1857                               Register temp,
1858                               Label& L_failed) {
1859     BLOCK_COMMENT("arraycopy_range_checks:");
1860 
1861     assert_different_registers(rscratch1, temp);
1862 
1863     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
1864     __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
1865     __ addw(temp, length, src_pos);
1866     __ cmpw(temp, rscratch1);
1867     __ br(Assembler::HI, L_failed);
1868 
1869     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
1870     __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
1871     __ addw(temp, length, dst_pos);
1872     __ cmpw(temp, rscratch1);
1873     __ br(Assembler::HI, L_failed);
1874 
1875     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
1876     __ movw(src_pos, src_pos);
1877     __ movw(dst_pos, dst_pos);
1878 
1879     BLOCK_COMMENT("arraycopy_range_checks done");
1880   }
1881 
1882   // These stubs get called from some dumb test routine.
1883   // I'll write them properly when they're called from
1884   // something that's actually doing something.
1885   static void fake_arraycopy_stub(address src, address dst, int count) {
1886     assert(count == 0, "huh?");
1887   }
1888 
1889 
1890   //
1891   //  Generate 'unsafe' array copy stub
1892   //  Though just as safe as the other stubs, it takes an unscaled
1893   //  size_t argument instead of an element count.
1894   //
1895   //  Input:
1896   //    c_rarg0   - source array address
1897   //    c_rarg1   - destination array address
1898   //    c_rarg2   - byte count, treated as ssize_t, can be zero
1899   //
1900   // Examines the alignment of the operands and dispatches
1901   // to a long, int, short, or byte copy loop.
1902   //
1903   address generate_unsafe_copy(const char *name,
1904                                address byte_copy_entry,
1905                                address short_copy_entry,
1906                                address int_copy_entry,
1907                                address long_copy_entry) {
1908     Label L_long_aligned, L_int_aligned, L_short_aligned;
1909     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1910 
1911     __ align(CodeEntryAlignment);
1912     StubCodeMark mark(this, "StubRoutines", name);
1913     address start = __ pc();
1914     __ enter(); // required for proper stackwalking of RuntimeStub frame
1915 
1916     // bump this on entry, not on exit:
1917     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
1918 
1919     __ orr(rscratch1, s, d);
1920     __ orr(rscratch1, rscratch1, count);
1921 
1922     __ andr(rscratch1, rscratch1, BytesPerLong-1);
1923     __ cbz(rscratch1, L_long_aligned);
1924     __ andr(rscratch1, rscratch1, BytesPerInt-1);
1925     __ cbz(rscratch1, L_int_aligned);
1926     __ tbz(rscratch1, 0, L_short_aligned);
1927     __ b(RuntimeAddress(byte_copy_entry));
1928 
1929     __ BIND(L_short_aligned);
1930     __ lsr(count, count, LogBytesPerShort);  // size => short_count
1931     __ b(RuntimeAddress(short_copy_entry));
1932     __ BIND(L_int_aligned);
1933     __ lsr(count, count, LogBytesPerInt);    // size => int_count
1934     __ b(RuntimeAddress(int_copy_entry));
1935     __ BIND(L_long_aligned);
1936     __ lsr(count, count, LogBytesPerLong);   // size => long_count
1937     __ b(RuntimeAddress(long_copy_entry));
1938 
1939     return start;
1940   }
1941 
1942   //
1943   //  Generate generic array copy stubs
1944   //
1945   //  Input:
1946   //    c_rarg0    -  src oop
1947   //    c_rarg1    -  src_pos (32-bits)
1948   //    c_rarg2    -  dst oop
1949   //    c_rarg3    -  dst_pos (32-bits)
1950   //    c_rarg4    -  element count (32-bits)
1951   //
1952   //  Output:
1953   //    r0 ==  0  -  success
1954   //    r0 == -1^K - failure, where K is partial transfer count
1955   //
1956   address generate_generic_copy(const char *name,
1957                                 address byte_copy_entry, address short_copy_entry,
1958                                 address int_copy_entry, address oop_copy_entry,
1959                                 address long_copy_entry, address checkcast_copy_entry) {
1960 
1961     Label L_failed, L_objArray;
1962     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
1963 
1964     // Input registers
1965     const Register src        = c_rarg0;  // source array oop
1966     const Register src_pos    = c_rarg1;  // source position
1967     const Register dst        = c_rarg2;  // destination array oop
1968     const Register dst_pos    = c_rarg3;  // destination position
1969     const Register length     = c_rarg4;
1970 
1971 
1972     // Registers used as temps
1973     const Register dst_klass  = c_rarg5;
1974 
1975     __ align(CodeEntryAlignment);
1976 
1977     StubCodeMark mark(this, "StubRoutines", name);
1978 
1979     address start = __ pc();
1980 
1981     __ enter(); // required for proper stackwalking of RuntimeStub frame
1982 
1983     // bump this on entry, not on exit:
1984     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
1985 
1986     //-----------------------------------------------------------------------
1987     // Assembler stub will be used for this call to arraycopy
1988     // if the following conditions are met:
1989     //
1990     // (1) src and dst must not be null.
1991     // (2) src_pos must not be negative.
1992     // (3) dst_pos must not be negative.
1993     // (4) length  must not be negative.
1994     // (5) src klass and dst klass should be the same and not NULL.
1995     // (6) src and dst should be arrays.
1996     // (7) src_pos + length must not exceed length of src.
1997     // (8) dst_pos + length must not exceed length of dst.
1998     //
1999 
2000     //  if (src == NULL) return -1;
2001     __ cbz(src, L_failed);
2002 
2003     //  if (src_pos < 0) return -1;
2004     __ tbnz(src_pos, 31, L_failed);  // i.e. sign bit set
2005 
2006     //  if (dst == NULL) return -1;
2007     __ cbz(dst, L_failed);
2008 
2009     //  if (dst_pos < 0) return -1;
2010     __ tbnz(dst_pos, 31, L_failed);  // i.e. sign bit set
2011 
2012     // registers used as temp
2013     const Register scratch_length    = r16; // elements count to copy
2014     const Register scratch_src_klass = r17; // array klass
2015     const Register lh                = r18; // layout helper
2016 
2017     //  if (length < 0) return -1;
2018     __ movw(scratch_length, length);        // length (elements count, 32-bits value)
2019     __ tbnz(scratch_length, 31, L_failed);  // i.e. sign bit set
2020 
2021     __ load_klass(scratch_src_klass, src);
2022 #ifdef ASSERT
2023     //  assert(src->klass() != NULL);
2024     {
2025       BLOCK_COMMENT("assert klasses not null {");
2026       Label L1, L2;
2027       __ cbnz(scratch_src_klass, L2);   // it is broken if klass is NULL
2028       __ bind(L1);
2029       __ stop("broken null klass");
2030       __ bind(L2);
2031       __ load_klass(rscratch1, dst);
2032       __ cbz(rscratch1, L1);     // this would be broken also
2033       BLOCK_COMMENT("} assert klasses not null done");
2034     }
2035 #endif
2036 
2037     // Load layout helper (32-bits)
2038     //
2039     //  |array_tag|     | header_size | element_type |     |log2_element_size|
2040     // 32        30    24            16              8     2                 0
2041     //
2042     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2043     //
2044 
2045     const int lh_offset = in_bytes(Klass::layout_helper_offset());
2046 
2047     // Handle objArrays completely differently...
2048     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2049     __ ldrw(lh, Address(scratch_src_klass, lh_offset));
2050     __ movw(rscratch1, objArray_lh);
2051     __ eorw(rscratch2, lh, rscratch1);
2052     __ cbzw(rscratch2, L_objArray);
2053 
2054     //  if (src->klass() != dst->klass()) return -1;
2055     __ load_klass(rscratch2, dst);
2056     __ eor(rscratch2, rscratch2, scratch_src_klass);
2057     __ cbnz(rscratch2, L_failed);
2058 
2059     //  if (!src->is_Array()) return -1;
2060     __ tbz(lh, 31, L_failed);  // i.e. (lh >= 0)
2061 
2062     // At this point, it is known to be a typeArray (array_tag 0x3).
2063 #ifdef ASSERT
2064     {
2065       BLOCK_COMMENT("assert primitive array {");
2066       Label L;
2067       __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
2068       __ cmpw(lh, rscratch2);
2069       __ br(Assembler::GE, L);
2070       __ stop("must be a primitive array");
2071       __ bind(L);
2072       BLOCK_COMMENT("} assert primitive array done");
2073     }
2074 #endif
2075 
2076     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2077                            rscratch2, L_failed);
2078 
2079     // TypeArrayKlass
2080     //
2081     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2082     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2083     //
2084 
2085     const Register rscratch1_offset = rscratch1;    // array offset
2086     const Register r18_elsize = lh; // element size
2087 
2088     __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
2089            exact_log2(Klass::_lh_header_size_mask+1));   // array_offset
2090     __ add(src, src, rscratch1_offset);           // src array offset
2091     __ add(dst, dst, rscratch1_offset);           // dst array offset
2092     BLOCK_COMMENT("choose copy loop based on element size");
2093 
2094     // next registers should be set before the jump to corresponding stub
2095     const Register from     = c_rarg0;  // source array address
2096     const Register to       = c_rarg1;  // destination array address
2097     const Register count    = c_rarg2;  // elements count
2098 
2099     // 'from', 'to', 'count' registers should be set in such order
2100     // since they are the same as 'src', 'src_pos', 'dst'.
2101 
2102     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
2103 
2104     // The possible values of elsize are 0-3, i.e. exact_log2(element
2105     // size in bytes).  We do a simple bitwise binary search.
2106   __ BIND(L_copy_bytes);
2107     __ tbnz(r18_elsize, 1, L_copy_ints);
2108     __ tbnz(r18_elsize, 0, L_copy_shorts);
2109     __ lea(from, Address(src, src_pos));// src_addr
2110     __ lea(to,   Address(dst, dst_pos));// dst_addr
2111     __ movw(count, scratch_length); // length
2112     __ b(RuntimeAddress(byte_copy_entry));
2113 
2114   __ BIND(L_copy_shorts);
2115     __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
2116     __ lea(to,   Address(dst, dst_pos, Address::lsl(1)));// dst_addr
2117     __ movw(count, scratch_length); // length
2118     __ b(RuntimeAddress(short_copy_entry));
2119 
2120   __ BIND(L_copy_ints);
2121     __ tbnz(r18_elsize, 0, L_copy_longs);
2122     __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
2123     __ lea(to,   Address(dst, dst_pos, Address::lsl(2)));// dst_addr
2124     __ movw(count, scratch_length); // length
2125     __ b(RuntimeAddress(int_copy_entry));
2126 
2127   __ BIND(L_copy_longs);
2128 #ifdef ASSERT
2129     {
2130       BLOCK_COMMENT("assert long copy {");
2131       Label L;
2132       __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r18_elsize
2133       __ cmpw(r18_elsize, LogBytesPerLong);
2134       __ br(Assembler::EQ, L);
2135       __ stop("must be long copy, but elsize is wrong");
2136       __ bind(L);
2137       BLOCK_COMMENT("} assert long copy done");
2138     }
2139 #endif
2140     __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
2141     __ lea(to,   Address(dst, dst_pos, Address::lsl(3)));// dst_addr
2142     __ movw(count, scratch_length); // length
2143     __ b(RuntimeAddress(long_copy_entry));
2144 
2145     // ObjArrayKlass
2146   __ BIND(L_objArray);
2147     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
2148 
2149     Label L_plain_copy, L_checkcast_copy;
2150     //  test array classes for subtyping
2151     __ load_klass(r18, dst);
2152     __ cmp(scratch_src_klass, r18); // usual case is exact equality
2153     __ br(Assembler::NE, L_checkcast_copy);
2154 
2155     // Identically typed arrays can be copied without element-wise checks.
2156     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2157                            rscratch2, L_failed);
2158 
2159     __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2160     __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2161     __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2162     __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2163     __ movw(count, scratch_length); // length
2164   __ BIND(L_plain_copy);
2165     __ b(RuntimeAddress(oop_copy_entry));
2166 
2167   __ BIND(L_checkcast_copy);
2168     // live at this point:  scratch_src_klass, scratch_length, r18 (dst_klass)
2169     {
2170       // Before looking at dst.length, make sure dst is also an objArray.
2171       __ ldrw(rscratch1, Address(r18, lh_offset));
2172       __ movw(rscratch2, objArray_lh);
2173       __ eorw(rscratch1, rscratch1, rscratch2);
2174       __ cbnzw(rscratch1, L_failed);
2175 
2176       // It is safe to examine both src.length and dst.length.
2177       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2178                              r18, L_failed);
2179 
2180       __ load_klass(dst_klass, dst); // reload
2181 
2182       // Marshal the base address arguments now, freeing registers.
2183       __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2184       __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2185       __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2186       __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2187       __ movw(count, length);           // length (reloaded)
2188       Register sco_temp = c_rarg3;      // this register is free now
2189       assert_different_registers(from, to, count, sco_temp,
2190                                  dst_klass, scratch_src_klass);
2191       // assert_clean_int(count, sco_temp);
2192 
2193       // Generate the type check.
2194       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
2195       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2196 
2197       // Smashes rscratch1, rscratch2
2198       generate_type_check(scratch_src_klass, sco_temp, dst_klass, L_plain_copy);
2199 
2200       // Fetch destination element klass from the ObjArrayKlass header.
2201       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2202       __ ldr(dst_klass, Address(dst_klass, ek_offset));
2203       __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2204 
2205       // the checkcast_copy loop needs two extra arguments:
2206       assert(c_rarg3 == sco_temp, "#3 already in place");
2207       // Set up arguments for checkcast_copy_entry.
2208       __ mov(c_rarg4, dst_klass);  // dst.klass.element_klass
2209       __ b(RuntimeAddress(checkcast_copy_entry));
2210     }
2211 
2212   __ BIND(L_failed);
2213     __ mov(r0, -1);
2214     __ leave();   // required for proper stackwalking of RuntimeStub frame
2215     __ ret(lr);
2216 
2217     return start;
2218   }
2219 
2220   //
2221   // Generate stub for array fill. If "aligned" is true, the
2222   // "to" address is assumed to be heapword aligned.
2223   //
2224   // Arguments for generated stub:
2225   //   to:    c_rarg0
2226   //   value: c_rarg1
2227   //   count: c_rarg2 treated as signed
2228   //
2229   address generate_fill(BasicType t, bool aligned, const char *name) {
2230     __ align(CodeEntryAlignment);
2231     StubCodeMark mark(this, "StubRoutines", name);
2232     address start = __ pc();
2233 
2234     BLOCK_COMMENT("Entry:");
2235 
2236     const Register to        = c_rarg0;  // source array address
2237     const Register value     = c_rarg1;  // value
2238     const Register count     = c_rarg2;  // elements count
2239 
2240     const Register bz_base = r10;        // base for block_zero routine
2241     const Register cnt_words = r11;      // temp register
2242 
2243     __ enter();
2244 
2245     Label L_fill_elements, L_exit1;
2246 
2247     int shift = -1;
2248     switch (t) {
2249       case T_BYTE:
2250         shift = 0;
2251         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2252         __ bfi(value, value, 8, 8);   // 8 bit -> 16 bit
2253         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2254         __ br(Assembler::LO, L_fill_elements);
2255         break;
2256       case T_SHORT:
2257         shift = 1;
2258         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2259         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2260         __ br(Assembler::LO, L_fill_elements);
2261         break;
2262       case T_INT:
2263         shift = 2;
2264         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2265         __ br(Assembler::LO, L_fill_elements);
2266         break;
2267       default: ShouldNotReachHere();
2268     }
2269 
2270     // Align source address at 8 bytes address boundary.
2271     Label L_skip_align1, L_skip_align2, L_skip_align4;
2272     if (!aligned) {
2273       switch (t) {
2274         case T_BYTE:
2275           // One byte misalignment happens only for byte arrays.
2276           __ tbz(to, 0, L_skip_align1);
2277           __ strb(value, Address(__ post(to, 1)));
2278           __ subw(count, count, 1);
2279           __ bind(L_skip_align1);
2280           // Fallthrough
2281         case T_SHORT:
2282           // Two bytes misalignment happens only for byte and short (char) arrays.
2283           __ tbz(to, 1, L_skip_align2);
2284           __ strh(value, Address(__ post(to, 2)));
2285           __ subw(count, count, 2 >> shift);
2286           __ bind(L_skip_align2);
2287           // Fallthrough
2288         case T_INT:
2289           // Align to 8 bytes, we know we are 4 byte aligned to start.
2290           __ tbz(to, 2, L_skip_align4);
2291           __ strw(value, Address(__ post(to, 4)));
2292           __ subw(count, count, 4 >> shift);
2293           __ bind(L_skip_align4);
2294           break;
2295         default: ShouldNotReachHere();
2296       }
2297     }
2298 
2299     //
2300     //  Fill large chunks
2301     //
2302     __ lsrw(cnt_words, count, 3 - shift); // number of words
2303     __ bfi(value, value, 32, 32);         // 32 bit -> 64 bit
2304     __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
2305     if (UseBlockZeroing) {
2306       Label non_block_zeroing, rest;
2307       // If the fill value is zero we can use the fast zero_words().
2308       __ cbnz(value, non_block_zeroing);
2309       __ mov(bz_base, to);
2310       __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
2311       __ zero_words(bz_base, cnt_words);
2312       __ b(rest);
2313       __ bind(non_block_zeroing);
2314       __ fill_words(to, cnt_words, value);
2315       __ bind(rest);
2316     } else {
2317       __ fill_words(to, cnt_words, value);
2318     }
2319 
2320     // Remaining count is less than 8 bytes. Fill it by a single store.
2321     // Note that the total length is no less than 8 bytes.
2322     if (t == T_BYTE || t == T_SHORT) {
2323       Label L_exit1;
2324       __ cbzw(count, L_exit1);
2325       __ add(to, to, count, Assembler::LSL, shift); // points to the end
2326       __ str(value, Address(to, -8));    // overwrite some elements
2327       __ bind(L_exit1);
2328       __ leave();
2329       __ ret(lr);
2330     }
2331 
2332     // Handle copies less than 8 bytes.
2333     Label L_fill_2, L_fill_4, L_exit2;
2334     __ bind(L_fill_elements);
2335     switch (t) {
2336       case T_BYTE:
2337         __ tbz(count, 0, L_fill_2);
2338         __ strb(value, Address(__ post(to, 1)));
2339         __ bind(L_fill_2);
2340         __ tbz(count, 1, L_fill_4);
2341         __ strh(value, Address(__ post(to, 2)));
2342         __ bind(L_fill_4);
2343         __ tbz(count, 2, L_exit2);
2344         __ strw(value, Address(to));
2345         break;
2346       case T_SHORT:
2347         __ tbz(count, 0, L_fill_4);
2348         __ strh(value, Address(__ post(to, 2)));
2349         __ bind(L_fill_4);
2350         __ tbz(count, 1, L_exit2);
2351         __ strw(value, Address(to));
2352         break;
2353       case T_INT:
2354         __ cbzw(count, L_exit2);
2355         __ strw(value, Address(to));
2356         break;
2357       default: ShouldNotReachHere();
2358     }
2359     __ bind(L_exit2);
2360     __ leave();
2361     __ ret(lr);
2362     return start;
2363   }
2364 
2365   address generate_data_cache_writeback() {
2366     const Register line        = c_rarg0;  // address of line to write back
2367 
2368     __ align(CodeEntryAlignment);
2369 
2370     StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback");
2371 
2372     address start = __ pc();
2373     __ enter();
2374     __ cache_wb(Address(line, 0));
2375     __ leave();
2376     __ ret(lr);
2377 
2378     return start;
2379   }
2380 
2381   address generate_data_cache_writeback_sync() {
2382     const Register is_pre     = c_rarg0;  // pre or post sync
2383 
2384     __ align(CodeEntryAlignment);
2385 
2386     StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback_sync");
2387 
2388     // pre wbsync is a no-op
2389     // post wbsync translates to an sfence
2390 
2391     Label skip;
2392     address start = __ pc();
2393     __ enter();
2394     __ cbnz(is_pre, skip);
2395     __ cache_wbsync(false);
2396     __ bind(skip);
2397     __ leave();
2398     __ ret(lr);
2399 
2400     return start;
2401   }
2402 
2403   void generate_arraycopy_stubs() {
2404     address entry;
2405     address entry_jbyte_arraycopy;
2406     address entry_jshort_arraycopy;
2407     address entry_jint_arraycopy;
2408     address entry_oop_arraycopy;
2409     address entry_jlong_arraycopy;
2410     address entry_checkcast_arraycopy;
2411 
2412     generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards);
2413     generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards);
2414 
2415     StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
2416 
2417     //*** jbyte
2418     // Always need aligned and unaligned versions
2419     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_byte_copy(false, &entry,
2420                                                                                   "jbyte_disjoint_arraycopy");
2421     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_byte_copy(false, entry,
2422                                                                                   &entry_jbyte_arraycopy,
2423                                                                                   "jbyte_arraycopy");
2424     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry,
2425                                                                                   "arrayof_jbyte_disjoint_arraycopy");
2426     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_byte_copy(true, entry, NULL,
2427                                                                                   "arrayof_jbyte_arraycopy");
2428 
2429     //*** jshort
2430     // Always need aligned and unaligned versions
2431     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_short_copy(false, &entry,
2432                                                                                     "jshort_disjoint_arraycopy");
2433     StubRoutines::_jshort_arraycopy                  = generate_conjoint_short_copy(false, entry,
2434                                                                                     &entry_jshort_arraycopy,
2435                                                                                     "jshort_arraycopy");
2436     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry,
2437                                                                                     "arrayof_jshort_disjoint_arraycopy");
2438     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_short_copy(true, entry, NULL,
2439                                                                                     "arrayof_jshort_arraycopy");
2440 
2441     //*** jint
2442     // Aligned versions
2443     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry,
2444                                                                                 "arrayof_jint_disjoint_arraycopy");
2445     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy,
2446                                                                                 "arrayof_jint_arraycopy");
2447     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
2448     // entry_jint_arraycopy always points to the unaligned version
2449     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_int_copy(false, &entry,
2450                                                                                 "jint_disjoint_arraycopy");
2451     StubRoutines::_jint_arraycopy                  = generate_conjoint_int_copy(false, entry,
2452                                                                                 &entry_jint_arraycopy,
2453                                                                                 "jint_arraycopy");
2454 
2455     //*** jlong
2456     // It is always aligned
2457     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry,
2458                                                                                   "arrayof_jlong_disjoint_arraycopy");
2459     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy,
2460                                                                                   "arrayof_jlong_arraycopy");
2461     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
2462     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
2463 
2464     //*** oops
2465     {
2466       // With compressed oops we need unaligned versions; notice that
2467       // we overwrite entry_oop_arraycopy.
2468       bool aligned = !UseCompressedOops;
2469 
2470       StubRoutines::_arrayof_oop_disjoint_arraycopy
2471         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy",
2472                                      /*dest_uninitialized*/false);
2473       StubRoutines::_arrayof_oop_arraycopy
2474         = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy",
2475                                      /*dest_uninitialized*/false);
2476       // Aligned versions without pre-barriers
2477       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
2478         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit",
2479                                      /*dest_uninitialized*/true);
2480       StubRoutines::_arrayof_oop_arraycopy_uninit
2481         = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit",
2482                                      /*dest_uninitialized*/true);
2483     }
2484 
2485     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
2486     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
2487     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
2488     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
2489 
2490     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
2491     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
2492                                                                         /*dest_uninitialized*/true);
2493 
2494     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
2495                                                               entry_jbyte_arraycopy,
2496                                                               entry_jshort_arraycopy,
2497                                                               entry_jint_arraycopy,
2498                                                               entry_jlong_arraycopy);
2499 
2500     StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
2501                                                                entry_jbyte_arraycopy,
2502                                                                entry_jshort_arraycopy,
2503                                                                entry_jint_arraycopy,
2504                                                                entry_oop_arraycopy,
2505                                                                entry_jlong_arraycopy,
2506                                                                entry_checkcast_arraycopy);
2507 
2508     StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
2509     StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
2510     StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
2511     StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
2512     StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
2513     StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
2514   }
2515 
2516   void generate_math_stubs() { Unimplemented(); }
2517 
2518   // Arguments:
2519   //
2520   // Inputs:
2521   //   c_rarg0   - source byte array address
2522   //   c_rarg1   - destination byte array address
2523   //   c_rarg2   - K (key) in little endian int array
2524   //
2525   address generate_aescrypt_encryptBlock() {
2526     __ align(CodeEntryAlignment);
2527     StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
2528 
2529     Label L_doLast;
2530 
2531     const Register from        = c_rarg0;  // source array address
2532     const Register to          = c_rarg1;  // destination array address
2533     const Register key         = c_rarg2;  // key array address
2534     const Register keylen      = rscratch1;
2535 
2536     address start = __ pc();
2537     __ enter();
2538 
2539     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2540 
2541     __ ld1(v0, __ T16B, from); // get 16 bytes of input
2542 
2543     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2544     __ rev32(v1, __ T16B, v1);
2545     __ rev32(v2, __ T16B, v2);
2546     __ rev32(v3, __ T16B, v3);
2547     __ rev32(v4, __ T16B, v4);
2548     __ aese(v0, v1);
2549     __ aesmc(v0, v0);
2550     __ aese(v0, v2);
2551     __ aesmc(v0, v0);
2552     __ aese(v0, v3);
2553     __ aesmc(v0, v0);
2554     __ aese(v0, v4);
2555     __ aesmc(v0, v0);
2556 
2557     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2558     __ rev32(v1, __ T16B, v1);
2559     __ rev32(v2, __ T16B, v2);
2560     __ rev32(v3, __ T16B, v3);
2561     __ rev32(v4, __ T16B, v4);
2562     __ aese(v0, v1);
2563     __ aesmc(v0, v0);
2564     __ aese(v0, v2);
2565     __ aesmc(v0, v0);
2566     __ aese(v0, v3);
2567     __ aesmc(v0, v0);
2568     __ aese(v0, v4);
2569     __ aesmc(v0, v0);
2570 
2571     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2572     __ rev32(v1, __ T16B, v1);
2573     __ rev32(v2, __ T16B, v2);
2574 
2575     __ cmpw(keylen, 44);
2576     __ br(Assembler::EQ, L_doLast);
2577 
2578     __ aese(v0, v1);
2579     __ aesmc(v0, v0);
2580     __ aese(v0, v2);
2581     __ aesmc(v0, v0);
2582 
2583     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2584     __ rev32(v1, __ T16B, v1);
2585     __ rev32(v2, __ T16B, v2);
2586 
2587     __ cmpw(keylen, 52);
2588     __ br(Assembler::EQ, L_doLast);
2589 
2590     __ aese(v0, v1);
2591     __ aesmc(v0, v0);
2592     __ aese(v0, v2);
2593     __ aesmc(v0, v0);
2594 
2595     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2596     __ rev32(v1, __ T16B, v1);
2597     __ rev32(v2, __ T16B, v2);
2598 
2599     __ BIND(L_doLast);
2600 
2601     __ aese(v0, v1);
2602     __ aesmc(v0, v0);
2603     __ aese(v0, v2);
2604 
2605     __ ld1(v1, __ T16B, key);
2606     __ rev32(v1, __ T16B, v1);
2607     __ eor(v0, __ T16B, v0, v1);
2608 
2609     __ st1(v0, __ T16B, to);
2610 
2611     __ mov(r0, 0);
2612 
2613     __ leave();
2614     __ ret(lr);
2615 
2616     return start;
2617   }
2618 
2619   // Arguments:
2620   //
2621   // Inputs:
2622   //   c_rarg0   - source byte array address
2623   //   c_rarg1   - destination byte array address
2624   //   c_rarg2   - K (key) in little endian int array
2625   //
2626   address generate_aescrypt_decryptBlock() {
2627     assert(UseAES, "need AES instructions and misaligned SSE support");
2628     __ align(CodeEntryAlignment);
2629     StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
2630     Label L_doLast;
2631 
2632     const Register from        = c_rarg0;  // source array address
2633     const Register to          = c_rarg1;  // destination array address
2634     const Register key         = c_rarg2;  // key array address
2635     const Register keylen      = rscratch1;
2636 
2637     address start = __ pc();
2638     __ enter(); // required for proper stackwalking of RuntimeStub frame
2639 
2640     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2641 
2642     __ ld1(v0, __ T16B, from); // get 16 bytes of input
2643 
2644     __ ld1(v5, __ T16B, __ post(key, 16));
2645     __ rev32(v5, __ T16B, v5);
2646 
2647     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2648     __ rev32(v1, __ T16B, v1);
2649     __ rev32(v2, __ T16B, v2);
2650     __ rev32(v3, __ T16B, v3);
2651     __ rev32(v4, __ T16B, v4);
2652     __ aesd(v0, v1);
2653     __ aesimc(v0, v0);
2654     __ aesd(v0, v2);
2655     __ aesimc(v0, v0);
2656     __ aesd(v0, v3);
2657     __ aesimc(v0, v0);
2658     __ aesd(v0, v4);
2659     __ aesimc(v0, v0);
2660 
2661     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2662     __ rev32(v1, __ T16B, v1);
2663     __ rev32(v2, __ T16B, v2);
2664     __ rev32(v3, __ T16B, v3);
2665     __ rev32(v4, __ T16B, v4);
2666     __ aesd(v0, v1);
2667     __ aesimc(v0, v0);
2668     __ aesd(v0, v2);
2669     __ aesimc(v0, v0);
2670     __ aesd(v0, v3);
2671     __ aesimc(v0, v0);
2672     __ aesd(v0, v4);
2673     __ aesimc(v0, v0);
2674 
2675     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2676     __ rev32(v1, __ T16B, v1);
2677     __ rev32(v2, __ T16B, v2);
2678 
2679     __ cmpw(keylen, 44);
2680     __ br(Assembler::EQ, L_doLast);
2681 
2682     __ aesd(v0, v1);
2683     __ aesimc(v0, v0);
2684     __ aesd(v0, v2);
2685     __ aesimc(v0, v0);
2686 
2687     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2688     __ rev32(v1, __ T16B, v1);
2689     __ rev32(v2, __ T16B, v2);
2690 
2691     __ cmpw(keylen, 52);
2692     __ br(Assembler::EQ, L_doLast);
2693 
2694     __ aesd(v0, v1);
2695     __ aesimc(v0, v0);
2696     __ aesd(v0, v2);
2697     __ aesimc(v0, v0);
2698 
2699     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2700     __ rev32(v1, __ T16B, v1);
2701     __ rev32(v2, __ T16B, v2);
2702 
2703     __ BIND(L_doLast);
2704 
2705     __ aesd(v0, v1);
2706     __ aesimc(v0, v0);
2707     __ aesd(v0, v2);
2708 
2709     __ eor(v0, __ T16B, v0, v5);
2710 
2711     __ st1(v0, __ T16B, to);
2712 
2713     __ mov(r0, 0);
2714 
2715     __ leave();
2716     __ ret(lr);
2717 
2718     return start;
2719   }
2720 
2721   // Arguments:
2722   //
2723   // Inputs:
2724   //   c_rarg0   - source byte array address
2725   //   c_rarg1   - destination byte array address
2726   //   c_rarg2   - K (key) in little endian int array
2727   //   c_rarg3   - r vector byte array address
2728   //   c_rarg4   - input length
2729   //
2730   // Output:
2731   //   x0        - input length
2732   //
2733   address generate_cipherBlockChaining_encryptAESCrypt() {
2734     assert(UseAES, "need AES instructions and misaligned SSE support");
2735     __ align(CodeEntryAlignment);
2736     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
2737 
2738     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2739 
2740     const Register from        = c_rarg0;  // source array address
2741     const Register to          = c_rarg1;  // destination array address
2742     const Register key         = c_rarg2;  // key array address
2743     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2744                                            // and left with the results of the last encryption block
2745     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2746     const Register keylen      = rscratch1;
2747 
2748     address start = __ pc();
2749 
2750       __ enter();
2751 
2752       __ movw(rscratch2, len_reg);
2753 
2754       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2755 
2756       __ ld1(v0, __ T16B, rvec);
2757 
2758       __ cmpw(keylen, 52);
2759       __ br(Assembler::CC, L_loadkeys_44);
2760       __ br(Assembler::EQ, L_loadkeys_52);
2761 
2762       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2763       __ rev32(v17, __ T16B, v17);
2764       __ rev32(v18, __ T16B, v18);
2765     __ BIND(L_loadkeys_52);
2766       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2767       __ rev32(v19, __ T16B, v19);
2768       __ rev32(v20, __ T16B, v20);
2769     __ BIND(L_loadkeys_44);
2770       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2771       __ rev32(v21, __ T16B, v21);
2772       __ rev32(v22, __ T16B, v22);
2773       __ rev32(v23, __ T16B, v23);
2774       __ rev32(v24, __ T16B, v24);
2775       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2776       __ rev32(v25, __ T16B, v25);
2777       __ rev32(v26, __ T16B, v26);
2778       __ rev32(v27, __ T16B, v27);
2779       __ rev32(v28, __ T16B, v28);
2780       __ ld1(v29, v30, v31, __ T16B, key);
2781       __ rev32(v29, __ T16B, v29);
2782       __ rev32(v30, __ T16B, v30);
2783       __ rev32(v31, __ T16B, v31);
2784 
2785     __ BIND(L_aes_loop);
2786       __ ld1(v1, __ T16B, __ post(from, 16));
2787       __ eor(v0, __ T16B, v0, v1);
2788 
2789       __ br(Assembler::CC, L_rounds_44);
2790       __ br(Assembler::EQ, L_rounds_52);
2791 
2792       __ aese(v0, v17); __ aesmc(v0, v0);
2793       __ aese(v0, v18); __ aesmc(v0, v0);
2794     __ BIND(L_rounds_52);
2795       __ aese(v0, v19); __ aesmc(v0, v0);
2796       __ aese(v0, v20); __ aesmc(v0, v0);
2797     __ BIND(L_rounds_44);
2798       __ aese(v0, v21); __ aesmc(v0, v0);
2799       __ aese(v0, v22); __ aesmc(v0, v0);
2800       __ aese(v0, v23); __ aesmc(v0, v0);
2801       __ aese(v0, v24); __ aesmc(v0, v0);
2802       __ aese(v0, v25); __ aesmc(v0, v0);
2803       __ aese(v0, v26); __ aesmc(v0, v0);
2804       __ aese(v0, v27); __ aesmc(v0, v0);
2805       __ aese(v0, v28); __ aesmc(v0, v0);
2806       __ aese(v0, v29); __ aesmc(v0, v0);
2807       __ aese(v0, v30);
2808       __ eor(v0, __ T16B, v0, v31);
2809 
2810       __ st1(v0, __ T16B, __ post(to, 16));
2811 
2812       __ subw(len_reg, len_reg, 16);
2813       __ cbnzw(len_reg, L_aes_loop);
2814 
2815       __ st1(v0, __ T16B, rvec);
2816 
2817       __ mov(r0, rscratch2);
2818 
2819       __ leave();
2820       __ ret(lr);
2821 
2822       return start;
2823   }
2824 
2825   // Arguments:
2826   //
2827   // Inputs:
2828   //   c_rarg0   - source byte array address
2829   //   c_rarg1   - destination byte array address
2830   //   c_rarg2   - K (key) in little endian int array
2831   //   c_rarg3   - r vector byte array address
2832   //   c_rarg4   - input length
2833   //
2834   // Output:
2835   //   r0        - input length
2836   //
2837   address generate_cipherBlockChaining_decryptAESCrypt() {
2838     assert(UseAES, "need AES instructions and misaligned SSE support");
2839     __ align(CodeEntryAlignment);
2840     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
2841 
2842     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2843 
2844     const Register from        = c_rarg0;  // source array address
2845     const Register to          = c_rarg1;  // destination array address
2846     const Register key         = c_rarg2;  // key array address
2847     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2848                                            // and left with the results of the last encryption block
2849     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2850     const Register keylen      = rscratch1;
2851 
2852     address start = __ pc();
2853 
2854       __ enter();
2855 
2856       __ movw(rscratch2, len_reg);
2857 
2858       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2859 
2860       __ ld1(v2, __ T16B, rvec);
2861 
2862       __ ld1(v31, __ T16B, __ post(key, 16));
2863       __ rev32(v31, __ T16B, v31);
2864 
2865       __ cmpw(keylen, 52);
2866       __ br(Assembler::CC, L_loadkeys_44);
2867       __ br(Assembler::EQ, L_loadkeys_52);
2868 
2869       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2870       __ rev32(v17, __ T16B, v17);
2871       __ rev32(v18, __ T16B, v18);
2872     __ BIND(L_loadkeys_52);
2873       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2874       __ rev32(v19, __ T16B, v19);
2875       __ rev32(v20, __ T16B, v20);
2876     __ BIND(L_loadkeys_44);
2877       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2878       __ rev32(v21, __ T16B, v21);
2879       __ rev32(v22, __ T16B, v22);
2880       __ rev32(v23, __ T16B, v23);
2881       __ rev32(v24, __ T16B, v24);
2882       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2883       __ rev32(v25, __ T16B, v25);
2884       __ rev32(v26, __ T16B, v26);
2885       __ rev32(v27, __ T16B, v27);
2886       __ rev32(v28, __ T16B, v28);
2887       __ ld1(v29, v30, __ T16B, key);
2888       __ rev32(v29, __ T16B, v29);
2889       __ rev32(v30, __ T16B, v30);
2890 
2891     __ BIND(L_aes_loop);
2892       __ ld1(v0, __ T16B, __ post(from, 16));
2893       __ orr(v1, __ T16B, v0, v0);
2894 
2895       __ br(Assembler::CC, L_rounds_44);
2896       __ br(Assembler::EQ, L_rounds_52);
2897 
2898       __ aesd(v0, v17); __ aesimc(v0, v0);
2899       __ aesd(v0, v18); __ aesimc(v0, v0);
2900     __ BIND(L_rounds_52);
2901       __ aesd(v0, v19); __ aesimc(v0, v0);
2902       __ aesd(v0, v20); __ aesimc(v0, v0);
2903     __ BIND(L_rounds_44);
2904       __ aesd(v0, v21); __ aesimc(v0, v0);
2905       __ aesd(v0, v22); __ aesimc(v0, v0);
2906       __ aesd(v0, v23); __ aesimc(v0, v0);
2907       __ aesd(v0, v24); __ aesimc(v0, v0);
2908       __ aesd(v0, v25); __ aesimc(v0, v0);
2909       __ aesd(v0, v26); __ aesimc(v0, v0);
2910       __ aesd(v0, v27); __ aesimc(v0, v0);
2911       __ aesd(v0, v28); __ aesimc(v0, v0);
2912       __ aesd(v0, v29); __ aesimc(v0, v0);
2913       __ aesd(v0, v30);
2914       __ eor(v0, __ T16B, v0, v31);
2915       __ eor(v0, __ T16B, v0, v2);
2916 
2917       __ st1(v0, __ T16B, __ post(to, 16));
2918       __ orr(v2, __ T16B, v1, v1);
2919 
2920       __ subw(len_reg, len_reg, 16);
2921       __ cbnzw(len_reg, L_aes_loop);
2922 
2923       __ st1(v2, __ T16B, rvec);
2924 
2925       __ mov(r0, rscratch2);
2926 
2927       __ leave();
2928       __ ret(lr);
2929 
2930     return start;
2931   }
2932 
2933   // Arguments:
2934   //
2935   // Inputs:
2936   //   c_rarg0   - byte[]  source+offset
2937   //   c_rarg1   - int[]   SHA.state
2938   //   c_rarg2   - int     offset
2939   //   c_rarg3   - int     limit
2940   //
2941   address generate_sha1_implCompress(bool multi_block, const char *name) {
2942     __ align(CodeEntryAlignment);
2943     StubCodeMark mark(this, "StubRoutines", name);
2944     address start = __ pc();
2945 
2946     Register buf   = c_rarg0;
2947     Register state = c_rarg1;
2948     Register ofs   = c_rarg2;
2949     Register limit = c_rarg3;
2950 
2951     Label keys;
2952     Label sha1_loop;
2953 
2954     // load the keys into v0..v3
2955     __ adr(rscratch1, keys);
2956     __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
2957     // load 5 words state into v6, v7
2958     __ ldrq(v6, Address(state, 0));
2959     __ ldrs(v7, Address(state, 16));
2960 
2961 
2962     __ BIND(sha1_loop);
2963     // load 64 bytes of data into v16..v19
2964     __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
2965     __ rev32(v16, __ T16B, v16);
2966     __ rev32(v17, __ T16B, v17);
2967     __ rev32(v18, __ T16B, v18);
2968     __ rev32(v19, __ T16B, v19);
2969 
2970     // do the sha1
2971     __ addv(v4, __ T4S, v16, v0);
2972     __ orr(v20, __ T16B, v6, v6);
2973 
2974     FloatRegister d0 = v16;
2975     FloatRegister d1 = v17;
2976     FloatRegister d2 = v18;
2977     FloatRegister d3 = v19;
2978 
2979     for (int round = 0; round < 20; round++) {
2980       FloatRegister tmp1 = (round & 1) ? v4 : v5;
2981       FloatRegister tmp2 = (round & 1) ? v21 : v22;
2982       FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
2983       FloatRegister tmp4 = (round & 1) ? v5 : v4;
2984       FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
2985 
2986       if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
2987       if (round < 19) __ addv(tmp1, __ T4S, d1, key);
2988       __ sha1h(tmp2, __ T4S, v20);
2989       if (round < 5)
2990         __ sha1c(v20, __ T4S, tmp3, tmp4);
2991       else if (round < 10 || round >= 15)
2992         __ sha1p(v20, __ T4S, tmp3, tmp4);
2993       else
2994         __ sha1m(v20, __ T4S, tmp3, tmp4);
2995       if (round < 16) __ sha1su1(d0, __ T4S, d3);
2996 
2997       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
2998     }
2999 
3000     __ addv(v7, __ T2S, v7, v21);
3001     __ addv(v6, __ T4S, v6, v20);
3002 
3003     if (multi_block) {
3004       __ add(ofs, ofs, 64);
3005       __ cmp(ofs, limit);
3006       __ br(Assembler::LE, sha1_loop);
3007       __ mov(c_rarg0, ofs); // return ofs
3008     }
3009 
3010     __ strq(v6, Address(state, 0));
3011     __ strs(v7, Address(state, 16));
3012 
3013     __ ret(lr);
3014 
3015     __ bind(keys);
3016     __ emit_int32(0x5a827999);
3017     __ emit_int32(0x6ed9eba1);
3018     __ emit_int32(0x8f1bbcdc);
3019     __ emit_int32(0xca62c1d6);
3020 
3021     return start;
3022   }
3023 
3024 
3025   // Arguments:
3026   //
3027   // Inputs:
3028   //   c_rarg0   - byte[]  source+offset
3029   //   c_rarg1   - int[]   SHA.state
3030   //   c_rarg2   - int     offset
3031   //   c_rarg3   - int     limit
3032   //
3033   address generate_sha256_implCompress(bool multi_block, const char *name) {
3034     static const uint32_t round_consts[64] = {
3035       0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
3036       0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
3037       0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
3038       0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
3039       0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
3040       0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
3041       0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
3042       0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
3043       0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
3044       0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
3045       0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
3046       0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
3047       0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
3048       0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
3049       0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
3050       0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
3051     };
3052     __ align(CodeEntryAlignment);
3053     StubCodeMark mark(this, "StubRoutines", name);
3054     address start = __ pc();
3055 
3056     Register buf   = c_rarg0;
3057     Register state = c_rarg1;
3058     Register ofs   = c_rarg2;
3059     Register limit = c_rarg3;
3060 
3061     Label sha1_loop;
3062 
3063     __ stpd(v8, v9, __ pre(sp, -32));
3064     __ stpd(v10, v11, Address(sp, 16));
3065 
3066 // dga == v0
3067 // dgb == v1
3068 // dg0 == v2
3069 // dg1 == v3
3070 // dg2 == v4
3071 // t0 == v6
3072 // t1 == v7
3073 
3074     // load 16 keys to v16..v31
3075     __ lea(rscratch1, ExternalAddress((address)round_consts));
3076     __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
3077     __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
3078     __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
3079     __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
3080 
3081     // load 8 words (256 bits) state
3082     __ ldpq(v0, v1, state);
3083 
3084     __ BIND(sha1_loop);
3085     // load 64 bytes of data into v8..v11
3086     __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
3087     __ rev32(v8, __ T16B, v8);
3088     __ rev32(v9, __ T16B, v9);
3089     __ rev32(v10, __ T16B, v10);
3090     __ rev32(v11, __ T16B, v11);
3091 
3092     __ addv(v6, __ T4S, v8, v16);
3093     __ orr(v2, __ T16B, v0, v0);
3094     __ orr(v3, __ T16B, v1, v1);
3095 
3096     FloatRegister d0 = v8;
3097     FloatRegister d1 = v9;
3098     FloatRegister d2 = v10;
3099     FloatRegister d3 = v11;
3100 
3101 
3102     for (int round = 0; round < 16; round++) {
3103       FloatRegister tmp1 = (round & 1) ? v6 : v7;
3104       FloatRegister tmp2 = (round & 1) ? v7 : v6;
3105       FloatRegister tmp3 = (round & 1) ? v2 : v4;
3106       FloatRegister tmp4 = (round & 1) ? v4 : v2;
3107 
3108       if (round < 12) __ sha256su0(d0, __ T4S, d1);
3109        __ orr(v4, __ T16B, v2, v2);
3110       if (round < 15)
3111         __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
3112       __ sha256h(v2, __ T4S, v3, tmp2);
3113       __ sha256h2(v3, __ T4S, v4, tmp2);
3114       if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
3115 
3116       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3117     }
3118 
3119     __ addv(v0, __ T4S, v0, v2);
3120     __ addv(v1, __ T4S, v1, v3);
3121 
3122     if (multi_block) {
3123       __ add(ofs, ofs, 64);
3124       __ cmp(ofs, limit);
3125       __ br(Assembler::LE, sha1_loop);
3126       __ mov(c_rarg0, ofs); // return ofs
3127     }
3128 
3129     __ ldpd(v10, v11, Address(sp, 16));
3130     __ ldpd(v8, v9, __ post(sp, 32));
3131 
3132     __ stpq(v0, v1, state);
3133 
3134     __ ret(lr);
3135 
3136     return start;
3137   }
3138 
3139   // Safefetch stubs.
3140   void generate_safefetch(const char* name, int size, address* entry,
3141                           address* fault_pc, address* continuation_pc) {
3142     // safefetch signatures:
3143     //   int      SafeFetch32(int*      adr, int      errValue);
3144     //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
3145     //
3146     // arguments:
3147     //   c_rarg0 = adr
3148     //   c_rarg1 = errValue
3149     //
3150     // result:
3151     //   PPC_RET  = *adr or errValue
3152 
3153     StubCodeMark mark(this, "StubRoutines", name);
3154 
3155     // Entry point, pc or function descriptor.
3156     *entry = __ pc();
3157 
3158     // Load *adr into c_rarg1, may fault.
3159     *fault_pc = __ pc();
3160     switch (size) {
3161       case 4:
3162         // int32_t
3163         __ ldrw(c_rarg1, Address(c_rarg0, 0));
3164         break;
3165       case 8:
3166         // int64_t
3167         __ ldr(c_rarg1, Address(c_rarg0, 0));
3168         break;
3169       default:
3170         ShouldNotReachHere();
3171     }
3172 
3173     // return errValue or *adr
3174     *continuation_pc = __ pc();
3175     __ mov(r0, c_rarg1);
3176     __ ret(lr);
3177   }
3178 
3179   /**
3180    *  Arguments:
3181    *
3182    * Inputs:
3183    *   c_rarg0   - int crc
3184    *   c_rarg1   - byte* buf
3185    *   c_rarg2   - int length
3186    *
3187    * Ouput:
3188    *       rax   - int crc result
3189    */
3190   address generate_updateBytesCRC32() {
3191     assert(UseCRC32Intrinsics, "what are we doing here?");
3192 
3193     __ align(CodeEntryAlignment);
3194     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
3195 
3196     address start = __ pc();
3197 
3198     const Register crc   = c_rarg0;  // crc
3199     const Register buf   = c_rarg1;  // source java byte array address
3200     const Register len   = c_rarg2;  // length
3201     const Register table0 = c_rarg3; // crc_table address
3202     const Register table1 = c_rarg4;
3203     const Register table2 = c_rarg5;
3204     const Register table3 = c_rarg6;
3205     const Register tmp3 = c_rarg7;
3206 
3207     BLOCK_COMMENT("Entry:");
3208     __ enter(); // required for proper stackwalking of RuntimeStub frame
3209 
3210     __ kernel_crc32(crc, buf, len,
3211               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
3212 
3213     __ leave(); // required for proper stackwalking of RuntimeStub frame
3214     __ ret(lr);
3215 
3216     return start;
3217   }
3218 
3219   /**
3220    *  Arguments:
3221    *
3222    * Inputs:
3223    *   c_rarg0   - int crc
3224    *   c_rarg1   - byte* buf
3225    *   c_rarg2   - int length
3226    *   c_rarg3   - int* table
3227    *
3228    * Ouput:
3229    *       r0   - int crc result
3230    */
3231   address generate_updateBytesCRC32C() {
3232     assert(UseCRC32CIntrinsics, "what are we doing here?");
3233 
3234     __ align(CodeEntryAlignment);
3235     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
3236 
3237     address start = __ pc();
3238 
3239     const Register crc   = c_rarg0;  // crc
3240     const Register buf   = c_rarg1;  // source java byte array address
3241     const Register len   = c_rarg2;  // length
3242     const Register table0 = c_rarg3; // crc_table address
3243     const Register table1 = c_rarg4;
3244     const Register table2 = c_rarg5;
3245     const Register table3 = c_rarg6;
3246     const Register tmp3 = c_rarg7;
3247 
3248     BLOCK_COMMENT("Entry:");
3249     __ enter(); // required for proper stackwalking of RuntimeStub frame
3250 
3251     __ kernel_crc32c(crc, buf, len,
3252               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
3253 
3254     __ leave(); // required for proper stackwalking of RuntimeStub frame
3255     __ ret(lr);
3256 
3257     return start;
3258   }
3259 
3260   /***
3261    *  Arguments:
3262    *
3263    *  Inputs:
3264    *   c_rarg0   - int   adler
3265    *   c_rarg1   - byte* buff
3266    *   c_rarg2   - int   len
3267    *
3268    * Output:
3269    *   c_rarg0   - int adler result
3270    */
3271   address generate_updateBytesAdler32() {
3272     __ align(CodeEntryAlignment);
3273     StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32");
3274     address start = __ pc();
3275 
3276     Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
3277 
3278     // Aliases
3279     Register adler  = c_rarg0;
3280     Register s1     = c_rarg0;
3281     Register s2     = c_rarg3;
3282     Register buff   = c_rarg1;
3283     Register len    = c_rarg2;
3284     Register nmax  = r4;
3285     Register base  = r5;
3286     Register count = r6;
3287     Register temp0 = rscratch1;
3288     Register temp1 = rscratch2;
3289     FloatRegister vbytes = v0;
3290     FloatRegister vs1acc = v1;
3291     FloatRegister vs2acc = v2;
3292     FloatRegister vtable = v3;
3293 
3294     // Max number of bytes we can process before having to take the mod
3295     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
3296     unsigned long BASE = 0xfff1;
3297     unsigned long NMAX = 0x15B0;
3298 
3299     __ mov(base, BASE);
3300     __ mov(nmax, NMAX);
3301 
3302     // Load accumulation coefficients for the upper 16 bits
3303     __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table));
3304     __ ld1(vtable, __ T16B, Address(temp0));
3305 
3306     // s1 is initialized to the lower 16 bits of adler
3307     // s2 is initialized to the upper 16 bits of adler
3308     __ ubfx(s2, adler, 16, 16);  // s2 = ((adler >> 16) & 0xffff)
3309     __ uxth(s1, adler);          // s1 = (adler & 0xffff)
3310 
3311     // The pipelined loop needs at least 16 elements for 1 iteration
3312     // It does check this, but it is more effective to skip to the cleanup loop
3313     __ cmp(len, (u1)16);
3314     __ br(Assembler::HS, L_nmax);
3315     __ cbz(len, L_combine);
3316 
3317     __ bind(L_simple_by1_loop);
3318     __ ldrb(temp0, Address(__ post(buff, 1)));
3319     __ add(s1, s1, temp0);
3320     __ add(s2, s2, s1);
3321     __ subs(len, len, 1);
3322     __ br(Assembler::HI, L_simple_by1_loop);
3323 
3324     // s1 = s1 % BASE
3325     __ subs(temp0, s1, base);
3326     __ csel(s1, temp0, s1, Assembler::HS);
3327 
3328     // s2 = s2 % BASE
3329     __ lsr(temp0, s2, 16);
3330     __ lsl(temp1, temp0, 4);
3331     __ sub(temp1, temp1, temp0);
3332     __ add(s2, temp1, s2, ext::uxth);
3333 
3334     __ subs(temp0, s2, base);
3335     __ csel(s2, temp0, s2, Assembler::HS);
3336 
3337     __ b(L_combine);
3338 
3339     __ bind(L_nmax);
3340     __ subs(len, len, nmax);
3341     __ sub(count, nmax, 16);
3342     __ br(Assembler::LO, L_by16);
3343 
3344     __ bind(L_nmax_loop);
3345 
3346     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
3347                                       vbytes, vs1acc, vs2acc, vtable);
3348 
3349     __ subs(count, count, 16);
3350     __ br(Assembler::HS, L_nmax_loop);
3351 
3352     // s1 = s1 % BASE
3353     __ lsr(temp0, s1, 16);
3354     __ lsl(temp1, temp0, 4);
3355     __ sub(temp1, temp1, temp0);
3356     __ add(temp1, temp1, s1, ext::uxth);
3357 
3358     __ lsr(temp0, temp1, 16);
3359     __ lsl(s1, temp0, 4);
3360     __ sub(s1, s1, temp0);
3361     __ add(s1, s1, temp1, ext:: uxth);
3362 
3363     __ subs(temp0, s1, base);
3364     __ csel(s1, temp0, s1, Assembler::HS);
3365 
3366     // s2 = s2 % BASE
3367     __ lsr(temp0, s2, 16);
3368     __ lsl(temp1, temp0, 4);
3369     __ sub(temp1, temp1, temp0);
3370     __ add(temp1, temp1, s2, ext::uxth);
3371 
3372     __ lsr(temp0, temp1, 16);
3373     __ lsl(s2, temp0, 4);
3374     __ sub(s2, s2, temp0);
3375     __ add(s2, s2, temp1, ext:: uxth);
3376 
3377     __ subs(temp0, s2, base);
3378     __ csel(s2, temp0, s2, Assembler::HS);
3379 
3380     __ subs(len, len, nmax);
3381     __ sub(count, nmax, 16);
3382     __ br(Assembler::HS, L_nmax_loop);
3383 
3384     __ bind(L_by16);
3385     __ adds(len, len, count);
3386     __ br(Assembler::LO, L_by1);
3387 
3388     __ bind(L_by16_loop);
3389 
3390     generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
3391                                       vbytes, vs1acc, vs2acc, vtable);
3392 
3393     __ subs(len, len, 16);
3394     __ br(Assembler::HS, L_by16_loop);
3395 
3396     __ bind(L_by1);
3397     __ adds(len, len, 15);
3398     __ br(Assembler::LO, L_do_mod);
3399 
3400     __ bind(L_by1_loop);
3401     __ ldrb(temp0, Address(__ post(buff, 1)));
3402     __ add(s1, temp0, s1);
3403     __ add(s2, s2, s1);
3404     __ subs(len, len, 1);
3405     __ br(Assembler::HS, L_by1_loop);
3406 
3407     __ bind(L_do_mod);
3408     // s1 = s1 % BASE
3409     __ lsr(temp0, s1, 16);
3410     __ lsl(temp1, temp0, 4);
3411     __ sub(temp1, temp1, temp0);
3412     __ add(temp1, temp1, s1, ext::uxth);
3413 
3414     __ lsr(temp0, temp1, 16);
3415     __ lsl(s1, temp0, 4);
3416     __ sub(s1, s1, temp0);
3417     __ add(s1, s1, temp1, ext:: uxth);
3418 
3419     __ subs(temp0, s1, base);
3420     __ csel(s1, temp0, s1, Assembler::HS);
3421 
3422     // s2 = s2 % BASE
3423     __ lsr(temp0, s2, 16);
3424     __ lsl(temp1, temp0, 4);
3425     __ sub(temp1, temp1, temp0);
3426     __ add(temp1, temp1, s2, ext::uxth);
3427 
3428     __ lsr(temp0, temp1, 16);
3429     __ lsl(s2, temp0, 4);
3430     __ sub(s2, s2, temp0);
3431     __ add(s2, s2, temp1, ext:: uxth);
3432 
3433     __ subs(temp0, s2, base);
3434     __ csel(s2, temp0, s2, Assembler::HS);
3435 
3436     // Combine lower bits and higher bits
3437     __ bind(L_combine);
3438     __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
3439 
3440     __ ret(lr);
3441 
3442     return start;
3443   }
3444 
3445   void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff,
3446           Register temp0, Register temp1, FloatRegister vbytes,
3447           FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) {
3448     // Below is a vectorized implementation of updating s1 and s2 for 16 bytes.
3449     // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration.
3450     // In non-vectorized code, we update s1 and s2 as:
3451     //   s1 <- s1 + b1
3452     //   s2 <- s2 + s1
3453     //   s1 <- s1 + b2
3454     //   s2 <- s2 + b1
3455     //   ...
3456     //   s1 <- s1 + b16
3457     //   s2 <- s2 + s1
3458     // Putting above assignments together, we have:
3459     //   s1_new = s1 + b1 + b2 + ... + b16
3460     //   s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16)
3461     //          = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1)
3462     //          = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1)
3463     __ ld1(vbytes, __ T16B, Address(__ post(buff, 16)));
3464 
3465     // s2 = s2 + s1 * 16
3466     __ add(s2, s2, s1, Assembler::LSL, 4);
3467 
3468     // vs1acc = b1 + b2 + b3 + ... + b16
3469     // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1)
3470     __ umullv(vs2acc, __ T8B, vtable, vbytes);
3471     __ umlalv(vs2acc, __ T16B, vtable, vbytes);
3472     __ uaddlv(vs1acc, __ T16B, vbytes);
3473     __ uaddlv(vs2acc, __ T8H, vs2acc);
3474 
3475     // s1 = s1 + vs1acc, s2 = s2 + vs2acc
3476     __ fmovd(temp0, vs1acc);
3477     __ fmovd(temp1, vs2acc);
3478     __ add(s1, s1, temp0);
3479     __ add(s2, s2, temp1);
3480   }
3481 
3482   /**
3483    *  Arguments:
3484    *
3485    *  Input:
3486    *    c_rarg0   - x address
3487    *    c_rarg1   - x length
3488    *    c_rarg2   - y address
3489    *    c_rarg3   - y lenth
3490    *    c_rarg4   - z address
3491    *    c_rarg5   - z length
3492    */
3493   address generate_multiplyToLen() {
3494     __ align(CodeEntryAlignment);
3495     StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
3496 
3497     address start = __ pc();
3498     const Register x     = r0;
3499     const Register xlen  = r1;
3500     const Register y     = r2;
3501     const Register ylen  = r3;
3502     const Register z     = r4;
3503     const Register zlen  = r5;
3504 
3505     const Register tmp1  = r10;
3506     const Register tmp2  = r11;
3507     const Register tmp3  = r12;
3508     const Register tmp4  = r13;
3509     const Register tmp5  = r14;
3510     const Register tmp6  = r15;
3511     const Register tmp7  = r16;
3512 
3513     BLOCK_COMMENT("Entry:");
3514     __ enter(); // required for proper stackwalking of RuntimeStub frame
3515     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
3516     __ leave(); // required for proper stackwalking of RuntimeStub frame
3517     __ ret(lr);
3518 
3519     return start;
3520   }
3521 
3522   address generate_squareToLen() {
3523     // squareToLen algorithm for sizes 1..127 described in java code works
3524     // faster than multiply_to_len on some CPUs and slower on others, but
3525     // multiply_to_len shows a bit better overall results
3526     __ align(CodeEntryAlignment);
3527     StubCodeMark mark(this, "StubRoutines", "squareToLen");
3528     address start = __ pc();
3529 
3530     const Register x     = r0;
3531     const Register xlen  = r1;
3532     const Register z     = r2;
3533     const Register zlen  = r3;
3534     const Register y     = r4; // == x
3535     const Register ylen  = r5; // == xlen
3536 
3537     const Register tmp1  = r10;
3538     const Register tmp2  = r11;
3539     const Register tmp3  = r12;
3540     const Register tmp4  = r13;
3541     const Register tmp5  = r14;
3542     const Register tmp6  = r15;
3543     const Register tmp7  = r16;
3544 
3545     RegSet spilled_regs = RegSet::of(y, ylen);
3546     BLOCK_COMMENT("Entry:");
3547     __ enter();
3548     __ push(spilled_regs, sp);
3549     __ mov(y, x);
3550     __ mov(ylen, xlen);
3551     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
3552     __ pop(spilled_regs, sp);
3553     __ leave();
3554     __ ret(lr);
3555     return start;
3556   }
3557 
3558   address generate_mulAdd() {
3559     __ align(CodeEntryAlignment);
3560     StubCodeMark mark(this, "StubRoutines", "mulAdd");
3561 
3562     address start = __ pc();
3563 
3564     const Register out     = r0;
3565     const Register in      = r1;
3566     const Register offset  = r2;
3567     const Register len     = r3;
3568     const Register k       = r4;
3569 
3570     BLOCK_COMMENT("Entry:");
3571     __ enter();
3572     __ mul_add(out, in, offset, len, k);
3573     __ leave();
3574     __ ret(lr);
3575 
3576     return start;
3577   }
3578 
3579   void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi,
3580                       FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0,
3581                       FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4) {
3582     // Karatsuba multiplication performs a 128*128 -> 256-bit
3583     // multiplication in three 128-bit multiplications and a few
3584     // additions.
3585     //
3586     // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1)
3587     // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0
3588     //
3589     // Inputs:
3590     //
3591     // A0 in a.d[0]     (subkey)
3592     // A1 in a.d[1]
3593     // (A1+A0) in a1_xor_a0.d[0]
3594     //
3595     // B0 in b.d[0]     (state)
3596     // B1 in b.d[1]
3597 
3598     __ ext(tmp1, __ T16B, b, b, 0x08);
3599     __ pmull2(result_hi, __ T1Q, b, a, __ T2D);  // A1*B1
3600     __ eor(tmp1, __ T16B, tmp1, b);            // (B1+B0)
3601     __ pmull(result_lo,  __ T1Q, b, a, __ T1D);  // A0*B0
3602     __ pmull(tmp2, __ T1Q, tmp1, a1_xor_a0, __ T1D); // (A1+A0)(B1+B0)
3603 
3604     __ ext(tmp4, __ T16B, result_lo, result_hi, 0x08);
3605     __ eor(tmp3, __ T16B, result_hi, result_lo); // A1*B1+A0*B0
3606     __ eor(tmp2, __ T16B, tmp2, tmp4);
3607     __ eor(tmp2, __ T16B, tmp2, tmp3);
3608 
3609     // Register pair <result_hi:result_lo> holds the result of carry-less multiplication
3610     __ ins(result_hi, __ D, tmp2, 0, 1);
3611     __ ins(result_lo, __ D, tmp2, 1, 0);
3612   }
3613 
3614   void ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi,
3615                     FloatRegister p, FloatRegister z, FloatRegister t1) {
3616     const FloatRegister t0 = result;
3617 
3618     // The GCM field polynomial f is z^128 + p(z), where p =
3619     // z^7+z^2+z+1.
3620     //
3621     //    z^128 === -p(z)  (mod (z^128 + p(z)))
3622     //
3623     // so, given that the product we're reducing is
3624     //    a == lo + hi * z^128
3625     // substituting,
3626     //      === lo - hi * p(z)  (mod (z^128 + p(z)))
3627     //
3628     // we reduce by multiplying hi by p(z) and subtracting the result
3629     // from (i.e. XORing it with) lo.  Because p has no nonzero high
3630     // bits we can do this with two 64-bit multiplications, lo*p and
3631     // hi*p.
3632 
3633     __ pmull2(t0, __ T1Q, hi, p, __ T2D);
3634     __ ext(t1, __ T16B, t0, z, 8);
3635     __ eor(hi, __ T16B, hi, t1);
3636     __ ext(t1, __ T16B, z, t0, 8);
3637     __ eor(lo, __ T16B, lo, t1);
3638     __ pmull(t0, __ T1Q, hi, p, __ T1D);
3639     __ eor(result, __ T16B, lo, t0);
3640   }
3641 
3642   address generate_has_negatives(address &has_negatives_long) {
3643     const u1 large_loop_size = 64;
3644     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
3645     int dcache_line = VM_Version::dcache_line_size();
3646 
3647     Register ary1 = r1, len = r2, result = r0;
3648 
3649     __ align(CodeEntryAlignment);
3650 
3651     StubCodeMark mark(this, "StubRoutines", "has_negatives");
3652 
3653     address entry = __ pc();
3654 
3655     __ enter();
3656 
3657   Label RET_TRUE, RET_TRUE_NO_POP, RET_FALSE, ALIGNED, LOOP16, CHECK_16, DONE,
3658         LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;
3659 
3660   __ cmp(len, (u1)15);
3661   __ br(Assembler::GT, LEN_OVER_15);
3662   // The only case when execution falls into this code is when pointer is near
3663   // the end of memory page and we have to avoid reading next page
3664   __ add(ary1, ary1, len);
3665   __ subs(len, len, 8);
3666   __ br(Assembler::GT, LEN_OVER_8);
3667   __ ldr(rscratch2, Address(ary1, -8));
3668   __ sub(rscratch1, zr, len, __ LSL, 3);  // LSL 3 is to get bits from bytes.
3669   __ lsrv(rscratch2, rscratch2, rscratch1);
3670   __ tst(rscratch2, UPPER_BIT_MASK);
3671   __ cset(result, Assembler::NE);
3672   __ leave();
3673   __ ret(lr);
3674   __ bind(LEN_OVER_8);
3675   __ ldp(rscratch1, rscratch2, Address(ary1, -16));
3676   __ sub(len, len, 8); // no data dep., then sub can be executed while loading
3677   __ tst(rscratch2, UPPER_BIT_MASK);
3678   __ br(Assembler::NE, RET_TRUE_NO_POP);
3679   __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
3680   __ lsrv(rscratch1, rscratch1, rscratch2);
3681   __ tst(rscratch1, UPPER_BIT_MASK);
3682   __ cset(result, Assembler::NE);
3683   __ leave();
3684   __ ret(lr);
3685 
3686   Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10;
3687   const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6;
3688 
3689   has_negatives_long = __ pc(); // 2nd entry point
3690 
3691   __ enter();
3692 
3693   __ bind(LEN_OVER_15);
3694     __ push(spilled_regs, sp);
3695     __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment
3696     __ cbz(rscratch2, ALIGNED);
3697     __ ldp(tmp6, tmp1, Address(ary1));
3698     __ mov(tmp5, 16);
3699     __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address
3700     __ add(ary1, ary1, rscratch1);
3701     __ sub(len, len, rscratch1);
3702     __ orr(tmp6, tmp6, tmp1);
3703     __ tst(tmp6, UPPER_BIT_MASK);
3704     __ br(Assembler::NE, RET_TRUE);
3705 
3706   __ bind(ALIGNED);
3707     __ cmp(len, large_loop_size);
3708     __ br(Assembler::LT, CHECK_16);
3709     // Perform 16-byte load as early return in pre-loop to handle situation
3710     // when initially aligned large array has negative values at starting bytes,
3711     // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is
3712     // slower. Cases with negative bytes further ahead won't be affected that
3713     // much. In fact, it'll be faster due to early loads, less instructions and
3714     // less branches in LARGE_LOOP.
3715     __ ldp(tmp6, tmp1, Address(__ post(ary1, 16)));
3716     __ sub(len, len, 16);
3717     __ orr(tmp6, tmp6, tmp1);
3718     __ tst(tmp6, UPPER_BIT_MASK);
3719     __ br(Assembler::NE, RET_TRUE);
3720     __ cmp(len, large_loop_size);
3721     __ br(Assembler::LT, CHECK_16);
3722 
3723     if (SoftwarePrefetchHintDistance >= 0
3724         && SoftwarePrefetchHintDistance >= dcache_line) {
3725       // initial prefetch
3726       __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line));
3727     }
3728   __ bind(LARGE_LOOP);
3729     if (SoftwarePrefetchHintDistance >= 0) {
3730       __ prfm(Address(ary1, SoftwarePrefetchHintDistance));
3731     }
3732     // Issue load instructions first, since it can save few CPU/MEM cycles, also
3733     // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp)
3734     // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3
3735     // instructions per cycle and have less branches, but this approach disables
3736     // early return, thus, all 64 bytes are loaded and checked every time.
3737     __ ldp(tmp2, tmp3, Address(ary1));
3738     __ ldp(tmp4, tmp5, Address(ary1, 16));
3739     __ ldp(rscratch1, rscratch2, Address(ary1, 32));
3740     __ ldp(tmp6, tmp1, Address(ary1, 48));
3741     __ add(ary1, ary1, large_loop_size);
3742     __ sub(len, len, large_loop_size);
3743     __ orr(tmp2, tmp2, tmp3);
3744     __ orr(tmp4, tmp4, tmp5);
3745     __ orr(rscratch1, rscratch1, rscratch2);
3746     __ orr(tmp6, tmp6, tmp1);
3747     __ orr(tmp2, tmp2, tmp4);
3748     __ orr(rscratch1, rscratch1, tmp6);
3749     __ orr(tmp2, tmp2, rscratch1);
3750     __ tst(tmp2, UPPER_BIT_MASK);
3751     __ br(Assembler::NE, RET_TRUE);
3752     __ cmp(len, large_loop_size);
3753     __ br(Assembler::GE, LARGE_LOOP);
3754 
3755   __ bind(CHECK_16); // small 16-byte load pre-loop
3756     __ cmp(len, (u1)16);
3757     __ br(Assembler::LT, POST_LOOP16);
3758 
3759   __ bind(LOOP16); // small 16-byte load loop
3760     __ ldp(tmp2, tmp3, Address(__ post(ary1, 16)));
3761     __ sub(len, len, 16);
3762     __ orr(tmp2, tmp2, tmp3);
3763     __ tst(tmp2, UPPER_BIT_MASK);
3764     __ br(Assembler::NE, RET_TRUE);
3765     __ cmp(len, (u1)16);
3766     __ br(Assembler::GE, LOOP16); // 16-byte load loop end
3767 
3768   __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
3769     __ cmp(len, (u1)8);
3770     __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
3771     __ ldr(tmp3, Address(__ post(ary1, 8)));
3772     __ sub(len, len, 8);
3773     __ tst(tmp3, UPPER_BIT_MASK);
3774     __ br(Assembler::NE, RET_TRUE);
3775 
3776   __ bind(POST_LOOP16_LOAD_TAIL);
3777     __ cbz(len, RET_FALSE); // Can't shift left by 64 when len==0
3778     __ ldr(tmp1, Address(ary1));
3779     __ mov(tmp2, 64);
3780     __ sub(tmp4, tmp2, len, __ LSL, 3);
3781     __ lslv(tmp1, tmp1, tmp4);
3782     __ tst(tmp1, UPPER_BIT_MASK);
3783     __ br(Assembler::NE, RET_TRUE);
3784     // Fallthrough
3785 
3786   __ bind(RET_FALSE);
3787     __ pop(spilled_regs, sp);
3788     __ leave();
3789     __ mov(result, zr);
3790     __ ret(lr);
3791 
3792   __ bind(RET_TRUE);
3793     __ pop(spilled_regs, sp);
3794   __ bind(RET_TRUE_NO_POP);
3795     __ leave();
3796     __ mov(result, 1);
3797     __ ret(lr);
3798 
3799   __ bind(DONE);
3800     __ pop(spilled_regs, sp);
3801     __ leave();
3802     __ ret(lr);
3803     return entry;
3804   }
3805 
3806   void generate_large_array_equals_loop_nonsimd(int loopThreshold,
3807         bool usePrefetch, Label &NOT_EQUAL) {
3808     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
3809         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
3810         tmp7 = r12, tmp8 = r13;
3811     Label LOOP;
3812 
3813     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
3814     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
3815     __ bind(LOOP);
3816     if (usePrefetch) {
3817       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
3818       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
3819     }
3820     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
3821     __ eor(tmp1, tmp1, tmp2);
3822     __ eor(tmp3, tmp3, tmp4);
3823     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
3824     __ orr(tmp1, tmp1, tmp3);
3825     __ cbnz(tmp1, NOT_EQUAL);
3826     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
3827     __ eor(tmp5, tmp5, tmp6);
3828     __ eor(tmp7, tmp7, tmp8);
3829     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
3830     __ orr(tmp5, tmp5, tmp7);
3831     __ cbnz(tmp5, NOT_EQUAL);
3832     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
3833     __ eor(tmp1, tmp1, tmp2);
3834     __ eor(tmp3, tmp3, tmp4);
3835     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
3836     __ orr(tmp1, tmp1, tmp3);
3837     __ cbnz(tmp1, NOT_EQUAL);
3838     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
3839     __ eor(tmp5, tmp5, tmp6);
3840     __ sub(cnt1, cnt1, 8 * wordSize);
3841     __ eor(tmp7, tmp7, tmp8);
3842     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
3843     // tmp6 is not used. MacroAssembler::subs is used here (rather than
3844     // cmp) because subs allows an unlimited range of immediate operand.
3845     __ subs(tmp6, cnt1, loopThreshold);
3846     __ orr(tmp5, tmp5, tmp7);
3847     __ cbnz(tmp5, NOT_EQUAL);
3848     __ br(__ GE, LOOP);
3849     // post-loop
3850     __ eor(tmp1, tmp1, tmp2);
3851     __ eor(tmp3, tmp3, tmp4);
3852     __ orr(tmp1, tmp1, tmp3);
3853     __ sub(cnt1, cnt1, 2 * wordSize);
3854     __ cbnz(tmp1, NOT_EQUAL);
3855   }
3856 
3857   void generate_large_array_equals_loop_simd(int loopThreshold,
3858         bool usePrefetch, Label &NOT_EQUAL) {
3859     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
3860         tmp2 = rscratch2;
3861     Label LOOP;
3862 
3863     __ bind(LOOP);
3864     if (usePrefetch) {
3865       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
3866       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
3867     }
3868     __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize)));
3869     __ sub(cnt1, cnt1, 8 * wordSize);
3870     __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize)));
3871     __ subs(tmp1, cnt1, loopThreshold);
3872     __ eor(v0, __ T16B, v0, v4);
3873     __ eor(v1, __ T16B, v1, v5);
3874     __ eor(v2, __ T16B, v2, v6);
3875     __ eor(v3, __ T16B, v3, v7);
3876     __ orr(v0, __ T16B, v0, v1);
3877     __ orr(v1, __ T16B, v2, v3);
3878     __ orr(v0, __ T16B, v0, v1);
3879     __ umov(tmp1, v0, __ D, 0);
3880     __ umov(tmp2, v0, __ D, 1);
3881     __ orr(tmp1, tmp1, tmp2);
3882     __ cbnz(tmp1, NOT_EQUAL);
3883     __ br(__ GE, LOOP);
3884   }
3885 
3886   // a1 = r1 - array1 address
3887   // a2 = r2 - array2 address
3888   // result = r0 - return value. Already contains "false"
3889   // cnt1 = r10 - amount of elements left to check, reduced by wordSize
3890   // r3-r5 are reserved temporary registers
3891   address generate_large_array_equals() {
3892     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
3893         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
3894         tmp7 = r12, tmp8 = r13;
3895     Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP,
3896         SMALL_LOOP, POST_LOOP;
3897     const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16;
3898     // calculate if at least 32 prefetched bytes are used
3899     int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32;
3900     int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE);
3901     RegSet spilled_regs = RegSet::range(tmp6, tmp8);
3902     assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4,
3903         tmp5, tmp6, tmp7, tmp8);
3904 
3905     __ align(CodeEntryAlignment);
3906 
3907     StubCodeMark mark(this, "StubRoutines", "large_array_equals");
3908 
3909     address entry = __ pc();
3910     __ enter();
3911     __ sub(cnt1, cnt1, wordSize);  // first 8 bytes were loaded outside of stub
3912     // also advance pointers to use post-increment instead of pre-increment
3913     __ add(a1, a1, wordSize);
3914     __ add(a2, a2, wordSize);
3915     if (AvoidUnalignedAccesses) {
3916       // both implementations (SIMD/nonSIMD) are using relatively large load
3917       // instructions (ld1/ldp), which has huge penalty (up to x2 exec time)
3918       // on some CPUs in case of address is not at least 16-byte aligned.
3919       // Arrays are 8-byte aligned currently, so, we can make additional 8-byte
3920       // load if needed at least for 1st address and make if 16-byte aligned.
3921       Label ALIGNED16;
3922       __ tbz(a1, 3, ALIGNED16);
3923       __ ldr(tmp1, Address(__ post(a1, wordSize)));
3924       __ ldr(tmp2, Address(__ post(a2, wordSize)));
3925       __ sub(cnt1, cnt1, wordSize);
3926       __ eor(tmp1, tmp1, tmp2);
3927       __ cbnz(tmp1, NOT_EQUAL_NO_POP);
3928       __ bind(ALIGNED16);
3929     }
3930     if (UseSIMDForArrayEquals) {
3931       if (SoftwarePrefetchHintDistance >= 0) {
3932         __ subs(tmp1, cnt1, prefetchLoopThreshold);
3933         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
3934         generate_large_array_equals_loop_simd(prefetchLoopThreshold,
3935             /* prfm = */ true, NOT_EQUAL);
3936         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
3937         __ br(__ LT, TAIL);
3938       }
3939       __ bind(NO_PREFETCH_LARGE_LOOP);
3940       generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold,
3941           /* prfm = */ false, NOT_EQUAL);
3942     } else {
3943       __ push(spilled_regs, sp);
3944       if (SoftwarePrefetchHintDistance >= 0) {
3945         __ subs(tmp1, cnt1, prefetchLoopThreshold);
3946         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
3947         generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold,
3948             /* prfm = */ true, NOT_EQUAL);
3949         __ subs(zr, cnt1, nonPrefetchLoopThreshold);
3950         __ br(__ LT, TAIL);
3951       }
3952       __ bind(NO_PREFETCH_LARGE_LOOP);
3953       generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold,
3954           /* prfm = */ false, NOT_EQUAL);
3955     }
3956     __ bind(TAIL);
3957       __ cbz(cnt1, EQUAL);
3958       __ subs(cnt1, cnt1, wordSize);
3959       __ br(__ LE, POST_LOOP);
3960     __ bind(SMALL_LOOP);
3961       __ ldr(tmp1, Address(__ post(a1, wordSize)));
3962       __ ldr(tmp2, Address(__ post(a2, wordSize)));
3963       __ subs(cnt1, cnt1, wordSize);
3964       __ eor(tmp1, tmp1, tmp2);
3965       __ cbnz(tmp1, NOT_EQUAL);
3966       __ br(__ GT, SMALL_LOOP);
3967     __ bind(POST_LOOP);
3968       __ ldr(tmp1, Address(a1, cnt1));
3969       __ ldr(tmp2, Address(a2, cnt1));
3970       __ eor(tmp1, tmp1, tmp2);
3971       __ cbnz(tmp1, NOT_EQUAL);
3972     __ bind(EQUAL);
3973       __ mov(result, true);
3974     __ bind(NOT_EQUAL);
3975       if (!UseSIMDForArrayEquals) {
3976         __ pop(spilled_regs, sp);
3977       }
3978     __ bind(NOT_EQUAL_NO_POP);
3979     __ leave();
3980     __ ret(lr);
3981     return entry;
3982   }
3983 
3984   address generate_dsin_dcos(bool isCos) {
3985     __ align(CodeEntryAlignment);
3986     StubCodeMark mark(this, "StubRoutines", isCos ? "libmDcos" : "libmDsin");
3987     address start = __ pc();
3988     __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw,
3989         (address)StubRoutines::aarch64::_two_over_pi,
3990         (address)StubRoutines::aarch64::_pio2,
3991         (address)StubRoutines::aarch64::_dsin_coef,
3992         (address)StubRoutines::aarch64::_dcos_coef);
3993     return start;
3994   }
3995 
3996   address generate_dlog() {
3997     __ align(CodeEntryAlignment);
3998     StubCodeMark mark(this, "StubRoutines", "dlog");
3999     address entry = __ pc();
4000     FloatRegister vtmp0 = v0, vtmp1 = v1, vtmp2 = v2, vtmp3 = v3, vtmp4 = v4,
4001         vtmp5 = v5, tmpC1 = v16, tmpC2 = v17, tmpC3 = v18, tmpC4 = v19;
4002     Register tmp1 = r0, tmp2 = r1, tmp3 = r2, tmp4 = r3, tmp5 = r4;
4003     __ fast_log(vtmp0, vtmp1, vtmp2, vtmp3, vtmp4, vtmp5, tmpC1, tmpC2, tmpC3,
4004         tmpC4, tmp1, tmp2, tmp3, tmp4, tmp5);
4005     return entry;
4006   }
4007 
4008   // code for comparing 16 bytes of strings with same encoding
4009   void compare_string_16_bytes_same(Label &DIFF1, Label &DIFF2) {
4010     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, tmp1 = r10, tmp2 = r11;
4011     __ ldr(rscratch1, Address(__ post(str1, 8)));
4012     __ eor(rscratch2, tmp1, tmp2);
4013     __ ldr(cnt1, Address(__ post(str2, 8)));
4014     __ cbnz(rscratch2, DIFF1);
4015     __ ldr(tmp1, Address(__ post(str1, 8)));
4016     __ eor(rscratch2, rscratch1, cnt1);
4017     __ ldr(tmp2, Address(__ post(str2, 8)));
4018     __ cbnz(rscratch2, DIFF2);
4019   }
4020 
4021   // code for comparing 16 characters of strings with Latin1 and Utf16 encoding
4022   void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1,
4023       Label &DIFF2) {
4024     Register cnt1 = r2, tmp2 = r11, tmp3 = r12;
4025     FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2;
4026 
4027     __ ldrq(vtmp, Address(__ post(tmp2, 16)));
4028     __ ldr(tmpU, Address(__ post(cnt1, 8)));
4029     __ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
4030     // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3
4031 
4032     __ fmovd(tmpL, vtmp3);
4033     __ eor(rscratch2, tmp3, tmpL);
4034     __ cbnz(rscratch2, DIFF2);
4035 
4036     __ ldr(tmp3, Address(__ post(cnt1, 8)));
4037     __ umov(tmpL, vtmp3, __ D, 1);
4038     __ eor(rscratch2, tmpU, tmpL);
4039     __ cbnz(rscratch2, DIFF1);
4040 
4041     __ zip2(vtmp, __ T16B, vtmp, vtmpZ);
4042     __ ldr(tmpU, Address(__ post(cnt1, 8)));
4043     __ fmovd(tmpL, vtmp);
4044     __ eor(rscratch2, tmp3, tmpL);
4045     __ cbnz(rscratch2, DIFF2);
4046 
4047     __ ldr(tmp3, Address(__ post(cnt1, 8)));
4048     __ umov(tmpL, vtmp, __ D, 1);
4049     __ eor(rscratch2, tmpU, tmpL);
4050     __ cbnz(rscratch2, DIFF1);
4051   }
4052 
4053   // r0  = result
4054   // r1  = str1
4055   // r2  = cnt1
4056   // r3  = str2
4057   // r4  = cnt2
4058   // r10 = tmp1
4059   // r11 = tmp2
4060   address generate_compare_long_string_different_encoding(bool isLU) {
4061     __ align(CodeEntryAlignment);
4062     StubCodeMark mark(this, "StubRoutines", isLU
4063         ? "compare_long_string_different_encoding LU"
4064         : "compare_long_string_different_encoding UL");
4065     address entry = __ pc();
4066     Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2,
4067         DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH,
4068         LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2;
4069     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
4070         tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14;
4071     FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2;
4072     RegSet spilled_regs = RegSet::of(tmp3, tmp4);
4073 
4074     int prefetchLoopExitCondition = MAX(64, SoftwarePrefetchHintDistance/2);
4075 
4076     __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ);
4077     // cnt2 == amount of characters left to compare
4078     // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL))
4079     __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
4080     __ add(str1, str1, isLU ? wordSize/2 : wordSize);
4081     __ add(str2, str2, isLU ? wordSize : wordSize/2);
4082     __ fmovd(isLU ? tmp1 : tmp2, vtmp);
4083     __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
4084     __ eor(rscratch2, tmp1, tmp2);
4085     __ mov(rscratch1, tmp2);
4086     __ cbnz(rscratch2, CALCULATE_DIFFERENCE);
4087     Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison
4088              tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison
4089     __ push(spilled_regs, sp);
4090     __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load
4091     __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load
4092 
4093     __ ldr(tmp3, Address(__ post(cnt1, 8)));
4094 
4095     if (SoftwarePrefetchHintDistance >= 0) {
4096       __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
4097       __ br(__ LT, NO_PREFETCH);
4098       __ bind(LARGE_LOOP_PREFETCH);
4099         __ prfm(Address(tmp2, SoftwarePrefetchHintDistance));
4100         __ mov(tmp4, 2);
4101         __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
4102         __ bind(LARGE_LOOP_PREFETCH_REPEAT1);
4103           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
4104           __ subs(tmp4, tmp4, 1);
4105           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1);
4106           __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
4107           __ mov(tmp4, 2);
4108         __ bind(LARGE_LOOP_PREFETCH_REPEAT2);
4109           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
4110           __ subs(tmp4, tmp4, 1);
4111           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2);
4112           __ sub(cnt2, cnt2, 64);
4113           __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
4114           __ br(__ GE, LARGE_LOOP_PREFETCH);
4115     }
4116     __ cbz(cnt2, LOAD_LAST); // no characters left except last load
4117     __ bind(NO_PREFETCH);
4118     __ subs(cnt2, cnt2, 16);
4119     __ br(__ LT, TAIL);
4120     __ align(OptoLoopAlignment);
4121     __ bind(SMALL_LOOP); // smaller loop
4122       __ subs(cnt2, cnt2, 16);
4123       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
4124       __ br(__ GE, SMALL_LOOP);
4125       __ cmn(cnt2, (u1)16);
4126       __ br(__ EQ, LOAD_LAST);
4127     __ bind(TAIL); // 1..15 characters left until last load (last 4 characters)
4128       __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string
4129       __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string
4130       __ ldr(tmp3, Address(cnt1, -8));
4131       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load
4132       __ b(LOAD_LAST);
4133     __ bind(DIFF2);
4134       __ mov(tmpU, tmp3);
4135     __ bind(DIFF1);
4136       __ pop(spilled_regs, sp);
4137       __ b(CALCULATE_DIFFERENCE);
4138     __ bind(LOAD_LAST);
4139       // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU.
4140       // No need to load it again
4141       __ mov(tmpU, tmp3);
4142       __ pop(spilled_regs, sp);
4143 
4144       // tmp2 points to the address of the last 4 Latin1 characters right now
4145       __ ldrs(vtmp, Address(tmp2));
4146       __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
4147       __ fmovd(tmpL, vtmp);
4148 
4149       __ eor(rscratch2, tmpU, tmpL);
4150       __ cbz(rscratch2, DONE);
4151 
4152     // Find the first different characters in the longwords and
4153     // compute their difference.
4154     __ bind(CALCULATE_DIFFERENCE);
4155       __ rev(rscratch2, rscratch2);
4156       __ clz(rscratch2, rscratch2);
4157       __ andr(rscratch2, rscratch2, -16);
4158       __ lsrv(tmp1, tmp1, rscratch2);
4159       __ uxthw(tmp1, tmp1);
4160       __ lsrv(rscratch1, rscratch1, rscratch2);
4161       __ uxthw(rscratch1, rscratch1);
4162       __ subw(result, tmp1, rscratch1);
4163     __ bind(DONE);
4164       __ ret(lr);
4165     return entry;
4166   }
4167 
4168     address generate_method_entry_barrier() {
4169     __ align(CodeEntryAlignment);
4170     StubCodeMark mark(this, "StubRoutines", "nmethod_entry_barrier");
4171 
4172     Label deoptimize_label;
4173 
4174     address start = __ pc();
4175 
4176     __ set_last_Java_frame(sp, rfp, lr, rscratch1);
4177 
4178     __ enter();
4179     __ add(rscratch2, sp, wordSize);  // rscratch2 points to the saved lr
4180 
4181     __ sub(sp, sp, 4 * wordSize);  // four words for the returned {sp, fp, lr, pc}
4182 
4183     __ push_call_clobbered_registers();
4184 
4185     __ mov(c_rarg0, rscratch2);
4186     __ call_VM_leaf
4187          (CAST_FROM_FN_PTR
4188           (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1);
4189 
4190     __ reset_last_Java_frame(true);
4191 
4192     __ mov(rscratch1, r0);
4193 
4194     __ pop_call_clobbered_registers();
4195 
4196     __ cbnz(rscratch1, deoptimize_label);
4197 
4198     __ leave();
4199     __ ret(lr);
4200 
4201     __ BIND(deoptimize_label);
4202 
4203     __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize));
4204     __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize));
4205 
4206     __ mov(sp, rscratch1);
4207     __ br(rscratch2);
4208 
4209     return start;
4210   }
4211 
4212   // r0  = result
4213   // r1  = str1
4214   // r2  = cnt1
4215   // r3  = str2
4216   // r4  = cnt2
4217   // r10 = tmp1
4218   // r11 = tmp2
4219   address generate_compare_long_string_same_encoding(bool isLL) {
4220     __ align(CodeEntryAlignment);
4221     StubCodeMark mark(this, "StubRoutines", isLL
4222         ? "compare_long_string_same_encoding LL"
4223         : "compare_long_string_same_encoding UU");
4224     address entry = __ pc();
4225     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
4226         tmp1 = r10, tmp2 = r11;
4227     Label SMALL_LOOP, LARGE_LOOP_PREFETCH, CHECK_LAST, DIFF2, TAIL,
4228         LENGTH_DIFF, DIFF, LAST_CHECK_AND_LENGTH_DIFF,
4229         DIFF_LAST_POSITION, DIFF_LAST_POSITION2;
4230     // exit from large loop when less than 64 bytes left to read or we're about
4231     // to prefetch memory behind array border
4232     int largeLoopExitCondition = MAX(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);
4233     // cnt1/cnt2 contains amount of characters to compare. cnt1 can be re-used
4234     // update cnt2 counter with already loaded 8 bytes
4235     __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2));
4236     // update pointers, because of previous read
4237     __ add(str1, str1, wordSize);
4238     __ add(str2, str2, wordSize);
4239     if (SoftwarePrefetchHintDistance >= 0) {
4240       __ bind(LARGE_LOOP_PREFETCH);
4241         __ prfm(Address(str1, SoftwarePrefetchHintDistance));
4242         __ prfm(Address(str2, SoftwarePrefetchHintDistance));
4243         compare_string_16_bytes_same(DIFF, DIFF2);
4244         compare_string_16_bytes_same(DIFF, DIFF2);
4245         __ sub(cnt2, cnt2, isLL ? 64 : 32);
4246         compare_string_16_bytes_same(DIFF, DIFF2);
4247         __ subs(rscratch2, cnt2, largeLoopExitCondition);
4248         compare_string_16_bytes_same(DIFF, DIFF2);
4249         __ br(__ GT, LARGE_LOOP_PREFETCH);
4250         __ cbz(cnt2, LAST_CHECK_AND_LENGTH_DIFF); // no more chars left?
4251     }
4252     // less than 16 bytes left?
4253     __ subs(cnt2, cnt2, isLL ? 16 : 8);
4254     __ br(__ LT, TAIL);
4255     __ align(OptoLoopAlignment);
4256     __ bind(SMALL_LOOP);
4257       compare_string_16_bytes_same(DIFF, DIFF2);
4258       __ subs(cnt2, cnt2, isLL ? 16 : 8);
4259       __ br(__ GE, SMALL_LOOP);
4260     __ bind(TAIL);
4261       __ adds(cnt2, cnt2, isLL ? 16 : 8);
4262       __ br(__ EQ, LAST_CHECK_AND_LENGTH_DIFF);
4263       __ subs(cnt2, cnt2, isLL ? 8 : 4);
4264       __ br(__ LE, CHECK_LAST);
4265       __ eor(rscratch2, tmp1, tmp2);
4266       __ cbnz(rscratch2, DIFF);
4267       __ ldr(tmp1, Address(__ post(str1, 8)));
4268       __ ldr(tmp2, Address(__ post(str2, 8)));
4269       __ sub(cnt2, cnt2, isLL ? 8 : 4);
4270     __ bind(CHECK_LAST);
4271       if (!isLL) {
4272         __ add(cnt2, cnt2, cnt2); // now in bytes
4273       }
4274       __ eor(rscratch2, tmp1, tmp2);
4275       __ cbnz(rscratch2, DIFF);
4276       __ ldr(rscratch1, Address(str1, cnt2));
4277       __ ldr(cnt1, Address(str2, cnt2));
4278       __ eor(rscratch2, rscratch1, cnt1);
4279       __ cbz(rscratch2, LENGTH_DIFF);
4280       // Find the first different characters in the longwords and
4281       // compute their difference.
4282     __ bind(DIFF2);
4283       __ rev(rscratch2, rscratch2);
4284       __ clz(rscratch2, rscratch2);
4285       __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
4286       __ lsrv(rscratch1, rscratch1, rscratch2);
4287       if (isLL) {
4288         __ lsrv(cnt1, cnt1, rscratch2);
4289         __ uxtbw(rscratch1, rscratch1);
4290         __ uxtbw(cnt1, cnt1);
4291       } else {
4292         __ lsrv(cnt1, cnt1, rscratch2);
4293         __ uxthw(rscratch1, rscratch1);
4294         __ uxthw(cnt1, cnt1);
4295       }
4296       __ subw(result, rscratch1, cnt1);
4297       __ b(LENGTH_DIFF);
4298     __ bind(DIFF);
4299       __ rev(rscratch2, rscratch2);
4300       __ clz(rscratch2, rscratch2);
4301       __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
4302       __ lsrv(tmp1, tmp1, rscratch2);
4303       if (isLL) {
4304         __ lsrv(tmp2, tmp2, rscratch2);
4305         __ uxtbw(tmp1, tmp1);
4306         __ uxtbw(tmp2, tmp2);
4307       } else {
4308         __ lsrv(tmp2, tmp2, rscratch2);
4309         __ uxthw(tmp1, tmp1);
4310         __ uxthw(tmp2, tmp2);
4311       }
4312       __ subw(result, tmp1, tmp2);
4313       __ b(LENGTH_DIFF);
4314     __ bind(LAST_CHECK_AND_LENGTH_DIFF);
4315       __ eor(rscratch2, tmp1, tmp2);
4316       __ cbnz(rscratch2, DIFF);
4317     __ bind(LENGTH_DIFF);
4318       __ ret(lr);
4319     return entry;
4320   }
4321 
4322   void generate_compare_long_strings() {
4323       StubRoutines::aarch64::_compare_long_string_LL
4324           = generate_compare_long_string_same_encoding(true);
4325       StubRoutines::aarch64::_compare_long_string_UU
4326           = generate_compare_long_string_same_encoding(false);
4327       StubRoutines::aarch64::_compare_long_string_LU
4328           = generate_compare_long_string_different_encoding(true);
4329       StubRoutines::aarch64::_compare_long_string_UL
4330           = generate_compare_long_string_different_encoding(false);
4331   }
4332 
4333   // R0 = result
4334   // R1 = str2
4335   // R2 = cnt1
4336   // R3 = str1
4337   // R4 = cnt2
4338   // This generic linear code use few additional ideas, which makes it faster:
4339   // 1) we can safely keep at least 1st register of pattern(since length >= 8)
4340   // in order to skip initial loading(help in systems with 1 ld pipeline)
4341   // 2) we can use "fast" algorithm of finding single character to search for
4342   // first symbol with less branches(1 branch per each loaded register instead
4343   // of branch for each symbol), so, this is where constants like
4344   // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from
4345   // 3) after loading and analyzing 1st register of source string, it can be
4346   // used to search for every 1st character entry, saving few loads in
4347   // comparison with "simplier-but-slower" implementation
4348   // 4) in order to avoid lots of push/pop operations, code below is heavily
4349   // re-using/re-initializing/compressing register values, which makes code
4350   // larger and a bit less readable, however, most of extra operations are
4351   // issued during loads or branches, so, penalty is minimal
4352   address generate_string_indexof_linear(bool str1_isL, bool str2_isL) {
4353     const char* stubName = str1_isL
4354         ? (str2_isL ? "indexof_linear_ll" : "indexof_linear_ul")
4355         : "indexof_linear_uu";
4356     __ align(CodeEntryAlignment);
4357     StubCodeMark mark(this, "StubRoutines", stubName);
4358     address entry = __ pc();
4359 
4360     int str1_chr_size = str1_isL ? 1 : 2;
4361     int str2_chr_size = str2_isL ? 1 : 2;
4362     int str1_chr_shift = str1_isL ? 0 : 1;
4363     int str2_chr_shift = str2_isL ? 0 : 1;
4364     bool isL = str1_isL && str2_isL;
4365    // parameters
4366     Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4;
4367     // temporary registers
4368     Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23;
4369     RegSet spilled_regs = RegSet::range(tmp1, tmp4);
4370     // redefinitions
4371     Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3;
4372 
4373     __ push(spilled_regs, sp);
4374     Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO,
4375         L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
4376         L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
4377         L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
4378         L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
4379         L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
4380     // Read whole register from str1. It is safe, because length >=8 here
4381     __ ldr(ch1, Address(str1));
4382     // Read whole register from str2. It is safe, because length >=8 here
4383     __ ldr(ch2, Address(str2));
4384     __ sub(cnt2, cnt2, cnt1);
4385     __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF);
4386     if (str1_isL != str2_isL) {
4387       __ eor(v0, __ T16B, v0, v0);
4388     }
4389     __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
4390     __ mul(first, first, tmp1);
4391     // check if we have less than 1 register to check
4392     __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1);
4393     if (str1_isL != str2_isL) {
4394       __ fmovd(v1, ch1);
4395     }
4396     __ br(__ LE, L_SMALL);
4397     __ eor(ch2, first, ch2);
4398     if (str1_isL != str2_isL) {
4399       __ zip1(v1, __ T16B, v1, v0);
4400     }
4401     __ sub(tmp2, ch2, tmp1);
4402     __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4403     __ bics(tmp2, tmp2, ch2);
4404     if (str1_isL != str2_isL) {
4405       __ fmovd(ch1, v1);
4406     }
4407     __ br(__ NE, L_HAS_ZERO);
4408     __ subs(cnt2, cnt2, wordSize/str2_chr_size);
4409     __ add(result, result, wordSize/str2_chr_size);
4410     __ add(str2, str2, wordSize);
4411     __ br(__ LT, L_POST_LOOP);
4412     __ BIND(L_LOOP);
4413       __ ldr(ch2, Address(str2));
4414       __ eor(ch2, first, ch2);
4415       __ sub(tmp2, ch2, tmp1);
4416       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4417       __ bics(tmp2, tmp2, ch2);
4418       __ br(__ NE, L_HAS_ZERO);
4419     __ BIND(L_LOOP_PROCEED);
4420       __ subs(cnt2, cnt2, wordSize/str2_chr_size);
4421       __ add(str2, str2, wordSize);
4422       __ add(result, result, wordSize/str2_chr_size);
4423       __ br(__ GE, L_LOOP);
4424     __ BIND(L_POST_LOOP);
4425       __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check
4426       __ br(__ LE, NOMATCH);
4427       __ ldr(ch2, Address(str2));
4428       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
4429       __ eor(ch2, first, ch2);
4430       __ sub(tmp2, ch2, tmp1);
4431       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4432       __ mov(tmp4, -1); // all bits set
4433       __ b(L_SMALL_PROCEED);
4434     __ align(OptoLoopAlignment);
4435     __ BIND(L_SMALL);
4436       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
4437       __ eor(ch2, first, ch2);
4438       if (str1_isL != str2_isL) {
4439         __ zip1(v1, __ T16B, v1, v0);
4440       }
4441       __ sub(tmp2, ch2, tmp1);
4442       __ mov(tmp4, -1); // all bits set
4443       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4444       if (str1_isL != str2_isL) {
4445         __ fmovd(ch1, v1); // move converted 4 symbols
4446       }
4447     __ BIND(L_SMALL_PROCEED);
4448       __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits.
4449       __ bic(tmp2, tmp2, ch2);
4450       __ ands(tmp2, tmp2, tmp4); // clear useless bits and check
4451       __ rbit(tmp2, tmp2);
4452       __ br(__ EQ, NOMATCH);
4453     __ BIND(L_SMALL_HAS_ZERO_LOOP);
4454       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's
4455       __ cmp(cnt1, u1(wordSize/str2_chr_size));
4456       __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2);
4457       if (str2_isL) { // LL
4458         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
4459         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
4460         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
4461         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
4462         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
4463       } else {
4464         __ mov(ch2, 0xE); // all bits in byte set except last one
4465         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
4466         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4467         __ lslv(tmp2, tmp2, tmp4);
4468         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4469         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4470         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
4471         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4472       }
4473       __ cmp(ch1, ch2);
4474       __ mov(tmp4, wordSize/str2_chr_size);
4475       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
4476     __ BIND(L_SMALL_CMP_LOOP);
4477       str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
4478                : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
4479       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
4480                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
4481       __ add(tmp4, tmp4, 1);
4482       __ cmp(tmp4, cnt1);
4483       __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP);
4484       __ cmp(first, ch2);
4485       __ br(__ EQ, L_SMALL_CMP_LOOP);
4486     __ BIND(L_SMALL_CMP_LOOP_NOMATCH);
4487       __ cbz(tmp2, NOMATCH); // no more matches. exit
4488       __ clz(tmp4, tmp2);
4489       __ add(result, result, 1); // advance index
4490       __ add(str2, str2, str2_chr_size); // advance pointer
4491       __ b(L_SMALL_HAS_ZERO_LOOP);
4492     __ align(OptoLoopAlignment);
4493     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP);
4494       __ cmp(first, ch2);
4495       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
4496       __ b(DONE);
4497     __ align(OptoLoopAlignment);
4498     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2);
4499       if (str2_isL) { // LL
4500         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
4501         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
4502         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
4503         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
4504         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
4505       } else {
4506         __ mov(ch2, 0xE); // all bits in byte set except last one
4507         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
4508         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4509         __ lslv(tmp2, tmp2, tmp4);
4510         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4511         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4512         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
4513         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4514       }
4515       __ cmp(ch1, ch2);
4516       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
4517       __ b(DONE);
4518     __ align(OptoLoopAlignment);
4519     __ BIND(L_HAS_ZERO);
4520       __ rbit(tmp2, tmp2);
4521       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's
4522       // Now, perform compression of counters(cnt2 and cnt1) into one register.
4523       // It's fine because both counters are 32bit and are not changed in this
4524       // loop. Just restore it on exit. So, cnt1 can be re-used in this loop.
4525       __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2);
4526       __ sub(result, result, 1);
4527     __ BIND(L_HAS_ZERO_LOOP);
4528       __ mov(cnt1, wordSize/str2_chr_size);
4529       __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2);
4530       __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare
4531       if (str2_isL) {
4532         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
4533         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4534         __ lslv(tmp2, tmp2, tmp4);
4535         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4536         __ add(tmp4, tmp4, 1);
4537         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4538         __ lsl(tmp2, tmp2, 1);
4539         __ mov(tmp4, wordSize/str2_chr_size);
4540       } else {
4541         __ mov(ch2, 0xE);
4542         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
4543         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4544         __ lslv(tmp2, tmp2, tmp4);
4545         __ add(tmp4, tmp4, 1);
4546         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4547         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
4548         __ lsl(tmp2, tmp2, 1);
4549         __ mov(tmp4, wordSize/str2_chr_size);
4550         __ sub(str2, str2, str2_chr_size);
4551       }
4552       __ cmp(ch1, ch2);
4553       __ mov(tmp4, wordSize/str2_chr_size);
4554       __ br(__ NE, L_CMP_LOOP_NOMATCH);
4555     __ BIND(L_CMP_LOOP);
4556       str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
4557                : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
4558       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
4559                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
4560       __ add(tmp4, tmp4, 1);
4561       __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2);
4562       __ br(__ GE, L_CMP_LOOP_LAST_CMP);
4563       __ cmp(cnt1, ch2);
4564       __ br(__ EQ, L_CMP_LOOP);
4565     __ BIND(L_CMP_LOOP_NOMATCH);
4566       // here we're not matched
4567       __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop
4568       __ clz(tmp4, tmp2);
4569       __ add(str2, str2, str2_chr_size); // advance pointer
4570       __ b(L_HAS_ZERO_LOOP);
4571     __ align(OptoLoopAlignment);
4572     __ BIND(L_CMP_LOOP_LAST_CMP);
4573       __ cmp(cnt1, ch2);
4574       __ br(__ NE, L_CMP_LOOP_NOMATCH);
4575       __ b(DONE);
4576     __ align(OptoLoopAlignment);
4577     __ BIND(L_CMP_LOOP_LAST_CMP2);
4578       if (str2_isL) {
4579         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
4580         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4581         __ lslv(tmp2, tmp2, tmp4);
4582         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4583         __ add(tmp4, tmp4, 1);
4584         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4585         __ lsl(tmp2, tmp2, 1);
4586       } else {
4587         __ mov(ch2, 0xE);
4588         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
4589         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4590         __ lslv(tmp2, tmp2, tmp4);
4591         __ add(tmp4, tmp4, 1);
4592         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4593         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
4594         __ lsl(tmp2, tmp2, 1);
4595         __ sub(str2, str2, str2_chr_size);
4596       }
4597       __ cmp(ch1, ch2);
4598       __ br(__ NE, L_CMP_LOOP_NOMATCH);
4599       __ b(DONE);
4600     __ align(OptoLoopAlignment);
4601     __ BIND(L_HAS_ZERO_LOOP_NOMATCH);
4602       // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
4603       // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
4604       // so, result was increased at max by wordSize/str2_chr_size - 1, so,
4605       // respective high bit wasn't changed. L_LOOP_PROCEED will increase
4606       // result by analyzed characters value, so, we can just reset lower bits
4607       // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
4608       // 2) restore cnt1 and cnt2 values from "compressed" cnt2
4609       // 3) advance str2 value to represent next str2 octet. result & 7/3 is
4610       // index of last analyzed substring inside current octet. So, str2 in at
4611       // respective start address. We need to advance it to next octet
4612       __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed
4613       __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2);
4614       __ bfm(result, zr, 0, 2 - str2_chr_shift);
4615       __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2
4616       __ movw(cnt2, cnt2);
4617       __ b(L_LOOP_PROCEED);
4618     __ align(OptoLoopAlignment);
4619     __ BIND(NOMATCH);
4620       __ mov(result, -1);
4621     __ BIND(DONE);
4622       __ pop(spilled_regs, sp);
4623       __ ret(lr);
4624     return entry;
4625   }
4626 
4627   void generate_string_indexof_stubs() {
4628     StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true);
4629     StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false);
4630     StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
4631   }
4632 
4633   void inflate_and_store_2_fp_registers(bool generatePrfm,
4634       FloatRegister src1, FloatRegister src2) {
4635     Register dst = r1;
4636     __ zip1(v1, __ T16B, src1, v0);
4637     __ zip2(v2, __ T16B, src1, v0);
4638     if (generatePrfm) {
4639       __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM);
4640     }
4641     __ zip1(v3, __ T16B, src2, v0);
4642     __ zip2(v4, __ T16B, src2, v0);
4643     __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64)));
4644   }
4645 
4646   // R0 = src
4647   // R1 = dst
4648   // R2 = len
4649   // R3 = len >> 3
4650   // V0 = 0
4651   // v1 = loaded 8 bytes
4652   address generate_large_byte_array_inflate() {
4653     __ align(CodeEntryAlignment);
4654     StubCodeMark mark(this, "StubRoutines", "large_byte_array_inflate");
4655     address entry = __ pc();
4656     Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE;
4657     Register src = r0, dst = r1, len = r2, octetCounter = r3;
4658     const int large_loop_threshold = MAX(64, SoftwarePrefetchHintDistance)/8 + 4;
4659 
4660     // do one more 8-byte read to have address 16-byte aligned in most cases
4661     // also use single store instruction
4662     __ ldrd(v2, __ post(src, 8));
4663     __ sub(octetCounter, octetCounter, 2);
4664     __ zip1(v1, __ T16B, v1, v0);
4665     __ zip1(v2, __ T16B, v2, v0);
4666     __ st1(v1, v2, __ T16B, __ post(dst, 32));
4667     __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
4668     __ subs(rscratch1, octetCounter, large_loop_threshold);
4669     __ br(__ LE, LOOP_START);
4670     __ b(LOOP_PRFM_START);
4671     __ bind(LOOP_PRFM);
4672       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
4673     __ bind(LOOP_PRFM_START);
4674       __ prfm(Address(src, SoftwarePrefetchHintDistance));
4675       __ sub(octetCounter, octetCounter, 8);
4676       __ subs(rscratch1, octetCounter, large_loop_threshold);
4677       inflate_and_store_2_fp_registers(true, v3, v4);
4678       inflate_and_store_2_fp_registers(true, v5, v6);
4679       __ br(__ GT, LOOP_PRFM);
4680       __ cmp(octetCounter, (u1)8);
4681       __ br(__ LT, DONE);
4682     __ bind(LOOP);
4683       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
4684       __ bind(LOOP_START);
4685       __ sub(octetCounter, octetCounter, 8);
4686       __ cmp(octetCounter, (u1)8);
4687       inflate_and_store_2_fp_registers(false, v3, v4);
4688       inflate_and_store_2_fp_registers(false, v5, v6);
4689       __ br(__ GE, LOOP);
4690     __ bind(DONE);
4691       __ ret(lr);
4692     return entry;
4693   }
4694 
4695   /**
4696    *  Arguments:
4697    *
4698    *  Input:
4699    *  c_rarg0   - current state address
4700    *  c_rarg1   - H key address
4701    *  c_rarg2   - data address
4702    *  c_rarg3   - number of blocks
4703    *
4704    *  Output:
4705    *  Updated state at c_rarg0
4706    */
4707   address generate_ghash_processBlocks() {
4708     // Bafflingly, GCM uses little-endian for the byte order, but
4709     // big-endian for the bit order.  For example, the polynomial 1 is
4710     // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
4711     //
4712     // So, we must either reverse the bytes in each word and do
4713     // everything big-endian or reverse the bits in each byte and do
4714     // it little-endian.  On AArch64 it's more idiomatic to reverse
4715     // the bits in each byte (we have an instruction, RBIT, to do
4716     // that) and keep the data in little-endian bit order throught the
4717     // calculation, bit-reversing the inputs and outputs.
4718 
4719     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
4720     __ align(wordSize * 2);
4721     address p = __ pc();
4722     __ emit_int64(0x87);  // The low-order bits of the field
4723                           // polynomial (i.e. p = z^7+z^2+z+1)
4724                           // repeated in the low and high parts of a
4725                           // 128-bit vector
4726     __ emit_int64(0x87);
4727 
4728     __ align(CodeEntryAlignment);
4729     address start = __ pc();
4730 
4731     Register state   = c_rarg0;
4732     Register subkeyH = c_rarg1;
4733     Register data    = c_rarg2;
4734     Register blocks  = c_rarg3;
4735 
4736     FloatRegister vzr = v30;
4737     __ eor(vzr, __ T16B, vzr, vzr); // zero register
4738 
4739     __ ldrq(v0, Address(state));
4740     __ ldrq(v1, Address(subkeyH));
4741 
4742     __ rev64(v0, __ T16B, v0);          // Bit-reverse words in state and subkeyH
4743     __ rbit(v0, __ T16B, v0);
4744     __ rev64(v1, __ T16B, v1);
4745     __ rbit(v1, __ T16B, v1);
4746 
4747     __ ldrq(v26, p);
4748 
4749     __ ext(v16, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
4750     __ eor(v16, __ T16B, v16, v1);      // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
4751 
4752     {
4753       Label L_ghash_loop;
4754       __ bind(L_ghash_loop);
4755 
4756       __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
4757                                                  // reversing each byte
4758       __ rbit(v2, __ T16B, v2);
4759       __ eor(v2, __ T16B, v0, v2);   // bit-swapped data ^ bit-swapped state
4760 
4761       // Multiply state in v2 by subkey in v1
4762       ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
4763                      /*a*/v1, /*b*/v2, /*a1_xor_a0*/v16,
4764                      /*temps*/v6, v20, v18, v21);
4765       // Reduce v7:v5 by the field polynomial
4766       ghash_reduce(v0, v5, v7, v26, vzr, v20);
4767 
4768       __ sub(blocks, blocks, 1);
4769       __ cbnz(blocks, L_ghash_loop);
4770     }
4771 
4772     // The bit-reversed result is at this point in v0
4773     __ rev64(v1, __ T16B, v0);
4774     __ rbit(v1, __ T16B, v1);
4775 
4776     __ st1(v1, __ T16B, state);
4777     __ ret(lr);
4778 
4779     return start;
4780   }
4781 
4782   // Continuation point for throwing of implicit exceptions that are
4783   // not handled in the current activation. Fabricates an exception
4784   // oop and initiates normal exception dispatching in this
4785   // frame. Since we need to preserve callee-saved values (currently
4786   // only for C2, but done for C1 as well) we need a callee-saved oop
4787   // map and therefore have to make these stubs into RuntimeStubs
4788   // rather than BufferBlobs.  If the compiler needs all registers to
4789   // be preserved between the fault point and the exception handler
4790   // then it must assume responsibility for that in
4791   // AbstractCompiler::continuation_for_implicit_null_exception or
4792   // continuation_for_implicit_division_by_zero_exception. All other
4793   // implicit exceptions (e.g., NullPointerException or
4794   // AbstractMethodError on entry) are either at call sites or
4795   // otherwise assume that stack unwinding will be initiated, so
4796   // caller saved registers were assumed volatile in the compiler.
4797 
4798 #undef __
4799 #define __ masm->
4800 
4801   address generate_throw_exception(const char* name,
4802                                    address runtime_entry,
4803                                    Register arg1 = noreg,
4804                                    Register arg2 = noreg) {
4805     // Information about frame layout at time of blocking runtime call.
4806     // Note that we only have to preserve callee-saved registers since
4807     // the compilers are responsible for supplying a continuation point
4808     // if they expect all registers to be preserved.
4809     // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0
4810     enum layout {
4811       rfp_off = 0,
4812       rfp_off2,
4813       return_off,
4814       return_off2,
4815       framesize // inclusive of return address
4816     };
4817 
4818     int insts_size = 512;
4819     int locs_size  = 64;
4820 
4821     CodeBuffer code(name, insts_size, locs_size);
4822     OopMapSet* oop_maps  = new OopMapSet();
4823     MacroAssembler* masm = new MacroAssembler(&code);
4824 
4825     address start = __ pc();
4826 
4827     // This is an inlined and slightly modified version of call_VM
4828     // which has the ability to fetch the return PC out of
4829     // thread-local storage and also sets up last_Java_sp slightly
4830     // differently than the real call_VM
4831 
4832     __ enter(); // Save FP and LR before call
4833 
4834     assert(is_even(framesize/2), "sp not 16-byte aligned");
4835 
4836     // lr and fp are already in place
4837     __ sub(sp, rfp, ((unsigned)framesize-4) << LogBytesPerInt); // prolog
4838 
4839     int frame_complete = __ pc() - start;
4840 
4841     // Set up last_Java_sp and last_Java_fp
4842     address the_pc = __ pc();
4843     __ set_last_Java_frame(sp, rfp, the_pc, rscratch1);
4844 
4845     // Call runtime
4846     if (arg1 != noreg) {
4847       assert(arg2 != c_rarg1, "clobbered");
4848       __ mov(c_rarg1, arg1);
4849     }
4850     if (arg2 != noreg) {
4851       __ mov(c_rarg2, arg2);
4852     }
4853     __ mov(c_rarg0, rthread);
4854     BLOCK_COMMENT("call runtime_entry");
4855     __ mov(rscratch1, runtime_entry);
4856     __ blr(rscratch1);
4857 
4858     // Generate oop map
4859     OopMap* map = new OopMap(framesize, 0);
4860 
4861     oop_maps->add_gc_map(the_pc - start, map);
4862 
4863     __ reset_last_Java_frame(true);
4864     __ maybe_isb();
4865 
4866     __ leave();
4867 
4868     // check for pending exceptions
4869 #ifdef ASSERT
4870     Label L;
4871     __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
4872     __ cbnz(rscratch1, L);
4873     __ should_not_reach_here();
4874     __ bind(L);
4875 #endif // ASSERT
4876     __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
4877 
4878 
4879     // codeBlob framesize is in words (not VMRegImpl::slot_size)
4880     RuntimeStub* stub =
4881       RuntimeStub::new_runtime_stub(name,
4882                                     &code,
4883                                     frame_complete,
4884                                     (framesize >> (LogBytesPerWord - LogBytesPerInt)),
4885                                     oop_maps, false);
4886     return stub->entry_point();
4887   }
4888 
4889   class MontgomeryMultiplyGenerator : public MacroAssembler {
4890 
4891     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
4892       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
4893 
4894     RegSet _toSave;
4895     bool _squaring;
4896 
4897   public:
4898     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
4899       : MacroAssembler(as->code()), _squaring(squaring) {
4900 
4901       // Register allocation
4902 
4903       Register reg = c_rarg0;
4904       Pa_base = reg;       // Argument registers
4905       if (squaring)
4906         Pb_base = Pa_base;
4907       else
4908         Pb_base = ++reg;
4909       Pn_base = ++reg;
4910       Rlen= ++reg;
4911       inv = ++reg;
4912       Pm_base = ++reg;
4913 
4914                           // Working registers:
4915       Ra =  ++reg;        // The current digit of a, b, n, and m.
4916       Rb =  ++reg;
4917       Rm =  ++reg;
4918       Rn =  ++reg;
4919 
4920       Pa =  ++reg;        // Pointers to the current/next digit of a, b, n, and m.
4921       Pb =  ++reg;
4922       Pm =  ++reg;
4923       Pn =  ++reg;
4924 
4925       t0 =  ++reg;        // Three registers which form a
4926       t1 =  ++reg;        // triple-precision accumuator.
4927       t2 =  ++reg;
4928 
4929       Ri =  ++reg;        // Inner and outer loop indexes.
4930       Rj =  ++reg;
4931 
4932       Rhi_ab = ++reg;     // Product registers: low and high parts
4933       Rlo_ab = ++reg;     // of a*b and m*n.
4934       Rhi_mn = ++reg;
4935       Rlo_mn = ++reg;
4936 
4937       // r19 and up are callee-saved.
4938       _toSave = RegSet::range(r19, reg) + Pm_base;
4939     }
4940 
4941   private:
4942     void save_regs() {
4943       push(_toSave, sp);
4944     }
4945 
4946     void restore_regs() {
4947       pop(_toSave, sp);
4948     }
4949 
4950     template <typename T>
4951     void unroll_2(Register count, T block) {
4952       Label loop, end, odd;
4953       tbnz(count, 0, odd);
4954       cbz(count, end);
4955       align(16);
4956       bind(loop);
4957       (this->*block)();
4958       bind(odd);
4959       (this->*block)();
4960       subs(count, count, 2);
4961       br(Assembler::GT, loop);
4962       bind(end);
4963     }
4964 
4965     template <typename T>
4966     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
4967       Label loop, end, odd;
4968       tbnz(count, 0, odd);
4969       cbz(count, end);
4970       align(16);
4971       bind(loop);
4972       (this->*block)(d, s, tmp);
4973       bind(odd);
4974       (this->*block)(d, s, tmp);
4975       subs(count, count, 2);
4976       br(Assembler::GT, loop);
4977       bind(end);
4978     }
4979 
4980     void pre1(RegisterOrConstant i) {
4981       block_comment("pre1");
4982       // Pa = Pa_base;
4983       // Pb = Pb_base + i;
4984       // Pm = Pm_base;
4985       // Pn = Pn_base + i;
4986       // Ra = *Pa;
4987       // Rb = *Pb;
4988       // Rm = *Pm;
4989       // Rn = *Pn;
4990       ldr(Ra, Address(Pa_base));
4991       ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
4992       ldr(Rm, Address(Pm_base));
4993       ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
4994       lea(Pa, Address(Pa_base));
4995       lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
4996       lea(Pm, Address(Pm_base));
4997       lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
4998 
4999       // Zero the m*n result.
5000       mov(Rhi_mn, zr);
5001       mov(Rlo_mn, zr);
5002     }
5003 
5004     // The core multiply-accumulate step of a Montgomery
5005     // multiplication.  The idea is to schedule operations as a
5006     // pipeline so that instructions with long latencies (loads and
5007     // multiplies) have time to complete before their results are
5008     // used.  This most benefits in-order implementations of the
5009     // architecture but out-of-order ones also benefit.
5010     void step() {
5011       block_comment("step");
5012       // MACC(Ra, Rb, t0, t1, t2);
5013       // Ra = *++Pa;
5014       // Rb = *--Pb;
5015       umulh(Rhi_ab, Ra, Rb);
5016       mul(Rlo_ab, Ra, Rb);
5017       ldr(Ra, pre(Pa, wordSize));
5018       ldr(Rb, pre(Pb, -wordSize));
5019       acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
5020                                        // previous iteration.
5021       // MACC(Rm, Rn, t0, t1, t2);
5022       // Rm = *++Pm;
5023       // Rn = *--Pn;
5024       umulh(Rhi_mn, Rm, Rn);
5025       mul(Rlo_mn, Rm, Rn);
5026       ldr(Rm, pre(Pm, wordSize));
5027       ldr(Rn, pre(Pn, -wordSize));
5028       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
5029     }
5030 
5031     void post1() {
5032       block_comment("post1");
5033 
5034       // MACC(Ra, Rb, t0, t1, t2);
5035       // Ra = *++Pa;
5036       // Rb = *--Pb;
5037       umulh(Rhi_ab, Ra, Rb);
5038       mul(Rlo_ab, Ra, Rb);
5039       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
5040       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
5041 
5042       // *Pm = Rm = t0 * inv;
5043       mul(Rm, t0, inv);
5044       str(Rm, Address(Pm));
5045 
5046       // MACC(Rm, Rn, t0, t1, t2);
5047       // t0 = t1; t1 = t2; t2 = 0;
5048       umulh(Rhi_mn, Rm, Rn);
5049 
5050 #ifndef PRODUCT
5051       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
5052       {
5053         mul(Rlo_mn, Rm, Rn);
5054         add(Rlo_mn, t0, Rlo_mn);
5055         Label ok;
5056         cbz(Rlo_mn, ok); {
5057           stop("broken Montgomery multiply");
5058         } bind(ok);
5059       }
5060 #endif
5061       // We have very carefully set things up so that
5062       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
5063       // the lower half of Rm * Rn because we know the result already:
5064       // it must be -t0.  t0 + (-t0) must generate a carry iff
5065       // t0 != 0.  So, rather than do a mul and an adds we just set
5066       // the carry flag iff t0 is nonzero.
5067       //
5068       // mul(Rlo_mn, Rm, Rn);
5069       // adds(zr, t0, Rlo_mn);
5070       subs(zr, t0, 1); // Set carry iff t0 is nonzero
5071       adcs(t0, t1, Rhi_mn);
5072       adc(t1, t2, zr);
5073       mov(t2, zr);
5074     }
5075 
5076     void pre2(RegisterOrConstant i, RegisterOrConstant len) {
5077       block_comment("pre2");
5078       // Pa = Pa_base + i-len;
5079       // Pb = Pb_base + len;
5080       // Pm = Pm_base + i-len;
5081       // Pn = Pn_base + len;
5082 
5083       if (i.is_register()) {
5084         sub(Rj, i.as_register(), len);
5085       } else {
5086         mov(Rj, i.as_constant());
5087         sub(Rj, Rj, len);
5088       }
5089       // Rj == i-len
5090 
5091       lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
5092       lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
5093       lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
5094       lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
5095 
5096       // Ra = *++Pa;
5097       // Rb = *--Pb;
5098       // Rm = *++Pm;
5099       // Rn = *--Pn;
5100       ldr(Ra, pre(Pa, wordSize));
5101       ldr(Rb, pre(Pb, -wordSize));
5102       ldr(Rm, pre(Pm, wordSize));
5103       ldr(Rn, pre(Pn, -wordSize));
5104 
5105       mov(Rhi_mn, zr);
5106       mov(Rlo_mn, zr);
5107     }
5108 
5109     void post2(RegisterOrConstant i, RegisterOrConstant len) {
5110       block_comment("post2");
5111       if (i.is_constant()) {
5112         mov(Rj, i.as_constant()-len.as_constant());
5113       } else {
5114         sub(Rj, i.as_register(), len);
5115       }
5116 
5117       adds(t0, t0, Rlo_mn); // The pending m*n, low part
5118 
5119       // As soon as we know the least significant digit of our result,
5120       // store it.
5121       // Pm_base[i-len] = t0;
5122       str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
5123 
5124       // t0 = t1; t1 = t2; t2 = 0;
5125       adcs(t0, t1, Rhi_mn); // The pending m*n, high part
5126       adc(t1, t2, zr);
5127       mov(t2, zr);
5128     }
5129 
5130     // A carry in t0 after Montgomery multiplication means that we
5131     // should subtract multiples of n from our result in m.  We'll
5132     // keep doing that until there is no carry.
5133     void normalize(RegisterOrConstant len) {
5134       block_comment("normalize");
5135       // while (t0)
5136       //   t0 = sub(Pm_base, Pn_base, t0, len);
5137       Label loop, post, again;
5138       Register cnt = t1, i = t2; // Re-use registers; we're done with them now
5139       cbz(t0, post); {
5140         bind(again); {
5141           mov(i, zr);
5142           mov(cnt, len);
5143           ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
5144           ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
5145           subs(zr, zr, zr); // set carry flag, i.e. no borrow
5146           align(16);
5147           bind(loop); {
5148             sbcs(Rm, Rm, Rn);
5149             str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
5150             add(i, i, 1);
5151             ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
5152             ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
5153             sub(cnt, cnt, 1);
5154           } cbnz(cnt, loop);
5155           sbc(t0, t0, zr);
5156         } cbnz(t0, again);
5157       } bind(post);
5158     }
5159 
5160     // Move memory at s to d, reversing words.
5161     //    Increments d to end of copied memory
5162     //    Destroys tmp1, tmp2
5163     //    Preserves len
5164     //    Leaves s pointing to the address which was in d at start
5165     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
5166       assert(tmp1 < r19 && tmp2 < r19, "register corruption");
5167 
5168       lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
5169       mov(tmp1, len);
5170       unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
5171       sub(s, d, len, ext::uxtw, LogBytesPerWord);
5172     }
5173     // where
5174     void reverse1(Register d, Register s, Register tmp) {
5175       ldr(tmp, pre(s, -wordSize));
5176       ror(tmp, tmp, 32);
5177       str(tmp, post(d, wordSize));
5178     }
5179 
5180     void step_squaring() {
5181       // An extra ACC
5182       step();
5183       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
5184     }
5185 
5186     void last_squaring(RegisterOrConstant i) {
5187       Label dont;
5188       // if ((i & 1) == 0) {
5189       tbnz(i.as_register(), 0, dont); {
5190         // MACC(Ra, Rb, t0, t1, t2);
5191         // Ra = *++Pa;
5192         // Rb = *--Pb;
5193         umulh(Rhi_ab, Ra, Rb);
5194         mul(Rlo_ab, Ra, Rb);
5195         acc(Rhi_ab, Rlo_ab, t0, t1, t2);
5196       } bind(dont);
5197     }
5198 
5199     void extra_step_squaring() {
5200       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
5201 
5202       // MACC(Rm, Rn, t0, t1, t2);
5203       // Rm = *++Pm;
5204       // Rn = *--Pn;
5205       umulh(Rhi_mn, Rm, Rn);
5206       mul(Rlo_mn, Rm, Rn);
5207       ldr(Rm, pre(Pm, wordSize));
5208       ldr(Rn, pre(Pn, -wordSize));
5209     }
5210 
5211     void post1_squaring() {
5212       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
5213 
5214       // *Pm = Rm = t0 * inv;
5215       mul(Rm, t0, inv);
5216       str(Rm, Address(Pm));
5217 
5218       // MACC(Rm, Rn, t0, t1, t2);
5219       // t0 = t1; t1 = t2; t2 = 0;
5220       umulh(Rhi_mn, Rm, Rn);
5221 
5222 #ifndef PRODUCT
5223       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
5224       {
5225         mul(Rlo_mn, Rm, Rn);
5226         add(Rlo_mn, t0, Rlo_mn);
5227         Label ok;
5228         cbz(Rlo_mn, ok); {
5229           stop("broken Montgomery multiply");
5230         } bind(ok);
5231       }
5232 #endif
5233       // We have very carefully set things up so that
5234       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
5235       // the lower half of Rm * Rn because we know the result already:
5236       // it must be -t0.  t0 + (-t0) must generate a carry iff
5237       // t0 != 0.  So, rather than do a mul and an adds we just set
5238       // the carry flag iff t0 is nonzero.
5239       //
5240       // mul(Rlo_mn, Rm, Rn);
5241       // adds(zr, t0, Rlo_mn);
5242       subs(zr, t0, 1); // Set carry iff t0 is nonzero
5243       adcs(t0, t1, Rhi_mn);
5244       adc(t1, t2, zr);
5245       mov(t2, zr);
5246     }
5247 
5248     void acc(Register Rhi, Register Rlo,
5249              Register t0, Register t1, Register t2) {
5250       adds(t0, t0, Rlo);
5251       adcs(t1, t1, Rhi);
5252       adc(t2, t2, zr);
5253     }
5254 
5255   public:
5256     /**
5257      * Fast Montgomery multiplication.  The derivation of the
5258      * algorithm is in A Cryptographic Library for the Motorola
5259      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
5260      *
5261      * Arguments:
5262      *
5263      * Inputs for multiplication:
5264      *   c_rarg0   - int array elements a
5265      *   c_rarg1   - int array elements b
5266      *   c_rarg2   - int array elements n (the modulus)
5267      *   c_rarg3   - int length
5268      *   c_rarg4   - int inv
5269      *   c_rarg5   - int array elements m (the result)
5270      *
5271      * Inputs for squaring:
5272      *   c_rarg0   - int array elements a
5273      *   c_rarg1   - int array elements n (the modulus)
5274      *   c_rarg2   - int length
5275      *   c_rarg3   - int inv
5276      *   c_rarg4   - int array elements m (the result)
5277      *
5278      */
5279     address generate_multiply() {
5280       Label argh, nothing;
5281       bind(argh);
5282       stop("MontgomeryMultiply total_allocation must be <= 8192");
5283 
5284       align(CodeEntryAlignment);
5285       address entry = pc();
5286 
5287       cbzw(Rlen, nothing);
5288 
5289       enter();
5290 
5291       // Make room.
5292       cmpw(Rlen, 512);
5293       br(Assembler::HI, argh);
5294       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
5295       andr(sp, Ra, -2 * wordSize);
5296 
5297       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
5298 
5299       {
5300         // Copy input args, reversing as we go.  We use Ra as a
5301         // temporary variable.
5302         reverse(Ra, Pa_base, Rlen, t0, t1);
5303         if (!_squaring)
5304           reverse(Ra, Pb_base, Rlen, t0, t1);
5305         reverse(Ra, Pn_base, Rlen, t0, t1);
5306       }
5307 
5308       // Push all call-saved registers and also Pm_base which we'll need
5309       // at the end.
5310       save_regs();
5311 
5312 #ifndef PRODUCT
5313       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
5314       {
5315         ldr(Rn, Address(Pn_base, 0));
5316         mul(Rlo_mn, Rn, inv);
5317         subs(zr, Rlo_mn, -1);
5318         Label ok;
5319         br(EQ, ok); {
5320           stop("broken inverse in Montgomery multiply");
5321         } bind(ok);
5322       }
5323 #endif
5324 
5325       mov(Pm_base, Ra);
5326 
5327       mov(t0, zr);
5328       mov(t1, zr);
5329       mov(t2, zr);
5330 
5331       block_comment("for (int i = 0; i < len; i++) {");
5332       mov(Ri, zr); {
5333         Label loop, end;
5334         cmpw(Ri, Rlen);
5335         br(Assembler::GE, end);
5336 
5337         bind(loop);
5338         pre1(Ri);
5339 
5340         block_comment("  for (j = i; j; j--) {"); {
5341           movw(Rj, Ri);
5342           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
5343         } block_comment("  } // j");
5344 
5345         post1();
5346         addw(Ri, Ri, 1);
5347         cmpw(Ri, Rlen);
5348         br(Assembler::LT, loop);
5349         bind(end);
5350         block_comment("} // i");
5351       }
5352 
5353       block_comment("for (int i = len; i < 2*len; i++) {");
5354       mov(Ri, Rlen); {
5355         Label loop, end;
5356         cmpw(Ri, Rlen, Assembler::LSL, 1);
5357         br(Assembler::GE, end);
5358 
5359         bind(loop);
5360         pre2(Ri, Rlen);
5361 
5362         block_comment("  for (j = len*2-i-1; j; j--) {"); {
5363           lslw(Rj, Rlen, 1);
5364           subw(Rj, Rj, Ri);
5365           subw(Rj, Rj, 1);
5366           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
5367         } block_comment("  } // j");
5368 
5369         post2(Ri, Rlen);
5370         addw(Ri, Ri, 1);
5371         cmpw(Ri, Rlen, Assembler::LSL, 1);
5372         br(Assembler::LT, loop);
5373         bind(end);
5374       }
5375       block_comment("} // i");
5376 
5377       normalize(Rlen);
5378 
5379       mov(Ra, Pm_base);  // Save Pm_base in Ra
5380       restore_regs();  // Restore caller's Pm_base
5381 
5382       // Copy our result into caller's Pm_base
5383       reverse(Pm_base, Ra, Rlen, t0, t1);
5384 
5385       leave();
5386       bind(nothing);
5387       ret(lr);
5388 
5389       return entry;
5390     }
5391     // In C, approximately:
5392 
5393     // void
5394     // montgomery_multiply(unsigned long Pa_base[], unsigned long Pb_base[],
5395     //                     unsigned long Pn_base[], unsigned long Pm_base[],
5396     //                     unsigned long inv, int len) {
5397     //   unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
5398     //   unsigned long *Pa, *Pb, *Pn, *Pm;
5399     //   unsigned long Ra, Rb, Rn, Rm;
5400 
5401     //   int i;
5402 
5403     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
5404 
5405     //   for (i = 0; i < len; i++) {
5406     //     int j;
5407 
5408     //     Pa = Pa_base;
5409     //     Pb = Pb_base + i;
5410     //     Pm = Pm_base;
5411     //     Pn = Pn_base + i;
5412 
5413     //     Ra = *Pa;
5414     //     Rb = *Pb;
5415     //     Rm = *Pm;
5416     //     Rn = *Pn;
5417 
5418     //     int iters = i;
5419     //     for (j = 0; iters--; j++) {
5420     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
5421     //       MACC(Ra, Rb, t0, t1, t2);
5422     //       Ra = *++Pa;
5423     //       Rb = *--Pb;
5424     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5425     //       MACC(Rm, Rn, t0, t1, t2);
5426     //       Rm = *++Pm;
5427     //       Rn = *--Pn;
5428     //     }
5429 
5430     //     assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
5431     //     MACC(Ra, Rb, t0, t1, t2);
5432     //     *Pm = Rm = t0 * inv;
5433     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
5434     //     MACC(Rm, Rn, t0, t1, t2);
5435 
5436     //     assert(t0 == 0, "broken Montgomery multiply");
5437 
5438     //     t0 = t1; t1 = t2; t2 = 0;
5439     //   }
5440 
5441     //   for (i = len; i < 2*len; i++) {
5442     //     int j;
5443 
5444     //     Pa = Pa_base + i-len;
5445     //     Pb = Pb_base + len;
5446     //     Pm = Pm_base + i-len;
5447     //     Pn = Pn_base + len;
5448 
5449     //     Ra = *++Pa;
5450     //     Rb = *--Pb;
5451     //     Rm = *++Pm;
5452     //     Rn = *--Pn;
5453 
5454     //     int iters = len*2-i-1;
5455     //     for (j = i-len+1; iters--; j++) {
5456     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
5457     //       MACC(Ra, Rb, t0, t1, t2);
5458     //       Ra = *++Pa;
5459     //       Rb = *--Pb;
5460     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5461     //       MACC(Rm, Rn, t0, t1, t2);
5462     //       Rm = *++Pm;
5463     //       Rn = *--Pn;
5464     //     }
5465 
5466     //     Pm_base[i-len] = t0;
5467     //     t0 = t1; t1 = t2; t2 = 0;
5468     //   }
5469 
5470     //   while (t0)
5471     //     t0 = sub(Pm_base, Pn_base, t0, len);
5472     // }
5473 
5474     /**
5475      * Fast Montgomery squaring.  This uses asymptotically 25% fewer
5476      * multiplies than Montgomery multiplication so it should be up to
5477      * 25% faster.  However, its loop control is more complex and it
5478      * may actually run slower on some machines.
5479      *
5480      * Arguments:
5481      *
5482      * Inputs:
5483      *   c_rarg0   - int array elements a
5484      *   c_rarg1   - int array elements n (the modulus)
5485      *   c_rarg2   - int length
5486      *   c_rarg3   - int inv
5487      *   c_rarg4   - int array elements m (the result)
5488      *
5489      */
5490     address generate_square() {
5491       Label argh;
5492       bind(argh);
5493       stop("MontgomeryMultiply total_allocation must be <= 8192");
5494 
5495       align(CodeEntryAlignment);
5496       address entry = pc();
5497 
5498       enter();
5499 
5500       // Make room.
5501       cmpw(Rlen, 512);
5502       br(Assembler::HI, argh);
5503       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
5504       andr(sp, Ra, -2 * wordSize);
5505 
5506       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
5507 
5508       {
5509         // Copy input args, reversing as we go.  We use Ra as a
5510         // temporary variable.
5511         reverse(Ra, Pa_base, Rlen, t0, t1);
5512         reverse(Ra, Pn_base, Rlen, t0, t1);
5513       }
5514 
5515       // Push all call-saved registers and also Pm_base which we'll need
5516       // at the end.
5517       save_regs();
5518 
5519       mov(Pm_base, Ra);
5520 
5521       mov(t0, zr);
5522       mov(t1, zr);
5523       mov(t2, zr);
5524 
5525       block_comment("for (int i = 0; i < len; i++) {");
5526       mov(Ri, zr); {
5527         Label loop, end;
5528         bind(loop);
5529         cmp(Ri, Rlen);
5530         br(Assembler::GE, end);
5531 
5532         pre1(Ri);
5533 
5534         block_comment("for (j = (i+1)/2; j; j--) {"); {
5535           add(Rj, Ri, 1);
5536           lsr(Rj, Rj, 1);
5537           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
5538         } block_comment("  } // j");
5539 
5540         last_squaring(Ri);
5541 
5542         block_comment("  for (j = i/2; j; j--) {"); {
5543           lsr(Rj, Ri, 1);
5544           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
5545         } block_comment("  } // j");
5546 
5547         post1_squaring();
5548         add(Ri, Ri, 1);
5549         cmp(Ri, Rlen);
5550         br(Assembler::LT, loop);
5551 
5552         bind(end);
5553         block_comment("} // i");
5554       }
5555 
5556       block_comment("for (int i = len; i < 2*len; i++) {");
5557       mov(Ri, Rlen); {
5558         Label loop, end;
5559         bind(loop);
5560         cmp(Ri, Rlen, Assembler::LSL, 1);
5561         br(Assembler::GE, end);
5562 
5563         pre2(Ri, Rlen);
5564 
5565         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
5566           lsl(Rj, Rlen, 1);
5567           sub(Rj, Rj, Ri);
5568           sub(Rj, Rj, 1);
5569           lsr(Rj, Rj, 1);
5570           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
5571         } block_comment("  } // j");
5572 
5573         last_squaring(Ri);
5574 
5575         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
5576           lsl(Rj, Rlen, 1);
5577           sub(Rj, Rj, Ri);
5578           lsr(Rj, Rj, 1);
5579           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
5580         } block_comment("  } // j");
5581 
5582         post2(Ri, Rlen);
5583         add(Ri, Ri, 1);
5584         cmp(Ri, Rlen, Assembler::LSL, 1);
5585 
5586         br(Assembler::LT, loop);
5587         bind(end);
5588         block_comment("} // i");
5589       }
5590 
5591       normalize(Rlen);
5592 
5593       mov(Ra, Pm_base);  // Save Pm_base in Ra
5594       restore_regs();  // Restore caller's Pm_base
5595 
5596       // Copy our result into caller's Pm_base
5597       reverse(Pm_base, Ra, Rlen, t0, t1);
5598 
5599       leave();
5600       ret(lr);
5601 
5602       return entry;
5603     }
5604     // In C, approximately:
5605 
5606     // void
5607     // montgomery_square(unsigned long Pa_base[], unsigned long Pn_base[],
5608     //                   unsigned long Pm_base[], unsigned long inv, int len) {
5609     //   unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
5610     //   unsigned long *Pa, *Pb, *Pn, *Pm;
5611     //   unsigned long Ra, Rb, Rn, Rm;
5612 
5613     //   int i;
5614 
5615     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
5616 
5617     //   for (i = 0; i < len; i++) {
5618     //     int j;
5619 
5620     //     Pa = Pa_base;
5621     //     Pb = Pa_base + i;
5622     //     Pm = Pm_base;
5623     //     Pn = Pn_base + i;
5624 
5625     //     Ra = *Pa;
5626     //     Rb = *Pb;
5627     //     Rm = *Pm;
5628     //     Rn = *Pn;
5629 
5630     //     int iters = (i+1)/2;
5631     //     for (j = 0; iters--; j++) {
5632     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
5633     //       MACC2(Ra, Rb, t0, t1, t2);
5634     //       Ra = *++Pa;
5635     //       Rb = *--Pb;
5636     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5637     //       MACC(Rm, Rn, t0, t1, t2);
5638     //       Rm = *++Pm;
5639     //       Rn = *--Pn;
5640     //     }
5641     //     if ((i & 1) == 0) {
5642     //       assert(Ra == Pa_base[j], "must be");
5643     //       MACC(Ra, Ra, t0, t1, t2);
5644     //     }
5645     //     iters = i/2;
5646     //     assert(iters == i-j, "must be");
5647     //     for (; iters--; j++) {
5648     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5649     //       MACC(Rm, Rn, t0, t1, t2);
5650     //       Rm = *++Pm;
5651     //       Rn = *--Pn;
5652     //     }
5653 
5654     //     *Pm = Rm = t0 * inv;
5655     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
5656     //     MACC(Rm, Rn, t0, t1, t2);
5657 
5658     //     assert(t0 == 0, "broken Montgomery multiply");
5659 
5660     //     t0 = t1; t1 = t2; t2 = 0;
5661     //   }
5662 
5663     //   for (i = len; i < 2*len; i++) {
5664     //     int start = i-len+1;
5665     //     int end = start + (len - start)/2;
5666     //     int j;
5667 
5668     //     Pa = Pa_base + i-len;
5669     //     Pb = Pa_base + len;
5670     //     Pm = Pm_base + i-len;
5671     //     Pn = Pn_base + len;
5672 
5673     //     Ra = *++Pa;
5674     //     Rb = *--Pb;
5675     //     Rm = *++Pm;
5676     //     Rn = *--Pn;
5677 
5678     //     int iters = (2*len-i-1)/2;
5679     //     assert(iters == end-start, "must be");
5680     //     for (j = start; iters--; j++) {
5681     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
5682     //       MACC2(Ra, Rb, t0, t1, t2);
5683     //       Ra = *++Pa;
5684     //       Rb = *--Pb;
5685     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5686     //       MACC(Rm, Rn, t0, t1, t2);
5687     //       Rm = *++Pm;
5688     //       Rn = *--Pn;
5689     //     }
5690     //     if ((i & 1) == 0) {
5691     //       assert(Ra == Pa_base[j], "must be");
5692     //       MACC(Ra, Ra, t0, t1, t2);
5693     //     }
5694     //     iters =  (2*len-i)/2;
5695     //     assert(iters == len-j, "must be");
5696     //     for (; iters--; j++) {
5697     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5698     //       MACC(Rm, Rn, t0, t1, t2);
5699     //       Rm = *++Pm;
5700     //       Rn = *--Pn;
5701     //     }
5702     //     Pm_base[i-len] = t0;
5703     //     t0 = t1; t1 = t2; t2 = 0;
5704     //   }
5705 
5706     //   while (t0)
5707     //     t0 = sub(Pm_base, Pn_base, t0, len);
5708     // }
5709   };
5710 
5711 
5712   // Initialization
5713   void generate_initial() {
5714     // Generate initial stubs and initializes the entry points
5715 
5716     // entry points that exist in all platforms Note: This is code
5717     // that could be shared among different platforms - however the
5718     // benefit seems to be smaller than the disadvantage of having a
5719     // much more complicated generator structure. See also comment in
5720     // stubRoutines.hpp.
5721 
5722     StubRoutines::_forward_exception_entry = generate_forward_exception();
5723 
5724     StubRoutines::_call_stub_entry =
5725       generate_call_stub(StubRoutines::_call_stub_return_address);
5726 
5727     // is referenced by megamorphic call
5728     StubRoutines::_catch_exception_entry = generate_catch_exception();
5729 
5730     // Build this early so it's available for the interpreter.
5731     StubRoutines::_throw_StackOverflowError_entry =
5732       generate_throw_exception("StackOverflowError throw_exception",
5733                                CAST_FROM_FN_PTR(address,
5734                                                 SharedRuntime::throw_StackOverflowError));
5735     StubRoutines::_throw_delayed_StackOverflowError_entry =
5736       generate_throw_exception("delayed StackOverflowError throw_exception",
5737                                CAST_FROM_FN_PTR(address,
5738                                                 SharedRuntime::throw_delayed_StackOverflowError));
5739     if (UseCRC32Intrinsics) {
5740       // set table address before stub generation which use it
5741       StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table;
5742       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
5743     }
5744 
5745     if (UseCRC32CIntrinsics) {
5746       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
5747     }
5748 
5749     // Disabled until JDK-8210858 is fixed
5750     // if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog)) {
5751     //   StubRoutines::_dlog = generate_dlog();
5752     // }
5753 
5754     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
5755       StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false);
5756     }
5757 
5758     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
5759       StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true);
5760     }
5761 
5762     // Safefetch stubs.
5763     generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
5764                                                        &StubRoutines::_safefetch32_fault_pc,
5765                                                        &StubRoutines::_safefetch32_continuation_pc);
5766     generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
5767                                                        &StubRoutines::_safefetchN_fault_pc,
5768                                                        &StubRoutines::_safefetchN_continuation_pc);
5769   }
5770 
5771   void generate_all() {
5772     // support for verify_oop (must happen after universe_init)
5773     StubRoutines::_verify_oop_subroutine_entry     = generate_verify_oop();
5774     StubRoutines::_throw_AbstractMethodError_entry =
5775       generate_throw_exception("AbstractMethodError throw_exception",
5776                                CAST_FROM_FN_PTR(address,
5777                                                 SharedRuntime::
5778                                                 throw_AbstractMethodError));
5779 
5780     StubRoutines::_throw_IncompatibleClassChangeError_entry =
5781       generate_throw_exception("IncompatibleClassChangeError throw_exception",
5782                                CAST_FROM_FN_PTR(address,
5783                                                 SharedRuntime::
5784                                                 throw_IncompatibleClassChangeError));
5785 
5786     StubRoutines::_throw_NullPointerException_at_call_entry =
5787       generate_throw_exception("NullPointerException at call throw_exception",
5788                                CAST_FROM_FN_PTR(address,
5789                                                 SharedRuntime::
5790                                                 throw_NullPointerException_at_call));
5791 
5792     StubRoutines::aarch64::_vector_iota_indices    = generate_iota_indices("iota_indices");
5793 
5794     // arraycopy stubs used by compilers
5795     generate_arraycopy_stubs();
5796 
5797     // has negatives stub for large arrays.
5798     StubRoutines::aarch64::_has_negatives = generate_has_negatives(StubRoutines::aarch64::_has_negatives_long);
5799 
5800     // array equals stub for large arrays.
5801     if (!UseSimpleArrayEquals) {
5802       StubRoutines::aarch64::_large_array_equals = generate_large_array_equals();
5803     }
5804 
5805     generate_compare_long_strings();
5806 
5807     generate_string_indexof_stubs();
5808 
5809     // byte_array_inflate stub for large arrays.
5810     StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate();
5811 
5812     BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
5813     if (bs_nm != NULL) {
5814       StubRoutines::aarch64::_method_entry_barrier = generate_method_entry_barrier();
5815     }
5816 #ifdef COMPILER2
5817     if (UseMultiplyToLenIntrinsic) {
5818       StubRoutines::_multiplyToLen = generate_multiplyToLen();
5819     }
5820 
5821     if (UseSquareToLenIntrinsic) {
5822       StubRoutines::_squareToLen = generate_squareToLen();
5823     }
5824 
5825     if (UseMulAddIntrinsic) {
5826       StubRoutines::_mulAdd = generate_mulAdd();
5827     }
5828 
5829     if (UseMontgomeryMultiplyIntrinsic) {
5830       StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply");
5831       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
5832       StubRoutines::_montgomeryMultiply = g.generate_multiply();
5833     }
5834 
5835     if (UseMontgomerySquareIntrinsic) {
5836       StubCodeMark mark(this, "StubRoutines", "montgomerySquare");
5837       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
5838       // We use generate_multiply() rather than generate_square()
5839       // because it's faster for the sizes of modulus we care about.
5840       StubRoutines::_montgomerySquare = g.generate_multiply();
5841     }
5842 #endif // COMPILER2
5843 
5844     // generate GHASH intrinsics code
5845     if (UseGHASHIntrinsics) {
5846       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
5847     }
5848 
5849     // data cache line writeback
5850     StubRoutines::_data_cache_writeback = generate_data_cache_writeback();
5851     StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync();
5852 
5853     if (UseAESIntrinsics) {
5854       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
5855       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
5856       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
5857       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
5858     }
5859 
5860     if (UseSHA1Intrinsics) {
5861       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(false,   "sha1_implCompress");
5862       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(true,    "sha1_implCompressMB");
5863     }
5864     if (UseSHA256Intrinsics) {
5865       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(false, "sha256_implCompress");
5866       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true,  "sha256_implCompressMB");
5867     }
5868 
5869     // generate Adler32 intrinsics code
5870     if (UseAdler32Intrinsics) {
5871       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
5872     }
5873 
5874     StubRoutines::aarch64::set_completed();
5875   }
5876 
5877  public:
5878   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
5879     if (all) {
5880       generate_all();
5881     } else {
5882       generate_initial();
5883     }
5884   }
5885 }; // end class declaration
5886 
5887 #define UCM_TABLE_MAX_ENTRIES 8
5888 void StubGenerator_generate(CodeBuffer* code, bool all) {
5889   if (UnsafeCopyMemory::_table == NULL) {
5890     UnsafeCopyMemory::create_table(UCM_TABLE_MAX_ENTRIES);
5891   }
5892   StubGenerator g(code, all);
5893 }