1 /* 2 * Copyright (c) 2003, 2020, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2014, 2020, Red Hat Inc. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "precompiled.hpp" 27 #include "asm/macroAssembler.hpp" 28 #include "asm/macroAssembler.inline.hpp" 29 #include "gc/shared/barrierSet.hpp" 30 #include "gc/shared/barrierSetAssembler.hpp" 31 #include "interpreter/interpreter.hpp" 32 #include "memory/universe.hpp" 33 #include "nativeInst_aarch64.hpp" 34 #include "oops/instanceOop.hpp" 35 #include "oops/method.hpp" 36 #include "oops/objArrayKlass.hpp" 37 #include "oops/oop.inline.hpp" 38 #include "prims/methodHandles.hpp" 39 #include "runtime/frame.inline.hpp" 40 #include "runtime/handles.inline.hpp" 41 #include "runtime/sharedRuntime.hpp" 42 #include "runtime/stubCodeGenerator.hpp" 43 #include "runtime/stubRoutines.hpp" 44 #include "runtime/thread.inline.hpp" 45 #include "utilities/align.hpp" 46 #include "utilities/powerOfTwo.hpp" 47 #ifdef COMPILER2 48 #include "opto/runtime.hpp" 49 #endif 50 #if INCLUDE_ZGC 51 #include "gc/z/zThreadLocalData.hpp" 52 #endif 53 54 // Declaration and definition of StubGenerator (no .hpp file). 55 // For a more detailed description of the stub routine structure 56 // see the comment in stubRoutines.hpp 57 58 #undef __ 59 #define __ _masm-> 60 #define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8)) 61 62 #ifdef PRODUCT 63 #define BLOCK_COMMENT(str) /* nothing */ 64 #else 65 #define BLOCK_COMMENT(str) __ block_comment(str) 66 #endif 67 68 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 69 70 // Stub Code definitions 71 72 class StubGenerator: public StubCodeGenerator { 73 private: 74 75 #ifdef PRODUCT 76 #define inc_counter_np(counter) ((void)0) 77 #else 78 void inc_counter_np_(int& counter) { 79 __ lea(rscratch2, ExternalAddress((address)&counter)); 80 __ ldrw(rscratch1, Address(rscratch2)); 81 __ addw(rscratch1, rscratch1, 1); 82 __ strw(rscratch1, Address(rscratch2)); 83 } 84 #define inc_counter_np(counter) \ 85 BLOCK_COMMENT("inc_counter " #counter); \ 86 inc_counter_np_(counter); 87 #endif 88 89 // Call stubs are used to call Java from C 90 // 91 // Arguments: 92 // c_rarg0: call wrapper address address 93 // c_rarg1: result address 94 // c_rarg2: result type BasicType 95 // c_rarg3: method Method* 96 // c_rarg4: (interpreter) entry point address 97 // c_rarg5: parameters intptr_t* 98 // c_rarg6: parameter size (in words) int 99 // c_rarg7: thread Thread* 100 // 101 // There is no return from the stub itself as any Java result 102 // is written to result 103 // 104 // we save r30 (lr) as the return PC at the base of the frame and 105 // link r29 (fp) below it as the frame pointer installing sp (r31) 106 // into fp. 107 // 108 // we save r0-r7, which accounts for all the c arguments. 109 // 110 // TODO: strictly do we need to save them all? they are treated as 111 // volatile by C so could we omit saving the ones we are going to 112 // place in global registers (thread? method?) or those we only use 113 // during setup of the Java call? 114 // 115 // we don't need to save r8 which C uses as an indirect result location 116 // return register. 117 // 118 // we don't need to save r9-r15 which both C and Java treat as 119 // volatile 120 // 121 // we don't need to save r16-18 because Java does not use them 122 // 123 // we save r19-r28 which Java uses as scratch registers and C 124 // expects to be callee-save 125 // 126 // we save the bottom 64 bits of each value stored in v8-v15; it is 127 // the responsibility of the caller to preserve larger values. 128 // 129 // so the stub frame looks like this when we enter Java code 130 // 131 // [ return_from_Java ] <--- sp 132 // [ argument word n ] 133 // ... 134 // -27 [ argument word 1 ] 135 // -26 [ saved v15 ] <--- sp_after_call 136 // -25 [ saved v14 ] 137 // -24 [ saved v13 ] 138 // -23 [ saved v12 ] 139 // -22 [ saved v11 ] 140 // -21 [ saved v10 ] 141 // -20 [ saved v9 ] 142 // -19 [ saved v8 ] 143 // -18 [ saved r28 ] 144 // -17 [ saved r27 ] 145 // -16 [ saved r26 ] 146 // -15 [ saved r25 ] 147 // -14 [ saved r24 ] 148 // -13 [ saved r23 ] 149 // -12 [ saved r22 ] 150 // -11 [ saved r21 ] 151 // -10 [ saved r20 ] 152 // -9 [ saved r19 ] 153 // -8 [ call wrapper (r0) ] 154 // -7 [ result (r1) ] 155 // -6 [ result type (r2) ] 156 // -5 [ method (r3) ] 157 // -4 [ entry point (r4) ] 158 // -3 [ parameters (r5) ] 159 // -2 [ parameter size (r6) ] 160 // -1 [ thread (r7) ] 161 // 0 [ saved fp (r29) ] <--- fp == saved sp (r31) 162 // 1 [ saved lr (r30) ] 163 164 // Call stub stack layout word offsets from fp 165 enum call_stub_layout { 166 sp_after_call_off = -26, 167 168 d15_off = -26, 169 d13_off = -24, 170 d11_off = -22, 171 d9_off = -20, 172 173 r28_off = -18, 174 r26_off = -16, 175 r24_off = -14, 176 r22_off = -12, 177 r20_off = -10, 178 call_wrapper_off = -8, 179 result_off = -7, 180 result_type_off = -6, 181 method_off = -5, 182 entry_point_off = -4, 183 parameter_size_off = -2, 184 thread_off = -1, 185 fp_f = 0, 186 retaddr_off = 1, 187 }; 188 189 address generate_call_stub(address& return_address) { 190 assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 && 191 (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off, 192 "adjust this code"); 193 194 StubCodeMark mark(this, "StubRoutines", "call_stub"); 195 address start = __ pc(); 196 197 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 198 199 const Address call_wrapper (rfp, call_wrapper_off * wordSize); 200 const Address result (rfp, result_off * wordSize); 201 const Address result_type (rfp, result_type_off * wordSize); 202 const Address method (rfp, method_off * wordSize); 203 const Address entry_point (rfp, entry_point_off * wordSize); 204 const Address parameter_size(rfp, parameter_size_off * wordSize); 205 206 const Address thread (rfp, thread_off * wordSize); 207 208 const Address d15_save (rfp, d15_off * wordSize); 209 const Address d13_save (rfp, d13_off * wordSize); 210 const Address d11_save (rfp, d11_off * wordSize); 211 const Address d9_save (rfp, d9_off * wordSize); 212 213 const Address r28_save (rfp, r28_off * wordSize); 214 const Address r26_save (rfp, r26_off * wordSize); 215 const Address r24_save (rfp, r24_off * wordSize); 216 const Address r22_save (rfp, r22_off * wordSize); 217 const Address r20_save (rfp, r20_off * wordSize); 218 219 // stub code 220 221 address aarch64_entry = __ pc(); 222 223 // set up frame and move sp to end of save area 224 __ enter(); 225 __ sub(sp, rfp, -sp_after_call_off * wordSize); 226 227 // save register parameters and Java scratch/global registers 228 // n.b. we save thread even though it gets installed in 229 // rthread because we want to sanity check rthread later 230 __ str(c_rarg7, thread); 231 __ strw(c_rarg6, parameter_size); 232 __ stp(c_rarg4, c_rarg5, entry_point); 233 __ stp(c_rarg2, c_rarg3, result_type); 234 __ stp(c_rarg0, c_rarg1, call_wrapper); 235 236 __ stp(r20, r19, r20_save); 237 __ stp(r22, r21, r22_save); 238 __ stp(r24, r23, r24_save); 239 __ stp(r26, r25, r26_save); 240 __ stp(r28, r27, r28_save); 241 242 __ stpd(v9, v8, d9_save); 243 __ stpd(v11, v10, d11_save); 244 __ stpd(v13, v12, d13_save); 245 __ stpd(v15, v14, d15_save); 246 247 // install Java thread in global register now we have saved 248 // whatever value it held 249 __ mov(rthread, c_rarg7); 250 // And method 251 __ mov(rmethod, c_rarg3); 252 253 // set up the heapbase register 254 __ reinit_heapbase(); 255 256 #ifdef ASSERT 257 // make sure we have no pending exceptions 258 { 259 Label L; 260 __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset()))); 261 __ cmp(rscratch1, (u1)NULL_WORD); 262 __ br(Assembler::EQ, L); 263 __ stop("StubRoutines::call_stub: entered with pending exception"); 264 __ BIND(L); 265 } 266 #endif 267 // pass parameters if any 268 __ mov(esp, sp); 269 __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way 270 __ andr(sp, rscratch1, -2 * wordSize); 271 272 BLOCK_COMMENT("pass parameters if any"); 273 Label parameters_done; 274 // parameter count is still in c_rarg6 275 // and parameter pointer identifying param 1 is in c_rarg5 276 __ cbzw(c_rarg6, parameters_done); 277 278 address loop = __ pc(); 279 __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize))); 280 __ subsw(c_rarg6, c_rarg6, 1); 281 __ push(rscratch1); 282 __ br(Assembler::GT, loop); 283 284 __ BIND(parameters_done); 285 286 // call Java entry -- passing methdoOop, and current sp 287 // rmethod: Method* 288 // r13: sender sp 289 BLOCK_COMMENT("call Java function"); 290 __ mov(r13, sp); 291 __ blr(c_rarg4); 292 293 // we do this here because the notify will already have been done 294 // if we get to the next instruction via an exception 295 // 296 // n.b. adding this instruction here affects the calculation of 297 // whether or not a routine returns to the call stub (used when 298 // doing stack walks) since the normal test is to check the return 299 // pc against the address saved below. so we may need to allow for 300 // this extra instruction in the check. 301 302 // save current address for use by exception handling code 303 304 return_address = __ pc(); 305 306 // store result depending on type (everything that is not 307 // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT) 308 // n.b. this assumes Java returns an integral result in r0 309 // and a floating result in j_farg0 310 __ ldr(j_rarg2, result); 311 Label is_long, is_float, is_double, exit; 312 __ ldr(j_rarg1, result_type); 313 __ cmp(j_rarg1, (u1)T_OBJECT); 314 __ br(Assembler::EQ, is_long); 315 __ cmp(j_rarg1, (u1)T_LONG); 316 __ br(Assembler::EQ, is_long); 317 __ cmp(j_rarg1, (u1)T_FLOAT); 318 __ br(Assembler::EQ, is_float); 319 __ cmp(j_rarg1, (u1)T_DOUBLE); 320 __ br(Assembler::EQ, is_double); 321 322 // handle T_INT case 323 __ strw(r0, Address(j_rarg2)); 324 325 __ BIND(exit); 326 327 // pop parameters 328 __ sub(esp, rfp, -sp_after_call_off * wordSize); 329 330 #ifdef ASSERT 331 // verify that threads correspond 332 { 333 Label L, S; 334 __ ldr(rscratch1, thread); 335 __ cmp(rthread, rscratch1); 336 __ br(Assembler::NE, S); 337 __ get_thread(rscratch1); 338 __ cmp(rthread, rscratch1); 339 __ br(Assembler::EQ, L); 340 __ BIND(S); 341 __ stop("StubRoutines::call_stub: threads must correspond"); 342 __ BIND(L); 343 } 344 #endif 345 346 // restore callee-save registers 347 __ ldpd(v15, v14, d15_save); 348 __ ldpd(v13, v12, d13_save); 349 __ ldpd(v11, v10, d11_save); 350 __ ldpd(v9, v8, d9_save); 351 352 __ ldp(r28, r27, r28_save); 353 __ ldp(r26, r25, r26_save); 354 __ ldp(r24, r23, r24_save); 355 __ ldp(r22, r21, r22_save); 356 __ ldp(r20, r19, r20_save); 357 358 __ ldp(c_rarg0, c_rarg1, call_wrapper); 359 __ ldrw(c_rarg2, result_type); 360 __ ldr(c_rarg3, method); 361 __ ldp(c_rarg4, c_rarg5, entry_point); 362 __ ldp(c_rarg6, c_rarg7, parameter_size); 363 364 // leave frame and return to caller 365 __ leave(); 366 __ ret(lr); 367 368 // handle return types different from T_INT 369 370 __ BIND(is_long); 371 __ str(r0, Address(j_rarg2, 0)); 372 __ br(Assembler::AL, exit); 373 374 __ BIND(is_float); 375 __ strs(j_farg0, Address(j_rarg2, 0)); 376 __ br(Assembler::AL, exit); 377 378 __ BIND(is_double); 379 __ strd(j_farg0, Address(j_rarg2, 0)); 380 __ br(Assembler::AL, exit); 381 382 return start; 383 } 384 385 // Return point for a Java call if there's an exception thrown in 386 // Java code. The exception is caught and transformed into a 387 // pending exception stored in JavaThread that can be tested from 388 // within the VM. 389 // 390 // Note: Usually the parameters are removed by the callee. In case 391 // of an exception crossing an activation frame boundary, that is 392 // not the case if the callee is compiled code => need to setup the 393 // rsp. 394 // 395 // r0: exception oop 396 397 address generate_catch_exception() { 398 StubCodeMark mark(this, "StubRoutines", "catch_exception"); 399 address start = __ pc(); 400 401 // same as in generate_call_stub(): 402 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 403 const Address thread (rfp, thread_off * wordSize); 404 405 #ifdef ASSERT 406 // verify that threads correspond 407 { 408 Label L, S; 409 __ ldr(rscratch1, thread); 410 __ cmp(rthread, rscratch1); 411 __ br(Assembler::NE, S); 412 __ get_thread(rscratch1); 413 __ cmp(rthread, rscratch1); 414 __ br(Assembler::EQ, L); 415 __ bind(S); 416 __ stop("StubRoutines::catch_exception: threads must correspond"); 417 __ bind(L); 418 } 419 #endif 420 421 // set pending exception 422 __ verify_oop(r0); 423 424 __ str(r0, Address(rthread, Thread::pending_exception_offset())); 425 __ mov(rscratch1, (address)__FILE__); 426 __ str(rscratch1, Address(rthread, Thread::exception_file_offset())); 427 __ movw(rscratch1, (int)__LINE__); 428 __ strw(rscratch1, Address(rthread, Thread::exception_line_offset())); 429 430 // complete return to VM 431 assert(StubRoutines::_call_stub_return_address != NULL, 432 "_call_stub_return_address must have been generated before"); 433 __ b(StubRoutines::_call_stub_return_address); 434 435 return start; 436 } 437 438 // Continuation point for runtime calls returning with a pending 439 // exception. The pending exception check happened in the runtime 440 // or native call stub. The pending exception in Thread is 441 // converted into a Java-level exception. 442 // 443 // Contract with Java-level exception handlers: 444 // r0: exception 445 // r3: throwing pc 446 // 447 // NOTE: At entry of this stub, exception-pc must be in LR !! 448 449 // NOTE: this is always used as a jump target within generated code 450 // so it just needs to be generated code wiht no x86 prolog 451 452 address generate_forward_exception() { 453 StubCodeMark mark(this, "StubRoutines", "forward exception"); 454 address start = __ pc(); 455 456 // Upon entry, LR points to the return address returning into 457 // Java (interpreted or compiled) code; i.e., the return address 458 // becomes the throwing pc. 459 // 460 // Arguments pushed before the runtime call are still on the stack 461 // but the exception handler will reset the stack pointer -> 462 // ignore them. A potential result in registers can be ignored as 463 // well. 464 465 #ifdef ASSERT 466 // make sure this code is only executed if there is a pending exception 467 { 468 Label L; 469 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 470 __ cbnz(rscratch1, L); 471 __ stop("StubRoutines::forward exception: no pending exception (1)"); 472 __ bind(L); 473 } 474 #endif 475 476 // compute exception handler into r19 477 478 // call the VM to find the handler address associated with the 479 // caller address. pass thread in r0 and caller pc (ret address) 480 // in r1. n.b. the caller pc is in lr, unlike x86 where it is on 481 // the stack. 482 __ mov(c_rarg1, lr); 483 // lr will be trashed by the VM call so we move it to R19 484 // (callee-saved) because we also need to pass it to the handler 485 // returned by this call. 486 __ mov(r19, lr); 487 BLOCK_COMMENT("call exception_handler_for_return_address"); 488 __ call_VM_leaf(CAST_FROM_FN_PTR(address, 489 SharedRuntime::exception_handler_for_return_address), 490 rthread, c_rarg1); 491 // we should not really care that lr is no longer the callee 492 // address. we saved the value the handler needs in r19 so we can 493 // just copy it to r3. however, the C2 handler will push its own 494 // frame and then calls into the VM and the VM code asserts that 495 // the PC for the frame above the handler belongs to a compiled 496 // Java method. So, we restore lr here to satisfy that assert. 497 __ mov(lr, r19); 498 // setup r0 & r3 & clear pending exception 499 __ mov(r3, r19); 500 __ mov(r19, r0); 501 __ ldr(r0, Address(rthread, Thread::pending_exception_offset())); 502 __ str(zr, Address(rthread, Thread::pending_exception_offset())); 503 504 #ifdef ASSERT 505 // make sure exception is set 506 { 507 Label L; 508 __ cbnz(r0, L); 509 __ stop("StubRoutines::forward exception: no pending exception (2)"); 510 __ bind(L); 511 } 512 #endif 513 514 // continue at exception handler 515 // r0: exception 516 // r3: throwing pc 517 // r19: exception handler 518 __ verify_oop(r0); 519 __ br(r19); 520 521 return start; 522 } 523 524 // Non-destructive plausibility checks for oops 525 // 526 // Arguments: 527 // r0: oop to verify 528 // rscratch1: error message 529 // 530 // Stack after saving c_rarg3: 531 // [tos + 0]: saved c_rarg3 532 // [tos + 1]: saved c_rarg2 533 // [tos + 2]: saved lr 534 // [tos + 3]: saved rscratch2 535 // [tos + 4]: saved r0 536 // [tos + 5]: saved rscratch1 537 address generate_verify_oop() { 538 539 StubCodeMark mark(this, "StubRoutines", "verify_oop"); 540 address start = __ pc(); 541 542 Label exit, error; 543 544 // save c_rarg2 and c_rarg3 545 __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16))); 546 547 // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 548 __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 549 __ ldr(c_rarg3, Address(c_rarg2)); 550 __ add(c_rarg3, c_rarg3, 1); 551 __ str(c_rarg3, Address(c_rarg2)); 552 553 // object is in r0 554 // make sure object is 'reasonable' 555 __ cbz(r0, exit); // if obj is NULL it is OK 556 557 #if INCLUDE_ZGC 558 if (UseZGC) { 559 // Check if mask is good. 560 // verifies that ZAddressBadMask & r0 == 0 561 __ ldr(c_rarg3, Address(rthread, ZThreadLocalData::address_bad_mask_offset())); 562 __ andr(c_rarg2, r0, c_rarg3); 563 __ cbnz(c_rarg2, error); 564 } 565 #endif 566 567 // Check if the oop is in the right area of memory 568 __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask()); 569 __ andr(c_rarg2, r0, c_rarg3); 570 __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits()); 571 572 // Compare c_rarg2 and c_rarg3. We don't use a compare 573 // instruction here because the flags register is live. 574 __ eor(c_rarg2, c_rarg2, c_rarg3); 575 __ cbnz(c_rarg2, error); 576 577 // make sure klass is 'reasonable', which is not zero. 578 __ load_klass(r0, r0); // get klass 579 __ cbz(r0, error); // if klass is NULL it is broken 580 581 // return if everything seems ok 582 __ bind(exit); 583 584 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 585 __ ret(lr); 586 587 // handle errors 588 __ bind(error); 589 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 590 591 __ push(RegSet::range(r0, r29), sp); 592 // debug(char* msg, int64_t pc, int64_t regs[]) 593 __ mov(c_rarg0, rscratch1); // pass address of error message 594 __ mov(c_rarg1, lr); // pass return address 595 __ mov(c_rarg2, sp); // pass address of regs on stack 596 #ifndef PRODUCT 597 assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area"); 598 #endif 599 BLOCK_COMMENT("call MacroAssembler::debug"); 600 __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64)); 601 __ blr(rscratch1); 602 __ hlt(0); 603 604 return start; 605 } 606 607 void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); } 608 609 // The inner part of zero_words(). This is the bulk operation, 610 // zeroing words in blocks, possibly using DC ZVA to do it. The 611 // caller is responsible for zeroing the last few words. 612 // 613 // Inputs: 614 // r10: the HeapWord-aligned base address of an array to zero. 615 // r11: the count in HeapWords, r11 > 0. 616 // 617 // Returns r10 and r11, adjusted for the caller to clear. 618 // r10: the base address of the tail of words left to clear. 619 // r11: the number of words in the tail. 620 // r11 < MacroAssembler::zero_words_block_size. 621 622 address generate_zero_blocks() { 623 Label done; 624 Label base_aligned; 625 626 Register base = r10, cnt = r11; 627 628 __ align(CodeEntryAlignment); 629 StubCodeMark mark(this, "StubRoutines", "zero_blocks"); 630 address start = __ pc(); 631 632 if (UseBlockZeroing) { 633 int zva_length = VM_Version::zva_length(); 634 635 // Ensure ZVA length can be divided by 16. This is required by 636 // the subsequent operations. 637 assert (zva_length % 16 == 0, "Unexpected ZVA Length"); 638 639 __ tbz(base, 3, base_aligned); 640 __ str(zr, Address(__ post(base, 8))); 641 __ sub(cnt, cnt, 1); 642 __ bind(base_aligned); 643 644 // Ensure count >= zva_length * 2 so that it still deserves a zva after 645 // alignment. 646 Label small; 647 int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit); 648 __ subs(rscratch1, cnt, low_limit >> 3); 649 __ br(Assembler::LT, small); 650 __ zero_dcache_blocks(base, cnt); 651 __ bind(small); 652 } 653 654 { 655 // Number of stp instructions we'll unroll 656 const int unroll = 657 MacroAssembler::zero_words_block_size / 2; 658 // Clear the remaining blocks. 659 Label loop; 660 __ subs(cnt, cnt, unroll * 2); 661 __ br(Assembler::LT, done); 662 __ bind(loop); 663 for (int i = 0; i < unroll; i++) 664 __ stp(zr, zr, __ post(base, 16)); 665 __ subs(cnt, cnt, unroll * 2); 666 __ br(Assembler::GE, loop); 667 __ bind(done); 668 __ add(cnt, cnt, unroll * 2); 669 } 670 671 __ ret(lr); 672 673 return start; 674 } 675 676 677 typedef enum { 678 copy_forwards = 1, 679 copy_backwards = -1 680 } copy_direction; 681 682 // Bulk copy of blocks of 8 words. 683 // 684 // count is a count of words. 685 // 686 // Precondition: count >= 8 687 // 688 // Postconditions: 689 // 690 // The least significant bit of count contains the remaining count 691 // of words to copy. The rest of count is trash. 692 // 693 // s and d are adjusted to point to the remaining words to copy 694 // 695 void generate_copy_longs(Label &start, Register s, Register d, Register count, 696 copy_direction direction) { 697 int unit = wordSize * direction; 698 int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize; 699 700 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6, 701 t4 = r7, t5 = r10, t6 = r11, t7 = r12; 702 const Register stride = r13; 703 704 assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7); 705 assert_different_registers(s, d, count, rscratch1); 706 707 Label again, drain; 708 const char *stub_name; 709 if (direction == copy_forwards) 710 stub_name = "forward_copy_longs"; 711 else 712 stub_name = "backward_copy_longs"; 713 714 __ align(CodeEntryAlignment); 715 716 StubCodeMark mark(this, "StubRoutines", stub_name); 717 718 __ bind(start); 719 720 Label unaligned_copy_long; 721 if (AvoidUnalignedAccesses) { 722 __ tbnz(d, 3, unaligned_copy_long); 723 } 724 725 if (direction == copy_forwards) { 726 __ sub(s, s, bias); 727 __ sub(d, d, bias); 728 } 729 730 #ifdef ASSERT 731 // Make sure we are never given < 8 words 732 { 733 Label L; 734 __ cmp(count, (u1)8); 735 __ br(Assembler::GE, L); 736 __ stop("genrate_copy_longs called with < 8 words"); 737 __ bind(L); 738 } 739 #endif 740 741 // Fill 8 registers 742 if (UseSIMDForMemoryOps) { 743 __ ldpq(v0, v1, Address(s, 4 * unit)); 744 __ ldpq(v2, v3, Address(__ pre(s, 8 * unit))); 745 } else { 746 __ ldp(t0, t1, Address(s, 2 * unit)); 747 __ ldp(t2, t3, Address(s, 4 * unit)); 748 __ ldp(t4, t5, Address(s, 6 * unit)); 749 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 750 } 751 752 __ subs(count, count, 16); 753 __ br(Assembler::LO, drain); 754 755 int prefetch = PrefetchCopyIntervalInBytes; 756 bool use_stride = false; 757 if (direction == copy_backwards) { 758 use_stride = prefetch > 256; 759 prefetch = -prefetch; 760 if (use_stride) __ mov(stride, prefetch); 761 } 762 763 __ bind(again); 764 765 if (PrefetchCopyIntervalInBytes > 0) 766 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 767 768 if (UseSIMDForMemoryOps) { 769 __ stpq(v0, v1, Address(d, 4 * unit)); 770 __ ldpq(v0, v1, Address(s, 4 * unit)); 771 __ stpq(v2, v3, Address(__ pre(d, 8 * unit))); 772 __ ldpq(v2, v3, Address(__ pre(s, 8 * unit))); 773 } else { 774 __ stp(t0, t1, Address(d, 2 * unit)); 775 __ ldp(t0, t1, Address(s, 2 * unit)); 776 __ stp(t2, t3, Address(d, 4 * unit)); 777 __ ldp(t2, t3, Address(s, 4 * unit)); 778 __ stp(t4, t5, Address(d, 6 * unit)); 779 __ ldp(t4, t5, Address(s, 6 * unit)); 780 __ stp(t6, t7, Address(__ pre(d, 8 * unit))); 781 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 782 } 783 784 __ subs(count, count, 8); 785 __ br(Assembler::HS, again); 786 787 // Drain 788 __ bind(drain); 789 if (UseSIMDForMemoryOps) { 790 __ stpq(v0, v1, Address(d, 4 * unit)); 791 __ stpq(v2, v3, Address(__ pre(d, 8 * unit))); 792 } else { 793 __ stp(t0, t1, Address(d, 2 * unit)); 794 __ stp(t2, t3, Address(d, 4 * unit)); 795 __ stp(t4, t5, Address(d, 6 * unit)); 796 __ stp(t6, t7, Address(__ pre(d, 8 * unit))); 797 } 798 799 { 800 Label L1, L2; 801 __ tbz(count, exact_log2(4), L1); 802 if (UseSIMDForMemoryOps) { 803 __ ldpq(v0, v1, Address(__ pre(s, 4 * unit))); 804 __ stpq(v0, v1, Address(__ pre(d, 4 * unit))); 805 } else { 806 __ ldp(t0, t1, Address(s, 2 * unit)); 807 __ ldp(t2, t3, Address(__ pre(s, 4 * unit))); 808 __ stp(t0, t1, Address(d, 2 * unit)); 809 __ stp(t2, t3, Address(__ pre(d, 4 * unit))); 810 } 811 __ bind(L1); 812 813 if (direction == copy_forwards) { 814 __ add(s, s, bias); 815 __ add(d, d, bias); 816 } 817 818 __ tbz(count, 1, L2); 819 __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards))); 820 __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards))); 821 __ bind(L2); 822 } 823 824 __ ret(lr); 825 826 if (AvoidUnalignedAccesses) { 827 Label drain, again; 828 // Register order for storing. Order is different for backward copy. 829 830 __ bind(unaligned_copy_long); 831 832 // source address is even aligned, target odd aligned 833 // 834 // when forward copying word pairs we read long pairs at offsets 835 // {0, 2, 4, 6} (in long words). when backwards copying we read 836 // long pairs at offsets {-2, -4, -6, -8}. We adjust the source 837 // address by -2 in the forwards case so we can compute the 838 // source offsets for both as {2, 4, 6, 8} * unit where unit = 1 839 // or -1. 840 // 841 // when forward copying we need to store 1 word, 3 pairs and 842 // then 1 word at offsets {0, 1, 3, 5, 7}. Rather thna use a 843 // zero offset We adjust the destination by -1 which means we 844 // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores. 845 // 846 // When backwards copyng we need to store 1 word, 3 pairs and 847 // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use 848 // offsets {1, 3, 5, 7, 8} * unit. 849 850 if (direction == copy_forwards) { 851 __ sub(s, s, 16); 852 __ sub(d, d, 8); 853 } 854 855 // Fill 8 registers 856 // 857 // for forwards copy s was offset by -16 from the original input 858 // value of s so the register contents are at these offsets 859 // relative to the 64 bit block addressed by that original input 860 // and so on for each successive 64 byte block when s is updated 861 // 862 // t0 at offset 0, t1 at offset 8 863 // t2 at offset 16, t3 at offset 24 864 // t4 at offset 32, t5 at offset 40 865 // t6 at offset 48, t7 at offset 56 866 867 // for backwards copy s was not offset so the register contents 868 // are at these offsets into the preceding 64 byte block 869 // relative to that original input and so on for each successive 870 // preceding 64 byte block when s is updated. this explains the 871 // slightly counter-intuitive looking pattern of register usage 872 // in the stp instructions for backwards copy. 873 // 874 // t0 at offset -16, t1 at offset -8 875 // t2 at offset -32, t3 at offset -24 876 // t4 at offset -48, t5 at offset -40 877 // t6 at offset -64, t7 at offset -56 878 879 __ ldp(t0, t1, Address(s, 2 * unit)); 880 __ ldp(t2, t3, Address(s, 4 * unit)); 881 __ ldp(t4, t5, Address(s, 6 * unit)); 882 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 883 884 __ subs(count, count, 16); 885 __ br(Assembler::LO, drain); 886 887 int prefetch = PrefetchCopyIntervalInBytes; 888 bool use_stride = false; 889 if (direction == copy_backwards) { 890 use_stride = prefetch > 256; 891 prefetch = -prefetch; 892 if (use_stride) __ mov(stride, prefetch); 893 } 894 895 __ bind(again); 896 897 if (PrefetchCopyIntervalInBytes > 0) 898 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 899 900 if (direction == copy_forwards) { 901 // allowing for the offset of -8 the store instructions place 902 // registers into the target 64 bit block at the following 903 // offsets 904 // 905 // t0 at offset 0 906 // t1 at offset 8, t2 at offset 16 907 // t3 at offset 24, t4 at offset 32 908 // t5 at offset 40, t6 at offset 48 909 // t7 at offset 56 910 911 __ str(t0, Address(d, 1 * unit)); 912 __ stp(t1, t2, Address(d, 2 * unit)); 913 __ ldp(t0, t1, Address(s, 2 * unit)); 914 __ stp(t3, t4, Address(d, 4 * unit)); 915 __ ldp(t2, t3, Address(s, 4 * unit)); 916 __ stp(t5, t6, Address(d, 6 * unit)); 917 __ ldp(t4, t5, Address(s, 6 * unit)); 918 __ str(t7, Address(__ pre(d, 8 * unit))); 919 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 920 } else { 921 // d was not offset when we started so the registers are 922 // written into the 64 bit block preceding d with the following 923 // offsets 924 // 925 // t1 at offset -8 926 // t3 at offset -24, t0 at offset -16 927 // t5 at offset -48, t2 at offset -32 928 // t7 at offset -56, t4 at offset -48 929 // t6 at offset -64 930 // 931 // note that this matches the offsets previously noted for the 932 // loads 933 934 __ str(t1, Address(d, 1 * unit)); 935 __ stp(t3, t0, Address(d, 3 * unit)); 936 __ ldp(t0, t1, Address(s, 2 * unit)); 937 __ stp(t5, t2, Address(d, 5 * unit)); 938 __ ldp(t2, t3, Address(s, 4 * unit)); 939 __ stp(t7, t4, Address(d, 7 * unit)); 940 __ ldp(t4, t5, Address(s, 6 * unit)); 941 __ str(t6, Address(__ pre(d, 8 * unit))); 942 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 943 } 944 945 __ subs(count, count, 8); 946 __ br(Assembler::HS, again); 947 948 // Drain 949 // 950 // this uses the same pattern of offsets and register arguments 951 // as above 952 __ bind(drain); 953 if (direction == copy_forwards) { 954 __ str(t0, Address(d, 1 * unit)); 955 __ stp(t1, t2, Address(d, 2 * unit)); 956 __ stp(t3, t4, Address(d, 4 * unit)); 957 __ stp(t5, t6, Address(d, 6 * unit)); 958 __ str(t7, Address(__ pre(d, 8 * unit))); 959 } else { 960 __ str(t1, Address(d, 1 * unit)); 961 __ stp(t3, t0, Address(d, 3 * unit)); 962 __ stp(t5, t2, Address(d, 5 * unit)); 963 __ stp(t7, t4, Address(d, 7 * unit)); 964 __ str(t6, Address(__ pre(d, 8 * unit))); 965 } 966 // now we need to copy any remaining part block which may 967 // include a 4 word block subblock and/or a 2 word subblock. 968 // bits 2 and 1 in the count are the tell-tale for whetehr we 969 // have each such subblock 970 { 971 Label L1, L2; 972 __ tbz(count, exact_log2(4), L1); 973 // this is the same as above but copying only 4 longs hence 974 // with ony one intervening stp between the str instructions 975 // but note that the offsets and registers still follow the 976 // same pattern 977 __ ldp(t0, t1, Address(s, 2 * unit)); 978 __ ldp(t2, t3, Address(__ pre(s, 4 * unit))); 979 if (direction == copy_forwards) { 980 __ str(t0, Address(d, 1 * unit)); 981 __ stp(t1, t2, Address(d, 2 * unit)); 982 __ str(t3, Address(__ pre(d, 4 * unit))); 983 } else { 984 __ str(t1, Address(d, 1 * unit)); 985 __ stp(t3, t0, Address(d, 3 * unit)); 986 __ str(t2, Address(__ pre(d, 4 * unit))); 987 } 988 __ bind(L1); 989 990 __ tbz(count, 1, L2); 991 // this is the same as above but copying only 2 longs hence 992 // there is no intervening stp between the str instructions 993 // but note that the offset and register patterns are still 994 // the same 995 __ ldp(t0, t1, Address(__ pre(s, 2 * unit))); 996 if (direction == copy_forwards) { 997 __ str(t0, Address(d, 1 * unit)); 998 __ str(t1, Address(__ pre(d, 2 * unit))); 999 } else { 1000 __ str(t1, Address(d, 1 * unit)); 1001 __ str(t0, Address(__ pre(d, 2 * unit))); 1002 } 1003 __ bind(L2); 1004 1005 // for forwards copy we need to re-adjust the offsets we 1006 // applied so that s and d are follow the last words written 1007 1008 if (direction == copy_forwards) { 1009 __ add(s, s, 16); 1010 __ add(d, d, 8); 1011 } 1012 1013 } 1014 1015 __ ret(lr); 1016 } 1017 } 1018 1019 // Small copy: less than 16 bytes. 1020 // 1021 // NB: Ignores all of the bits of count which represent more than 15 1022 // bytes, so a caller doesn't have to mask them. 1023 1024 void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) { 1025 bool is_backwards = step < 0; 1026 size_t granularity = uabs(step); 1027 int direction = is_backwards ? -1 : 1; 1028 int unit = wordSize * direction; 1029 1030 Label Lword, Lint, Lshort, Lbyte; 1031 1032 assert(granularity 1033 && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small"); 1034 1035 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6; 1036 1037 // ??? I don't know if this bit-test-and-branch is the right thing 1038 // to do. It does a lot of jumping, resulting in several 1039 // mispredicted branches. It might make more sense to do this 1040 // with something like Duff's device with a single computed branch. 1041 1042 __ tbz(count, 3 - exact_log2(granularity), Lword); 1043 __ ldr(tmp, Address(__ adjust(s, unit, is_backwards))); 1044 __ str(tmp, Address(__ adjust(d, unit, is_backwards))); 1045 __ bind(Lword); 1046 1047 if (granularity <= sizeof (jint)) { 1048 __ tbz(count, 2 - exact_log2(granularity), Lint); 1049 __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards))); 1050 __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards))); 1051 __ bind(Lint); 1052 } 1053 1054 if (granularity <= sizeof (jshort)) { 1055 __ tbz(count, 1 - exact_log2(granularity), Lshort); 1056 __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards))); 1057 __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards))); 1058 __ bind(Lshort); 1059 } 1060 1061 if (granularity <= sizeof (jbyte)) { 1062 __ tbz(count, 0, Lbyte); 1063 __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards))); 1064 __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards))); 1065 __ bind(Lbyte); 1066 } 1067 } 1068 1069 Label copy_f, copy_b; 1070 1071 // All-singing all-dancing memory copy. 1072 // 1073 // Copy count units of memory from s to d. The size of a unit is 1074 // step, which can be positive or negative depending on the direction 1075 // of copy. If is_aligned is false, we align the source address. 1076 // 1077 1078 void copy_memory(bool is_aligned, Register s, Register d, 1079 Register count, Register tmp, int step) { 1080 copy_direction direction = step < 0 ? copy_backwards : copy_forwards; 1081 bool is_backwards = step < 0; 1082 int granularity = uabs(step); 1083 const Register t0 = r3, t1 = r4; 1084 1085 // <= 96 bytes do inline. Direction doesn't matter because we always 1086 // load all the data before writing anything 1087 Label copy4, copy8, copy16, copy32, copy80, copy_big, finish; 1088 const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8; 1089 const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12; 1090 const Register send = r17, dend = r18; 1091 1092 if (PrefetchCopyIntervalInBytes > 0) 1093 __ prfm(Address(s, 0), PLDL1KEEP); 1094 __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity)); 1095 __ br(Assembler::HI, copy_big); 1096 1097 __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity)))); 1098 __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity)))); 1099 1100 __ cmp(count, u1(16/granularity)); 1101 __ br(Assembler::LS, copy16); 1102 1103 __ cmp(count, u1(64/granularity)); 1104 __ br(Assembler::HI, copy80); 1105 1106 __ cmp(count, u1(32/granularity)); 1107 __ br(Assembler::LS, copy32); 1108 1109 // 33..64 bytes 1110 if (UseSIMDForMemoryOps) { 1111 __ ldpq(v0, v1, Address(s, 0)); 1112 __ ldpq(v2, v3, Address(send, -32)); 1113 __ stpq(v0, v1, Address(d, 0)); 1114 __ stpq(v2, v3, Address(dend, -32)); 1115 } else { 1116 __ ldp(t0, t1, Address(s, 0)); 1117 __ ldp(t2, t3, Address(s, 16)); 1118 __ ldp(t4, t5, Address(send, -32)); 1119 __ ldp(t6, t7, Address(send, -16)); 1120 1121 __ stp(t0, t1, Address(d, 0)); 1122 __ stp(t2, t3, Address(d, 16)); 1123 __ stp(t4, t5, Address(dend, -32)); 1124 __ stp(t6, t7, Address(dend, -16)); 1125 } 1126 __ b(finish); 1127 1128 // 17..32 bytes 1129 __ bind(copy32); 1130 __ ldp(t0, t1, Address(s, 0)); 1131 __ ldp(t2, t3, Address(send, -16)); 1132 __ stp(t0, t1, Address(d, 0)); 1133 __ stp(t2, t3, Address(dend, -16)); 1134 __ b(finish); 1135 1136 // 65..80/96 bytes 1137 // (96 bytes if SIMD because we do 32 byes per instruction) 1138 __ bind(copy80); 1139 if (UseSIMDForMemoryOps) { 1140 __ ld4(v0, v1, v2, v3, __ T16B, Address(s, 0)); 1141 __ ldpq(v4, v5, Address(send, -32)); 1142 __ st4(v0, v1, v2, v3, __ T16B, Address(d, 0)); 1143 __ stpq(v4, v5, Address(dend, -32)); 1144 } else { 1145 __ ldp(t0, t1, Address(s, 0)); 1146 __ ldp(t2, t3, Address(s, 16)); 1147 __ ldp(t4, t5, Address(s, 32)); 1148 __ ldp(t6, t7, Address(s, 48)); 1149 __ ldp(t8, t9, Address(send, -16)); 1150 1151 __ stp(t0, t1, Address(d, 0)); 1152 __ stp(t2, t3, Address(d, 16)); 1153 __ stp(t4, t5, Address(d, 32)); 1154 __ stp(t6, t7, Address(d, 48)); 1155 __ stp(t8, t9, Address(dend, -16)); 1156 } 1157 __ b(finish); 1158 1159 // 0..16 bytes 1160 __ bind(copy16); 1161 __ cmp(count, u1(8/granularity)); 1162 __ br(Assembler::LO, copy8); 1163 1164 // 8..16 bytes 1165 __ ldr(t0, Address(s, 0)); 1166 __ ldr(t1, Address(send, -8)); 1167 __ str(t0, Address(d, 0)); 1168 __ str(t1, Address(dend, -8)); 1169 __ b(finish); 1170 1171 if (granularity < 8) { 1172 // 4..7 bytes 1173 __ bind(copy8); 1174 __ tbz(count, 2 - exact_log2(granularity), copy4); 1175 __ ldrw(t0, Address(s, 0)); 1176 __ ldrw(t1, Address(send, -4)); 1177 __ strw(t0, Address(d, 0)); 1178 __ strw(t1, Address(dend, -4)); 1179 __ b(finish); 1180 if (granularity < 4) { 1181 // 0..3 bytes 1182 __ bind(copy4); 1183 __ cbz(count, finish); // get rid of 0 case 1184 if (granularity == 2) { 1185 __ ldrh(t0, Address(s, 0)); 1186 __ strh(t0, Address(d, 0)); 1187 } else { // granularity == 1 1188 // Now 1..3 bytes. Handle the 1 and 2 byte case by copying 1189 // the first and last byte. 1190 // Handle the 3 byte case by loading and storing base + count/2 1191 // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1)) 1192 // This does means in the 1 byte case we load/store the same 1193 // byte 3 times. 1194 __ lsr(count, count, 1); 1195 __ ldrb(t0, Address(s, 0)); 1196 __ ldrb(t1, Address(send, -1)); 1197 __ ldrb(t2, Address(s, count)); 1198 __ strb(t0, Address(d, 0)); 1199 __ strb(t1, Address(dend, -1)); 1200 __ strb(t2, Address(d, count)); 1201 } 1202 __ b(finish); 1203 } 1204 } 1205 1206 __ bind(copy_big); 1207 if (is_backwards) { 1208 __ lea(s, Address(s, count, Address::lsl(exact_log2(-step)))); 1209 __ lea(d, Address(d, count, Address::lsl(exact_log2(-step)))); 1210 } 1211 1212 // Now we've got the small case out of the way we can align the 1213 // source address on a 2-word boundary. 1214 1215 Label aligned; 1216 1217 if (is_aligned) { 1218 // We may have to adjust by 1 word to get s 2-word-aligned. 1219 __ tbz(s, exact_log2(wordSize), aligned); 1220 __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards))); 1221 __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards))); 1222 __ sub(count, count, wordSize/granularity); 1223 } else { 1224 if (is_backwards) { 1225 __ andr(rscratch2, s, 2 * wordSize - 1); 1226 } else { 1227 __ neg(rscratch2, s); 1228 __ andr(rscratch2, rscratch2, 2 * wordSize - 1); 1229 } 1230 // rscratch2 is the byte adjustment needed to align s. 1231 __ cbz(rscratch2, aligned); 1232 int shift = exact_log2(granularity); 1233 if (shift) __ lsr(rscratch2, rscratch2, shift); 1234 __ sub(count, count, rscratch2); 1235 1236 #if 0 1237 // ?? This code is only correct for a disjoint copy. It may or 1238 // may not make sense to use it in that case. 1239 1240 // Copy the first pair; s and d may not be aligned. 1241 __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0)); 1242 __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0)); 1243 1244 // Align s and d, adjust count 1245 if (is_backwards) { 1246 __ sub(s, s, rscratch2); 1247 __ sub(d, d, rscratch2); 1248 } else { 1249 __ add(s, s, rscratch2); 1250 __ add(d, d, rscratch2); 1251 } 1252 #else 1253 copy_memory_small(s, d, rscratch2, rscratch1, step); 1254 #endif 1255 } 1256 1257 __ bind(aligned); 1258 1259 // s is now 2-word-aligned. 1260 1261 // We have a count of units and some trailing bytes. Adjust the 1262 // count and do a bulk copy of words. 1263 __ lsr(rscratch2, count, exact_log2(wordSize/granularity)); 1264 if (direction == copy_forwards) 1265 __ bl(copy_f); 1266 else 1267 __ bl(copy_b); 1268 1269 // And the tail. 1270 copy_memory_small(s, d, count, tmp, step); 1271 1272 if (granularity >= 8) __ bind(copy8); 1273 if (granularity >= 4) __ bind(copy4); 1274 __ bind(finish); 1275 } 1276 1277 1278 void clobber_registers() { 1279 #ifdef ASSERT 1280 __ mov(rscratch1, (uint64_t)0xdeadbeef); 1281 __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32); 1282 for (Register r = r3; r <= r18; r++) 1283 if (r != rscratch1) __ mov(r, rscratch1); 1284 #endif 1285 } 1286 1287 // Scan over array at a for count oops, verifying each one. 1288 // Preserves a and count, clobbers rscratch1 and rscratch2. 1289 void verify_oop_array (size_t size, Register a, Register count, Register temp) { 1290 Label loop, end; 1291 __ mov(rscratch1, a); 1292 __ mov(rscratch2, zr); 1293 __ bind(loop); 1294 __ cmp(rscratch2, count); 1295 __ br(Assembler::HS, end); 1296 if (size == (size_t)wordSize) { 1297 __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1298 __ verify_oop(temp); 1299 } else { 1300 __ ldrw(r16, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1301 __ decode_heap_oop(temp); // calls verify_oop 1302 } 1303 __ add(rscratch2, rscratch2, size); 1304 __ b(loop); 1305 __ bind(end); 1306 } 1307 1308 // Arguments: 1309 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1310 // ignored 1311 // is_oop - true => oop array, so generate store check code 1312 // name - stub name string 1313 // 1314 // Inputs: 1315 // c_rarg0 - source array address 1316 // c_rarg1 - destination array address 1317 // c_rarg2 - element count, treated as ssize_t, can be zero 1318 // 1319 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1320 // the hardware handle it. The two dwords within qwords that span 1321 // cache line boundaries will still be loaded and stored atomicly. 1322 // 1323 // Side Effects: 1324 // disjoint_int_copy_entry is set to the no-overlap entry point 1325 // used by generate_conjoint_int_oop_copy(). 1326 // 1327 address generate_disjoint_copy(size_t size, bool aligned, bool is_oop, address *entry, 1328 const char *name, bool dest_uninitialized = false) { 1329 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1330 RegSet saved_reg = RegSet::of(s, d, count); 1331 __ align(CodeEntryAlignment); 1332 StubCodeMark mark(this, "StubRoutines", name); 1333 address start = __ pc(); 1334 __ enter(); 1335 1336 if (entry != NULL) { 1337 *entry = __ pc(); 1338 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1339 BLOCK_COMMENT("Entry:"); 1340 } 1341 1342 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT; 1343 if (dest_uninitialized) { 1344 decorators |= IS_DEST_UNINITIALIZED; 1345 } 1346 if (aligned) { 1347 decorators |= ARRAYCOPY_ALIGNED; 1348 } 1349 1350 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1351 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg); 1352 1353 if (is_oop) { 1354 // save regs before copy_memory 1355 __ push(RegSet::of(d, count), sp); 1356 } 1357 { 1358 // UnsafeCopyMemory page error: continue after ucm 1359 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size); 1360 UnsafeCopyMemoryMark ucmm(this, add_entry, true); 1361 copy_memory(aligned, s, d, count, rscratch1, size); 1362 } 1363 1364 if (is_oop) { 1365 __ pop(RegSet::of(d, count), sp); 1366 if (VerifyOops) 1367 verify_oop_array(size, d, count, r16); 1368 } 1369 1370 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); 1371 1372 __ leave(); 1373 __ mov(r0, zr); // return 0 1374 __ ret(lr); 1375 return start; 1376 } 1377 1378 // Arguments: 1379 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1380 // ignored 1381 // is_oop - true => oop array, so generate store check code 1382 // name - stub name string 1383 // 1384 // Inputs: 1385 // c_rarg0 - source array address 1386 // c_rarg1 - destination array address 1387 // c_rarg2 - element count, treated as ssize_t, can be zero 1388 // 1389 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1390 // the hardware handle it. The two dwords within qwords that span 1391 // cache line boundaries will still be loaded and stored atomicly. 1392 // 1393 address generate_conjoint_copy(size_t size, bool aligned, bool is_oop, address nooverlap_target, 1394 address *entry, const char *name, 1395 bool dest_uninitialized = false) { 1396 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1397 RegSet saved_regs = RegSet::of(s, d, count); 1398 StubCodeMark mark(this, "StubRoutines", name); 1399 address start = __ pc(); 1400 __ enter(); 1401 1402 if (entry != NULL) { 1403 *entry = __ pc(); 1404 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1405 BLOCK_COMMENT("Entry:"); 1406 } 1407 1408 // use fwd copy when (d-s) above_equal (count*size) 1409 __ sub(rscratch1, d, s); 1410 __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size)); 1411 __ br(Assembler::HS, nooverlap_target); 1412 1413 DecoratorSet decorators = IN_HEAP | IS_ARRAY; 1414 if (dest_uninitialized) { 1415 decorators |= IS_DEST_UNINITIALIZED; 1416 } 1417 if (aligned) { 1418 decorators |= ARRAYCOPY_ALIGNED; 1419 } 1420 1421 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1422 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs); 1423 1424 if (is_oop) { 1425 // save regs before copy_memory 1426 __ push(RegSet::of(d, count), sp); 1427 } 1428 { 1429 // UnsafeCopyMemory page error: continue after ucm 1430 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size); 1431 UnsafeCopyMemoryMark ucmm(this, add_entry, true); 1432 copy_memory(aligned, s, d, count, rscratch1, -size); 1433 } 1434 if (is_oop) { 1435 __ pop(RegSet::of(d, count), sp); 1436 if (VerifyOops) 1437 verify_oop_array(size, d, count, r16); 1438 } 1439 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); 1440 __ leave(); 1441 __ mov(r0, zr); // return 0 1442 __ ret(lr); 1443 return start; 1444 } 1445 1446 // Arguments: 1447 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1448 // ignored 1449 // name - stub name string 1450 // 1451 // Inputs: 1452 // c_rarg0 - source array address 1453 // c_rarg1 - destination array address 1454 // c_rarg2 - element count, treated as ssize_t, can be zero 1455 // 1456 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1457 // we let the hardware handle it. The one to eight bytes within words, 1458 // dwords or qwords that span cache line boundaries will still be loaded 1459 // and stored atomically. 1460 // 1461 // Side Effects: 1462 // disjoint_byte_copy_entry is set to the no-overlap entry point // 1463 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1464 // we let the hardware handle it. The one to eight bytes within words, 1465 // dwords or qwords that span cache line boundaries will still be loaded 1466 // and stored atomically. 1467 // 1468 // Side Effects: 1469 // disjoint_byte_copy_entry is set to the no-overlap entry point 1470 // used by generate_conjoint_byte_copy(). 1471 // 1472 address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) { 1473 const bool not_oop = false; 1474 return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name); 1475 } 1476 1477 // Arguments: 1478 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1479 // ignored 1480 // name - stub name string 1481 // 1482 // Inputs: 1483 // c_rarg0 - source array address 1484 // c_rarg1 - destination array address 1485 // c_rarg2 - element count, treated as ssize_t, can be zero 1486 // 1487 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1488 // we let the hardware handle it. The one to eight bytes within words, 1489 // dwords or qwords that span cache line boundaries will still be loaded 1490 // and stored atomically. 1491 // 1492 address generate_conjoint_byte_copy(bool aligned, address nooverlap_target, 1493 address* entry, const char *name) { 1494 const bool not_oop = false; 1495 return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name); 1496 } 1497 1498 // Arguments: 1499 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1500 // ignored 1501 // name - stub name string 1502 // 1503 // Inputs: 1504 // c_rarg0 - source array address 1505 // c_rarg1 - destination array address 1506 // c_rarg2 - element count, treated as ssize_t, can be zero 1507 // 1508 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1509 // let the hardware handle it. The two or four words within dwords 1510 // or qwords that span cache line boundaries will still be loaded 1511 // and stored atomically. 1512 // 1513 // Side Effects: 1514 // disjoint_short_copy_entry is set to the no-overlap entry point 1515 // used by generate_conjoint_short_copy(). 1516 // 1517 address generate_disjoint_short_copy(bool aligned, 1518 address* entry, const char *name) { 1519 const bool not_oop = false; 1520 return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name); 1521 } 1522 1523 // Arguments: 1524 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1525 // ignored 1526 // name - stub name string 1527 // 1528 // Inputs: 1529 // c_rarg0 - source array address 1530 // c_rarg1 - destination array address 1531 // c_rarg2 - element count, treated as ssize_t, can be zero 1532 // 1533 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1534 // let the hardware handle it. The two or four words within dwords 1535 // or qwords that span cache line boundaries will still be loaded 1536 // and stored atomically. 1537 // 1538 address generate_conjoint_short_copy(bool aligned, address nooverlap_target, 1539 address *entry, const char *name) { 1540 const bool not_oop = false; 1541 return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name); 1542 1543 } 1544 // Arguments: 1545 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1546 // ignored 1547 // name - stub name string 1548 // 1549 // Inputs: 1550 // c_rarg0 - source array address 1551 // c_rarg1 - destination array address 1552 // c_rarg2 - element count, treated as ssize_t, can be zero 1553 // 1554 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1555 // the hardware handle it. The two dwords within qwords that span 1556 // cache line boundaries will still be loaded and stored atomicly. 1557 // 1558 // Side Effects: 1559 // disjoint_int_copy_entry is set to the no-overlap entry point 1560 // used by generate_conjoint_int_oop_copy(). 1561 // 1562 address generate_disjoint_int_copy(bool aligned, address *entry, 1563 const char *name, bool dest_uninitialized = false) { 1564 const bool not_oop = false; 1565 return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name); 1566 } 1567 1568 // Arguments: 1569 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1570 // ignored 1571 // name - stub name string 1572 // 1573 // Inputs: 1574 // c_rarg0 - source array address 1575 // c_rarg1 - destination array address 1576 // c_rarg2 - element count, treated as ssize_t, can be zero 1577 // 1578 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1579 // the hardware handle it. The two dwords within qwords that span 1580 // cache line boundaries will still be loaded and stored atomicly. 1581 // 1582 address generate_conjoint_int_copy(bool aligned, address nooverlap_target, 1583 address *entry, const char *name, 1584 bool dest_uninitialized = false) { 1585 const bool not_oop = false; 1586 return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name); 1587 } 1588 1589 1590 // Arguments: 1591 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1592 // ignored 1593 // name - stub name string 1594 // 1595 // Inputs: 1596 // c_rarg0 - source array address 1597 // c_rarg1 - destination array address 1598 // c_rarg2 - element count, treated as size_t, can be zero 1599 // 1600 // Side Effects: 1601 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1602 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1603 // 1604 address generate_disjoint_long_copy(bool aligned, address *entry, 1605 const char *name, bool dest_uninitialized = false) { 1606 const bool not_oop = false; 1607 return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name); 1608 } 1609 1610 // Arguments: 1611 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1612 // ignored 1613 // name - stub name string 1614 // 1615 // Inputs: 1616 // c_rarg0 - source array address 1617 // c_rarg1 - destination array address 1618 // c_rarg2 - element count, treated as size_t, can be zero 1619 // 1620 address generate_conjoint_long_copy(bool aligned, 1621 address nooverlap_target, address *entry, 1622 const char *name, bool dest_uninitialized = false) { 1623 const bool not_oop = false; 1624 return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name); 1625 } 1626 1627 // Arguments: 1628 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1629 // ignored 1630 // name - stub name string 1631 // 1632 // Inputs: 1633 // c_rarg0 - source array address 1634 // c_rarg1 - destination array address 1635 // c_rarg2 - element count, treated as size_t, can be zero 1636 // 1637 // Side Effects: 1638 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1639 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1640 // 1641 address generate_disjoint_oop_copy(bool aligned, address *entry, 1642 const char *name, bool dest_uninitialized) { 1643 const bool is_oop = true; 1644 const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1645 return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized); 1646 } 1647 1648 // Arguments: 1649 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1650 // ignored 1651 // name - stub name string 1652 // 1653 // Inputs: 1654 // c_rarg0 - source array address 1655 // c_rarg1 - destination array address 1656 // c_rarg2 - element count, treated as size_t, can be zero 1657 // 1658 address generate_conjoint_oop_copy(bool aligned, 1659 address nooverlap_target, address *entry, 1660 const char *name, bool dest_uninitialized) { 1661 const bool is_oop = true; 1662 const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1663 return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry, 1664 name, dest_uninitialized); 1665 } 1666 1667 1668 // Helper for generating a dynamic type check. 1669 // Smashes rscratch1, rscratch2. 1670 void generate_type_check(Register sub_klass, 1671 Register super_check_offset, 1672 Register super_klass, 1673 Label& L_success) { 1674 assert_different_registers(sub_klass, super_check_offset, super_klass); 1675 1676 BLOCK_COMMENT("type_check:"); 1677 1678 Label L_miss; 1679 1680 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, NULL, 1681 super_check_offset); 1682 __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL); 1683 1684 // Fall through on failure! 1685 __ BIND(L_miss); 1686 } 1687 1688 // 1689 // Generate checkcasting array copy stub 1690 // 1691 // Input: 1692 // c_rarg0 - source array address 1693 // c_rarg1 - destination array address 1694 // c_rarg2 - element count, treated as ssize_t, can be zero 1695 // c_rarg3 - size_t ckoff (super_check_offset) 1696 // c_rarg4 - oop ckval (super_klass) 1697 // 1698 // Output: 1699 // r0 == 0 - success 1700 // r0 == -1^K - failure, where K is partial transfer count 1701 // 1702 address generate_checkcast_copy(const char *name, address *entry, 1703 bool dest_uninitialized = false) { 1704 1705 Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop; 1706 1707 // Input registers (after setup_arg_regs) 1708 const Register from = c_rarg0; // source array address 1709 const Register to = c_rarg1; // destination array address 1710 const Register count = c_rarg2; // elementscount 1711 const Register ckoff = c_rarg3; // super_check_offset 1712 const Register ckval = c_rarg4; // super_klass 1713 1714 RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4); 1715 RegSet wb_post_saved_regs = RegSet::of(count); 1716 1717 // Registers used as temps (r18, r19, r20 are save-on-entry) 1718 const Register count_save = r21; // orig elementscount 1719 const Register start_to = r20; // destination array start address 1720 const Register copied_oop = r18; // actual oop copied 1721 const Register r19_klass = r19; // oop._klass 1722 1723 //--------------------------------------------------------------- 1724 // Assembler stub will be used for this call to arraycopy 1725 // if the two arrays are subtypes of Object[] but the 1726 // destination array type is not equal to or a supertype 1727 // of the source type. Each element must be separately 1728 // checked. 1729 1730 assert_different_registers(from, to, count, ckoff, ckval, start_to, 1731 copied_oop, r19_klass, count_save); 1732 1733 __ align(CodeEntryAlignment); 1734 StubCodeMark mark(this, "StubRoutines", name); 1735 address start = __ pc(); 1736 1737 __ enter(); // required for proper stackwalking of RuntimeStub frame 1738 1739 #ifdef ASSERT 1740 // caller guarantees that the arrays really are different 1741 // otherwise, we would have to make conjoint checks 1742 { Label L; 1743 array_overlap_test(L, TIMES_OOP); 1744 __ stop("checkcast_copy within a single array"); 1745 __ bind(L); 1746 } 1747 #endif //ASSERT 1748 1749 // Caller of this entry point must set up the argument registers. 1750 if (entry != NULL) { 1751 *entry = __ pc(); 1752 BLOCK_COMMENT("Entry:"); 1753 } 1754 1755 // Empty array: Nothing to do. 1756 __ cbz(count, L_done); 1757 1758 __ push(RegSet::of(r18, r19, r20, r21), sp); 1759 1760 #ifdef ASSERT 1761 BLOCK_COMMENT("assert consistent ckoff/ckval"); 1762 // The ckoff and ckval must be mutually consistent, 1763 // even though caller generates both. 1764 { Label L; 1765 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1766 __ ldrw(start_to, Address(ckval, sco_offset)); 1767 __ cmpw(ckoff, start_to); 1768 __ br(Assembler::EQ, L); 1769 __ stop("super_check_offset inconsistent"); 1770 __ bind(L); 1771 } 1772 #endif //ASSERT 1773 1774 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT; 1775 bool is_oop = true; 1776 if (dest_uninitialized) { 1777 decorators |= IS_DEST_UNINITIALIZED; 1778 } 1779 1780 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1781 bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs); 1782 1783 // save the original count 1784 __ mov(count_save, count); 1785 1786 // Copy from low to high addresses 1787 __ mov(start_to, to); // Save destination array start address 1788 __ b(L_load_element); 1789 1790 // ======== begin loop ======== 1791 // (Loop is rotated; its entry is L_load_element.) 1792 // Loop control: 1793 // for (; count != 0; count--) { 1794 // copied_oop = load_heap_oop(from++); 1795 // ... generate_type_check ...; 1796 // store_heap_oop(to++, copied_oop); 1797 // } 1798 __ align(OptoLoopAlignment); 1799 1800 __ BIND(L_store_element); 1801 __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop, noreg, noreg, AS_RAW); // store the oop 1802 __ sub(count, count, 1); 1803 __ cbz(count, L_do_card_marks); 1804 1805 // ======== loop entry is here ======== 1806 __ BIND(L_load_element); 1807 __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8), noreg, noreg, AS_RAW); // load the oop 1808 __ cbz(copied_oop, L_store_element); 1809 1810 __ load_klass(r19_klass, copied_oop);// query the object klass 1811 generate_type_check(r19_klass, ckoff, ckval, L_store_element); 1812 // ======== end loop ======== 1813 1814 // It was a real error; we must depend on the caller to finish the job. 1815 // Register count = remaining oops, count_orig = total oops. 1816 // Emit GC store barriers for the oops we have copied and report 1817 // their number to the caller. 1818 1819 __ subs(count, count_save, count); // K = partially copied oop count 1820 __ eon(count, count, zr); // report (-1^K) to caller 1821 __ br(Assembler::EQ, L_done_pop); 1822 1823 __ BIND(L_do_card_marks); 1824 bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs); 1825 1826 __ bind(L_done_pop); 1827 __ pop(RegSet::of(r18, r19, r20, r21), sp); 1828 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr); 1829 1830 __ bind(L_done); 1831 __ mov(r0, count); 1832 __ leave(); 1833 __ ret(lr); 1834 1835 return start; 1836 } 1837 1838 // Perform range checks on the proposed arraycopy. 1839 // Kills temp, but nothing else. 1840 // Also, clean the sign bits of src_pos and dst_pos. 1841 void arraycopy_range_checks(Register src, // source array oop (c_rarg0) 1842 Register src_pos, // source position (c_rarg1) 1843 Register dst, // destination array oo (c_rarg2) 1844 Register dst_pos, // destination position (c_rarg3) 1845 Register length, 1846 Register temp, 1847 Label& L_failed) { 1848 BLOCK_COMMENT("arraycopy_range_checks:"); 1849 1850 assert_different_registers(rscratch1, temp); 1851 1852 // if (src_pos + length > arrayOop(src)->length()) FAIL; 1853 __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes())); 1854 __ addw(temp, length, src_pos); 1855 __ cmpw(temp, rscratch1); 1856 __ br(Assembler::HI, L_failed); 1857 1858 // if (dst_pos + length > arrayOop(dst)->length()) FAIL; 1859 __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes())); 1860 __ addw(temp, length, dst_pos); 1861 __ cmpw(temp, rscratch1); 1862 __ br(Assembler::HI, L_failed); 1863 1864 // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'. 1865 __ movw(src_pos, src_pos); 1866 __ movw(dst_pos, dst_pos); 1867 1868 BLOCK_COMMENT("arraycopy_range_checks done"); 1869 } 1870 1871 // These stubs get called from some dumb test routine. 1872 // I'll write them properly when they're called from 1873 // something that's actually doing something. 1874 static void fake_arraycopy_stub(address src, address dst, int count) { 1875 assert(count == 0, "huh?"); 1876 } 1877 1878 1879 // 1880 // Generate 'unsafe' array copy stub 1881 // Though just as safe as the other stubs, it takes an unscaled 1882 // size_t argument instead of an element count. 1883 // 1884 // Input: 1885 // c_rarg0 - source array address 1886 // c_rarg1 - destination array address 1887 // c_rarg2 - byte count, treated as ssize_t, can be zero 1888 // 1889 // Examines the alignment of the operands and dispatches 1890 // to a long, int, short, or byte copy loop. 1891 // 1892 address generate_unsafe_copy(const char *name, 1893 address byte_copy_entry, 1894 address short_copy_entry, 1895 address int_copy_entry, 1896 address long_copy_entry) { 1897 Label L_long_aligned, L_int_aligned, L_short_aligned; 1898 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1899 1900 __ align(CodeEntryAlignment); 1901 StubCodeMark mark(this, "StubRoutines", name); 1902 address start = __ pc(); 1903 __ enter(); // required for proper stackwalking of RuntimeStub frame 1904 1905 // bump this on entry, not on exit: 1906 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr); 1907 1908 __ orr(rscratch1, s, d); 1909 __ orr(rscratch1, rscratch1, count); 1910 1911 __ andr(rscratch1, rscratch1, BytesPerLong-1); 1912 __ cbz(rscratch1, L_long_aligned); 1913 __ andr(rscratch1, rscratch1, BytesPerInt-1); 1914 __ cbz(rscratch1, L_int_aligned); 1915 __ tbz(rscratch1, 0, L_short_aligned); 1916 __ b(RuntimeAddress(byte_copy_entry)); 1917 1918 __ BIND(L_short_aligned); 1919 __ lsr(count, count, LogBytesPerShort); // size => short_count 1920 __ b(RuntimeAddress(short_copy_entry)); 1921 __ BIND(L_int_aligned); 1922 __ lsr(count, count, LogBytesPerInt); // size => int_count 1923 __ b(RuntimeAddress(int_copy_entry)); 1924 __ BIND(L_long_aligned); 1925 __ lsr(count, count, LogBytesPerLong); // size => long_count 1926 __ b(RuntimeAddress(long_copy_entry)); 1927 1928 return start; 1929 } 1930 1931 // 1932 // Generate generic array copy stubs 1933 // 1934 // Input: 1935 // c_rarg0 - src oop 1936 // c_rarg1 - src_pos (32-bits) 1937 // c_rarg2 - dst oop 1938 // c_rarg3 - dst_pos (32-bits) 1939 // c_rarg4 - element count (32-bits) 1940 // 1941 // Output: 1942 // r0 == 0 - success 1943 // r0 == -1^K - failure, where K is partial transfer count 1944 // 1945 address generate_generic_copy(const char *name, 1946 address byte_copy_entry, address short_copy_entry, 1947 address int_copy_entry, address oop_copy_entry, 1948 address long_copy_entry, address checkcast_copy_entry) { 1949 1950 Label L_failed, L_objArray; 1951 Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs; 1952 1953 // Input registers 1954 const Register src = c_rarg0; // source array oop 1955 const Register src_pos = c_rarg1; // source position 1956 const Register dst = c_rarg2; // destination array oop 1957 const Register dst_pos = c_rarg3; // destination position 1958 const Register length = c_rarg4; 1959 1960 1961 // Registers used as temps 1962 const Register dst_klass = c_rarg5; 1963 1964 __ align(CodeEntryAlignment); 1965 1966 StubCodeMark mark(this, "StubRoutines", name); 1967 1968 address start = __ pc(); 1969 1970 __ enter(); // required for proper stackwalking of RuntimeStub frame 1971 1972 // bump this on entry, not on exit: 1973 inc_counter_np(SharedRuntime::_generic_array_copy_ctr); 1974 1975 //----------------------------------------------------------------------- 1976 // Assembler stub will be used for this call to arraycopy 1977 // if the following conditions are met: 1978 // 1979 // (1) src and dst must not be null. 1980 // (2) src_pos must not be negative. 1981 // (3) dst_pos must not be negative. 1982 // (4) length must not be negative. 1983 // (5) src klass and dst klass should be the same and not NULL. 1984 // (6) src and dst should be arrays. 1985 // (7) src_pos + length must not exceed length of src. 1986 // (8) dst_pos + length must not exceed length of dst. 1987 // 1988 1989 // if (src == NULL) return -1; 1990 __ cbz(src, L_failed); 1991 1992 // if (src_pos < 0) return -1; 1993 __ tbnz(src_pos, 31, L_failed); // i.e. sign bit set 1994 1995 // if (dst == NULL) return -1; 1996 __ cbz(dst, L_failed); 1997 1998 // if (dst_pos < 0) return -1; 1999 __ tbnz(dst_pos, 31, L_failed); // i.e. sign bit set 2000 2001 // registers used as temp 2002 const Register scratch_length = r16; // elements count to copy 2003 const Register scratch_src_klass = r17; // array klass 2004 const Register lh = r18; // layout helper 2005 2006 // if (length < 0) return -1; 2007 __ movw(scratch_length, length); // length (elements count, 32-bits value) 2008 __ tbnz(scratch_length, 31, L_failed); // i.e. sign bit set 2009 2010 __ load_klass(scratch_src_klass, src); 2011 #ifdef ASSERT 2012 // assert(src->klass() != NULL); 2013 { 2014 BLOCK_COMMENT("assert klasses not null {"); 2015 Label L1, L2; 2016 __ cbnz(scratch_src_klass, L2); // it is broken if klass is NULL 2017 __ bind(L1); 2018 __ stop("broken null klass"); 2019 __ bind(L2); 2020 __ load_klass(rscratch1, dst); 2021 __ cbz(rscratch1, L1); // this would be broken also 2022 BLOCK_COMMENT("} assert klasses not null done"); 2023 } 2024 #endif 2025 2026 // Load layout helper (32-bits) 2027 // 2028 // |array_tag| | header_size | element_type | |log2_element_size| 2029 // 32 30 24 16 8 2 0 2030 // 2031 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0 2032 // 2033 2034 const int lh_offset = in_bytes(Klass::layout_helper_offset()); 2035 2036 // Handle objArrays completely differently... 2037 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT); 2038 __ ldrw(lh, Address(scratch_src_klass, lh_offset)); 2039 __ movw(rscratch1, objArray_lh); 2040 __ eorw(rscratch2, lh, rscratch1); 2041 __ cbzw(rscratch2, L_objArray); 2042 2043 // if (src->klass() != dst->klass()) return -1; 2044 __ load_klass(rscratch2, dst); 2045 __ eor(rscratch2, rscratch2, scratch_src_klass); 2046 __ cbnz(rscratch2, L_failed); 2047 2048 // if (!src->is_Array()) return -1; 2049 __ tbz(lh, 31, L_failed); // i.e. (lh >= 0) 2050 2051 // At this point, it is known to be a typeArray (array_tag 0x3). 2052 #ifdef ASSERT 2053 { 2054 BLOCK_COMMENT("assert primitive array {"); 2055 Label L; 2056 __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift); 2057 __ cmpw(lh, rscratch2); 2058 __ br(Assembler::GE, L); 2059 __ stop("must be a primitive array"); 2060 __ bind(L); 2061 BLOCK_COMMENT("} assert primitive array done"); 2062 } 2063 #endif 2064 2065 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2066 rscratch2, L_failed); 2067 2068 // TypeArrayKlass 2069 // 2070 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize); 2071 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize); 2072 // 2073 2074 const Register rscratch1_offset = rscratch1; // array offset 2075 const Register r18_elsize = lh; // element size 2076 2077 __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift, 2078 exact_log2(Klass::_lh_header_size_mask+1)); // array_offset 2079 __ add(src, src, rscratch1_offset); // src array offset 2080 __ add(dst, dst, rscratch1_offset); // dst array offset 2081 BLOCK_COMMENT("choose copy loop based on element size"); 2082 2083 // next registers should be set before the jump to corresponding stub 2084 const Register from = c_rarg0; // source array address 2085 const Register to = c_rarg1; // destination array address 2086 const Register count = c_rarg2; // elements count 2087 2088 // 'from', 'to', 'count' registers should be set in such order 2089 // since they are the same as 'src', 'src_pos', 'dst'. 2090 2091 assert(Klass::_lh_log2_element_size_shift == 0, "fix this code"); 2092 2093 // The possible values of elsize are 0-3, i.e. exact_log2(element 2094 // size in bytes). We do a simple bitwise binary search. 2095 __ BIND(L_copy_bytes); 2096 __ tbnz(r18_elsize, 1, L_copy_ints); 2097 __ tbnz(r18_elsize, 0, L_copy_shorts); 2098 __ lea(from, Address(src, src_pos));// src_addr 2099 __ lea(to, Address(dst, dst_pos));// dst_addr 2100 __ movw(count, scratch_length); // length 2101 __ b(RuntimeAddress(byte_copy_entry)); 2102 2103 __ BIND(L_copy_shorts); 2104 __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr 2105 __ lea(to, Address(dst, dst_pos, Address::lsl(1)));// dst_addr 2106 __ movw(count, scratch_length); // length 2107 __ b(RuntimeAddress(short_copy_entry)); 2108 2109 __ BIND(L_copy_ints); 2110 __ tbnz(r18_elsize, 0, L_copy_longs); 2111 __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr 2112 __ lea(to, Address(dst, dst_pos, Address::lsl(2)));// dst_addr 2113 __ movw(count, scratch_length); // length 2114 __ b(RuntimeAddress(int_copy_entry)); 2115 2116 __ BIND(L_copy_longs); 2117 #ifdef ASSERT 2118 { 2119 BLOCK_COMMENT("assert long copy {"); 2120 Label L; 2121 __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r18_elsize 2122 __ cmpw(r18_elsize, LogBytesPerLong); 2123 __ br(Assembler::EQ, L); 2124 __ stop("must be long copy, but elsize is wrong"); 2125 __ bind(L); 2126 BLOCK_COMMENT("} assert long copy done"); 2127 } 2128 #endif 2129 __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr 2130 __ lea(to, Address(dst, dst_pos, Address::lsl(3)));// dst_addr 2131 __ movw(count, scratch_length); // length 2132 __ b(RuntimeAddress(long_copy_entry)); 2133 2134 // ObjArrayKlass 2135 __ BIND(L_objArray); 2136 // live at this point: scratch_src_klass, scratch_length, src[_pos], dst[_pos] 2137 2138 Label L_plain_copy, L_checkcast_copy; 2139 // test array classes for subtyping 2140 __ load_klass(r18, dst); 2141 __ cmp(scratch_src_klass, r18); // usual case is exact equality 2142 __ br(Assembler::NE, L_checkcast_copy); 2143 2144 // Identically typed arrays can be copied without element-wise checks. 2145 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2146 rscratch2, L_failed); 2147 2148 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2149 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2150 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2151 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2152 __ movw(count, scratch_length); // length 2153 __ BIND(L_plain_copy); 2154 __ b(RuntimeAddress(oop_copy_entry)); 2155 2156 __ BIND(L_checkcast_copy); 2157 // live at this point: scratch_src_klass, scratch_length, r18 (dst_klass) 2158 { 2159 // Before looking at dst.length, make sure dst is also an objArray. 2160 __ ldrw(rscratch1, Address(r18, lh_offset)); 2161 __ movw(rscratch2, objArray_lh); 2162 __ eorw(rscratch1, rscratch1, rscratch2); 2163 __ cbnzw(rscratch1, L_failed); 2164 2165 // It is safe to examine both src.length and dst.length. 2166 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2167 r18, L_failed); 2168 2169 __ load_klass(dst_klass, dst); // reload 2170 2171 // Marshal the base address arguments now, freeing registers. 2172 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2173 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2174 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2175 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2176 __ movw(count, length); // length (reloaded) 2177 Register sco_temp = c_rarg3; // this register is free now 2178 assert_different_registers(from, to, count, sco_temp, 2179 dst_klass, scratch_src_klass); 2180 // assert_clean_int(count, sco_temp); 2181 2182 // Generate the type check. 2183 const int sco_offset = in_bytes(Klass::super_check_offset_offset()); 2184 __ ldrw(sco_temp, Address(dst_klass, sco_offset)); 2185 2186 // Smashes rscratch1, rscratch2 2187 generate_type_check(scratch_src_klass, sco_temp, dst_klass, L_plain_copy); 2188 2189 // Fetch destination element klass from the ObjArrayKlass header. 2190 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset()); 2191 __ ldr(dst_klass, Address(dst_klass, ek_offset)); 2192 __ ldrw(sco_temp, Address(dst_klass, sco_offset)); 2193 2194 // the checkcast_copy loop needs two extra arguments: 2195 assert(c_rarg3 == sco_temp, "#3 already in place"); 2196 // Set up arguments for checkcast_copy_entry. 2197 __ mov(c_rarg4, dst_klass); // dst.klass.element_klass 2198 __ b(RuntimeAddress(checkcast_copy_entry)); 2199 } 2200 2201 __ BIND(L_failed); 2202 __ mov(r0, -1); 2203 __ leave(); // required for proper stackwalking of RuntimeStub frame 2204 __ ret(lr); 2205 2206 return start; 2207 } 2208 2209 // 2210 // Generate stub for array fill. If "aligned" is true, the 2211 // "to" address is assumed to be heapword aligned. 2212 // 2213 // Arguments for generated stub: 2214 // to: c_rarg0 2215 // value: c_rarg1 2216 // count: c_rarg2 treated as signed 2217 // 2218 address generate_fill(BasicType t, bool aligned, const char *name) { 2219 __ align(CodeEntryAlignment); 2220 StubCodeMark mark(this, "StubRoutines", name); 2221 address start = __ pc(); 2222 2223 BLOCK_COMMENT("Entry:"); 2224 2225 const Register to = c_rarg0; // source array address 2226 const Register value = c_rarg1; // value 2227 const Register count = c_rarg2; // elements count 2228 2229 const Register bz_base = r10; // base for block_zero routine 2230 const Register cnt_words = r11; // temp register 2231 2232 __ enter(); 2233 2234 Label L_fill_elements, L_exit1; 2235 2236 int shift = -1; 2237 switch (t) { 2238 case T_BYTE: 2239 shift = 0; 2240 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2241 __ bfi(value, value, 8, 8); // 8 bit -> 16 bit 2242 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2243 __ br(Assembler::LO, L_fill_elements); 2244 break; 2245 case T_SHORT: 2246 shift = 1; 2247 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2248 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2249 __ br(Assembler::LO, L_fill_elements); 2250 break; 2251 case T_INT: 2252 shift = 2; 2253 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2254 __ br(Assembler::LO, L_fill_elements); 2255 break; 2256 default: ShouldNotReachHere(); 2257 } 2258 2259 // Align source address at 8 bytes address boundary. 2260 Label L_skip_align1, L_skip_align2, L_skip_align4; 2261 if (!aligned) { 2262 switch (t) { 2263 case T_BYTE: 2264 // One byte misalignment happens only for byte arrays. 2265 __ tbz(to, 0, L_skip_align1); 2266 __ strb(value, Address(__ post(to, 1))); 2267 __ subw(count, count, 1); 2268 __ bind(L_skip_align1); 2269 // Fallthrough 2270 case T_SHORT: 2271 // Two bytes misalignment happens only for byte and short (char) arrays. 2272 __ tbz(to, 1, L_skip_align2); 2273 __ strh(value, Address(__ post(to, 2))); 2274 __ subw(count, count, 2 >> shift); 2275 __ bind(L_skip_align2); 2276 // Fallthrough 2277 case T_INT: 2278 // Align to 8 bytes, we know we are 4 byte aligned to start. 2279 __ tbz(to, 2, L_skip_align4); 2280 __ strw(value, Address(__ post(to, 4))); 2281 __ subw(count, count, 4 >> shift); 2282 __ bind(L_skip_align4); 2283 break; 2284 default: ShouldNotReachHere(); 2285 } 2286 } 2287 2288 // 2289 // Fill large chunks 2290 // 2291 __ lsrw(cnt_words, count, 3 - shift); // number of words 2292 __ bfi(value, value, 32, 32); // 32 bit -> 64 bit 2293 __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift); 2294 if (UseBlockZeroing) { 2295 Label non_block_zeroing, rest; 2296 // If the fill value is zero we can use the fast zero_words(). 2297 __ cbnz(value, non_block_zeroing); 2298 __ mov(bz_base, to); 2299 __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord); 2300 __ zero_words(bz_base, cnt_words); 2301 __ b(rest); 2302 __ bind(non_block_zeroing); 2303 __ fill_words(to, cnt_words, value); 2304 __ bind(rest); 2305 } else { 2306 __ fill_words(to, cnt_words, value); 2307 } 2308 2309 // Remaining count is less than 8 bytes. Fill it by a single store. 2310 // Note that the total length is no less than 8 bytes. 2311 if (t == T_BYTE || t == T_SHORT) { 2312 Label L_exit1; 2313 __ cbzw(count, L_exit1); 2314 __ add(to, to, count, Assembler::LSL, shift); // points to the end 2315 __ str(value, Address(to, -8)); // overwrite some elements 2316 __ bind(L_exit1); 2317 __ leave(); 2318 __ ret(lr); 2319 } 2320 2321 // Handle copies less than 8 bytes. 2322 Label L_fill_2, L_fill_4, L_exit2; 2323 __ bind(L_fill_elements); 2324 switch (t) { 2325 case T_BYTE: 2326 __ tbz(count, 0, L_fill_2); 2327 __ strb(value, Address(__ post(to, 1))); 2328 __ bind(L_fill_2); 2329 __ tbz(count, 1, L_fill_4); 2330 __ strh(value, Address(__ post(to, 2))); 2331 __ bind(L_fill_4); 2332 __ tbz(count, 2, L_exit2); 2333 __ strw(value, Address(to)); 2334 break; 2335 case T_SHORT: 2336 __ tbz(count, 0, L_fill_4); 2337 __ strh(value, Address(__ post(to, 2))); 2338 __ bind(L_fill_4); 2339 __ tbz(count, 1, L_exit2); 2340 __ strw(value, Address(to)); 2341 break; 2342 case T_INT: 2343 __ cbzw(count, L_exit2); 2344 __ strw(value, Address(to)); 2345 break; 2346 default: ShouldNotReachHere(); 2347 } 2348 __ bind(L_exit2); 2349 __ leave(); 2350 __ ret(lr); 2351 return start; 2352 } 2353 2354 address generate_data_cache_writeback() { 2355 const Register line = c_rarg0; // address of line to write back 2356 2357 __ align(CodeEntryAlignment); 2358 2359 StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback"); 2360 2361 address start = __ pc(); 2362 __ enter(); 2363 __ cache_wb(Address(line, 0)); 2364 __ leave(); 2365 __ ret(lr); 2366 2367 return start; 2368 } 2369 2370 address generate_data_cache_writeback_sync() { 2371 const Register is_pre = c_rarg0; // pre or post sync 2372 2373 __ align(CodeEntryAlignment); 2374 2375 StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback_sync"); 2376 2377 // pre wbsync is a no-op 2378 // post wbsync translates to an sfence 2379 2380 Label skip; 2381 address start = __ pc(); 2382 __ enter(); 2383 __ cbnz(is_pre, skip); 2384 __ cache_wbsync(false); 2385 __ bind(skip); 2386 __ leave(); 2387 __ ret(lr); 2388 2389 return start; 2390 } 2391 2392 void generate_arraycopy_stubs() { 2393 address entry; 2394 address entry_jbyte_arraycopy; 2395 address entry_jshort_arraycopy; 2396 address entry_jint_arraycopy; 2397 address entry_oop_arraycopy; 2398 address entry_jlong_arraycopy; 2399 address entry_checkcast_arraycopy; 2400 2401 generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards); 2402 generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards); 2403 2404 StubRoutines::aarch64::_zero_blocks = generate_zero_blocks(); 2405 2406 //*** jbyte 2407 // Always need aligned and unaligned versions 2408 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(false, &entry, 2409 "jbyte_disjoint_arraycopy"); 2410 StubRoutines::_jbyte_arraycopy = generate_conjoint_byte_copy(false, entry, 2411 &entry_jbyte_arraycopy, 2412 "jbyte_arraycopy"); 2413 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry, 2414 "arrayof_jbyte_disjoint_arraycopy"); 2415 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_byte_copy(true, entry, NULL, 2416 "arrayof_jbyte_arraycopy"); 2417 2418 //*** jshort 2419 // Always need aligned and unaligned versions 2420 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, &entry, 2421 "jshort_disjoint_arraycopy"); 2422 StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(false, entry, 2423 &entry_jshort_arraycopy, 2424 "jshort_arraycopy"); 2425 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry, 2426 "arrayof_jshort_disjoint_arraycopy"); 2427 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_short_copy(true, entry, NULL, 2428 "arrayof_jshort_arraycopy"); 2429 2430 //*** jint 2431 // Aligned versions 2432 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry, 2433 "arrayof_jint_disjoint_arraycopy"); 2434 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy, 2435 "arrayof_jint_arraycopy"); 2436 // In 64 bit we need both aligned and unaligned versions of jint arraycopy. 2437 // entry_jint_arraycopy always points to the unaligned version 2438 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_int_copy(false, &entry, 2439 "jint_disjoint_arraycopy"); 2440 StubRoutines::_jint_arraycopy = generate_conjoint_int_copy(false, entry, 2441 &entry_jint_arraycopy, 2442 "jint_arraycopy"); 2443 2444 //*** jlong 2445 // It is always aligned 2446 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry, 2447 "arrayof_jlong_disjoint_arraycopy"); 2448 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy, 2449 "arrayof_jlong_arraycopy"); 2450 StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy; 2451 StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy; 2452 2453 //*** oops 2454 { 2455 // With compressed oops we need unaligned versions; notice that 2456 // we overwrite entry_oop_arraycopy. 2457 bool aligned = !UseCompressedOops; 2458 2459 StubRoutines::_arrayof_oop_disjoint_arraycopy 2460 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy", 2461 /*dest_uninitialized*/false); 2462 StubRoutines::_arrayof_oop_arraycopy 2463 = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy", 2464 /*dest_uninitialized*/false); 2465 // Aligned versions without pre-barriers 2466 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit 2467 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit", 2468 /*dest_uninitialized*/true); 2469 StubRoutines::_arrayof_oop_arraycopy_uninit 2470 = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit", 2471 /*dest_uninitialized*/true); 2472 } 2473 2474 StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy; 2475 StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy; 2476 StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit; 2477 StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit; 2478 2479 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy); 2480 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL, 2481 /*dest_uninitialized*/true); 2482 2483 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy("unsafe_arraycopy", 2484 entry_jbyte_arraycopy, 2485 entry_jshort_arraycopy, 2486 entry_jint_arraycopy, 2487 entry_jlong_arraycopy); 2488 2489 StubRoutines::_generic_arraycopy = generate_generic_copy("generic_arraycopy", 2490 entry_jbyte_arraycopy, 2491 entry_jshort_arraycopy, 2492 entry_jint_arraycopy, 2493 entry_oop_arraycopy, 2494 entry_jlong_arraycopy, 2495 entry_checkcast_arraycopy); 2496 2497 StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill"); 2498 StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill"); 2499 StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill"); 2500 StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill"); 2501 StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill"); 2502 StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill"); 2503 } 2504 2505 void generate_math_stubs() { Unimplemented(); } 2506 2507 // Arguments: 2508 // 2509 // Inputs: 2510 // c_rarg0 - source byte array address 2511 // c_rarg1 - destination byte array address 2512 // c_rarg2 - K (key) in little endian int array 2513 // 2514 address generate_aescrypt_encryptBlock() { 2515 __ align(CodeEntryAlignment); 2516 StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock"); 2517 2518 Label L_doLast; 2519 2520 const Register from = c_rarg0; // source array address 2521 const Register to = c_rarg1; // destination array address 2522 const Register key = c_rarg2; // key array address 2523 const Register keylen = rscratch1; 2524 2525 address start = __ pc(); 2526 __ enter(); 2527 2528 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2529 2530 __ ld1(v0, __ T16B, from); // get 16 bytes of input 2531 2532 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2533 __ rev32(v1, __ T16B, v1); 2534 __ rev32(v2, __ T16B, v2); 2535 __ rev32(v3, __ T16B, v3); 2536 __ rev32(v4, __ T16B, v4); 2537 __ aese(v0, v1); 2538 __ aesmc(v0, v0); 2539 __ aese(v0, v2); 2540 __ aesmc(v0, v0); 2541 __ aese(v0, v3); 2542 __ aesmc(v0, v0); 2543 __ aese(v0, v4); 2544 __ aesmc(v0, v0); 2545 2546 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2547 __ rev32(v1, __ T16B, v1); 2548 __ rev32(v2, __ T16B, v2); 2549 __ rev32(v3, __ T16B, v3); 2550 __ rev32(v4, __ T16B, v4); 2551 __ aese(v0, v1); 2552 __ aesmc(v0, v0); 2553 __ aese(v0, v2); 2554 __ aesmc(v0, v0); 2555 __ aese(v0, v3); 2556 __ aesmc(v0, v0); 2557 __ aese(v0, v4); 2558 __ aesmc(v0, v0); 2559 2560 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2561 __ rev32(v1, __ T16B, v1); 2562 __ rev32(v2, __ T16B, v2); 2563 2564 __ cmpw(keylen, 44); 2565 __ br(Assembler::EQ, L_doLast); 2566 2567 __ aese(v0, v1); 2568 __ aesmc(v0, v0); 2569 __ aese(v0, v2); 2570 __ aesmc(v0, v0); 2571 2572 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2573 __ rev32(v1, __ T16B, v1); 2574 __ rev32(v2, __ T16B, v2); 2575 2576 __ cmpw(keylen, 52); 2577 __ br(Assembler::EQ, L_doLast); 2578 2579 __ aese(v0, v1); 2580 __ aesmc(v0, v0); 2581 __ aese(v0, v2); 2582 __ aesmc(v0, v0); 2583 2584 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2585 __ rev32(v1, __ T16B, v1); 2586 __ rev32(v2, __ T16B, v2); 2587 2588 __ BIND(L_doLast); 2589 2590 __ aese(v0, v1); 2591 __ aesmc(v0, v0); 2592 __ aese(v0, v2); 2593 2594 __ ld1(v1, __ T16B, key); 2595 __ rev32(v1, __ T16B, v1); 2596 __ eor(v0, __ T16B, v0, v1); 2597 2598 __ st1(v0, __ T16B, to); 2599 2600 __ mov(r0, 0); 2601 2602 __ leave(); 2603 __ ret(lr); 2604 2605 return start; 2606 } 2607 2608 // Arguments: 2609 // 2610 // Inputs: 2611 // c_rarg0 - source byte array address 2612 // c_rarg1 - destination byte array address 2613 // c_rarg2 - K (key) in little endian int array 2614 // 2615 address generate_aescrypt_decryptBlock() { 2616 assert(UseAES, "need AES instructions and misaligned SSE support"); 2617 __ align(CodeEntryAlignment); 2618 StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock"); 2619 Label L_doLast; 2620 2621 const Register from = c_rarg0; // source array address 2622 const Register to = c_rarg1; // destination array address 2623 const Register key = c_rarg2; // key array address 2624 const Register keylen = rscratch1; 2625 2626 address start = __ pc(); 2627 __ enter(); // required for proper stackwalking of RuntimeStub frame 2628 2629 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2630 2631 __ ld1(v0, __ T16B, from); // get 16 bytes of input 2632 2633 __ ld1(v5, __ T16B, __ post(key, 16)); 2634 __ rev32(v5, __ T16B, v5); 2635 2636 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2637 __ rev32(v1, __ T16B, v1); 2638 __ rev32(v2, __ T16B, v2); 2639 __ rev32(v3, __ T16B, v3); 2640 __ rev32(v4, __ T16B, v4); 2641 __ aesd(v0, v1); 2642 __ aesimc(v0, v0); 2643 __ aesd(v0, v2); 2644 __ aesimc(v0, v0); 2645 __ aesd(v0, v3); 2646 __ aesimc(v0, v0); 2647 __ aesd(v0, v4); 2648 __ aesimc(v0, v0); 2649 2650 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2651 __ rev32(v1, __ T16B, v1); 2652 __ rev32(v2, __ T16B, v2); 2653 __ rev32(v3, __ T16B, v3); 2654 __ rev32(v4, __ T16B, v4); 2655 __ aesd(v0, v1); 2656 __ aesimc(v0, v0); 2657 __ aesd(v0, v2); 2658 __ aesimc(v0, v0); 2659 __ aesd(v0, v3); 2660 __ aesimc(v0, v0); 2661 __ aesd(v0, v4); 2662 __ aesimc(v0, v0); 2663 2664 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2665 __ rev32(v1, __ T16B, v1); 2666 __ rev32(v2, __ T16B, v2); 2667 2668 __ cmpw(keylen, 44); 2669 __ br(Assembler::EQ, L_doLast); 2670 2671 __ aesd(v0, v1); 2672 __ aesimc(v0, v0); 2673 __ aesd(v0, v2); 2674 __ aesimc(v0, v0); 2675 2676 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2677 __ rev32(v1, __ T16B, v1); 2678 __ rev32(v2, __ T16B, v2); 2679 2680 __ cmpw(keylen, 52); 2681 __ br(Assembler::EQ, L_doLast); 2682 2683 __ aesd(v0, v1); 2684 __ aesimc(v0, v0); 2685 __ aesd(v0, v2); 2686 __ aesimc(v0, v0); 2687 2688 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2689 __ rev32(v1, __ T16B, v1); 2690 __ rev32(v2, __ T16B, v2); 2691 2692 __ BIND(L_doLast); 2693 2694 __ aesd(v0, v1); 2695 __ aesimc(v0, v0); 2696 __ aesd(v0, v2); 2697 2698 __ eor(v0, __ T16B, v0, v5); 2699 2700 __ st1(v0, __ T16B, to); 2701 2702 __ mov(r0, 0); 2703 2704 __ leave(); 2705 __ ret(lr); 2706 2707 return start; 2708 } 2709 2710 // Arguments: 2711 // 2712 // Inputs: 2713 // c_rarg0 - source byte array address 2714 // c_rarg1 - destination byte array address 2715 // c_rarg2 - K (key) in little endian int array 2716 // c_rarg3 - r vector byte array address 2717 // c_rarg4 - input length 2718 // 2719 // Output: 2720 // x0 - input length 2721 // 2722 address generate_cipherBlockChaining_encryptAESCrypt() { 2723 assert(UseAES, "need AES instructions and misaligned SSE support"); 2724 __ align(CodeEntryAlignment); 2725 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt"); 2726 2727 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2728 2729 const Register from = c_rarg0; // source array address 2730 const Register to = c_rarg1; // destination array address 2731 const Register key = c_rarg2; // key array address 2732 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2733 // and left with the results of the last encryption block 2734 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2735 const Register keylen = rscratch1; 2736 2737 address start = __ pc(); 2738 2739 __ enter(); 2740 2741 __ movw(rscratch2, len_reg); 2742 2743 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2744 2745 __ ld1(v0, __ T16B, rvec); 2746 2747 __ cmpw(keylen, 52); 2748 __ br(Assembler::CC, L_loadkeys_44); 2749 __ br(Assembler::EQ, L_loadkeys_52); 2750 2751 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2752 __ rev32(v17, __ T16B, v17); 2753 __ rev32(v18, __ T16B, v18); 2754 __ BIND(L_loadkeys_52); 2755 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2756 __ rev32(v19, __ T16B, v19); 2757 __ rev32(v20, __ T16B, v20); 2758 __ BIND(L_loadkeys_44); 2759 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2760 __ rev32(v21, __ T16B, v21); 2761 __ rev32(v22, __ T16B, v22); 2762 __ rev32(v23, __ T16B, v23); 2763 __ rev32(v24, __ T16B, v24); 2764 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2765 __ rev32(v25, __ T16B, v25); 2766 __ rev32(v26, __ T16B, v26); 2767 __ rev32(v27, __ T16B, v27); 2768 __ rev32(v28, __ T16B, v28); 2769 __ ld1(v29, v30, v31, __ T16B, key); 2770 __ rev32(v29, __ T16B, v29); 2771 __ rev32(v30, __ T16B, v30); 2772 __ rev32(v31, __ T16B, v31); 2773 2774 __ BIND(L_aes_loop); 2775 __ ld1(v1, __ T16B, __ post(from, 16)); 2776 __ eor(v0, __ T16B, v0, v1); 2777 2778 __ br(Assembler::CC, L_rounds_44); 2779 __ br(Assembler::EQ, L_rounds_52); 2780 2781 __ aese(v0, v17); __ aesmc(v0, v0); 2782 __ aese(v0, v18); __ aesmc(v0, v0); 2783 __ BIND(L_rounds_52); 2784 __ aese(v0, v19); __ aesmc(v0, v0); 2785 __ aese(v0, v20); __ aesmc(v0, v0); 2786 __ BIND(L_rounds_44); 2787 __ aese(v0, v21); __ aesmc(v0, v0); 2788 __ aese(v0, v22); __ aesmc(v0, v0); 2789 __ aese(v0, v23); __ aesmc(v0, v0); 2790 __ aese(v0, v24); __ aesmc(v0, v0); 2791 __ aese(v0, v25); __ aesmc(v0, v0); 2792 __ aese(v0, v26); __ aesmc(v0, v0); 2793 __ aese(v0, v27); __ aesmc(v0, v0); 2794 __ aese(v0, v28); __ aesmc(v0, v0); 2795 __ aese(v0, v29); __ aesmc(v0, v0); 2796 __ aese(v0, v30); 2797 __ eor(v0, __ T16B, v0, v31); 2798 2799 __ st1(v0, __ T16B, __ post(to, 16)); 2800 2801 __ subw(len_reg, len_reg, 16); 2802 __ cbnzw(len_reg, L_aes_loop); 2803 2804 __ st1(v0, __ T16B, rvec); 2805 2806 __ mov(r0, rscratch2); 2807 2808 __ leave(); 2809 __ ret(lr); 2810 2811 return start; 2812 } 2813 2814 // Arguments: 2815 // 2816 // Inputs: 2817 // c_rarg0 - source byte array address 2818 // c_rarg1 - destination byte array address 2819 // c_rarg2 - K (key) in little endian int array 2820 // c_rarg3 - r vector byte array address 2821 // c_rarg4 - input length 2822 // 2823 // Output: 2824 // r0 - input length 2825 // 2826 address generate_cipherBlockChaining_decryptAESCrypt() { 2827 assert(UseAES, "need AES instructions and misaligned SSE support"); 2828 __ align(CodeEntryAlignment); 2829 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt"); 2830 2831 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2832 2833 const Register from = c_rarg0; // source array address 2834 const Register to = c_rarg1; // destination array address 2835 const Register key = c_rarg2; // key array address 2836 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2837 // and left with the results of the last encryption block 2838 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2839 const Register keylen = rscratch1; 2840 2841 address start = __ pc(); 2842 2843 __ enter(); 2844 2845 __ movw(rscratch2, len_reg); 2846 2847 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2848 2849 __ ld1(v2, __ T16B, rvec); 2850 2851 __ ld1(v31, __ T16B, __ post(key, 16)); 2852 __ rev32(v31, __ T16B, v31); 2853 2854 __ cmpw(keylen, 52); 2855 __ br(Assembler::CC, L_loadkeys_44); 2856 __ br(Assembler::EQ, L_loadkeys_52); 2857 2858 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2859 __ rev32(v17, __ T16B, v17); 2860 __ rev32(v18, __ T16B, v18); 2861 __ BIND(L_loadkeys_52); 2862 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2863 __ rev32(v19, __ T16B, v19); 2864 __ rev32(v20, __ T16B, v20); 2865 __ BIND(L_loadkeys_44); 2866 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2867 __ rev32(v21, __ T16B, v21); 2868 __ rev32(v22, __ T16B, v22); 2869 __ rev32(v23, __ T16B, v23); 2870 __ rev32(v24, __ T16B, v24); 2871 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2872 __ rev32(v25, __ T16B, v25); 2873 __ rev32(v26, __ T16B, v26); 2874 __ rev32(v27, __ T16B, v27); 2875 __ rev32(v28, __ T16B, v28); 2876 __ ld1(v29, v30, __ T16B, key); 2877 __ rev32(v29, __ T16B, v29); 2878 __ rev32(v30, __ T16B, v30); 2879 2880 __ BIND(L_aes_loop); 2881 __ ld1(v0, __ T16B, __ post(from, 16)); 2882 __ orr(v1, __ T16B, v0, v0); 2883 2884 __ br(Assembler::CC, L_rounds_44); 2885 __ br(Assembler::EQ, L_rounds_52); 2886 2887 __ aesd(v0, v17); __ aesimc(v0, v0); 2888 __ aesd(v0, v18); __ aesimc(v0, v0); 2889 __ BIND(L_rounds_52); 2890 __ aesd(v0, v19); __ aesimc(v0, v0); 2891 __ aesd(v0, v20); __ aesimc(v0, v0); 2892 __ BIND(L_rounds_44); 2893 __ aesd(v0, v21); __ aesimc(v0, v0); 2894 __ aesd(v0, v22); __ aesimc(v0, v0); 2895 __ aesd(v0, v23); __ aesimc(v0, v0); 2896 __ aesd(v0, v24); __ aesimc(v0, v0); 2897 __ aesd(v0, v25); __ aesimc(v0, v0); 2898 __ aesd(v0, v26); __ aesimc(v0, v0); 2899 __ aesd(v0, v27); __ aesimc(v0, v0); 2900 __ aesd(v0, v28); __ aesimc(v0, v0); 2901 __ aesd(v0, v29); __ aesimc(v0, v0); 2902 __ aesd(v0, v30); 2903 __ eor(v0, __ T16B, v0, v31); 2904 __ eor(v0, __ T16B, v0, v2); 2905 2906 __ st1(v0, __ T16B, __ post(to, 16)); 2907 __ orr(v2, __ T16B, v1, v1); 2908 2909 __ subw(len_reg, len_reg, 16); 2910 __ cbnzw(len_reg, L_aes_loop); 2911 2912 __ st1(v2, __ T16B, rvec); 2913 2914 __ mov(r0, rscratch2); 2915 2916 __ leave(); 2917 __ ret(lr); 2918 2919 return start; 2920 } 2921 2922 // Arguments: 2923 // 2924 // Inputs: 2925 // c_rarg0 - byte[] source+offset 2926 // c_rarg1 - int[] SHA.state 2927 // c_rarg2 - int offset 2928 // c_rarg3 - int limit 2929 // 2930 address generate_sha1_implCompress(bool multi_block, const char *name) { 2931 __ align(CodeEntryAlignment); 2932 StubCodeMark mark(this, "StubRoutines", name); 2933 address start = __ pc(); 2934 2935 Register buf = c_rarg0; 2936 Register state = c_rarg1; 2937 Register ofs = c_rarg2; 2938 Register limit = c_rarg3; 2939 2940 Label keys; 2941 Label sha1_loop; 2942 2943 // load the keys into v0..v3 2944 __ adr(rscratch1, keys); 2945 __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1)); 2946 // load 5 words state into v6, v7 2947 __ ldrq(v6, Address(state, 0)); 2948 __ ldrs(v7, Address(state, 16)); 2949 2950 2951 __ BIND(sha1_loop); 2952 // load 64 bytes of data into v16..v19 2953 __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf); 2954 __ rev32(v16, __ T16B, v16); 2955 __ rev32(v17, __ T16B, v17); 2956 __ rev32(v18, __ T16B, v18); 2957 __ rev32(v19, __ T16B, v19); 2958 2959 // do the sha1 2960 __ addv(v4, __ T4S, v16, v0); 2961 __ orr(v20, __ T16B, v6, v6); 2962 2963 FloatRegister d0 = v16; 2964 FloatRegister d1 = v17; 2965 FloatRegister d2 = v18; 2966 FloatRegister d3 = v19; 2967 2968 for (int round = 0; round < 20; round++) { 2969 FloatRegister tmp1 = (round & 1) ? v4 : v5; 2970 FloatRegister tmp2 = (round & 1) ? v21 : v22; 2971 FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7; 2972 FloatRegister tmp4 = (round & 1) ? v5 : v4; 2973 FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3)); 2974 2975 if (round < 16) __ sha1su0(d0, __ T4S, d1, d2); 2976 if (round < 19) __ addv(tmp1, __ T4S, d1, key); 2977 __ sha1h(tmp2, __ T4S, v20); 2978 if (round < 5) 2979 __ sha1c(v20, __ T4S, tmp3, tmp4); 2980 else if (round < 10 || round >= 15) 2981 __ sha1p(v20, __ T4S, tmp3, tmp4); 2982 else 2983 __ sha1m(v20, __ T4S, tmp3, tmp4); 2984 if (round < 16) __ sha1su1(d0, __ T4S, d3); 2985 2986 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 2987 } 2988 2989 __ addv(v7, __ T2S, v7, v21); 2990 __ addv(v6, __ T4S, v6, v20); 2991 2992 if (multi_block) { 2993 __ add(ofs, ofs, 64); 2994 __ cmp(ofs, limit); 2995 __ br(Assembler::LE, sha1_loop); 2996 __ mov(c_rarg0, ofs); // return ofs 2997 } 2998 2999 __ strq(v6, Address(state, 0)); 3000 __ strs(v7, Address(state, 16)); 3001 3002 __ ret(lr); 3003 3004 __ bind(keys); 3005 __ emit_int32(0x5a827999); 3006 __ emit_int32(0x6ed9eba1); 3007 __ emit_int32(0x8f1bbcdc); 3008 __ emit_int32(0xca62c1d6); 3009 3010 return start; 3011 } 3012 3013 3014 // Arguments: 3015 // 3016 // Inputs: 3017 // c_rarg0 - byte[] source+offset 3018 // c_rarg1 - int[] SHA.state 3019 // c_rarg2 - int offset 3020 // c_rarg3 - int limit 3021 // 3022 address generate_sha256_implCompress(bool multi_block, const char *name) { 3023 static const uint32_t round_consts[64] = { 3024 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 3025 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 3026 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 3027 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 3028 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 3029 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 3030 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 3031 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, 3032 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 3033 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 3034 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 3035 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, 3036 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 3037 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, 3038 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 3039 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, 3040 }; 3041 __ align(CodeEntryAlignment); 3042 StubCodeMark mark(this, "StubRoutines", name); 3043 address start = __ pc(); 3044 3045 Register buf = c_rarg0; 3046 Register state = c_rarg1; 3047 Register ofs = c_rarg2; 3048 Register limit = c_rarg3; 3049 3050 Label sha1_loop; 3051 3052 __ stpd(v8, v9, __ pre(sp, -32)); 3053 __ stpd(v10, v11, Address(sp, 16)); 3054 3055 // dga == v0 3056 // dgb == v1 3057 // dg0 == v2 3058 // dg1 == v3 3059 // dg2 == v4 3060 // t0 == v6 3061 // t1 == v7 3062 3063 // load 16 keys to v16..v31 3064 __ lea(rscratch1, ExternalAddress((address)round_consts)); 3065 __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64)); 3066 __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64)); 3067 __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64)); 3068 __ ld1(v28, v29, v30, v31, __ T4S, rscratch1); 3069 3070 // load 8 words (256 bits) state 3071 __ ldpq(v0, v1, state); 3072 3073 __ BIND(sha1_loop); 3074 // load 64 bytes of data into v8..v11 3075 __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf); 3076 __ rev32(v8, __ T16B, v8); 3077 __ rev32(v9, __ T16B, v9); 3078 __ rev32(v10, __ T16B, v10); 3079 __ rev32(v11, __ T16B, v11); 3080 3081 __ addv(v6, __ T4S, v8, v16); 3082 __ orr(v2, __ T16B, v0, v0); 3083 __ orr(v3, __ T16B, v1, v1); 3084 3085 FloatRegister d0 = v8; 3086 FloatRegister d1 = v9; 3087 FloatRegister d2 = v10; 3088 FloatRegister d3 = v11; 3089 3090 3091 for (int round = 0; round < 16; round++) { 3092 FloatRegister tmp1 = (round & 1) ? v6 : v7; 3093 FloatRegister tmp2 = (round & 1) ? v7 : v6; 3094 FloatRegister tmp3 = (round & 1) ? v2 : v4; 3095 FloatRegister tmp4 = (round & 1) ? v4 : v2; 3096 3097 if (round < 12) __ sha256su0(d0, __ T4S, d1); 3098 __ orr(v4, __ T16B, v2, v2); 3099 if (round < 15) 3100 __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17)); 3101 __ sha256h(v2, __ T4S, v3, tmp2); 3102 __ sha256h2(v3, __ T4S, v4, tmp2); 3103 if (round < 12) __ sha256su1(d0, __ T4S, d2, d3); 3104 3105 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3106 } 3107 3108 __ addv(v0, __ T4S, v0, v2); 3109 __ addv(v1, __ T4S, v1, v3); 3110 3111 if (multi_block) { 3112 __ add(ofs, ofs, 64); 3113 __ cmp(ofs, limit); 3114 __ br(Assembler::LE, sha1_loop); 3115 __ mov(c_rarg0, ofs); // return ofs 3116 } 3117 3118 __ ldpd(v10, v11, Address(sp, 16)); 3119 __ ldpd(v8, v9, __ post(sp, 32)); 3120 3121 __ stpq(v0, v1, state); 3122 3123 __ ret(lr); 3124 3125 return start; 3126 } 3127 3128 // Arguments: 3129 // 3130 // Inputs: 3131 // c_rarg0 - byte[] source+offset 3132 // c_rarg1 - int[] SHA.state 3133 // c_rarg2 - int offset 3134 // c_rarg3 - int limit 3135 // 3136 address generate_sha512_implCompress(bool multi_block, const char *name) { 3137 static const uint64_t round_consts[80] = { 3138 0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL, 3139 0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L, 3140 0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L, 3141 0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L, 3142 0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L, 3143 0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L, 3144 0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L, 3145 0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L, 3146 0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL, 3147 0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L, 3148 0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL, 3149 0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL, 3150 0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L, 3151 0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L, 3152 0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L, 3153 0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L, 3154 0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L, 3155 0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL, 3156 0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL, 3157 0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL, 3158 0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L, 3159 0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L, 3160 0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL, 3161 0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL, 3162 0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL, 3163 0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL, 3164 0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L 3165 }; 3166 3167 // Double rounds for sha512. 3168 #define sha512_dround(dr, i0, i1, i2, i3, i4, rc0, rc1, in0, in1, in2, in3, in4) \ 3169 if (dr < 36) \ 3170 __ ld1(v##rc1, __ T2D, __ post(rscratch2, 16)); \ 3171 __ addv(v5, __ T2D, v##rc0, v##in0); \ 3172 __ ext(v6, __ T16B, v##i2, v##i3, 8); \ 3173 __ ext(v5, __ T16B, v5, v5, 8); \ 3174 __ ext(v7, __ T16B, v##i1, v##i2, 8); \ 3175 __ addv(v##i3, __ T2D, v##i3, v5); \ 3176 if (dr < 32) { \ 3177 __ ext(v5, __ T16B, v##in3, v##in4, 8); \ 3178 __ sha512su0(v##in0, __ T2D, v##in1); \ 3179 } \ 3180 __ sha512h(v##i3, __ T2D, v6, v7); \ 3181 if (dr < 32) \ 3182 __ sha512su1(v##in0, __ T2D, v##in2, v5); \ 3183 __ addv(v##i4, __ T2D, v##i1, v##i3); \ 3184 __ sha512h2(v##i3, __ T2D, v##i1, v##i0); \ 3185 3186 __ align(CodeEntryAlignment); 3187 StubCodeMark mark(this, "StubRoutines", name); 3188 address start = __ pc(); 3189 3190 Register buf = c_rarg0; 3191 Register state = c_rarg1; 3192 Register ofs = c_rarg2; 3193 Register limit = c_rarg3; 3194 3195 __ stpd(v8, v9, __ pre(sp, -64)); 3196 __ stpd(v10, v11, Address(sp, 16)); 3197 __ stpd(v12, v13, Address(sp, 32)); 3198 __ stpd(v14, v15, Address(sp, 48)); 3199 3200 Label sha512_loop; 3201 3202 // load state 3203 __ ld1(v8, v9, v10, v11, __ T2D, state); 3204 3205 // load first 4 round constants 3206 __ lea(rscratch1, ExternalAddress((address)round_consts)); 3207 __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64)); 3208 3209 __ BIND(sha512_loop); 3210 // load 128B of data into v12..v19 3211 __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64)); 3212 __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64)); 3213 __ rev64(v12, __ T16B, v12); 3214 __ rev64(v13, __ T16B, v13); 3215 __ rev64(v14, __ T16B, v14); 3216 __ rev64(v15, __ T16B, v15); 3217 __ rev64(v16, __ T16B, v16); 3218 __ rev64(v17, __ T16B, v17); 3219 __ rev64(v18, __ T16B, v18); 3220 __ rev64(v19, __ T16B, v19); 3221 3222 __ mov(rscratch2, rscratch1); 3223 3224 __ mov(v0, __ T16B, v8); 3225 __ mov(v1, __ T16B, v9); 3226 __ mov(v2, __ T16B, v10); 3227 __ mov(v3, __ T16B, v11); 3228 3229 sha512_dround( 0, 0, 1, 2, 3, 4, 20, 24, 12, 13, 19, 16, 17); 3230 sha512_dround( 1, 3, 0, 4, 2, 1, 21, 25, 13, 14, 12, 17, 18); 3231 sha512_dround( 2, 2, 3, 1, 4, 0, 22, 26, 14, 15, 13, 18, 19); 3232 sha512_dround( 3, 4, 2, 0, 1, 3, 23, 27, 15, 16, 14, 19, 12); 3233 sha512_dround( 4, 1, 4, 3, 0, 2, 24, 28, 16, 17, 15, 12, 13); 3234 sha512_dround( 5, 0, 1, 2, 3, 4, 25, 29, 17, 18, 16, 13, 14); 3235 sha512_dround( 6, 3, 0, 4, 2, 1, 26, 30, 18, 19, 17, 14, 15); 3236 sha512_dround( 7, 2, 3, 1, 4, 0, 27, 31, 19, 12, 18, 15, 16); 3237 sha512_dround( 8, 4, 2, 0, 1, 3, 28, 24, 12, 13, 19, 16, 17); 3238 sha512_dround( 9, 1, 4, 3, 0, 2, 29, 25, 13, 14, 12, 17, 18); 3239 sha512_dround(10, 0, 1, 2, 3, 4, 30, 26, 14, 15, 13, 18, 19); 3240 sha512_dround(11, 3, 0, 4, 2, 1, 31, 27, 15, 16, 14, 19, 12); 3241 sha512_dround(12, 2, 3, 1, 4, 0, 24, 28, 16, 17, 15, 12, 13); 3242 sha512_dround(13, 4, 2, 0, 1, 3, 25, 29, 17, 18, 16, 13, 14); 3243 sha512_dround(14, 1, 4, 3, 0, 2, 26, 30, 18, 19, 17, 14, 15); 3244 sha512_dround(15, 0, 1, 2, 3, 4, 27, 31, 19, 12, 18, 15, 16); 3245 sha512_dround(16, 3, 0, 4, 2, 1, 28, 24, 12, 13, 19, 16, 17); 3246 sha512_dround(17, 2, 3, 1, 4, 0, 29, 25, 13, 14, 12, 17, 18); 3247 sha512_dround(18, 4, 2, 0, 1, 3, 30, 26, 14, 15, 13, 18, 19); 3248 sha512_dround(19, 1, 4, 3, 0, 2, 31, 27, 15, 16, 14, 19, 12); 3249 sha512_dround(20, 0, 1, 2, 3, 4, 24, 28, 16, 17, 15, 12, 13); 3250 sha512_dround(21, 3, 0, 4, 2, 1, 25, 29, 17, 18, 16, 13, 14); 3251 sha512_dround(22, 2, 3, 1, 4, 0, 26, 30, 18, 19, 17, 14, 15); 3252 sha512_dround(23, 4, 2, 0, 1, 3, 27, 31, 19, 12, 18, 15, 16); 3253 sha512_dround(24, 1, 4, 3, 0, 2, 28, 24, 12, 13, 19, 16, 17); 3254 sha512_dround(25, 0, 1, 2, 3, 4, 29, 25, 13, 14, 12, 17, 18); 3255 sha512_dround(26, 3, 0, 4, 2, 1, 30, 26, 14, 15, 13, 18, 19); 3256 sha512_dround(27, 2, 3, 1, 4, 0, 31, 27, 15, 16, 14, 19, 12); 3257 sha512_dround(28, 4, 2, 0, 1, 3, 24, 28, 16, 17, 15, 12, 13); 3258 sha512_dround(29, 1, 4, 3, 0, 2, 25, 29, 17, 18, 16, 13, 14); 3259 sha512_dround(30, 0, 1, 2, 3, 4, 26, 30, 18, 19, 17, 14, 15); 3260 sha512_dround(31, 3, 0, 4, 2, 1, 27, 31, 19, 12, 18, 15, 16); 3261 sha512_dround(32, 2, 3, 1, 4, 0, 28, 24, 12, 0, 0, 0, 0); 3262 sha512_dround(33, 4, 2, 0, 1, 3, 29, 25, 13, 0, 0, 0, 0); 3263 sha512_dround(34, 1, 4, 3, 0, 2, 30, 26, 14, 0, 0, 0, 0); 3264 sha512_dround(35, 0, 1, 2, 3, 4, 31, 27, 15, 0, 0, 0, 0); 3265 sha512_dround(36, 3, 0, 4, 2, 1, 24, 0, 16, 0, 0, 0, 0); 3266 sha512_dround(37, 2, 3, 1, 4, 0, 25, 0, 17, 0, 0, 0, 0); 3267 sha512_dround(38, 4, 2, 0, 1, 3, 26, 0, 18, 0, 0, 0, 0); 3268 sha512_dround(39, 1, 4, 3, 0, 2, 27, 0, 19, 0, 0, 0, 0); 3269 3270 __ addv(v8, __ T2D, v8, v0); 3271 __ addv(v9, __ T2D, v9, v1); 3272 __ addv(v10, __ T2D, v10, v2); 3273 __ addv(v11, __ T2D, v11, v3); 3274 3275 if (multi_block) { 3276 __ add(ofs, ofs, 128); 3277 __ cmp(ofs, limit); 3278 __ br(Assembler::LE, sha512_loop); 3279 __ mov(c_rarg0, ofs); // return ofs 3280 } 3281 3282 __ st1(v8, v9, v10, v11, __ T2D, state); 3283 3284 __ ldpd(v14, v15, Address(sp, 48)); 3285 __ ldpd(v12, v13, Address(sp, 32)); 3286 __ ldpd(v10, v11, Address(sp, 16)); 3287 __ ldpd(v8, v9, __ post(sp, 64)); 3288 3289 __ ret(lr); 3290 3291 return start; 3292 } 3293 3294 // Safefetch stubs. 3295 void generate_safefetch(const char* name, int size, address* entry, 3296 address* fault_pc, address* continuation_pc) { 3297 // safefetch signatures: 3298 // int SafeFetch32(int* adr, int errValue); 3299 // intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue); 3300 // 3301 // arguments: 3302 // c_rarg0 = adr 3303 // c_rarg1 = errValue 3304 // 3305 // result: 3306 // PPC_RET = *adr or errValue 3307 3308 StubCodeMark mark(this, "StubRoutines", name); 3309 3310 // Entry point, pc or function descriptor. 3311 *entry = __ pc(); 3312 3313 // Load *adr into c_rarg1, may fault. 3314 *fault_pc = __ pc(); 3315 switch (size) { 3316 case 4: 3317 // int32_t 3318 __ ldrw(c_rarg1, Address(c_rarg0, 0)); 3319 break; 3320 case 8: 3321 // int64_t 3322 __ ldr(c_rarg1, Address(c_rarg0, 0)); 3323 break; 3324 default: 3325 ShouldNotReachHere(); 3326 } 3327 3328 // return errValue or *adr 3329 *continuation_pc = __ pc(); 3330 __ mov(r0, c_rarg1); 3331 __ ret(lr); 3332 } 3333 3334 /** 3335 * Arguments: 3336 * 3337 * Inputs: 3338 * c_rarg0 - int crc 3339 * c_rarg1 - byte* buf 3340 * c_rarg2 - int length 3341 * 3342 * Ouput: 3343 * rax - int crc result 3344 */ 3345 address generate_updateBytesCRC32() { 3346 assert(UseCRC32Intrinsics, "what are we doing here?"); 3347 3348 __ align(CodeEntryAlignment); 3349 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32"); 3350 3351 address start = __ pc(); 3352 3353 const Register crc = c_rarg0; // crc 3354 const Register buf = c_rarg1; // source java byte array address 3355 const Register len = c_rarg2; // length 3356 const Register table0 = c_rarg3; // crc_table address 3357 const Register table1 = c_rarg4; 3358 const Register table2 = c_rarg5; 3359 const Register table3 = c_rarg6; 3360 const Register tmp3 = c_rarg7; 3361 3362 BLOCK_COMMENT("Entry:"); 3363 __ enter(); // required for proper stackwalking of RuntimeStub frame 3364 3365 __ kernel_crc32(crc, buf, len, 3366 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 3367 3368 __ leave(); // required for proper stackwalking of RuntimeStub frame 3369 __ ret(lr); 3370 3371 return start; 3372 } 3373 3374 /** 3375 * Arguments: 3376 * 3377 * Inputs: 3378 * c_rarg0 - int crc 3379 * c_rarg1 - byte* buf 3380 * c_rarg2 - int length 3381 * c_rarg3 - int* table 3382 * 3383 * Ouput: 3384 * r0 - int crc result 3385 */ 3386 address generate_updateBytesCRC32C() { 3387 assert(UseCRC32CIntrinsics, "what are we doing here?"); 3388 3389 __ align(CodeEntryAlignment); 3390 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C"); 3391 3392 address start = __ pc(); 3393 3394 const Register crc = c_rarg0; // crc 3395 const Register buf = c_rarg1; // source java byte array address 3396 const Register len = c_rarg2; // length 3397 const Register table0 = c_rarg3; // crc_table address 3398 const Register table1 = c_rarg4; 3399 const Register table2 = c_rarg5; 3400 const Register table3 = c_rarg6; 3401 const Register tmp3 = c_rarg7; 3402 3403 BLOCK_COMMENT("Entry:"); 3404 __ enter(); // required for proper stackwalking of RuntimeStub frame 3405 3406 __ kernel_crc32c(crc, buf, len, 3407 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 3408 3409 __ leave(); // required for proper stackwalking of RuntimeStub frame 3410 __ ret(lr); 3411 3412 return start; 3413 } 3414 3415 /*** 3416 * Arguments: 3417 * 3418 * Inputs: 3419 * c_rarg0 - int adler 3420 * c_rarg1 - byte* buff 3421 * c_rarg2 - int len 3422 * 3423 * Output: 3424 * c_rarg0 - int adler result 3425 */ 3426 address generate_updateBytesAdler32() { 3427 __ align(CodeEntryAlignment); 3428 StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32"); 3429 address start = __ pc(); 3430 3431 Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1; 3432 3433 // Aliases 3434 Register adler = c_rarg0; 3435 Register s1 = c_rarg0; 3436 Register s2 = c_rarg3; 3437 Register buff = c_rarg1; 3438 Register len = c_rarg2; 3439 Register nmax = r4; 3440 Register base = r5; 3441 Register count = r6; 3442 Register temp0 = rscratch1; 3443 Register temp1 = rscratch2; 3444 FloatRegister vbytes = v0; 3445 FloatRegister vs1acc = v1; 3446 FloatRegister vs2acc = v2; 3447 FloatRegister vtable = v3; 3448 3449 // Max number of bytes we can process before having to take the mod 3450 // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 3451 uint64_t BASE = 0xfff1; 3452 uint64_t NMAX = 0x15B0; 3453 3454 __ mov(base, BASE); 3455 __ mov(nmax, NMAX); 3456 3457 // Load accumulation coefficients for the upper 16 bits 3458 __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table)); 3459 __ ld1(vtable, __ T16B, Address(temp0)); 3460 3461 // s1 is initialized to the lower 16 bits of adler 3462 // s2 is initialized to the upper 16 bits of adler 3463 __ ubfx(s2, adler, 16, 16); // s2 = ((adler >> 16) & 0xffff) 3464 __ uxth(s1, adler); // s1 = (adler & 0xffff) 3465 3466 // The pipelined loop needs at least 16 elements for 1 iteration 3467 // It does check this, but it is more effective to skip to the cleanup loop 3468 __ cmp(len, (u1)16); 3469 __ br(Assembler::HS, L_nmax); 3470 __ cbz(len, L_combine); 3471 3472 __ bind(L_simple_by1_loop); 3473 __ ldrb(temp0, Address(__ post(buff, 1))); 3474 __ add(s1, s1, temp0); 3475 __ add(s2, s2, s1); 3476 __ subs(len, len, 1); 3477 __ br(Assembler::HI, L_simple_by1_loop); 3478 3479 // s1 = s1 % BASE 3480 __ subs(temp0, s1, base); 3481 __ csel(s1, temp0, s1, Assembler::HS); 3482 3483 // s2 = s2 % BASE 3484 __ lsr(temp0, s2, 16); 3485 __ lsl(temp1, temp0, 4); 3486 __ sub(temp1, temp1, temp0); 3487 __ add(s2, temp1, s2, ext::uxth); 3488 3489 __ subs(temp0, s2, base); 3490 __ csel(s2, temp0, s2, Assembler::HS); 3491 3492 __ b(L_combine); 3493 3494 __ bind(L_nmax); 3495 __ subs(len, len, nmax); 3496 __ sub(count, nmax, 16); 3497 __ br(Assembler::LO, L_by16); 3498 3499 __ bind(L_nmax_loop); 3500 3501 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1, 3502 vbytes, vs1acc, vs2acc, vtable); 3503 3504 __ subs(count, count, 16); 3505 __ br(Assembler::HS, L_nmax_loop); 3506 3507 // s1 = s1 % BASE 3508 __ lsr(temp0, s1, 16); 3509 __ lsl(temp1, temp0, 4); 3510 __ sub(temp1, temp1, temp0); 3511 __ add(temp1, temp1, s1, ext::uxth); 3512 3513 __ lsr(temp0, temp1, 16); 3514 __ lsl(s1, temp0, 4); 3515 __ sub(s1, s1, temp0); 3516 __ add(s1, s1, temp1, ext:: uxth); 3517 3518 __ subs(temp0, s1, base); 3519 __ csel(s1, temp0, s1, Assembler::HS); 3520 3521 // s2 = s2 % BASE 3522 __ lsr(temp0, s2, 16); 3523 __ lsl(temp1, temp0, 4); 3524 __ sub(temp1, temp1, temp0); 3525 __ add(temp1, temp1, s2, ext::uxth); 3526 3527 __ lsr(temp0, temp1, 16); 3528 __ lsl(s2, temp0, 4); 3529 __ sub(s2, s2, temp0); 3530 __ add(s2, s2, temp1, ext:: uxth); 3531 3532 __ subs(temp0, s2, base); 3533 __ csel(s2, temp0, s2, Assembler::HS); 3534 3535 __ subs(len, len, nmax); 3536 __ sub(count, nmax, 16); 3537 __ br(Assembler::HS, L_nmax_loop); 3538 3539 __ bind(L_by16); 3540 __ adds(len, len, count); 3541 __ br(Assembler::LO, L_by1); 3542 3543 __ bind(L_by16_loop); 3544 3545 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1, 3546 vbytes, vs1acc, vs2acc, vtable); 3547 3548 __ subs(len, len, 16); 3549 __ br(Assembler::HS, L_by16_loop); 3550 3551 __ bind(L_by1); 3552 __ adds(len, len, 15); 3553 __ br(Assembler::LO, L_do_mod); 3554 3555 __ bind(L_by1_loop); 3556 __ ldrb(temp0, Address(__ post(buff, 1))); 3557 __ add(s1, temp0, s1); 3558 __ add(s2, s2, s1); 3559 __ subs(len, len, 1); 3560 __ br(Assembler::HS, L_by1_loop); 3561 3562 __ bind(L_do_mod); 3563 // s1 = s1 % BASE 3564 __ lsr(temp0, s1, 16); 3565 __ lsl(temp1, temp0, 4); 3566 __ sub(temp1, temp1, temp0); 3567 __ add(temp1, temp1, s1, ext::uxth); 3568 3569 __ lsr(temp0, temp1, 16); 3570 __ lsl(s1, temp0, 4); 3571 __ sub(s1, s1, temp0); 3572 __ add(s1, s1, temp1, ext:: uxth); 3573 3574 __ subs(temp0, s1, base); 3575 __ csel(s1, temp0, s1, Assembler::HS); 3576 3577 // s2 = s2 % BASE 3578 __ lsr(temp0, s2, 16); 3579 __ lsl(temp1, temp0, 4); 3580 __ sub(temp1, temp1, temp0); 3581 __ add(temp1, temp1, s2, ext::uxth); 3582 3583 __ lsr(temp0, temp1, 16); 3584 __ lsl(s2, temp0, 4); 3585 __ sub(s2, s2, temp0); 3586 __ add(s2, s2, temp1, ext:: uxth); 3587 3588 __ subs(temp0, s2, base); 3589 __ csel(s2, temp0, s2, Assembler::HS); 3590 3591 // Combine lower bits and higher bits 3592 __ bind(L_combine); 3593 __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16) 3594 3595 __ ret(lr); 3596 3597 return start; 3598 } 3599 3600 void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff, 3601 Register temp0, Register temp1, FloatRegister vbytes, 3602 FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) { 3603 // Below is a vectorized implementation of updating s1 and s2 for 16 bytes. 3604 // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration. 3605 // In non-vectorized code, we update s1 and s2 as: 3606 // s1 <- s1 + b1 3607 // s2 <- s2 + s1 3608 // s1 <- s1 + b2 3609 // s2 <- s2 + b1 3610 // ... 3611 // s1 <- s1 + b16 3612 // s2 <- s2 + s1 3613 // Putting above assignments together, we have: 3614 // s1_new = s1 + b1 + b2 + ... + b16 3615 // s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16) 3616 // = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1) 3617 // = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1) 3618 __ ld1(vbytes, __ T16B, Address(__ post(buff, 16))); 3619 3620 // s2 = s2 + s1 * 16 3621 __ add(s2, s2, s1, Assembler::LSL, 4); 3622 3623 // vs1acc = b1 + b2 + b3 + ... + b16 3624 // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1) 3625 __ umullv(vs2acc, __ T8B, vtable, vbytes); 3626 __ umlalv(vs2acc, __ T16B, vtable, vbytes); 3627 __ uaddlv(vs1acc, __ T16B, vbytes); 3628 __ uaddlv(vs2acc, __ T8H, vs2acc); 3629 3630 // s1 = s1 + vs1acc, s2 = s2 + vs2acc 3631 __ fmovd(temp0, vs1acc); 3632 __ fmovd(temp1, vs2acc); 3633 __ add(s1, s1, temp0); 3634 __ add(s2, s2, temp1); 3635 } 3636 3637 /** 3638 * Arguments: 3639 * 3640 * Input: 3641 * c_rarg0 - x address 3642 * c_rarg1 - x length 3643 * c_rarg2 - y address 3644 * c_rarg3 - y lenth 3645 * c_rarg4 - z address 3646 * c_rarg5 - z length 3647 */ 3648 address generate_multiplyToLen() { 3649 __ align(CodeEntryAlignment); 3650 StubCodeMark mark(this, "StubRoutines", "multiplyToLen"); 3651 3652 address start = __ pc(); 3653 const Register x = r0; 3654 const Register xlen = r1; 3655 const Register y = r2; 3656 const Register ylen = r3; 3657 const Register z = r4; 3658 const Register zlen = r5; 3659 3660 const Register tmp1 = r10; 3661 const Register tmp2 = r11; 3662 const Register tmp3 = r12; 3663 const Register tmp4 = r13; 3664 const Register tmp5 = r14; 3665 const Register tmp6 = r15; 3666 const Register tmp7 = r16; 3667 3668 BLOCK_COMMENT("Entry:"); 3669 __ enter(); // required for proper stackwalking of RuntimeStub frame 3670 __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 3671 __ leave(); // required for proper stackwalking of RuntimeStub frame 3672 __ ret(lr); 3673 3674 return start; 3675 } 3676 3677 address generate_squareToLen() { 3678 // squareToLen algorithm for sizes 1..127 described in java code works 3679 // faster than multiply_to_len on some CPUs and slower on others, but 3680 // multiply_to_len shows a bit better overall results 3681 __ align(CodeEntryAlignment); 3682 StubCodeMark mark(this, "StubRoutines", "squareToLen"); 3683 address start = __ pc(); 3684 3685 const Register x = r0; 3686 const Register xlen = r1; 3687 const Register z = r2; 3688 const Register zlen = r3; 3689 const Register y = r4; // == x 3690 const Register ylen = r5; // == xlen 3691 3692 const Register tmp1 = r10; 3693 const Register tmp2 = r11; 3694 const Register tmp3 = r12; 3695 const Register tmp4 = r13; 3696 const Register tmp5 = r14; 3697 const Register tmp6 = r15; 3698 const Register tmp7 = r16; 3699 3700 RegSet spilled_regs = RegSet::of(y, ylen); 3701 BLOCK_COMMENT("Entry:"); 3702 __ enter(); 3703 __ push(spilled_regs, sp); 3704 __ mov(y, x); 3705 __ mov(ylen, xlen); 3706 __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 3707 __ pop(spilled_regs, sp); 3708 __ leave(); 3709 __ ret(lr); 3710 return start; 3711 } 3712 3713 address generate_mulAdd() { 3714 __ align(CodeEntryAlignment); 3715 StubCodeMark mark(this, "StubRoutines", "mulAdd"); 3716 3717 address start = __ pc(); 3718 3719 const Register out = r0; 3720 const Register in = r1; 3721 const Register offset = r2; 3722 const Register len = r3; 3723 const Register k = r4; 3724 3725 BLOCK_COMMENT("Entry:"); 3726 __ enter(); 3727 __ mul_add(out, in, offset, len, k); 3728 __ leave(); 3729 __ ret(lr); 3730 3731 return start; 3732 } 3733 3734 void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi, 3735 FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0, 3736 FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4) { 3737 // Karatsuba multiplication performs a 128*128 -> 256-bit 3738 // multiplication in three 128-bit multiplications and a few 3739 // additions. 3740 // 3741 // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1) 3742 // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0 3743 // 3744 // Inputs: 3745 // 3746 // A0 in a.d[0] (subkey) 3747 // A1 in a.d[1] 3748 // (A1+A0) in a1_xor_a0.d[0] 3749 // 3750 // B0 in b.d[0] (state) 3751 // B1 in b.d[1] 3752 3753 __ ext(tmp1, __ T16B, b, b, 0x08); 3754 __ pmull2(result_hi, __ T1Q, b, a, __ T2D); // A1*B1 3755 __ eor(tmp1, __ T16B, tmp1, b); // (B1+B0) 3756 __ pmull(result_lo, __ T1Q, b, a, __ T1D); // A0*B0 3757 __ pmull(tmp2, __ T1Q, tmp1, a1_xor_a0, __ T1D); // (A1+A0)(B1+B0) 3758 3759 __ ext(tmp4, __ T16B, result_lo, result_hi, 0x08); 3760 __ eor(tmp3, __ T16B, result_hi, result_lo); // A1*B1+A0*B0 3761 __ eor(tmp2, __ T16B, tmp2, tmp4); 3762 __ eor(tmp2, __ T16B, tmp2, tmp3); 3763 3764 // Register pair <result_hi:result_lo> holds the result of carry-less multiplication 3765 __ ins(result_hi, __ D, tmp2, 0, 1); 3766 __ ins(result_lo, __ D, tmp2, 1, 0); 3767 } 3768 3769 void ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi, 3770 FloatRegister p, FloatRegister z, FloatRegister t1) { 3771 const FloatRegister t0 = result; 3772 3773 // The GCM field polynomial f is z^128 + p(z), where p = 3774 // z^7+z^2+z+1. 3775 // 3776 // z^128 === -p(z) (mod (z^128 + p(z))) 3777 // 3778 // so, given that the product we're reducing is 3779 // a == lo + hi * z^128 3780 // substituting, 3781 // === lo - hi * p(z) (mod (z^128 + p(z))) 3782 // 3783 // we reduce by multiplying hi by p(z) and subtracting the result 3784 // from (i.e. XORing it with) lo. Because p has no nonzero high 3785 // bits we can do this with two 64-bit multiplications, lo*p and 3786 // hi*p. 3787 3788 __ pmull2(t0, __ T1Q, hi, p, __ T2D); 3789 __ ext(t1, __ T16B, t0, z, 8); 3790 __ eor(hi, __ T16B, hi, t1); 3791 __ ext(t1, __ T16B, z, t0, 8); 3792 __ eor(lo, __ T16B, lo, t1); 3793 __ pmull(t0, __ T1Q, hi, p, __ T1D); 3794 __ eor(result, __ T16B, lo, t0); 3795 } 3796 3797 address generate_has_negatives(address &has_negatives_long) { 3798 const u1 large_loop_size = 64; 3799 const uint64_t UPPER_BIT_MASK=0x8080808080808080; 3800 int dcache_line = VM_Version::dcache_line_size(); 3801 3802 Register ary1 = r1, len = r2, result = r0; 3803 3804 __ align(CodeEntryAlignment); 3805 3806 StubCodeMark mark(this, "StubRoutines", "has_negatives"); 3807 3808 address entry = __ pc(); 3809 3810 __ enter(); 3811 3812 Label RET_TRUE, RET_TRUE_NO_POP, RET_FALSE, ALIGNED, LOOP16, CHECK_16, DONE, 3813 LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL; 3814 3815 __ cmp(len, (u1)15); 3816 __ br(Assembler::GT, LEN_OVER_15); 3817 // The only case when execution falls into this code is when pointer is near 3818 // the end of memory page and we have to avoid reading next page 3819 __ add(ary1, ary1, len); 3820 __ subs(len, len, 8); 3821 __ br(Assembler::GT, LEN_OVER_8); 3822 __ ldr(rscratch2, Address(ary1, -8)); 3823 __ sub(rscratch1, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes. 3824 __ lsrv(rscratch2, rscratch2, rscratch1); 3825 __ tst(rscratch2, UPPER_BIT_MASK); 3826 __ cset(result, Assembler::NE); 3827 __ leave(); 3828 __ ret(lr); 3829 __ bind(LEN_OVER_8); 3830 __ ldp(rscratch1, rscratch2, Address(ary1, -16)); 3831 __ sub(len, len, 8); // no data dep., then sub can be executed while loading 3832 __ tst(rscratch2, UPPER_BIT_MASK); 3833 __ br(Assembler::NE, RET_TRUE_NO_POP); 3834 __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes 3835 __ lsrv(rscratch1, rscratch1, rscratch2); 3836 __ tst(rscratch1, UPPER_BIT_MASK); 3837 __ cset(result, Assembler::NE); 3838 __ leave(); 3839 __ ret(lr); 3840 3841 Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10; 3842 const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6; 3843 3844 has_negatives_long = __ pc(); // 2nd entry point 3845 3846 __ enter(); 3847 3848 __ bind(LEN_OVER_15); 3849 __ push(spilled_regs, sp); 3850 __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment 3851 __ cbz(rscratch2, ALIGNED); 3852 __ ldp(tmp6, tmp1, Address(ary1)); 3853 __ mov(tmp5, 16); 3854 __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address 3855 __ add(ary1, ary1, rscratch1); 3856 __ sub(len, len, rscratch1); 3857 __ orr(tmp6, tmp6, tmp1); 3858 __ tst(tmp6, UPPER_BIT_MASK); 3859 __ br(Assembler::NE, RET_TRUE); 3860 3861 __ bind(ALIGNED); 3862 __ cmp(len, large_loop_size); 3863 __ br(Assembler::LT, CHECK_16); 3864 // Perform 16-byte load as early return in pre-loop to handle situation 3865 // when initially aligned large array has negative values at starting bytes, 3866 // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is 3867 // slower. Cases with negative bytes further ahead won't be affected that 3868 // much. In fact, it'll be faster due to early loads, less instructions and 3869 // less branches in LARGE_LOOP. 3870 __ ldp(tmp6, tmp1, Address(__ post(ary1, 16))); 3871 __ sub(len, len, 16); 3872 __ orr(tmp6, tmp6, tmp1); 3873 __ tst(tmp6, UPPER_BIT_MASK); 3874 __ br(Assembler::NE, RET_TRUE); 3875 __ cmp(len, large_loop_size); 3876 __ br(Assembler::LT, CHECK_16); 3877 3878 if (SoftwarePrefetchHintDistance >= 0 3879 && SoftwarePrefetchHintDistance >= dcache_line) { 3880 // initial prefetch 3881 __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line)); 3882 } 3883 __ bind(LARGE_LOOP); 3884 if (SoftwarePrefetchHintDistance >= 0) { 3885 __ prfm(Address(ary1, SoftwarePrefetchHintDistance)); 3886 } 3887 // Issue load instructions first, since it can save few CPU/MEM cycles, also 3888 // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp) 3889 // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3 3890 // instructions per cycle and have less branches, but this approach disables 3891 // early return, thus, all 64 bytes are loaded and checked every time. 3892 __ ldp(tmp2, tmp3, Address(ary1)); 3893 __ ldp(tmp4, tmp5, Address(ary1, 16)); 3894 __ ldp(rscratch1, rscratch2, Address(ary1, 32)); 3895 __ ldp(tmp6, tmp1, Address(ary1, 48)); 3896 __ add(ary1, ary1, large_loop_size); 3897 __ sub(len, len, large_loop_size); 3898 __ orr(tmp2, tmp2, tmp3); 3899 __ orr(tmp4, tmp4, tmp5); 3900 __ orr(rscratch1, rscratch1, rscratch2); 3901 __ orr(tmp6, tmp6, tmp1); 3902 __ orr(tmp2, tmp2, tmp4); 3903 __ orr(rscratch1, rscratch1, tmp6); 3904 __ orr(tmp2, tmp2, rscratch1); 3905 __ tst(tmp2, UPPER_BIT_MASK); 3906 __ br(Assembler::NE, RET_TRUE); 3907 __ cmp(len, large_loop_size); 3908 __ br(Assembler::GE, LARGE_LOOP); 3909 3910 __ bind(CHECK_16); // small 16-byte load pre-loop 3911 __ cmp(len, (u1)16); 3912 __ br(Assembler::LT, POST_LOOP16); 3913 3914 __ bind(LOOP16); // small 16-byte load loop 3915 __ ldp(tmp2, tmp3, Address(__ post(ary1, 16))); 3916 __ sub(len, len, 16); 3917 __ orr(tmp2, tmp2, tmp3); 3918 __ tst(tmp2, UPPER_BIT_MASK); 3919 __ br(Assembler::NE, RET_TRUE); 3920 __ cmp(len, (u1)16); 3921 __ br(Assembler::GE, LOOP16); // 16-byte load loop end 3922 3923 __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally 3924 __ cmp(len, (u1)8); 3925 __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL); 3926 __ ldr(tmp3, Address(__ post(ary1, 8))); 3927 __ sub(len, len, 8); 3928 __ tst(tmp3, UPPER_BIT_MASK); 3929 __ br(Assembler::NE, RET_TRUE); 3930 3931 __ bind(POST_LOOP16_LOAD_TAIL); 3932 __ cbz(len, RET_FALSE); // Can't shift left by 64 when len==0 3933 __ ldr(tmp1, Address(ary1)); 3934 __ mov(tmp2, 64); 3935 __ sub(tmp4, tmp2, len, __ LSL, 3); 3936 __ lslv(tmp1, tmp1, tmp4); 3937 __ tst(tmp1, UPPER_BIT_MASK); 3938 __ br(Assembler::NE, RET_TRUE); 3939 // Fallthrough 3940 3941 __ bind(RET_FALSE); 3942 __ pop(spilled_regs, sp); 3943 __ leave(); 3944 __ mov(result, zr); 3945 __ ret(lr); 3946 3947 __ bind(RET_TRUE); 3948 __ pop(spilled_regs, sp); 3949 __ bind(RET_TRUE_NO_POP); 3950 __ leave(); 3951 __ mov(result, 1); 3952 __ ret(lr); 3953 3954 __ bind(DONE); 3955 __ pop(spilled_regs, sp); 3956 __ leave(); 3957 __ ret(lr); 3958 return entry; 3959 } 3960 3961 void generate_large_array_equals_loop_nonsimd(int loopThreshold, 3962 bool usePrefetch, Label &NOT_EQUAL) { 3963 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 3964 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, 3965 tmp7 = r12, tmp8 = r13; 3966 Label LOOP; 3967 3968 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 3969 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 3970 __ bind(LOOP); 3971 if (usePrefetch) { 3972 __ prfm(Address(a1, SoftwarePrefetchHintDistance)); 3973 __ prfm(Address(a2, SoftwarePrefetchHintDistance)); 3974 } 3975 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); 3976 __ eor(tmp1, tmp1, tmp2); 3977 __ eor(tmp3, tmp3, tmp4); 3978 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); 3979 __ orr(tmp1, tmp1, tmp3); 3980 __ cbnz(tmp1, NOT_EQUAL); 3981 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 3982 __ eor(tmp5, tmp5, tmp6); 3983 __ eor(tmp7, tmp7, tmp8); 3984 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 3985 __ orr(tmp5, tmp5, tmp7); 3986 __ cbnz(tmp5, NOT_EQUAL); 3987 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); 3988 __ eor(tmp1, tmp1, tmp2); 3989 __ eor(tmp3, tmp3, tmp4); 3990 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); 3991 __ orr(tmp1, tmp1, tmp3); 3992 __ cbnz(tmp1, NOT_EQUAL); 3993 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 3994 __ eor(tmp5, tmp5, tmp6); 3995 __ sub(cnt1, cnt1, 8 * wordSize); 3996 __ eor(tmp7, tmp7, tmp8); 3997 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 3998 // tmp6 is not used. MacroAssembler::subs is used here (rather than 3999 // cmp) because subs allows an unlimited range of immediate operand. 4000 __ subs(tmp6, cnt1, loopThreshold); 4001 __ orr(tmp5, tmp5, tmp7); 4002 __ cbnz(tmp5, NOT_EQUAL); 4003 __ br(__ GE, LOOP); 4004 // post-loop 4005 __ eor(tmp1, tmp1, tmp2); 4006 __ eor(tmp3, tmp3, tmp4); 4007 __ orr(tmp1, tmp1, tmp3); 4008 __ sub(cnt1, cnt1, 2 * wordSize); 4009 __ cbnz(tmp1, NOT_EQUAL); 4010 } 4011 4012 void generate_large_array_equals_loop_simd(int loopThreshold, 4013 bool usePrefetch, Label &NOT_EQUAL) { 4014 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 4015 tmp2 = rscratch2; 4016 Label LOOP; 4017 4018 __ bind(LOOP); 4019 if (usePrefetch) { 4020 __ prfm(Address(a1, SoftwarePrefetchHintDistance)); 4021 __ prfm(Address(a2, SoftwarePrefetchHintDistance)); 4022 } 4023 __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize))); 4024 __ sub(cnt1, cnt1, 8 * wordSize); 4025 __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize))); 4026 __ subs(tmp1, cnt1, loopThreshold); 4027 __ eor(v0, __ T16B, v0, v4); 4028 __ eor(v1, __ T16B, v1, v5); 4029 __ eor(v2, __ T16B, v2, v6); 4030 __ eor(v3, __ T16B, v3, v7); 4031 __ orr(v0, __ T16B, v0, v1); 4032 __ orr(v1, __ T16B, v2, v3); 4033 __ orr(v0, __ T16B, v0, v1); 4034 __ umov(tmp1, v0, __ D, 0); 4035 __ umov(tmp2, v0, __ D, 1); 4036 __ orr(tmp1, tmp1, tmp2); 4037 __ cbnz(tmp1, NOT_EQUAL); 4038 __ br(__ GE, LOOP); 4039 } 4040 4041 // a1 = r1 - array1 address 4042 // a2 = r2 - array2 address 4043 // result = r0 - return value. Already contains "false" 4044 // cnt1 = r10 - amount of elements left to check, reduced by wordSize 4045 // r3-r5 are reserved temporary registers 4046 address generate_large_array_equals() { 4047 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 4048 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, 4049 tmp7 = r12, tmp8 = r13; 4050 Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP, 4051 SMALL_LOOP, POST_LOOP; 4052 const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16; 4053 // calculate if at least 32 prefetched bytes are used 4054 int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32; 4055 int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE); 4056 RegSet spilled_regs = RegSet::range(tmp6, tmp8); 4057 assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4, 4058 tmp5, tmp6, tmp7, tmp8); 4059 4060 __ align(CodeEntryAlignment); 4061 4062 StubCodeMark mark(this, "StubRoutines", "large_array_equals"); 4063 4064 address entry = __ pc(); 4065 __ enter(); 4066 __ sub(cnt1, cnt1, wordSize); // first 8 bytes were loaded outside of stub 4067 // also advance pointers to use post-increment instead of pre-increment 4068 __ add(a1, a1, wordSize); 4069 __ add(a2, a2, wordSize); 4070 if (AvoidUnalignedAccesses) { 4071 // both implementations (SIMD/nonSIMD) are using relatively large load 4072 // instructions (ld1/ldp), which has huge penalty (up to x2 exec time) 4073 // on some CPUs in case of address is not at least 16-byte aligned. 4074 // Arrays are 8-byte aligned currently, so, we can make additional 8-byte 4075 // load if needed at least for 1st address and make if 16-byte aligned. 4076 Label ALIGNED16; 4077 __ tbz(a1, 3, ALIGNED16); 4078 __ ldr(tmp1, Address(__ post(a1, wordSize))); 4079 __ ldr(tmp2, Address(__ post(a2, wordSize))); 4080 __ sub(cnt1, cnt1, wordSize); 4081 __ eor(tmp1, tmp1, tmp2); 4082 __ cbnz(tmp1, NOT_EQUAL_NO_POP); 4083 __ bind(ALIGNED16); 4084 } 4085 if (UseSIMDForArrayEquals) { 4086 if (SoftwarePrefetchHintDistance >= 0) { 4087 __ subs(tmp1, cnt1, prefetchLoopThreshold); 4088 __ br(__ LE, NO_PREFETCH_LARGE_LOOP); 4089 generate_large_array_equals_loop_simd(prefetchLoopThreshold, 4090 /* prfm = */ true, NOT_EQUAL); 4091 __ subs(zr, cnt1, nonPrefetchLoopThreshold); 4092 __ br(__ LT, TAIL); 4093 } 4094 __ bind(NO_PREFETCH_LARGE_LOOP); 4095 generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold, 4096 /* prfm = */ false, NOT_EQUAL); 4097 } else { 4098 __ push(spilled_regs, sp); 4099 if (SoftwarePrefetchHintDistance >= 0) { 4100 __ subs(tmp1, cnt1, prefetchLoopThreshold); 4101 __ br(__ LE, NO_PREFETCH_LARGE_LOOP); 4102 generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold, 4103 /* prfm = */ true, NOT_EQUAL); 4104 __ subs(zr, cnt1, nonPrefetchLoopThreshold); 4105 __ br(__ LT, TAIL); 4106 } 4107 __ bind(NO_PREFETCH_LARGE_LOOP); 4108 generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold, 4109 /* prfm = */ false, NOT_EQUAL); 4110 } 4111 __ bind(TAIL); 4112 __ cbz(cnt1, EQUAL); 4113 __ subs(cnt1, cnt1, wordSize); 4114 __ br(__ LE, POST_LOOP); 4115 __ bind(SMALL_LOOP); 4116 __ ldr(tmp1, Address(__ post(a1, wordSize))); 4117 __ ldr(tmp2, Address(__ post(a2, wordSize))); 4118 __ subs(cnt1, cnt1, wordSize); 4119 __ eor(tmp1, tmp1, tmp2); 4120 __ cbnz(tmp1, NOT_EQUAL); 4121 __ br(__ GT, SMALL_LOOP); 4122 __ bind(POST_LOOP); 4123 __ ldr(tmp1, Address(a1, cnt1)); 4124 __ ldr(tmp2, Address(a2, cnt1)); 4125 __ eor(tmp1, tmp1, tmp2); 4126 __ cbnz(tmp1, NOT_EQUAL); 4127 __ bind(EQUAL); 4128 __ mov(result, true); 4129 __ bind(NOT_EQUAL); 4130 if (!UseSIMDForArrayEquals) { 4131 __ pop(spilled_regs, sp); 4132 } 4133 __ bind(NOT_EQUAL_NO_POP); 4134 __ leave(); 4135 __ ret(lr); 4136 return entry; 4137 } 4138 4139 address generate_dsin_dcos(bool isCos) { 4140 __ align(CodeEntryAlignment); 4141 StubCodeMark mark(this, "StubRoutines", isCos ? "libmDcos" : "libmDsin"); 4142 address start = __ pc(); 4143 __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw, 4144 (address)StubRoutines::aarch64::_two_over_pi, 4145 (address)StubRoutines::aarch64::_pio2, 4146 (address)StubRoutines::aarch64::_dsin_coef, 4147 (address)StubRoutines::aarch64::_dcos_coef); 4148 return start; 4149 } 4150 4151 address generate_dlog() { 4152 __ align(CodeEntryAlignment); 4153 StubCodeMark mark(this, "StubRoutines", "dlog"); 4154 address entry = __ pc(); 4155 FloatRegister vtmp0 = v0, vtmp1 = v1, vtmp2 = v2, vtmp3 = v3, vtmp4 = v4, 4156 vtmp5 = v5, tmpC1 = v16, tmpC2 = v17, tmpC3 = v18, tmpC4 = v19; 4157 Register tmp1 = r0, tmp2 = r1, tmp3 = r2, tmp4 = r3, tmp5 = r4; 4158 __ fast_log(vtmp0, vtmp1, vtmp2, vtmp3, vtmp4, vtmp5, tmpC1, tmpC2, tmpC3, 4159 tmpC4, tmp1, tmp2, tmp3, tmp4, tmp5); 4160 return entry; 4161 } 4162 4163 // code for comparing 16 bytes of strings with same encoding 4164 void compare_string_16_bytes_same(Label &DIFF1, Label &DIFF2) { 4165 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, tmp1 = r10, tmp2 = r11; 4166 __ ldr(rscratch1, Address(__ post(str1, 8))); 4167 __ eor(rscratch2, tmp1, tmp2); 4168 __ ldr(cnt1, Address(__ post(str2, 8))); 4169 __ cbnz(rscratch2, DIFF1); 4170 __ ldr(tmp1, Address(__ post(str1, 8))); 4171 __ eor(rscratch2, rscratch1, cnt1); 4172 __ ldr(tmp2, Address(__ post(str2, 8))); 4173 __ cbnz(rscratch2, DIFF2); 4174 } 4175 4176 // code for comparing 16 characters of strings with Latin1 and Utf16 encoding 4177 void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1, 4178 Label &DIFF2) { 4179 Register cnt1 = r2, tmp2 = r11, tmp3 = r12; 4180 FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2; 4181 4182 __ ldrq(vtmp, Address(__ post(tmp2, 16))); 4183 __ ldr(tmpU, Address(__ post(cnt1, 8))); 4184 __ zip1(vtmp3, __ T16B, vtmp, vtmpZ); 4185 // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3 4186 4187 __ fmovd(tmpL, vtmp3); 4188 __ eor(rscratch2, tmp3, tmpL); 4189 __ cbnz(rscratch2, DIFF2); 4190 4191 __ ldr(tmp3, Address(__ post(cnt1, 8))); 4192 __ umov(tmpL, vtmp3, __ D, 1); 4193 __ eor(rscratch2, tmpU, tmpL); 4194 __ cbnz(rscratch2, DIFF1); 4195 4196 __ zip2(vtmp, __ T16B, vtmp, vtmpZ); 4197 __ ldr(tmpU, Address(__ post(cnt1, 8))); 4198 __ fmovd(tmpL, vtmp); 4199 __ eor(rscratch2, tmp3, tmpL); 4200 __ cbnz(rscratch2, DIFF2); 4201 4202 __ ldr(tmp3, Address(__ post(cnt1, 8))); 4203 __ umov(tmpL, vtmp, __ D, 1); 4204 __ eor(rscratch2, tmpU, tmpL); 4205 __ cbnz(rscratch2, DIFF1); 4206 } 4207 4208 // r0 = result 4209 // r1 = str1 4210 // r2 = cnt1 4211 // r3 = str2 4212 // r4 = cnt2 4213 // r10 = tmp1 4214 // r11 = tmp2 4215 address generate_compare_long_string_different_encoding(bool isLU) { 4216 __ align(CodeEntryAlignment); 4217 StubCodeMark mark(this, "StubRoutines", isLU 4218 ? "compare_long_string_different_encoding LU" 4219 : "compare_long_string_different_encoding UL"); 4220 address entry = __ pc(); 4221 Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2, 4222 DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH, 4223 LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2; 4224 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 4225 tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14; 4226 FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2; 4227 RegSet spilled_regs = RegSet::of(tmp3, tmp4); 4228 4229 int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2); 4230 4231 __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ); 4232 // cnt2 == amount of characters left to compare 4233 // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL)) 4234 __ zip1(vtmp, __ T8B, vtmp, vtmpZ); 4235 __ add(str1, str1, isLU ? wordSize/2 : wordSize); 4236 __ add(str2, str2, isLU ? wordSize : wordSize/2); 4237 __ fmovd(isLU ? tmp1 : tmp2, vtmp); 4238 __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case. 4239 __ eor(rscratch2, tmp1, tmp2); 4240 __ mov(rscratch1, tmp2); 4241 __ cbnz(rscratch2, CALCULATE_DIFFERENCE); 4242 Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison 4243 tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison 4244 __ push(spilled_regs, sp); 4245 __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load 4246 __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load 4247 4248 __ ldr(tmp3, Address(__ post(cnt1, 8))); 4249 4250 if (SoftwarePrefetchHintDistance >= 0) { 4251 __ subs(rscratch2, cnt2, prefetchLoopExitCondition); 4252 __ br(__ LT, NO_PREFETCH); 4253 __ bind(LARGE_LOOP_PREFETCH); 4254 __ prfm(Address(tmp2, SoftwarePrefetchHintDistance)); 4255 __ mov(tmp4, 2); 4256 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance)); 4257 __ bind(LARGE_LOOP_PREFETCH_REPEAT1); 4258 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 4259 __ subs(tmp4, tmp4, 1); 4260 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1); 4261 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance)); 4262 __ mov(tmp4, 2); 4263 __ bind(LARGE_LOOP_PREFETCH_REPEAT2); 4264 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 4265 __ subs(tmp4, tmp4, 1); 4266 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2); 4267 __ sub(cnt2, cnt2, 64); 4268 __ subs(rscratch2, cnt2, prefetchLoopExitCondition); 4269 __ br(__ GE, LARGE_LOOP_PREFETCH); 4270 } 4271 __ cbz(cnt2, LOAD_LAST); // no characters left except last load 4272 __ bind(NO_PREFETCH); 4273 __ subs(cnt2, cnt2, 16); 4274 __ br(__ LT, TAIL); 4275 __ align(OptoLoopAlignment); 4276 __ bind(SMALL_LOOP); // smaller loop 4277 __ subs(cnt2, cnt2, 16); 4278 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 4279 __ br(__ GE, SMALL_LOOP); 4280 __ cmn(cnt2, (u1)16); 4281 __ br(__ EQ, LOAD_LAST); 4282 __ bind(TAIL); // 1..15 characters left until last load (last 4 characters) 4283 __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string 4284 __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string 4285 __ ldr(tmp3, Address(cnt1, -8)); 4286 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load 4287 __ b(LOAD_LAST); 4288 __ bind(DIFF2); 4289 __ mov(tmpU, tmp3); 4290 __ bind(DIFF1); 4291 __ pop(spilled_regs, sp); 4292 __ b(CALCULATE_DIFFERENCE); 4293 __ bind(LOAD_LAST); 4294 // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU. 4295 // No need to load it again 4296 __ mov(tmpU, tmp3); 4297 __ pop(spilled_regs, sp); 4298 4299 // tmp2 points to the address of the last 4 Latin1 characters right now 4300 __ ldrs(vtmp, Address(tmp2)); 4301 __ zip1(vtmp, __ T8B, vtmp, vtmpZ); 4302 __ fmovd(tmpL, vtmp); 4303 4304 __ eor(rscratch2, tmpU, tmpL); 4305 __ cbz(rscratch2, DONE); 4306 4307 // Find the first different characters in the longwords and 4308 // compute their difference. 4309 __ bind(CALCULATE_DIFFERENCE); 4310 __ rev(rscratch2, rscratch2); 4311 __ clz(rscratch2, rscratch2); 4312 __ andr(rscratch2, rscratch2, -16); 4313 __ lsrv(tmp1, tmp1, rscratch2); 4314 __ uxthw(tmp1, tmp1); 4315 __ lsrv(rscratch1, rscratch1, rscratch2); 4316 __ uxthw(rscratch1, rscratch1); 4317 __ subw(result, tmp1, rscratch1); 4318 __ bind(DONE); 4319 __ ret(lr); 4320 return entry; 4321 } 4322 4323 address generate_method_entry_barrier() { 4324 __ align(CodeEntryAlignment); 4325 StubCodeMark mark(this, "StubRoutines", "nmethod_entry_barrier"); 4326 4327 Label deoptimize_label; 4328 4329 address start = __ pc(); 4330 4331 __ set_last_Java_frame(sp, rfp, lr, rscratch1); 4332 4333 __ enter(); 4334 __ add(rscratch2, sp, wordSize); // rscratch2 points to the saved lr 4335 4336 __ sub(sp, sp, 4 * wordSize); // four words for the returned {sp, fp, lr, pc} 4337 4338 __ push_call_clobbered_registers(); 4339 4340 __ mov(c_rarg0, rscratch2); 4341 __ call_VM_leaf 4342 (CAST_FROM_FN_PTR 4343 (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1); 4344 4345 __ reset_last_Java_frame(true); 4346 4347 __ mov(rscratch1, r0); 4348 4349 __ pop_call_clobbered_registers(); 4350 4351 __ cbnz(rscratch1, deoptimize_label); 4352 4353 __ leave(); 4354 __ ret(lr); 4355 4356 __ BIND(deoptimize_label); 4357 4358 __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize)); 4359 __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize)); 4360 4361 __ mov(sp, rscratch1); 4362 __ br(rscratch2); 4363 4364 return start; 4365 } 4366 4367 // r0 = result 4368 // r1 = str1 4369 // r2 = cnt1 4370 // r3 = str2 4371 // r4 = cnt2 4372 // r10 = tmp1 4373 // r11 = tmp2 4374 address generate_compare_long_string_same_encoding(bool isLL) { 4375 __ align(CodeEntryAlignment); 4376 StubCodeMark mark(this, "StubRoutines", isLL 4377 ? "compare_long_string_same_encoding LL" 4378 : "compare_long_string_same_encoding UU"); 4379 address entry = __ pc(); 4380 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 4381 tmp1 = r10, tmp2 = r11; 4382 Label SMALL_LOOP, LARGE_LOOP_PREFETCH, CHECK_LAST, DIFF2, TAIL, 4383 LENGTH_DIFF, DIFF, LAST_CHECK_AND_LENGTH_DIFF, 4384 DIFF_LAST_POSITION, DIFF_LAST_POSITION2; 4385 // exit from large loop when less than 64 bytes left to read or we're about 4386 // to prefetch memory behind array border 4387 int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2); 4388 // cnt1/cnt2 contains amount of characters to compare. cnt1 can be re-used 4389 // update cnt2 counter with already loaded 8 bytes 4390 __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2)); 4391 // update pointers, because of previous read 4392 __ add(str1, str1, wordSize); 4393 __ add(str2, str2, wordSize); 4394 if (SoftwarePrefetchHintDistance >= 0) { 4395 __ bind(LARGE_LOOP_PREFETCH); 4396 __ prfm(Address(str1, SoftwarePrefetchHintDistance)); 4397 __ prfm(Address(str2, SoftwarePrefetchHintDistance)); 4398 compare_string_16_bytes_same(DIFF, DIFF2); 4399 compare_string_16_bytes_same(DIFF, DIFF2); 4400 __ sub(cnt2, cnt2, isLL ? 64 : 32); 4401 compare_string_16_bytes_same(DIFF, DIFF2); 4402 __ subs(rscratch2, cnt2, largeLoopExitCondition); 4403 compare_string_16_bytes_same(DIFF, DIFF2); 4404 __ br(__ GT, LARGE_LOOP_PREFETCH); 4405 __ cbz(cnt2, LAST_CHECK_AND_LENGTH_DIFF); // no more chars left? 4406 } 4407 // less than 16 bytes left? 4408 __ subs(cnt2, cnt2, isLL ? 16 : 8); 4409 __ br(__ LT, TAIL); 4410 __ align(OptoLoopAlignment); 4411 __ bind(SMALL_LOOP); 4412 compare_string_16_bytes_same(DIFF, DIFF2); 4413 __ subs(cnt2, cnt2, isLL ? 16 : 8); 4414 __ br(__ GE, SMALL_LOOP); 4415 __ bind(TAIL); 4416 __ adds(cnt2, cnt2, isLL ? 16 : 8); 4417 __ br(__ EQ, LAST_CHECK_AND_LENGTH_DIFF); 4418 __ subs(cnt2, cnt2, isLL ? 8 : 4); 4419 __ br(__ LE, CHECK_LAST); 4420 __ eor(rscratch2, tmp1, tmp2); 4421 __ cbnz(rscratch2, DIFF); 4422 __ ldr(tmp1, Address(__ post(str1, 8))); 4423 __ ldr(tmp2, Address(__ post(str2, 8))); 4424 __ sub(cnt2, cnt2, isLL ? 8 : 4); 4425 __ bind(CHECK_LAST); 4426 if (!isLL) { 4427 __ add(cnt2, cnt2, cnt2); // now in bytes 4428 } 4429 __ eor(rscratch2, tmp1, tmp2); 4430 __ cbnz(rscratch2, DIFF); 4431 __ ldr(rscratch1, Address(str1, cnt2)); 4432 __ ldr(cnt1, Address(str2, cnt2)); 4433 __ eor(rscratch2, rscratch1, cnt1); 4434 __ cbz(rscratch2, LENGTH_DIFF); 4435 // Find the first different characters in the longwords and 4436 // compute their difference. 4437 __ bind(DIFF2); 4438 __ rev(rscratch2, rscratch2); 4439 __ clz(rscratch2, rscratch2); 4440 __ andr(rscratch2, rscratch2, isLL ? -8 : -16); 4441 __ lsrv(rscratch1, rscratch1, rscratch2); 4442 if (isLL) { 4443 __ lsrv(cnt1, cnt1, rscratch2); 4444 __ uxtbw(rscratch1, rscratch1); 4445 __ uxtbw(cnt1, cnt1); 4446 } else { 4447 __ lsrv(cnt1, cnt1, rscratch2); 4448 __ uxthw(rscratch1, rscratch1); 4449 __ uxthw(cnt1, cnt1); 4450 } 4451 __ subw(result, rscratch1, cnt1); 4452 __ b(LENGTH_DIFF); 4453 __ bind(DIFF); 4454 __ rev(rscratch2, rscratch2); 4455 __ clz(rscratch2, rscratch2); 4456 __ andr(rscratch2, rscratch2, isLL ? -8 : -16); 4457 __ lsrv(tmp1, tmp1, rscratch2); 4458 if (isLL) { 4459 __ lsrv(tmp2, tmp2, rscratch2); 4460 __ uxtbw(tmp1, tmp1); 4461 __ uxtbw(tmp2, tmp2); 4462 } else { 4463 __ lsrv(tmp2, tmp2, rscratch2); 4464 __ uxthw(tmp1, tmp1); 4465 __ uxthw(tmp2, tmp2); 4466 } 4467 __ subw(result, tmp1, tmp2); 4468 __ b(LENGTH_DIFF); 4469 __ bind(LAST_CHECK_AND_LENGTH_DIFF); 4470 __ eor(rscratch2, tmp1, tmp2); 4471 __ cbnz(rscratch2, DIFF); 4472 __ bind(LENGTH_DIFF); 4473 __ ret(lr); 4474 return entry; 4475 } 4476 4477 void generate_compare_long_strings() { 4478 StubRoutines::aarch64::_compare_long_string_LL 4479 = generate_compare_long_string_same_encoding(true); 4480 StubRoutines::aarch64::_compare_long_string_UU 4481 = generate_compare_long_string_same_encoding(false); 4482 StubRoutines::aarch64::_compare_long_string_LU 4483 = generate_compare_long_string_different_encoding(true); 4484 StubRoutines::aarch64::_compare_long_string_UL 4485 = generate_compare_long_string_different_encoding(false); 4486 } 4487 4488 // R0 = result 4489 // R1 = str2 4490 // R2 = cnt1 4491 // R3 = str1 4492 // R4 = cnt2 4493 // This generic linear code use few additional ideas, which makes it faster: 4494 // 1) we can safely keep at least 1st register of pattern(since length >= 8) 4495 // in order to skip initial loading(help in systems with 1 ld pipeline) 4496 // 2) we can use "fast" algorithm of finding single character to search for 4497 // first symbol with less branches(1 branch per each loaded register instead 4498 // of branch for each symbol), so, this is where constants like 4499 // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from 4500 // 3) after loading and analyzing 1st register of source string, it can be 4501 // used to search for every 1st character entry, saving few loads in 4502 // comparison with "simplier-but-slower" implementation 4503 // 4) in order to avoid lots of push/pop operations, code below is heavily 4504 // re-using/re-initializing/compressing register values, which makes code 4505 // larger and a bit less readable, however, most of extra operations are 4506 // issued during loads or branches, so, penalty is minimal 4507 address generate_string_indexof_linear(bool str1_isL, bool str2_isL) { 4508 const char* stubName = str1_isL 4509 ? (str2_isL ? "indexof_linear_ll" : "indexof_linear_ul") 4510 : "indexof_linear_uu"; 4511 __ align(CodeEntryAlignment); 4512 StubCodeMark mark(this, "StubRoutines", stubName); 4513 address entry = __ pc(); 4514 4515 int str1_chr_size = str1_isL ? 1 : 2; 4516 int str2_chr_size = str2_isL ? 1 : 2; 4517 int str1_chr_shift = str1_isL ? 0 : 1; 4518 int str2_chr_shift = str2_isL ? 0 : 1; 4519 bool isL = str1_isL && str2_isL; 4520 // parameters 4521 Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4; 4522 // temporary registers 4523 Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23; 4524 RegSet spilled_regs = RegSet::range(tmp1, tmp4); 4525 // redefinitions 4526 Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3; 4527 4528 __ push(spilled_regs, sp); 4529 Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO, 4530 L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED, 4531 L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP, 4532 L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH, 4533 L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2, 4534 L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH; 4535 // Read whole register from str1. It is safe, because length >=8 here 4536 __ ldr(ch1, Address(str1)); 4537 // Read whole register from str2. It is safe, because length >=8 here 4538 __ ldr(ch2, Address(str2)); 4539 __ sub(cnt2, cnt2, cnt1); 4540 __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF); 4541 if (str1_isL != str2_isL) { 4542 __ eor(v0, __ T16B, v0, v0); 4543 } 4544 __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001); 4545 __ mul(first, first, tmp1); 4546 // check if we have less than 1 register to check 4547 __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1); 4548 if (str1_isL != str2_isL) { 4549 __ fmovd(v1, ch1); 4550 } 4551 __ br(__ LE, L_SMALL); 4552 __ eor(ch2, first, ch2); 4553 if (str1_isL != str2_isL) { 4554 __ zip1(v1, __ T16B, v1, v0); 4555 } 4556 __ sub(tmp2, ch2, tmp1); 4557 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 4558 __ bics(tmp2, tmp2, ch2); 4559 if (str1_isL != str2_isL) { 4560 __ fmovd(ch1, v1); 4561 } 4562 __ br(__ NE, L_HAS_ZERO); 4563 __ subs(cnt2, cnt2, wordSize/str2_chr_size); 4564 __ add(result, result, wordSize/str2_chr_size); 4565 __ add(str2, str2, wordSize); 4566 __ br(__ LT, L_POST_LOOP); 4567 __ BIND(L_LOOP); 4568 __ ldr(ch2, Address(str2)); 4569 __ eor(ch2, first, ch2); 4570 __ sub(tmp2, ch2, tmp1); 4571 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 4572 __ bics(tmp2, tmp2, ch2); 4573 __ br(__ NE, L_HAS_ZERO); 4574 __ BIND(L_LOOP_PROCEED); 4575 __ subs(cnt2, cnt2, wordSize/str2_chr_size); 4576 __ add(str2, str2, wordSize); 4577 __ add(result, result, wordSize/str2_chr_size); 4578 __ br(__ GE, L_LOOP); 4579 __ BIND(L_POST_LOOP); 4580 __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check 4581 __ br(__ LE, NOMATCH); 4582 __ ldr(ch2, Address(str2)); 4583 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift); 4584 __ eor(ch2, first, ch2); 4585 __ sub(tmp2, ch2, tmp1); 4586 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 4587 __ mov(tmp4, -1); // all bits set 4588 __ b(L_SMALL_PROCEED); 4589 __ align(OptoLoopAlignment); 4590 __ BIND(L_SMALL); 4591 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift); 4592 __ eor(ch2, first, ch2); 4593 if (str1_isL != str2_isL) { 4594 __ zip1(v1, __ T16B, v1, v0); 4595 } 4596 __ sub(tmp2, ch2, tmp1); 4597 __ mov(tmp4, -1); // all bits set 4598 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 4599 if (str1_isL != str2_isL) { 4600 __ fmovd(ch1, v1); // move converted 4 symbols 4601 } 4602 __ BIND(L_SMALL_PROCEED); 4603 __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits. 4604 __ bic(tmp2, tmp2, ch2); 4605 __ ands(tmp2, tmp2, tmp4); // clear useless bits and check 4606 __ rbit(tmp2, tmp2); 4607 __ br(__ EQ, NOMATCH); 4608 __ BIND(L_SMALL_HAS_ZERO_LOOP); 4609 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's 4610 __ cmp(cnt1, u1(wordSize/str2_chr_size)); 4611 __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2); 4612 if (str2_isL) { // LL 4613 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index" 4614 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe. 4615 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info 4616 __ add(result, result, tmp4, __ LSR, LogBitsPerByte); 4617 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 4618 } else { 4619 __ mov(ch2, 0xE); // all bits in byte set except last one 4620 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 4621 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 4622 __ lslv(tmp2, tmp2, tmp4); 4623 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4624 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4625 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 4626 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4627 } 4628 __ cmp(ch1, ch2); 4629 __ mov(tmp4, wordSize/str2_chr_size); 4630 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 4631 __ BIND(L_SMALL_CMP_LOOP); 4632 str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift))) 4633 : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift))); 4634 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))) 4635 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))); 4636 __ add(tmp4, tmp4, 1); 4637 __ cmp(tmp4, cnt1); 4638 __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP); 4639 __ cmp(first, ch2); 4640 __ br(__ EQ, L_SMALL_CMP_LOOP); 4641 __ BIND(L_SMALL_CMP_LOOP_NOMATCH); 4642 __ cbz(tmp2, NOMATCH); // no more matches. exit 4643 __ clz(tmp4, tmp2); 4644 __ add(result, result, 1); // advance index 4645 __ add(str2, str2, str2_chr_size); // advance pointer 4646 __ b(L_SMALL_HAS_ZERO_LOOP); 4647 __ align(OptoLoopAlignment); 4648 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP); 4649 __ cmp(first, ch2); 4650 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 4651 __ b(DONE); 4652 __ align(OptoLoopAlignment); 4653 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2); 4654 if (str2_isL) { // LL 4655 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index" 4656 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe. 4657 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info 4658 __ add(result, result, tmp4, __ LSR, LogBitsPerByte); 4659 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 4660 } else { 4661 __ mov(ch2, 0xE); // all bits in byte set except last one 4662 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 4663 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 4664 __ lslv(tmp2, tmp2, tmp4); 4665 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4666 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4667 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 4668 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4669 } 4670 __ cmp(ch1, ch2); 4671 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 4672 __ b(DONE); 4673 __ align(OptoLoopAlignment); 4674 __ BIND(L_HAS_ZERO); 4675 __ rbit(tmp2, tmp2); 4676 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's 4677 // Now, perform compression of counters(cnt2 and cnt1) into one register. 4678 // It's fine because both counters are 32bit and are not changed in this 4679 // loop. Just restore it on exit. So, cnt1 can be re-used in this loop. 4680 __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2); 4681 __ sub(result, result, 1); 4682 __ BIND(L_HAS_ZERO_LOOP); 4683 __ mov(cnt1, wordSize/str2_chr_size); 4684 __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2); 4685 __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare 4686 if (str2_isL) { 4687 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index 4688 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 4689 __ lslv(tmp2, tmp2, tmp4); 4690 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4691 __ add(tmp4, tmp4, 1); 4692 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4693 __ lsl(tmp2, tmp2, 1); 4694 __ mov(tmp4, wordSize/str2_chr_size); 4695 } else { 4696 __ mov(ch2, 0xE); 4697 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 4698 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 4699 __ lslv(tmp2, tmp2, tmp4); 4700 __ add(tmp4, tmp4, 1); 4701 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4702 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); 4703 __ lsl(tmp2, tmp2, 1); 4704 __ mov(tmp4, wordSize/str2_chr_size); 4705 __ sub(str2, str2, str2_chr_size); 4706 } 4707 __ cmp(ch1, ch2); 4708 __ mov(tmp4, wordSize/str2_chr_size); 4709 __ br(__ NE, L_CMP_LOOP_NOMATCH); 4710 __ BIND(L_CMP_LOOP); 4711 str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift))) 4712 : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift))); 4713 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))) 4714 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))); 4715 __ add(tmp4, tmp4, 1); 4716 __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2); 4717 __ br(__ GE, L_CMP_LOOP_LAST_CMP); 4718 __ cmp(cnt1, ch2); 4719 __ br(__ EQ, L_CMP_LOOP); 4720 __ BIND(L_CMP_LOOP_NOMATCH); 4721 // here we're not matched 4722 __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop 4723 __ clz(tmp4, tmp2); 4724 __ add(str2, str2, str2_chr_size); // advance pointer 4725 __ b(L_HAS_ZERO_LOOP); 4726 __ align(OptoLoopAlignment); 4727 __ BIND(L_CMP_LOOP_LAST_CMP); 4728 __ cmp(cnt1, ch2); 4729 __ br(__ NE, L_CMP_LOOP_NOMATCH); 4730 __ b(DONE); 4731 __ align(OptoLoopAlignment); 4732 __ BIND(L_CMP_LOOP_LAST_CMP2); 4733 if (str2_isL) { 4734 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index 4735 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 4736 __ lslv(tmp2, tmp2, tmp4); 4737 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4738 __ add(tmp4, tmp4, 1); 4739 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4740 __ lsl(tmp2, tmp2, 1); 4741 } else { 4742 __ mov(ch2, 0xE); 4743 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 4744 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 4745 __ lslv(tmp2, tmp2, tmp4); 4746 __ add(tmp4, tmp4, 1); 4747 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4748 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); 4749 __ lsl(tmp2, tmp2, 1); 4750 __ sub(str2, str2, str2_chr_size); 4751 } 4752 __ cmp(ch1, ch2); 4753 __ br(__ NE, L_CMP_LOOP_NOMATCH); 4754 __ b(DONE); 4755 __ align(OptoLoopAlignment); 4756 __ BIND(L_HAS_ZERO_LOOP_NOMATCH); 4757 // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until 4758 // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP, 4759 // so, result was increased at max by wordSize/str2_chr_size - 1, so, 4760 // respective high bit wasn't changed. L_LOOP_PROCEED will increase 4761 // result by analyzed characters value, so, we can just reset lower bits 4762 // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL 4763 // 2) restore cnt1 and cnt2 values from "compressed" cnt2 4764 // 3) advance str2 value to represent next str2 octet. result & 7/3 is 4765 // index of last analyzed substring inside current octet. So, str2 in at 4766 // respective start address. We need to advance it to next octet 4767 __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed 4768 __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2); 4769 __ bfm(result, zr, 0, 2 - str2_chr_shift); 4770 __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2 4771 __ movw(cnt2, cnt2); 4772 __ b(L_LOOP_PROCEED); 4773 __ align(OptoLoopAlignment); 4774 __ BIND(NOMATCH); 4775 __ mov(result, -1); 4776 __ BIND(DONE); 4777 __ pop(spilled_regs, sp); 4778 __ ret(lr); 4779 return entry; 4780 } 4781 4782 void generate_string_indexof_stubs() { 4783 StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true); 4784 StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false); 4785 StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false); 4786 } 4787 4788 void inflate_and_store_2_fp_registers(bool generatePrfm, 4789 FloatRegister src1, FloatRegister src2) { 4790 Register dst = r1; 4791 __ zip1(v1, __ T16B, src1, v0); 4792 __ zip2(v2, __ T16B, src1, v0); 4793 if (generatePrfm) { 4794 __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM); 4795 } 4796 __ zip1(v3, __ T16B, src2, v0); 4797 __ zip2(v4, __ T16B, src2, v0); 4798 __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64))); 4799 } 4800 4801 // R0 = src 4802 // R1 = dst 4803 // R2 = len 4804 // R3 = len >> 3 4805 // V0 = 0 4806 // v1 = loaded 8 bytes 4807 address generate_large_byte_array_inflate() { 4808 __ align(CodeEntryAlignment); 4809 StubCodeMark mark(this, "StubRoutines", "large_byte_array_inflate"); 4810 address entry = __ pc(); 4811 Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE; 4812 Register src = r0, dst = r1, len = r2, octetCounter = r3; 4813 const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4; 4814 4815 // do one more 8-byte read to have address 16-byte aligned in most cases 4816 // also use single store instruction 4817 __ ldrd(v2, __ post(src, 8)); 4818 __ sub(octetCounter, octetCounter, 2); 4819 __ zip1(v1, __ T16B, v1, v0); 4820 __ zip1(v2, __ T16B, v2, v0); 4821 __ st1(v1, v2, __ T16B, __ post(dst, 32)); 4822 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 4823 __ subs(rscratch1, octetCounter, large_loop_threshold); 4824 __ br(__ LE, LOOP_START); 4825 __ b(LOOP_PRFM_START); 4826 __ bind(LOOP_PRFM); 4827 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 4828 __ bind(LOOP_PRFM_START); 4829 __ prfm(Address(src, SoftwarePrefetchHintDistance)); 4830 __ sub(octetCounter, octetCounter, 8); 4831 __ subs(rscratch1, octetCounter, large_loop_threshold); 4832 inflate_and_store_2_fp_registers(true, v3, v4); 4833 inflate_and_store_2_fp_registers(true, v5, v6); 4834 __ br(__ GT, LOOP_PRFM); 4835 __ cmp(octetCounter, (u1)8); 4836 __ br(__ LT, DONE); 4837 __ bind(LOOP); 4838 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 4839 __ bind(LOOP_START); 4840 __ sub(octetCounter, octetCounter, 8); 4841 __ cmp(octetCounter, (u1)8); 4842 inflate_and_store_2_fp_registers(false, v3, v4); 4843 inflate_and_store_2_fp_registers(false, v5, v6); 4844 __ br(__ GE, LOOP); 4845 __ bind(DONE); 4846 __ ret(lr); 4847 return entry; 4848 } 4849 4850 /** 4851 * Arguments: 4852 * 4853 * Input: 4854 * c_rarg0 - current state address 4855 * c_rarg1 - H key address 4856 * c_rarg2 - data address 4857 * c_rarg3 - number of blocks 4858 * 4859 * Output: 4860 * Updated state at c_rarg0 4861 */ 4862 address generate_ghash_processBlocks() { 4863 // Bafflingly, GCM uses little-endian for the byte order, but 4864 // big-endian for the bit order. For example, the polynomial 1 is 4865 // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00. 4866 // 4867 // So, we must either reverse the bytes in each word and do 4868 // everything big-endian or reverse the bits in each byte and do 4869 // it little-endian. On AArch64 it's more idiomatic to reverse 4870 // the bits in each byte (we have an instruction, RBIT, to do 4871 // that) and keep the data in little-endian bit order throught the 4872 // calculation, bit-reversing the inputs and outputs. 4873 4874 StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks"); 4875 __ align(wordSize * 2); 4876 address p = __ pc(); 4877 __ emit_int64(0x87); // The low-order bits of the field 4878 // polynomial (i.e. p = z^7+z^2+z+1) 4879 // repeated in the low and high parts of a 4880 // 128-bit vector 4881 __ emit_int64(0x87); 4882 4883 __ align(CodeEntryAlignment); 4884 address start = __ pc(); 4885 4886 Register state = c_rarg0; 4887 Register subkeyH = c_rarg1; 4888 Register data = c_rarg2; 4889 Register blocks = c_rarg3; 4890 4891 FloatRegister vzr = v30; 4892 __ eor(vzr, __ T16B, vzr, vzr); // zero register 4893 4894 __ ldrq(v0, Address(state)); 4895 __ ldrq(v1, Address(subkeyH)); 4896 4897 __ rev64(v0, __ T16B, v0); // Bit-reverse words in state and subkeyH 4898 __ rbit(v0, __ T16B, v0); 4899 __ rev64(v1, __ T16B, v1); 4900 __ rbit(v1, __ T16B, v1); 4901 4902 __ ldrq(v26, p); 4903 4904 __ ext(v16, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1 4905 __ eor(v16, __ T16B, v16, v1); // xor subkeyH into subkeyL (Karatsuba: (A1+A0)) 4906 4907 { 4908 Label L_ghash_loop; 4909 __ bind(L_ghash_loop); 4910 4911 __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit 4912 // reversing each byte 4913 __ rbit(v2, __ T16B, v2); 4914 __ eor(v2, __ T16B, v0, v2); // bit-swapped data ^ bit-swapped state 4915 4916 // Multiply state in v2 by subkey in v1 4917 ghash_multiply(/*result_lo*/v5, /*result_hi*/v7, 4918 /*a*/v1, /*b*/v2, /*a1_xor_a0*/v16, 4919 /*temps*/v6, v20, v18, v21); 4920 // Reduce v7:v5 by the field polynomial 4921 ghash_reduce(v0, v5, v7, v26, vzr, v20); 4922 4923 __ sub(blocks, blocks, 1); 4924 __ cbnz(blocks, L_ghash_loop); 4925 } 4926 4927 // The bit-reversed result is at this point in v0 4928 __ rev64(v1, __ T16B, v0); 4929 __ rbit(v1, __ T16B, v1); 4930 4931 __ st1(v1, __ T16B, state); 4932 __ ret(lr); 4933 4934 return start; 4935 } 4936 4937 // Continuation point for throwing of implicit exceptions that are 4938 // not handled in the current activation. Fabricates an exception 4939 // oop and initiates normal exception dispatching in this 4940 // frame. Since we need to preserve callee-saved values (currently 4941 // only for C2, but done for C1 as well) we need a callee-saved oop 4942 // map and therefore have to make these stubs into RuntimeStubs 4943 // rather than BufferBlobs. If the compiler needs all registers to 4944 // be preserved between the fault point and the exception handler 4945 // then it must assume responsibility for that in 4946 // AbstractCompiler::continuation_for_implicit_null_exception or 4947 // continuation_for_implicit_division_by_zero_exception. All other 4948 // implicit exceptions (e.g., NullPointerException or 4949 // AbstractMethodError on entry) are either at call sites or 4950 // otherwise assume that stack unwinding will be initiated, so 4951 // caller saved registers were assumed volatile in the compiler. 4952 4953 #undef __ 4954 #define __ masm-> 4955 4956 address generate_throw_exception(const char* name, 4957 address runtime_entry, 4958 Register arg1 = noreg, 4959 Register arg2 = noreg) { 4960 // Information about frame layout at time of blocking runtime call. 4961 // Note that we only have to preserve callee-saved registers since 4962 // the compilers are responsible for supplying a continuation point 4963 // if they expect all registers to be preserved. 4964 // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0 4965 enum layout { 4966 rfp_off = 0, 4967 rfp_off2, 4968 return_off, 4969 return_off2, 4970 framesize // inclusive of return address 4971 }; 4972 4973 int insts_size = 512; 4974 int locs_size = 64; 4975 4976 CodeBuffer code(name, insts_size, locs_size); 4977 OopMapSet* oop_maps = new OopMapSet(); 4978 MacroAssembler* masm = new MacroAssembler(&code); 4979 4980 address start = __ pc(); 4981 4982 // This is an inlined and slightly modified version of call_VM 4983 // which has the ability to fetch the return PC out of 4984 // thread-local storage and also sets up last_Java_sp slightly 4985 // differently than the real call_VM 4986 4987 __ enter(); // Save FP and LR before call 4988 4989 assert(is_even(framesize/2), "sp not 16-byte aligned"); 4990 4991 // lr and fp are already in place 4992 __ sub(sp, rfp, ((unsigned)framesize-4) << LogBytesPerInt); // prolog 4993 4994 int frame_complete = __ pc() - start; 4995 4996 // Set up last_Java_sp and last_Java_fp 4997 address the_pc = __ pc(); 4998 __ set_last_Java_frame(sp, rfp, the_pc, rscratch1); 4999 5000 // Call runtime 5001 if (arg1 != noreg) { 5002 assert(arg2 != c_rarg1, "clobbered"); 5003 __ mov(c_rarg1, arg1); 5004 } 5005 if (arg2 != noreg) { 5006 __ mov(c_rarg2, arg2); 5007 } 5008 __ mov(c_rarg0, rthread); 5009 BLOCK_COMMENT("call runtime_entry"); 5010 __ mov(rscratch1, runtime_entry); 5011 __ blr(rscratch1); 5012 5013 // Generate oop map 5014 OopMap* map = new OopMap(framesize, 0); 5015 5016 oop_maps->add_gc_map(the_pc - start, map); 5017 5018 __ reset_last_Java_frame(true); 5019 __ maybe_isb(); 5020 5021 __ leave(); 5022 5023 // check for pending exceptions 5024 #ifdef ASSERT 5025 Label L; 5026 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 5027 __ cbnz(rscratch1, L); 5028 __ should_not_reach_here(); 5029 __ bind(L); 5030 #endif // ASSERT 5031 __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 5032 5033 5034 // codeBlob framesize is in words (not VMRegImpl::slot_size) 5035 RuntimeStub* stub = 5036 RuntimeStub::new_runtime_stub(name, 5037 &code, 5038 frame_complete, 5039 (framesize >> (LogBytesPerWord - LogBytesPerInt)), 5040 oop_maps, false); 5041 return stub->entry_point(); 5042 } 5043 5044 class MontgomeryMultiplyGenerator : public MacroAssembler { 5045 5046 Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn, 5047 Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj; 5048 5049 RegSet _toSave; 5050 bool _squaring; 5051 5052 public: 5053 MontgomeryMultiplyGenerator (Assembler *as, bool squaring) 5054 : MacroAssembler(as->code()), _squaring(squaring) { 5055 5056 // Register allocation 5057 5058 Register reg = c_rarg0; 5059 Pa_base = reg; // Argument registers 5060 if (squaring) 5061 Pb_base = Pa_base; 5062 else 5063 Pb_base = ++reg; 5064 Pn_base = ++reg; 5065 Rlen= ++reg; 5066 inv = ++reg; 5067 Pm_base = ++reg; 5068 5069 // Working registers: 5070 Ra = ++reg; // The current digit of a, b, n, and m. 5071 Rb = ++reg; 5072 Rm = ++reg; 5073 Rn = ++reg; 5074 5075 Pa = ++reg; // Pointers to the current/next digit of a, b, n, and m. 5076 Pb = ++reg; 5077 Pm = ++reg; 5078 Pn = ++reg; 5079 5080 t0 = ++reg; // Three registers which form a 5081 t1 = ++reg; // triple-precision accumuator. 5082 t2 = ++reg; 5083 5084 Ri = ++reg; // Inner and outer loop indexes. 5085 Rj = ++reg; 5086 5087 Rhi_ab = ++reg; // Product registers: low and high parts 5088 Rlo_ab = ++reg; // of a*b and m*n. 5089 Rhi_mn = ++reg; 5090 Rlo_mn = ++reg; 5091 5092 // r19 and up are callee-saved. 5093 _toSave = RegSet::range(r19, reg) + Pm_base; 5094 } 5095 5096 private: 5097 void save_regs() { 5098 push(_toSave, sp); 5099 } 5100 5101 void restore_regs() { 5102 pop(_toSave, sp); 5103 } 5104 5105 template <typename T> 5106 void unroll_2(Register count, T block) { 5107 Label loop, end, odd; 5108 tbnz(count, 0, odd); 5109 cbz(count, end); 5110 align(16); 5111 bind(loop); 5112 (this->*block)(); 5113 bind(odd); 5114 (this->*block)(); 5115 subs(count, count, 2); 5116 br(Assembler::GT, loop); 5117 bind(end); 5118 } 5119 5120 template <typename T> 5121 void unroll_2(Register count, T block, Register d, Register s, Register tmp) { 5122 Label loop, end, odd; 5123 tbnz(count, 0, odd); 5124 cbz(count, end); 5125 align(16); 5126 bind(loop); 5127 (this->*block)(d, s, tmp); 5128 bind(odd); 5129 (this->*block)(d, s, tmp); 5130 subs(count, count, 2); 5131 br(Assembler::GT, loop); 5132 bind(end); 5133 } 5134 5135 void pre1(RegisterOrConstant i) { 5136 block_comment("pre1"); 5137 // Pa = Pa_base; 5138 // Pb = Pb_base + i; 5139 // Pm = Pm_base; 5140 // Pn = Pn_base + i; 5141 // Ra = *Pa; 5142 // Rb = *Pb; 5143 // Rm = *Pm; 5144 // Rn = *Pn; 5145 ldr(Ra, Address(Pa_base)); 5146 ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 5147 ldr(Rm, Address(Pm_base)); 5148 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 5149 lea(Pa, Address(Pa_base)); 5150 lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 5151 lea(Pm, Address(Pm_base)); 5152 lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 5153 5154 // Zero the m*n result. 5155 mov(Rhi_mn, zr); 5156 mov(Rlo_mn, zr); 5157 } 5158 5159 // The core multiply-accumulate step of a Montgomery 5160 // multiplication. The idea is to schedule operations as a 5161 // pipeline so that instructions with long latencies (loads and 5162 // multiplies) have time to complete before their results are 5163 // used. This most benefits in-order implementations of the 5164 // architecture but out-of-order ones also benefit. 5165 void step() { 5166 block_comment("step"); 5167 // MACC(Ra, Rb, t0, t1, t2); 5168 // Ra = *++Pa; 5169 // Rb = *--Pb; 5170 umulh(Rhi_ab, Ra, Rb); 5171 mul(Rlo_ab, Ra, Rb); 5172 ldr(Ra, pre(Pa, wordSize)); 5173 ldr(Rb, pre(Pb, -wordSize)); 5174 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the 5175 // previous iteration. 5176 // MACC(Rm, Rn, t0, t1, t2); 5177 // Rm = *++Pm; 5178 // Rn = *--Pn; 5179 umulh(Rhi_mn, Rm, Rn); 5180 mul(Rlo_mn, Rm, Rn); 5181 ldr(Rm, pre(Pm, wordSize)); 5182 ldr(Rn, pre(Pn, -wordSize)); 5183 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 5184 } 5185 5186 void post1() { 5187 block_comment("post1"); 5188 5189 // MACC(Ra, Rb, t0, t1, t2); 5190 // Ra = *++Pa; 5191 // Rb = *--Pb; 5192 umulh(Rhi_ab, Ra, Rb); 5193 mul(Rlo_ab, Ra, Rb); 5194 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 5195 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 5196 5197 // *Pm = Rm = t0 * inv; 5198 mul(Rm, t0, inv); 5199 str(Rm, Address(Pm)); 5200 5201 // MACC(Rm, Rn, t0, t1, t2); 5202 // t0 = t1; t1 = t2; t2 = 0; 5203 umulh(Rhi_mn, Rm, Rn); 5204 5205 #ifndef PRODUCT 5206 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 5207 { 5208 mul(Rlo_mn, Rm, Rn); 5209 add(Rlo_mn, t0, Rlo_mn); 5210 Label ok; 5211 cbz(Rlo_mn, ok); { 5212 stop("broken Montgomery multiply"); 5213 } bind(ok); 5214 } 5215 #endif 5216 // We have very carefully set things up so that 5217 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 5218 // the lower half of Rm * Rn because we know the result already: 5219 // it must be -t0. t0 + (-t0) must generate a carry iff 5220 // t0 != 0. So, rather than do a mul and an adds we just set 5221 // the carry flag iff t0 is nonzero. 5222 // 5223 // mul(Rlo_mn, Rm, Rn); 5224 // adds(zr, t0, Rlo_mn); 5225 subs(zr, t0, 1); // Set carry iff t0 is nonzero 5226 adcs(t0, t1, Rhi_mn); 5227 adc(t1, t2, zr); 5228 mov(t2, zr); 5229 } 5230 5231 void pre2(RegisterOrConstant i, RegisterOrConstant len) { 5232 block_comment("pre2"); 5233 // Pa = Pa_base + i-len; 5234 // Pb = Pb_base + len; 5235 // Pm = Pm_base + i-len; 5236 // Pn = Pn_base + len; 5237 5238 if (i.is_register()) { 5239 sub(Rj, i.as_register(), len); 5240 } else { 5241 mov(Rj, i.as_constant()); 5242 sub(Rj, Rj, len); 5243 } 5244 // Rj == i-len 5245 5246 lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord))); 5247 lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord))); 5248 lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 5249 lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord))); 5250 5251 // Ra = *++Pa; 5252 // Rb = *--Pb; 5253 // Rm = *++Pm; 5254 // Rn = *--Pn; 5255 ldr(Ra, pre(Pa, wordSize)); 5256 ldr(Rb, pre(Pb, -wordSize)); 5257 ldr(Rm, pre(Pm, wordSize)); 5258 ldr(Rn, pre(Pn, -wordSize)); 5259 5260 mov(Rhi_mn, zr); 5261 mov(Rlo_mn, zr); 5262 } 5263 5264 void post2(RegisterOrConstant i, RegisterOrConstant len) { 5265 block_comment("post2"); 5266 if (i.is_constant()) { 5267 mov(Rj, i.as_constant()-len.as_constant()); 5268 } else { 5269 sub(Rj, i.as_register(), len); 5270 } 5271 5272 adds(t0, t0, Rlo_mn); // The pending m*n, low part 5273 5274 // As soon as we know the least significant digit of our result, 5275 // store it. 5276 // Pm_base[i-len] = t0; 5277 str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 5278 5279 // t0 = t1; t1 = t2; t2 = 0; 5280 adcs(t0, t1, Rhi_mn); // The pending m*n, high part 5281 adc(t1, t2, zr); 5282 mov(t2, zr); 5283 } 5284 5285 // A carry in t0 after Montgomery multiplication means that we 5286 // should subtract multiples of n from our result in m. We'll 5287 // keep doing that until there is no carry. 5288 void normalize(RegisterOrConstant len) { 5289 block_comment("normalize"); 5290 // while (t0) 5291 // t0 = sub(Pm_base, Pn_base, t0, len); 5292 Label loop, post, again; 5293 Register cnt = t1, i = t2; // Re-use registers; we're done with them now 5294 cbz(t0, post); { 5295 bind(again); { 5296 mov(i, zr); 5297 mov(cnt, len); 5298 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 5299 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 5300 subs(zr, zr, zr); // set carry flag, i.e. no borrow 5301 align(16); 5302 bind(loop); { 5303 sbcs(Rm, Rm, Rn); 5304 str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 5305 add(i, i, 1); 5306 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 5307 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 5308 sub(cnt, cnt, 1); 5309 } cbnz(cnt, loop); 5310 sbc(t0, t0, zr); 5311 } cbnz(t0, again); 5312 } bind(post); 5313 } 5314 5315 // Move memory at s to d, reversing words. 5316 // Increments d to end of copied memory 5317 // Destroys tmp1, tmp2 5318 // Preserves len 5319 // Leaves s pointing to the address which was in d at start 5320 void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) { 5321 assert(tmp1 < r19 && tmp2 < r19, "register corruption"); 5322 5323 lea(s, Address(s, len, Address::uxtw(LogBytesPerWord))); 5324 mov(tmp1, len); 5325 unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2); 5326 sub(s, d, len, ext::uxtw, LogBytesPerWord); 5327 } 5328 // where 5329 void reverse1(Register d, Register s, Register tmp) { 5330 ldr(tmp, pre(s, -wordSize)); 5331 ror(tmp, tmp, 32); 5332 str(tmp, post(d, wordSize)); 5333 } 5334 5335 void step_squaring() { 5336 // An extra ACC 5337 step(); 5338 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 5339 } 5340 5341 void last_squaring(RegisterOrConstant i) { 5342 Label dont; 5343 // if ((i & 1) == 0) { 5344 tbnz(i.as_register(), 0, dont); { 5345 // MACC(Ra, Rb, t0, t1, t2); 5346 // Ra = *++Pa; 5347 // Rb = *--Pb; 5348 umulh(Rhi_ab, Ra, Rb); 5349 mul(Rlo_ab, Ra, Rb); 5350 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 5351 } bind(dont); 5352 } 5353 5354 void extra_step_squaring() { 5355 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 5356 5357 // MACC(Rm, Rn, t0, t1, t2); 5358 // Rm = *++Pm; 5359 // Rn = *--Pn; 5360 umulh(Rhi_mn, Rm, Rn); 5361 mul(Rlo_mn, Rm, Rn); 5362 ldr(Rm, pre(Pm, wordSize)); 5363 ldr(Rn, pre(Pn, -wordSize)); 5364 } 5365 5366 void post1_squaring() { 5367 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 5368 5369 // *Pm = Rm = t0 * inv; 5370 mul(Rm, t0, inv); 5371 str(Rm, Address(Pm)); 5372 5373 // MACC(Rm, Rn, t0, t1, t2); 5374 // t0 = t1; t1 = t2; t2 = 0; 5375 umulh(Rhi_mn, Rm, Rn); 5376 5377 #ifndef PRODUCT 5378 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 5379 { 5380 mul(Rlo_mn, Rm, Rn); 5381 add(Rlo_mn, t0, Rlo_mn); 5382 Label ok; 5383 cbz(Rlo_mn, ok); { 5384 stop("broken Montgomery multiply"); 5385 } bind(ok); 5386 } 5387 #endif 5388 // We have very carefully set things up so that 5389 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 5390 // the lower half of Rm * Rn because we know the result already: 5391 // it must be -t0. t0 + (-t0) must generate a carry iff 5392 // t0 != 0. So, rather than do a mul and an adds we just set 5393 // the carry flag iff t0 is nonzero. 5394 // 5395 // mul(Rlo_mn, Rm, Rn); 5396 // adds(zr, t0, Rlo_mn); 5397 subs(zr, t0, 1); // Set carry iff t0 is nonzero 5398 adcs(t0, t1, Rhi_mn); 5399 adc(t1, t2, zr); 5400 mov(t2, zr); 5401 } 5402 5403 void acc(Register Rhi, Register Rlo, 5404 Register t0, Register t1, Register t2) { 5405 adds(t0, t0, Rlo); 5406 adcs(t1, t1, Rhi); 5407 adc(t2, t2, zr); 5408 } 5409 5410 public: 5411 /** 5412 * Fast Montgomery multiplication. The derivation of the 5413 * algorithm is in A Cryptographic Library for the Motorola 5414 * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. 5415 * 5416 * Arguments: 5417 * 5418 * Inputs for multiplication: 5419 * c_rarg0 - int array elements a 5420 * c_rarg1 - int array elements b 5421 * c_rarg2 - int array elements n (the modulus) 5422 * c_rarg3 - int length 5423 * c_rarg4 - int inv 5424 * c_rarg5 - int array elements m (the result) 5425 * 5426 * Inputs for squaring: 5427 * c_rarg0 - int array elements a 5428 * c_rarg1 - int array elements n (the modulus) 5429 * c_rarg2 - int length 5430 * c_rarg3 - int inv 5431 * c_rarg4 - int array elements m (the result) 5432 * 5433 */ 5434 address generate_multiply() { 5435 Label argh, nothing; 5436 bind(argh); 5437 stop("MontgomeryMultiply total_allocation must be <= 8192"); 5438 5439 align(CodeEntryAlignment); 5440 address entry = pc(); 5441 5442 cbzw(Rlen, nothing); 5443 5444 enter(); 5445 5446 // Make room. 5447 cmpw(Rlen, 512); 5448 br(Assembler::HI, argh); 5449 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 5450 andr(sp, Ra, -2 * wordSize); 5451 5452 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 5453 5454 { 5455 // Copy input args, reversing as we go. We use Ra as a 5456 // temporary variable. 5457 reverse(Ra, Pa_base, Rlen, t0, t1); 5458 if (!_squaring) 5459 reverse(Ra, Pb_base, Rlen, t0, t1); 5460 reverse(Ra, Pn_base, Rlen, t0, t1); 5461 } 5462 5463 // Push all call-saved registers and also Pm_base which we'll need 5464 // at the end. 5465 save_regs(); 5466 5467 #ifndef PRODUCT 5468 // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply"); 5469 { 5470 ldr(Rn, Address(Pn_base, 0)); 5471 mul(Rlo_mn, Rn, inv); 5472 subs(zr, Rlo_mn, -1); 5473 Label ok; 5474 br(EQ, ok); { 5475 stop("broken inverse in Montgomery multiply"); 5476 } bind(ok); 5477 } 5478 #endif 5479 5480 mov(Pm_base, Ra); 5481 5482 mov(t0, zr); 5483 mov(t1, zr); 5484 mov(t2, zr); 5485 5486 block_comment("for (int i = 0; i < len; i++) {"); 5487 mov(Ri, zr); { 5488 Label loop, end; 5489 cmpw(Ri, Rlen); 5490 br(Assembler::GE, end); 5491 5492 bind(loop); 5493 pre1(Ri); 5494 5495 block_comment(" for (j = i; j; j--) {"); { 5496 movw(Rj, Ri); 5497 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 5498 } block_comment(" } // j"); 5499 5500 post1(); 5501 addw(Ri, Ri, 1); 5502 cmpw(Ri, Rlen); 5503 br(Assembler::LT, loop); 5504 bind(end); 5505 block_comment("} // i"); 5506 } 5507 5508 block_comment("for (int i = len; i < 2*len; i++) {"); 5509 mov(Ri, Rlen); { 5510 Label loop, end; 5511 cmpw(Ri, Rlen, Assembler::LSL, 1); 5512 br(Assembler::GE, end); 5513 5514 bind(loop); 5515 pre2(Ri, Rlen); 5516 5517 block_comment(" for (j = len*2-i-1; j; j--) {"); { 5518 lslw(Rj, Rlen, 1); 5519 subw(Rj, Rj, Ri); 5520 subw(Rj, Rj, 1); 5521 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 5522 } block_comment(" } // j"); 5523 5524 post2(Ri, Rlen); 5525 addw(Ri, Ri, 1); 5526 cmpw(Ri, Rlen, Assembler::LSL, 1); 5527 br(Assembler::LT, loop); 5528 bind(end); 5529 } 5530 block_comment("} // i"); 5531 5532 normalize(Rlen); 5533 5534 mov(Ra, Pm_base); // Save Pm_base in Ra 5535 restore_regs(); // Restore caller's Pm_base 5536 5537 // Copy our result into caller's Pm_base 5538 reverse(Pm_base, Ra, Rlen, t0, t1); 5539 5540 leave(); 5541 bind(nothing); 5542 ret(lr); 5543 5544 return entry; 5545 } 5546 // In C, approximately: 5547 5548 // void 5549 // montgomery_multiply(julong Pa_base[], julong Pb_base[], 5550 // julong Pn_base[], julong Pm_base[], 5551 // julong inv, int len) { 5552 // julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 5553 // julong *Pa, *Pb, *Pn, *Pm; 5554 // julong Ra, Rb, Rn, Rm; 5555 5556 // int i; 5557 5558 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 5559 5560 // for (i = 0; i < len; i++) { 5561 // int j; 5562 5563 // Pa = Pa_base; 5564 // Pb = Pb_base + i; 5565 // Pm = Pm_base; 5566 // Pn = Pn_base + i; 5567 5568 // Ra = *Pa; 5569 // Rb = *Pb; 5570 // Rm = *Pm; 5571 // Rn = *Pn; 5572 5573 // int iters = i; 5574 // for (j = 0; iters--; j++) { 5575 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 5576 // MACC(Ra, Rb, t0, t1, t2); 5577 // Ra = *++Pa; 5578 // Rb = *--Pb; 5579 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5580 // MACC(Rm, Rn, t0, t1, t2); 5581 // Rm = *++Pm; 5582 // Rn = *--Pn; 5583 // } 5584 5585 // assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be"); 5586 // MACC(Ra, Rb, t0, t1, t2); 5587 // *Pm = Rm = t0 * inv; 5588 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 5589 // MACC(Rm, Rn, t0, t1, t2); 5590 5591 // assert(t0 == 0, "broken Montgomery multiply"); 5592 5593 // t0 = t1; t1 = t2; t2 = 0; 5594 // } 5595 5596 // for (i = len; i < 2*len; i++) { 5597 // int j; 5598 5599 // Pa = Pa_base + i-len; 5600 // Pb = Pb_base + len; 5601 // Pm = Pm_base + i-len; 5602 // Pn = Pn_base + len; 5603 5604 // Ra = *++Pa; 5605 // Rb = *--Pb; 5606 // Rm = *++Pm; 5607 // Rn = *--Pn; 5608 5609 // int iters = len*2-i-1; 5610 // for (j = i-len+1; iters--; j++) { 5611 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 5612 // MACC(Ra, Rb, t0, t1, t2); 5613 // Ra = *++Pa; 5614 // Rb = *--Pb; 5615 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5616 // MACC(Rm, Rn, t0, t1, t2); 5617 // Rm = *++Pm; 5618 // Rn = *--Pn; 5619 // } 5620 5621 // Pm_base[i-len] = t0; 5622 // t0 = t1; t1 = t2; t2 = 0; 5623 // } 5624 5625 // while (t0) 5626 // t0 = sub(Pm_base, Pn_base, t0, len); 5627 // } 5628 5629 /** 5630 * Fast Montgomery squaring. This uses asymptotically 25% fewer 5631 * multiplies than Montgomery multiplication so it should be up to 5632 * 25% faster. However, its loop control is more complex and it 5633 * may actually run slower on some machines. 5634 * 5635 * Arguments: 5636 * 5637 * Inputs: 5638 * c_rarg0 - int array elements a 5639 * c_rarg1 - int array elements n (the modulus) 5640 * c_rarg2 - int length 5641 * c_rarg3 - int inv 5642 * c_rarg4 - int array elements m (the result) 5643 * 5644 */ 5645 address generate_square() { 5646 Label argh; 5647 bind(argh); 5648 stop("MontgomeryMultiply total_allocation must be <= 8192"); 5649 5650 align(CodeEntryAlignment); 5651 address entry = pc(); 5652 5653 enter(); 5654 5655 // Make room. 5656 cmpw(Rlen, 512); 5657 br(Assembler::HI, argh); 5658 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 5659 andr(sp, Ra, -2 * wordSize); 5660 5661 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 5662 5663 { 5664 // Copy input args, reversing as we go. We use Ra as a 5665 // temporary variable. 5666 reverse(Ra, Pa_base, Rlen, t0, t1); 5667 reverse(Ra, Pn_base, Rlen, t0, t1); 5668 } 5669 5670 // Push all call-saved registers and also Pm_base which we'll need 5671 // at the end. 5672 save_regs(); 5673 5674 mov(Pm_base, Ra); 5675 5676 mov(t0, zr); 5677 mov(t1, zr); 5678 mov(t2, zr); 5679 5680 block_comment("for (int i = 0; i < len; i++) {"); 5681 mov(Ri, zr); { 5682 Label loop, end; 5683 bind(loop); 5684 cmp(Ri, Rlen); 5685 br(Assembler::GE, end); 5686 5687 pre1(Ri); 5688 5689 block_comment("for (j = (i+1)/2; j; j--) {"); { 5690 add(Rj, Ri, 1); 5691 lsr(Rj, Rj, 1); 5692 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 5693 } block_comment(" } // j"); 5694 5695 last_squaring(Ri); 5696 5697 block_comment(" for (j = i/2; j; j--) {"); { 5698 lsr(Rj, Ri, 1); 5699 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 5700 } block_comment(" } // j"); 5701 5702 post1_squaring(); 5703 add(Ri, Ri, 1); 5704 cmp(Ri, Rlen); 5705 br(Assembler::LT, loop); 5706 5707 bind(end); 5708 block_comment("} // i"); 5709 } 5710 5711 block_comment("for (int i = len; i < 2*len; i++) {"); 5712 mov(Ri, Rlen); { 5713 Label loop, end; 5714 bind(loop); 5715 cmp(Ri, Rlen, Assembler::LSL, 1); 5716 br(Assembler::GE, end); 5717 5718 pre2(Ri, Rlen); 5719 5720 block_comment(" for (j = (2*len-i-1)/2; j; j--) {"); { 5721 lsl(Rj, Rlen, 1); 5722 sub(Rj, Rj, Ri); 5723 sub(Rj, Rj, 1); 5724 lsr(Rj, Rj, 1); 5725 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 5726 } block_comment(" } // j"); 5727 5728 last_squaring(Ri); 5729 5730 block_comment(" for (j = (2*len-i)/2; j; j--) {"); { 5731 lsl(Rj, Rlen, 1); 5732 sub(Rj, Rj, Ri); 5733 lsr(Rj, Rj, 1); 5734 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 5735 } block_comment(" } // j"); 5736 5737 post2(Ri, Rlen); 5738 add(Ri, Ri, 1); 5739 cmp(Ri, Rlen, Assembler::LSL, 1); 5740 5741 br(Assembler::LT, loop); 5742 bind(end); 5743 block_comment("} // i"); 5744 } 5745 5746 normalize(Rlen); 5747 5748 mov(Ra, Pm_base); // Save Pm_base in Ra 5749 restore_regs(); // Restore caller's Pm_base 5750 5751 // Copy our result into caller's Pm_base 5752 reverse(Pm_base, Ra, Rlen, t0, t1); 5753 5754 leave(); 5755 ret(lr); 5756 5757 return entry; 5758 } 5759 // In C, approximately: 5760 5761 // void 5762 // montgomery_square(julong Pa_base[], julong Pn_base[], 5763 // julong Pm_base[], julong inv, int len) { 5764 // julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 5765 // julong *Pa, *Pb, *Pn, *Pm; 5766 // julong Ra, Rb, Rn, Rm; 5767 5768 // int i; 5769 5770 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 5771 5772 // for (i = 0; i < len; i++) { 5773 // int j; 5774 5775 // Pa = Pa_base; 5776 // Pb = Pa_base + i; 5777 // Pm = Pm_base; 5778 // Pn = Pn_base + i; 5779 5780 // Ra = *Pa; 5781 // Rb = *Pb; 5782 // Rm = *Pm; 5783 // Rn = *Pn; 5784 5785 // int iters = (i+1)/2; 5786 // for (j = 0; iters--; j++) { 5787 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 5788 // MACC2(Ra, Rb, t0, t1, t2); 5789 // Ra = *++Pa; 5790 // Rb = *--Pb; 5791 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5792 // MACC(Rm, Rn, t0, t1, t2); 5793 // Rm = *++Pm; 5794 // Rn = *--Pn; 5795 // } 5796 // if ((i & 1) == 0) { 5797 // assert(Ra == Pa_base[j], "must be"); 5798 // MACC(Ra, Ra, t0, t1, t2); 5799 // } 5800 // iters = i/2; 5801 // assert(iters == i-j, "must be"); 5802 // for (; iters--; j++) { 5803 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5804 // MACC(Rm, Rn, t0, t1, t2); 5805 // Rm = *++Pm; 5806 // Rn = *--Pn; 5807 // } 5808 5809 // *Pm = Rm = t0 * inv; 5810 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 5811 // MACC(Rm, Rn, t0, t1, t2); 5812 5813 // assert(t0 == 0, "broken Montgomery multiply"); 5814 5815 // t0 = t1; t1 = t2; t2 = 0; 5816 // } 5817 5818 // for (i = len; i < 2*len; i++) { 5819 // int start = i-len+1; 5820 // int end = start + (len - start)/2; 5821 // int j; 5822 5823 // Pa = Pa_base + i-len; 5824 // Pb = Pa_base + len; 5825 // Pm = Pm_base + i-len; 5826 // Pn = Pn_base + len; 5827 5828 // Ra = *++Pa; 5829 // Rb = *--Pb; 5830 // Rm = *++Pm; 5831 // Rn = *--Pn; 5832 5833 // int iters = (2*len-i-1)/2; 5834 // assert(iters == end-start, "must be"); 5835 // for (j = start; iters--; j++) { 5836 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 5837 // MACC2(Ra, Rb, t0, t1, t2); 5838 // Ra = *++Pa; 5839 // Rb = *--Pb; 5840 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5841 // MACC(Rm, Rn, t0, t1, t2); 5842 // Rm = *++Pm; 5843 // Rn = *--Pn; 5844 // } 5845 // if ((i & 1) == 0) { 5846 // assert(Ra == Pa_base[j], "must be"); 5847 // MACC(Ra, Ra, t0, t1, t2); 5848 // } 5849 // iters = (2*len-i)/2; 5850 // assert(iters == len-j, "must be"); 5851 // for (; iters--; j++) { 5852 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5853 // MACC(Rm, Rn, t0, t1, t2); 5854 // Rm = *++Pm; 5855 // Rn = *--Pn; 5856 // } 5857 // Pm_base[i-len] = t0; 5858 // t0 = t1; t1 = t2; t2 = 0; 5859 // } 5860 5861 // while (t0) 5862 // t0 = sub(Pm_base, Pn_base, t0, len); 5863 // } 5864 }; 5865 5866 5867 // Initialization 5868 void generate_initial() { 5869 // Generate initial stubs and initializes the entry points 5870 5871 // entry points that exist in all platforms Note: This is code 5872 // that could be shared among different platforms - however the 5873 // benefit seems to be smaller than the disadvantage of having a 5874 // much more complicated generator structure. See also comment in 5875 // stubRoutines.hpp. 5876 5877 StubRoutines::_forward_exception_entry = generate_forward_exception(); 5878 5879 StubRoutines::_call_stub_entry = 5880 generate_call_stub(StubRoutines::_call_stub_return_address); 5881 5882 // is referenced by megamorphic call 5883 StubRoutines::_catch_exception_entry = generate_catch_exception(); 5884 5885 // Build this early so it's available for the interpreter. 5886 StubRoutines::_throw_StackOverflowError_entry = 5887 generate_throw_exception("StackOverflowError throw_exception", 5888 CAST_FROM_FN_PTR(address, 5889 SharedRuntime::throw_StackOverflowError)); 5890 StubRoutines::_throw_delayed_StackOverflowError_entry = 5891 generate_throw_exception("delayed StackOverflowError throw_exception", 5892 CAST_FROM_FN_PTR(address, 5893 SharedRuntime::throw_delayed_StackOverflowError)); 5894 if (UseCRC32Intrinsics) { 5895 // set table address before stub generation which use it 5896 StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table; 5897 StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32(); 5898 } 5899 5900 if (UseCRC32CIntrinsics) { 5901 StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(); 5902 } 5903 5904 // Disabled until JDK-8210858 is fixed 5905 // if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog)) { 5906 // StubRoutines::_dlog = generate_dlog(); 5907 // } 5908 5909 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) { 5910 StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false); 5911 } 5912 5913 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) { 5914 StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true); 5915 } 5916 5917 // Safefetch stubs. 5918 generate_safefetch("SafeFetch32", sizeof(int), &StubRoutines::_safefetch32_entry, 5919 &StubRoutines::_safefetch32_fault_pc, 5920 &StubRoutines::_safefetch32_continuation_pc); 5921 generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry, 5922 &StubRoutines::_safefetchN_fault_pc, 5923 &StubRoutines::_safefetchN_continuation_pc); 5924 } 5925 5926 void generate_all() { 5927 // support for verify_oop (must happen after universe_init) 5928 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop(); 5929 StubRoutines::_throw_AbstractMethodError_entry = 5930 generate_throw_exception("AbstractMethodError throw_exception", 5931 CAST_FROM_FN_PTR(address, 5932 SharedRuntime:: 5933 throw_AbstractMethodError)); 5934 5935 StubRoutines::_throw_IncompatibleClassChangeError_entry = 5936 generate_throw_exception("IncompatibleClassChangeError throw_exception", 5937 CAST_FROM_FN_PTR(address, 5938 SharedRuntime:: 5939 throw_IncompatibleClassChangeError)); 5940 5941 StubRoutines::_throw_NullPointerException_at_call_entry = 5942 generate_throw_exception("NullPointerException at call throw_exception", 5943 CAST_FROM_FN_PTR(address, 5944 SharedRuntime:: 5945 throw_NullPointerException_at_call)); 5946 5947 // arraycopy stubs used by compilers 5948 generate_arraycopy_stubs(); 5949 5950 // has negatives stub for large arrays. 5951 StubRoutines::aarch64::_has_negatives = generate_has_negatives(StubRoutines::aarch64::_has_negatives_long); 5952 5953 // array equals stub for large arrays. 5954 if (!UseSimpleArrayEquals) { 5955 StubRoutines::aarch64::_large_array_equals = generate_large_array_equals(); 5956 } 5957 5958 generate_compare_long_strings(); 5959 5960 generate_string_indexof_stubs(); 5961 5962 // byte_array_inflate stub for large arrays. 5963 StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate(); 5964 5965 BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod(); 5966 if (bs_nm != NULL) { 5967 StubRoutines::aarch64::_method_entry_barrier = generate_method_entry_barrier(); 5968 } 5969 #ifdef COMPILER2 5970 if (UseMultiplyToLenIntrinsic) { 5971 StubRoutines::_multiplyToLen = generate_multiplyToLen(); 5972 } 5973 5974 if (UseSquareToLenIntrinsic) { 5975 StubRoutines::_squareToLen = generate_squareToLen(); 5976 } 5977 5978 if (UseMulAddIntrinsic) { 5979 StubRoutines::_mulAdd = generate_mulAdd(); 5980 } 5981 5982 if (UseMontgomeryMultiplyIntrinsic) { 5983 StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply"); 5984 MontgomeryMultiplyGenerator g(_masm, /*squaring*/false); 5985 StubRoutines::_montgomeryMultiply = g.generate_multiply(); 5986 } 5987 5988 if (UseMontgomerySquareIntrinsic) { 5989 StubCodeMark mark(this, "StubRoutines", "montgomerySquare"); 5990 MontgomeryMultiplyGenerator g(_masm, /*squaring*/true); 5991 // We use generate_multiply() rather than generate_square() 5992 // because it's faster for the sizes of modulus we care about. 5993 StubRoutines::_montgomerySquare = g.generate_multiply(); 5994 } 5995 #endif // COMPILER2 5996 5997 // generate GHASH intrinsics code 5998 if (UseGHASHIntrinsics) { 5999 StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks(); 6000 } 6001 6002 // data cache line writeback 6003 StubRoutines::_data_cache_writeback = generate_data_cache_writeback(); 6004 StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync(); 6005 6006 if (UseAESIntrinsics) { 6007 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); 6008 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); 6009 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(); 6010 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt(); 6011 } 6012 6013 if (UseSHA1Intrinsics) { 6014 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress"); 6015 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB"); 6016 } 6017 if (UseSHA256Intrinsics) { 6018 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress"); 6019 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB"); 6020 } 6021 if (UseSHA512Intrinsics) { 6022 StubRoutines::_sha512_implCompress = generate_sha512_implCompress(false, "sha512_implCompress"); 6023 StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true, "sha512_implCompressMB"); 6024 } 6025 6026 // generate Adler32 intrinsics code 6027 if (UseAdler32Intrinsics) { 6028 StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32(); 6029 } 6030 6031 StubRoutines::aarch64::set_completed(); 6032 } 6033 6034 public: 6035 StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) { 6036 if (all) { 6037 generate_all(); 6038 } else { 6039 generate_initial(); 6040 } 6041 } 6042 }; // end class declaration 6043 6044 #define UCM_TABLE_MAX_ENTRIES 8 6045 void StubGenerator_generate(CodeBuffer* code, bool all) { 6046 if (UnsafeCopyMemory::_table == NULL) { 6047 UnsafeCopyMemory::create_table(UCM_TABLE_MAX_ENTRIES); 6048 } 6049 StubGenerator g(code, all); 6050 }