1 /* 2 * Copyright (c) 2003, 2020, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2014, 2020, Red Hat Inc. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "precompiled.hpp" 27 #include "asm/macroAssembler.hpp" 28 #include "asm/macroAssembler.inline.hpp" 29 #include "gc/shared/barrierSet.hpp" 30 #include "gc/shared/barrierSetAssembler.hpp" 31 #include "interpreter/interpreter.hpp" 32 #include "memory/universe.hpp" 33 #include "nativeInst_aarch64.hpp" 34 #include "oops/instanceOop.hpp" 35 #include "oops/method.hpp" 36 #include "oops/objArrayKlass.hpp" 37 #include "oops/oop.inline.hpp" 38 #include "prims/methodHandles.hpp" 39 #include "runtime/frame.inline.hpp" 40 #include "runtime/handles.inline.hpp" 41 #include "runtime/sharedRuntime.hpp" 42 #include "runtime/stubCodeGenerator.hpp" 43 #include "runtime/stubRoutines.hpp" 44 #include "runtime/thread.inline.hpp" 45 #include "utilities/align.hpp" 46 #include "utilities/powerOfTwo.hpp" 47 #ifdef COMPILER2 48 #include "opto/runtime.hpp" 49 #endif 50 #if INCLUDE_ZGC 51 #include "gc/z/zThreadLocalData.hpp" 52 #endif 53 54 // Declaration and definition of StubGenerator (no .hpp file). 55 // For a more detailed description of the stub routine structure 56 // see the comment in stubRoutines.hpp 57 58 #undef __ 59 #define __ _masm-> 60 #define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8)) 61 62 #ifdef PRODUCT 63 #define BLOCK_COMMENT(str) /* nothing */ 64 #else 65 #define BLOCK_COMMENT(str) __ block_comment(str) 66 #endif 67 68 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 69 70 // Stub Code definitions 71 72 class StubGenerator: public StubCodeGenerator { 73 private: 74 75 #ifdef PRODUCT 76 #define inc_counter_np(counter) ((void)0) 77 #else 78 void inc_counter_np_(int& counter) { 79 __ lea(rscratch2, ExternalAddress((address)&counter)); 80 __ ldrw(rscratch1, Address(rscratch2)); 81 __ addw(rscratch1, rscratch1, 1); 82 __ strw(rscratch1, Address(rscratch2)); 83 } 84 #define inc_counter_np(counter) \ 85 BLOCK_COMMENT("inc_counter " #counter); \ 86 inc_counter_np_(counter); 87 #endif 88 89 // Call stubs are used to call Java from C 90 // 91 // Arguments: 92 // c_rarg0: call wrapper address address 93 // c_rarg1: result address 94 // c_rarg2: result type BasicType 95 // c_rarg3: method Method* 96 // c_rarg4: (interpreter) entry point address 97 // c_rarg5: parameters intptr_t* 98 // c_rarg6: parameter size (in words) int 99 // c_rarg7: thread Thread* 100 // 101 // There is no return from the stub itself as any Java result 102 // is written to result 103 // 104 // we save r30 (lr) as the return PC at the base of the frame and 105 // link r29 (fp) below it as the frame pointer installing sp (r31) 106 // into fp. 107 // 108 // we save r0-r7, which accounts for all the c arguments. 109 // 110 // TODO: strictly do we need to save them all? they are treated as 111 // volatile by C so could we omit saving the ones we are going to 112 // place in global registers (thread? method?) or those we only use 113 // during setup of the Java call? 114 // 115 // we don't need to save r8 which C uses as an indirect result location 116 // return register. 117 // 118 // we don't need to save r9-r15 which both C and Java treat as 119 // volatile 120 // 121 // we don't need to save r16-18 because Java does not use them 122 // 123 // we save r19-r28 which Java uses as scratch registers and C 124 // expects to be callee-save 125 // 126 // we save the bottom 64 bits of each value stored in v8-v15; it is 127 // the responsibility of the caller to preserve larger values. 128 // 129 // so the stub frame looks like this when we enter Java code 130 // 131 // [ return_from_Java ] <--- sp 132 // [ argument word n ] 133 // ... 134 // -27 [ argument word 1 ] 135 // -26 [ saved v15 ] <--- sp_after_call 136 // -25 [ saved v14 ] 137 // -24 [ saved v13 ] 138 // -23 [ saved v12 ] 139 // -22 [ saved v11 ] 140 // -21 [ saved v10 ] 141 // -20 [ saved v9 ] 142 // -19 [ saved v8 ] 143 // -18 [ saved r28 ] 144 // -17 [ saved r27 ] 145 // -16 [ saved r26 ] 146 // -15 [ saved r25 ] 147 // -14 [ saved r24 ] 148 // -13 [ saved r23 ] 149 // -12 [ saved r22 ] 150 // -11 [ saved r21 ] 151 // -10 [ saved r20 ] 152 // -9 [ saved r19 ] 153 // -8 [ call wrapper (r0) ] 154 // -7 [ result (r1) ] 155 // -6 [ result type (r2) ] 156 // -5 [ method (r3) ] 157 // -4 [ entry point (r4) ] 158 // -3 [ parameters (r5) ] 159 // -2 [ parameter size (r6) ] 160 // -1 [ thread (r7) ] 161 // 0 [ saved fp (r29) ] <--- fp == saved sp (r31) 162 // 1 [ saved lr (r30) ] 163 164 // Call stub stack layout word offsets from fp 165 enum call_stub_layout { 166 sp_after_call_off = -26, 167 168 d15_off = -26, 169 d13_off = -24, 170 d11_off = -22, 171 d9_off = -20, 172 173 r28_off = -18, 174 r26_off = -16, 175 r24_off = -14, 176 r22_off = -12, 177 r20_off = -10, 178 call_wrapper_off = -8, 179 result_off = -7, 180 result_type_off = -6, 181 method_off = -5, 182 entry_point_off = -4, 183 parameter_size_off = -2, 184 thread_off = -1, 185 fp_f = 0, 186 retaddr_off = 1, 187 }; 188 189 address generate_call_stub(address& return_address) { 190 assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 && 191 (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off, 192 "adjust this code"); 193 194 StubCodeMark mark(this, "StubRoutines", "call_stub"); 195 address start = __ pc(); 196 197 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 198 199 const Address call_wrapper (rfp, call_wrapper_off * wordSize); 200 const Address result (rfp, result_off * wordSize); 201 const Address result_type (rfp, result_type_off * wordSize); 202 const Address method (rfp, method_off * wordSize); 203 const Address entry_point (rfp, entry_point_off * wordSize); 204 const Address parameter_size(rfp, parameter_size_off * wordSize); 205 206 const Address thread (rfp, thread_off * wordSize); 207 208 const Address d15_save (rfp, d15_off * wordSize); 209 const Address d13_save (rfp, d13_off * wordSize); 210 const Address d11_save (rfp, d11_off * wordSize); 211 const Address d9_save (rfp, d9_off * wordSize); 212 213 const Address r28_save (rfp, r28_off * wordSize); 214 const Address r26_save (rfp, r26_off * wordSize); 215 const Address r24_save (rfp, r24_off * wordSize); 216 const Address r22_save (rfp, r22_off * wordSize); 217 const Address r20_save (rfp, r20_off * wordSize); 218 219 // stub code 220 221 address aarch64_entry = __ pc(); 222 223 // set up frame and move sp to end of save area 224 __ enter(); 225 __ sub(sp, rfp, -sp_after_call_off * wordSize); 226 227 // save register parameters and Java scratch/global registers 228 // n.b. we save thread even though it gets installed in 229 // rthread because we want to sanity check rthread later 230 __ str(c_rarg7, thread); 231 __ strw(c_rarg6, parameter_size); 232 __ stp(c_rarg4, c_rarg5, entry_point); 233 __ stp(c_rarg2, c_rarg3, result_type); 234 __ stp(c_rarg0, c_rarg1, call_wrapper); 235 236 __ stp(r20, r19, r20_save); 237 __ stp(r22, r21, r22_save); 238 __ stp(r24, r23, r24_save); 239 __ stp(r26, r25, r26_save); 240 __ stp(r28, r27, r28_save); 241 242 __ stpd(v9, v8, d9_save); 243 __ stpd(v11, v10, d11_save); 244 __ stpd(v13, v12, d13_save); 245 __ stpd(v15, v14, d15_save); 246 247 // install Java thread in global register now we have saved 248 // whatever value it held 249 __ mov(rthread, c_rarg7); 250 // And method 251 __ mov(rmethod, c_rarg3); 252 253 // set up the heapbase register 254 __ reinit_heapbase(); 255 256 #ifdef ASSERT 257 // make sure we have no pending exceptions 258 { 259 Label L; 260 __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset()))); 261 __ cmp(rscratch1, (u1)NULL_WORD); 262 __ br(Assembler::EQ, L); 263 __ stop("StubRoutines::call_stub: entered with pending exception"); 264 __ BIND(L); 265 } 266 #endif 267 // pass parameters if any 268 __ mov(esp, sp); 269 __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way 270 __ andr(sp, rscratch1, -2 * wordSize); 271 272 BLOCK_COMMENT("pass parameters if any"); 273 Label parameters_done; 274 // parameter count is still in c_rarg6 275 // and parameter pointer identifying param 1 is in c_rarg5 276 __ cbzw(c_rarg6, parameters_done); 277 278 address loop = __ pc(); 279 __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize))); 280 __ subsw(c_rarg6, c_rarg6, 1); 281 __ push(rscratch1); 282 __ br(Assembler::GT, loop); 283 284 __ BIND(parameters_done); 285 286 // call Java entry -- passing methdoOop, and current sp 287 // rmethod: Method* 288 // r13: sender sp 289 BLOCK_COMMENT("call Java function"); 290 __ mov(r13, sp); 291 __ blr(c_rarg4); 292 293 // we do this here because the notify will already have been done 294 // if we get to the next instruction via an exception 295 // 296 // n.b. adding this instruction here affects the calculation of 297 // whether or not a routine returns to the call stub (used when 298 // doing stack walks) since the normal test is to check the return 299 // pc against the address saved below. so we may need to allow for 300 // this extra instruction in the check. 301 302 // save current address for use by exception handling code 303 304 return_address = __ pc(); 305 306 // store result depending on type (everything that is not 307 // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT) 308 // n.b. this assumes Java returns an integral result in r0 309 // and a floating result in j_farg0 310 __ ldr(j_rarg2, result); 311 Label is_long, is_float, is_double, exit; 312 __ ldr(j_rarg1, result_type); 313 __ cmp(j_rarg1, (u1)T_OBJECT); 314 __ br(Assembler::EQ, is_long); 315 __ cmp(j_rarg1, (u1)T_LONG); 316 __ br(Assembler::EQ, is_long); 317 __ cmp(j_rarg1, (u1)T_FLOAT); 318 __ br(Assembler::EQ, is_float); 319 __ cmp(j_rarg1, (u1)T_DOUBLE); 320 __ br(Assembler::EQ, is_double); 321 322 // handle T_INT case 323 __ strw(r0, Address(j_rarg2)); 324 325 __ BIND(exit); 326 327 // pop parameters 328 __ sub(esp, rfp, -sp_after_call_off * wordSize); 329 330 #ifdef ASSERT 331 // verify that threads correspond 332 { 333 Label L, S; 334 __ ldr(rscratch1, thread); 335 __ cmp(rthread, rscratch1); 336 __ br(Assembler::NE, S); 337 __ get_thread(rscratch1); 338 __ cmp(rthread, rscratch1); 339 __ br(Assembler::EQ, L); 340 __ BIND(S); 341 __ stop("StubRoutines::call_stub: threads must correspond"); 342 __ BIND(L); 343 } 344 #endif 345 346 // restore callee-save registers 347 __ ldpd(v15, v14, d15_save); 348 __ ldpd(v13, v12, d13_save); 349 __ ldpd(v11, v10, d11_save); 350 __ ldpd(v9, v8, d9_save); 351 352 __ ldp(r28, r27, r28_save); 353 __ ldp(r26, r25, r26_save); 354 __ ldp(r24, r23, r24_save); 355 __ ldp(r22, r21, r22_save); 356 __ ldp(r20, r19, r20_save); 357 358 __ ldp(c_rarg0, c_rarg1, call_wrapper); 359 __ ldrw(c_rarg2, result_type); 360 __ ldr(c_rarg3, method); 361 __ ldp(c_rarg4, c_rarg5, entry_point); 362 __ ldp(c_rarg6, c_rarg7, parameter_size); 363 364 // leave frame and return to caller 365 __ leave(); 366 __ ret(lr); 367 368 // handle return types different from T_INT 369 370 __ BIND(is_long); 371 __ str(r0, Address(j_rarg2, 0)); 372 __ br(Assembler::AL, exit); 373 374 __ BIND(is_float); 375 __ strs(j_farg0, Address(j_rarg2, 0)); 376 __ br(Assembler::AL, exit); 377 378 __ BIND(is_double); 379 __ strd(j_farg0, Address(j_rarg2, 0)); 380 __ br(Assembler::AL, exit); 381 382 return start; 383 } 384 385 // Return point for a Java call if there's an exception thrown in 386 // Java code. The exception is caught and transformed into a 387 // pending exception stored in JavaThread that can be tested from 388 // within the VM. 389 // 390 // Note: Usually the parameters are removed by the callee. In case 391 // of an exception crossing an activation frame boundary, that is 392 // not the case if the callee is compiled code => need to setup the 393 // rsp. 394 // 395 // r0: exception oop 396 397 address generate_catch_exception() { 398 StubCodeMark mark(this, "StubRoutines", "catch_exception"); 399 address start = __ pc(); 400 401 // same as in generate_call_stub(): 402 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 403 const Address thread (rfp, thread_off * wordSize); 404 405 #ifdef ASSERT 406 // verify that threads correspond 407 { 408 Label L, S; 409 __ ldr(rscratch1, thread); 410 __ cmp(rthread, rscratch1); 411 __ br(Assembler::NE, S); 412 __ get_thread(rscratch1); 413 __ cmp(rthread, rscratch1); 414 __ br(Assembler::EQ, L); 415 __ bind(S); 416 __ stop("StubRoutines::catch_exception: threads must correspond"); 417 __ bind(L); 418 } 419 #endif 420 421 // set pending exception 422 __ verify_oop(r0); 423 424 __ str(r0, Address(rthread, Thread::pending_exception_offset())); 425 __ mov(rscratch1, (address)__FILE__); 426 __ str(rscratch1, Address(rthread, Thread::exception_file_offset())); 427 __ movw(rscratch1, (int)__LINE__); 428 __ strw(rscratch1, Address(rthread, Thread::exception_line_offset())); 429 430 // complete return to VM 431 assert(StubRoutines::_call_stub_return_address != NULL, 432 "_call_stub_return_address must have been generated before"); 433 __ b(StubRoutines::_call_stub_return_address); 434 435 return start; 436 } 437 438 // Continuation point for runtime calls returning with a pending 439 // exception. The pending exception check happened in the runtime 440 // or native call stub. The pending exception in Thread is 441 // converted into a Java-level exception. 442 // 443 // Contract with Java-level exception handlers: 444 // r0: exception 445 // r3: throwing pc 446 // 447 // NOTE: At entry of this stub, exception-pc must be in LR !! 448 449 // NOTE: this is always used as a jump target within generated code 450 // so it just needs to be generated code wiht no x86 prolog 451 452 address generate_forward_exception() { 453 StubCodeMark mark(this, "StubRoutines", "forward exception"); 454 address start = __ pc(); 455 456 // Upon entry, LR points to the return address returning into 457 // Java (interpreted or compiled) code; i.e., the return address 458 // becomes the throwing pc. 459 // 460 // Arguments pushed before the runtime call are still on the stack 461 // but the exception handler will reset the stack pointer -> 462 // ignore them. A potential result in registers can be ignored as 463 // well. 464 465 #ifdef ASSERT 466 // make sure this code is only executed if there is a pending exception 467 { 468 Label L; 469 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 470 __ cbnz(rscratch1, L); 471 __ stop("StubRoutines::forward exception: no pending exception (1)"); 472 __ bind(L); 473 } 474 #endif 475 476 // compute exception handler into r19 477 478 // call the VM to find the handler address associated with the 479 // caller address. pass thread in r0 and caller pc (ret address) 480 // in r1. n.b. the caller pc is in lr, unlike x86 where it is on 481 // the stack. 482 __ mov(c_rarg1, lr); 483 // lr will be trashed by the VM call so we move it to R19 484 // (callee-saved) because we also need to pass it to the handler 485 // returned by this call. 486 __ mov(r19, lr); 487 BLOCK_COMMENT("call exception_handler_for_return_address"); 488 __ call_VM_leaf(CAST_FROM_FN_PTR(address, 489 SharedRuntime::exception_handler_for_return_address), 490 rthread, c_rarg1); 491 // we should not really care that lr is no longer the callee 492 // address. we saved the value the handler needs in r19 so we can 493 // just copy it to r3. however, the C2 handler will push its own 494 // frame and then calls into the VM and the VM code asserts that 495 // the PC for the frame above the handler belongs to a compiled 496 // Java method. So, we restore lr here to satisfy that assert. 497 __ mov(lr, r19); 498 // setup r0 & r3 & clear pending exception 499 __ mov(r3, r19); 500 __ mov(r19, r0); 501 __ ldr(r0, Address(rthread, Thread::pending_exception_offset())); 502 __ str(zr, Address(rthread, Thread::pending_exception_offset())); 503 504 #ifdef ASSERT 505 // make sure exception is set 506 { 507 Label L; 508 __ cbnz(r0, L); 509 __ stop("StubRoutines::forward exception: no pending exception (2)"); 510 __ bind(L); 511 } 512 #endif 513 514 // continue at exception handler 515 // r0: exception 516 // r3: throwing pc 517 // r19: exception handler 518 __ verify_oop(r0); 519 __ br(r19); 520 521 return start; 522 } 523 524 // Non-destructive plausibility checks for oops 525 // 526 // Arguments: 527 // r0: oop to verify 528 // rscratch1: error message 529 // 530 // Stack after saving c_rarg3: 531 // [tos + 0]: saved c_rarg3 532 // [tos + 1]: saved c_rarg2 533 // [tos + 2]: saved lr 534 // [tos + 3]: saved rscratch2 535 // [tos + 4]: saved r0 536 // [tos + 5]: saved rscratch1 537 address generate_verify_oop() { 538 539 StubCodeMark mark(this, "StubRoutines", "verify_oop"); 540 address start = __ pc(); 541 542 Label exit, error; 543 544 // save c_rarg2 and c_rarg3 545 __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16))); 546 547 // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 548 __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 549 __ ldr(c_rarg3, Address(c_rarg2)); 550 __ add(c_rarg3, c_rarg3, 1); 551 __ str(c_rarg3, Address(c_rarg2)); 552 553 // object is in r0 554 // make sure object is 'reasonable' 555 __ cbz(r0, exit); // if obj is NULL it is OK 556 557 #if INCLUDE_ZGC 558 if (UseZGC) { 559 // Check if mask is good. 560 // verifies that ZAddressBadMask & r0 == 0 561 __ ldr(c_rarg3, Address(rthread, ZThreadLocalData::address_bad_mask_offset())); 562 __ andr(c_rarg2, r0, c_rarg3); 563 __ cbnz(c_rarg2, error); 564 } 565 #endif 566 567 // Check if the oop is in the right area of memory 568 __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask()); 569 __ andr(c_rarg2, r0, c_rarg3); 570 __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits()); 571 572 // Compare c_rarg2 and c_rarg3. We don't use a compare 573 // instruction here because the flags register is live. 574 __ eor(c_rarg2, c_rarg2, c_rarg3); 575 __ cbnz(c_rarg2, error); 576 577 // make sure klass is 'reasonable', which is not zero. 578 __ load_klass(r0, r0); // get klass 579 __ cbz(r0, error); // if klass is NULL it is broken 580 581 // return if everything seems ok 582 __ bind(exit); 583 584 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 585 __ ret(lr); 586 587 // handle errors 588 __ bind(error); 589 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 590 591 __ push(RegSet::range(r0, r29), sp); 592 // debug(char* msg, int64_t pc, int64_t regs[]) 593 __ mov(c_rarg0, rscratch1); // pass address of error message 594 __ mov(c_rarg1, lr); // pass return address 595 __ mov(c_rarg2, sp); // pass address of regs on stack 596 #ifndef PRODUCT 597 assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area"); 598 #endif 599 BLOCK_COMMENT("call MacroAssembler::debug"); 600 __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64)); 601 __ blr(rscratch1); 602 __ hlt(0); 603 604 return start; 605 } 606 607 void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); } 608 609 // Generate indices for iota vector. 610 address generate_iota_indices(const char *stub_name) { 611 __ align(CodeEntryAlignment); 612 StubCodeMark mark(this, "StubRoutines", stub_name); 613 address start = __ pc(); 614 __ emit_data64(0x0706050403020100, relocInfo::none); 615 __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none); 616 return start; 617 } 618 619 // The inner part of zero_words(). This is the bulk operation, 620 // zeroing words in blocks, possibly using DC ZVA to do it. The 621 // caller is responsible for zeroing the last few words. 622 // 623 // Inputs: 624 // r10: the HeapWord-aligned base address of an array to zero. 625 // r11: the count in HeapWords, r11 > 0. 626 // 627 // Returns r10 and r11, adjusted for the caller to clear. 628 // r10: the base address of the tail of words left to clear. 629 // r11: the number of words in the tail. 630 // r11 < MacroAssembler::zero_words_block_size. 631 632 address generate_zero_blocks() { 633 Label done; 634 Label base_aligned; 635 636 Register base = r10, cnt = r11; 637 638 __ align(CodeEntryAlignment); 639 StubCodeMark mark(this, "StubRoutines", "zero_blocks"); 640 address start = __ pc(); 641 642 if (UseBlockZeroing) { 643 int zva_length = VM_Version::zva_length(); 644 645 // Ensure ZVA length can be divided by 16. This is required by 646 // the subsequent operations. 647 assert (zva_length % 16 == 0, "Unexpected ZVA Length"); 648 649 __ tbz(base, 3, base_aligned); 650 __ str(zr, Address(__ post(base, 8))); 651 __ sub(cnt, cnt, 1); 652 __ bind(base_aligned); 653 654 // Ensure count >= zva_length * 2 so that it still deserves a zva after 655 // alignment. 656 Label small; 657 int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit); 658 __ subs(rscratch1, cnt, low_limit >> 3); 659 __ br(Assembler::LT, small); 660 __ zero_dcache_blocks(base, cnt); 661 __ bind(small); 662 } 663 664 { 665 // Number of stp instructions we'll unroll 666 const int unroll = 667 MacroAssembler::zero_words_block_size / 2; 668 // Clear the remaining blocks. 669 Label loop; 670 __ subs(cnt, cnt, unroll * 2); 671 __ br(Assembler::LT, done); 672 __ bind(loop); 673 for (int i = 0; i < unroll; i++) 674 __ stp(zr, zr, __ post(base, 16)); 675 __ subs(cnt, cnt, unroll * 2); 676 __ br(Assembler::GE, loop); 677 __ bind(done); 678 __ add(cnt, cnt, unroll * 2); 679 } 680 681 __ ret(lr); 682 683 return start; 684 } 685 686 687 typedef enum { 688 copy_forwards = 1, 689 copy_backwards = -1 690 } copy_direction; 691 692 // Bulk copy of blocks of 8 words. 693 // 694 // count is a count of words. 695 // 696 // Precondition: count >= 8 697 // 698 // Postconditions: 699 // 700 // The least significant bit of count contains the remaining count 701 // of words to copy. The rest of count is trash. 702 // 703 // s and d are adjusted to point to the remaining words to copy 704 // 705 void generate_copy_longs(Label &start, Register s, Register d, Register count, 706 copy_direction direction) { 707 int unit = wordSize * direction; 708 int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize; 709 710 int offset; 711 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6, 712 t4 = r7, t5 = r10, t6 = r11, t7 = r12; 713 const Register stride = r13; 714 715 assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7); 716 assert_different_registers(s, d, count, rscratch1); 717 718 Label again, drain; 719 const char *stub_name; 720 if (direction == copy_forwards) 721 stub_name = "forward_copy_longs"; 722 else 723 stub_name = "backward_copy_longs"; 724 725 __ align(CodeEntryAlignment); 726 727 StubCodeMark mark(this, "StubRoutines", stub_name); 728 729 __ bind(start); 730 731 Label unaligned_copy_long; 732 if (AvoidUnalignedAccesses) { 733 __ tbnz(d, 3, unaligned_copy_long); 734 } 735 736 if (direction == copy_forwards) { 737 __ sub(s, s, bias); 738 __ sub(d, d, bias); 739 } 740 741 #ifdef ASSERT 742 // Make sure we are never given < 8 words 743 { 744 Label L; 745 __ cmp(count, (u1)8); 746 __ br(Assembler::GE, L); 747 __ stop("genrate_copy_longs called with < 8 words"); 748 __ bind(L); 749 } 750 #endif 751 752 // Fill 8 registers 753 if (UseSIMDForMemoryOps) { 754 __ ldpq(v0, v1, Address(s, 4 * unit)); 755 __ ldpq(v2, v3, Address(__ pre(s, 8 * unit))); 756 } else { 757 __ ldp(t0, t1, Address(s, 2 * unit)); 758 __ ldp(t2, t3, Address(s, 4 * unit)); 759 __ ldp(t4, t5, Address(s, 6 * unit)); 760 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 761 } 762 763 __ subs(count, count, 16); 764 __ br(Assembler::LO, drain); 765 766 int prefetch = PrefetchCopyIntervalInBytes; 767 bool use_stride = false; 768 if (direction == copy_backwards) { 769 use_stride = prefetch > 256; 770 prefetch = -prefetch; 771 if (use_stride) __ mov(stride, prefetch); 772 } 773 774 __ bind(again); 775 776 if (PrefetchCopyIntervalInBytes > 0) 777 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 778 779 if (UseSIMDForMemoryOps) { 780 __ stpq(v0, v1, Address(d, 4 * unit)); 781 __ ldpq(v0, v1, Address(s, 4 * unit)); 782 __ stpq(v2, v3, Address(__ pre(d, 8 * unit))); 783 __ ldpq(v2, v3, Address(__ pre(s, 8 * unit))); 784 } else { 785 __ stp(t0, t1, Address(d, 2 * unit)); 786 __ ldp(t0, t1, Address(s, 2 * unit)); 787 __ stp(t2, t3, Address(d, 4 * unit)); 788 __ ldp(t2, t3, Address(s, 4 * unit)); 789 __ stp(t4, t5, Address(d, 6 * unit)); 790 __ ldp(t4, t5, Address(s, 6 * unit)); 791 __ stp(t6, t7, Address(__ pre(d, 8 * unit))); 792 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 793 } 794 795 __ subs(count, count, 8); 796 __ br(Assembler::HS, again); 797 798 // Drain 799 __ bind(drain); 800 if (UseSIMDForMemoryOps) { 801 __ stpq(v0, v1, Address(d, 4 * unit)); 802 __ stpq(v2, v3, Address(__ pre(d, 8 * unit))); 803 } else { 804 __ stp(t0, t1, Address(d, 2 * unit)); 805 __ stp(t2, t3, Address(d, 4 * unit)); 806 __ stp(t4, t5, Address(d, 6 * unit)); 807 __ stp(t6, t7, Address(__ pre(d, 8 * unit))); 808 } 809 810 { 811 Label L1, L2; 812 __ tbz(count, exact_log2(4), L1); 813 if (UseSIMDForMemoryOps) { 814 __ ldpq(v0, v1, Address(__ pre(s, 4 * unit))); 815 __ stpq(v0, v1, Address(__ pre(d, 4 * unit))); 816 } else { 817 __ ldp(t0, t1, Address(s, 2 * unit)); 818 __ ldp(t2, t3, Address(__ pre(s, 4 * unit))); 819 __ stp(t0, t1, Address(d, 2 * unit)); 820 __ stp(t2, t3, Address(__ pre(d, 4 * unit))); 821 } 822 __ bind(L1); 823 824 if (direction == copy_forwards) { 825 __ add(s, s, bias); 826 __ add(d, d, bias); 827 } 828 829 __ tbz(count, 1, L2); 830 __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards))); 831 __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards))); 832 __ bind(L2); 833 } 834 835 __ ret(lr); 836 837 if (AvoidUnalignedAccesses) { 838 Label drain, again; 839 // Register order for storing. Order is different for backward copy. 840 841 __ bind(unaligned_copy_long); 842 843 // source address is even aligned, target odd aligned 844 // 845 // when forward copying word pairs we read long pairs at offsets 846 // {0, 2, 4, 6} (in long words). when backwards copying we read 847 // long pairs at offsets {-2, -4, -6, -8}. We adjust the source 848 // address by -2 in the forwards case so we can compute the 849 // source offsets for both as {2, 4, 6, 8} * unit where unit = 1 850 // or -1. 851 // 852 // when forward copying we need to store 1 word, 3 pairs and 853 // then 1 word at offsets {0, 1, 3, 5, 7}. Rather thna use a 854 // zero offset We adjust the destination by -1 which means we 855 // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores. 856 // 857 // When backwards copyng we need to store 1 word, 3 pairs and 858 // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use 859 // offsets {1, 3, 5, 7, 8} * unit. 860 861 if (direction == copy_forwards) { 862 __ sub(s, s, 16); 863 __ sub(d, d, 8); 864 } 865 866 // Fill 8 registers 867 // 868 // for forwards copy s was offset by -16 from the original input 869 // value of s so the register contents are at these offsets 870 // relative to the 64 bit block addressed by that original input 871 // and so on for each successive 64 byte block when s is updated 872 // 873 // t0 at offset 0, t1 at offset 8 874 // t2 at offset 16, t3 at offset 24 875 // t4 at offset 32, t5 at offset 40 876 // t6 at offset 48, t7 at offset 56 877 878 // for backwards copy s was not offset so the register contents 879 // are at these offsets into the preceding 64 byte block 880 // relative to that original input and so on for each successive 881 // preceding 64 byte block when s is updated. this explains the 882 // slightly counter-intuitive looking pattern of register usage 883 // in the stp instructions for backwards copy. 884 // 885 // t0 at offset -16, t1 at offset -8 886 // t2 at offset -32, t3 at offset -24 887 // t4 at offset -48, t5 at offset -40 888 // t6 at offset -64, t7 at offset -56 889 890 __ ldp(t0, t1, Address(s, 2 * unit)); 891 __ ldp(t2, t3, Address(s, 4 * unit)); 892 __ ldp(t4, t5, Address(s, 6 * unit)); 893 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 894 895 __ subs(count, count, 16); 896 __ br(Assembler::LO, drain); 897 898 int prefetch = PrefetchCopyIntervalInBytes; 899 bool use_stride = false; 900 if (direction == copy_backwards) { 901 use_stride = prefetch > 256; 902 prefetch = -prefetch; 903 if (use_stride) __ mov(stride, prefetch); 904 } 905 906 __ bind(again); 907 908 if (PrefetchCopyIntervalInBytes > 0) 909 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 910 911 if (direction == copy_forwards) { 912 // allowing for the offset of -8 the store instructions place 913 // registers into the target 64 bit block at the following 914 // offsets 915 // 916 // t0 at offset 0 917 // t1 at offset 8, t2 at offset 16 918 // t3 at offset 24, t4 at offset 32 919 // t5 at offset 40, t6 at offset 48 920 // t7 at offset 56 921 922 __ str(t0, Address(d, 1 * unit)); 923 __ stp(t1, t2, Address(d, 2 * unit)); 924 __ ldp(t0, t1, Address(s, 2 * unit)); 925 __ stp(t3, t4, Address(d, 4 * unit)); 926 __ ldp(t2, t3, Address(s, 4 * unit)); 927 __ stp(t5, t6, Address(d, 6 * unit)); 928 __ ldp(t4, t5, Address(s, 6 * unit)); 929 __ str(t7, Address(__ pre(d, 8 * unit))); 930 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 931 } else { 932 // d was not offset when we started so the registers are 933 // written into the 64 bit block preceding d with the following 934 // offsets 935 // 936 // t1 at offset -8 937 // t3 at offset -24, t0 at offset -16 938 // t5 at offset -48, t2 at offset -32 939 // t7 at offset -56, t4 at offset -48 940 // t6 at offset -64 941 // 942 // note that this matches the offsets previously noted for the 943 // loads 944 945 __ str(t1, Address(d, 1 * unit)); 946 __ stp(t3, t0, Address(d, 3 * unit)); 947 __ ldp(t0, t1, Address(s, 2 * unit)); 948 __ stp(t5, t2, Address(d, 5 * unit)); 949 __ ldp(t2, t3, Address(s, 4 * unit)); 950 __ stp(t7, t4, Address(d, 7 * unit)); 951 __ ldp(t4, t5, Address(s, 6 * unit)); 952 __ str(t6, Address(__ pre(d, 8 * unit))); 953 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 954 } 955 956 __ subs(count, count, 8); 957 __ br(Assembler::HS, again); 958 959 // Drain 960 // 961 // this uses the same pattern of offsets and register arguments 962 // as above 963 __ bind(drain); 964 if (direction == copy_forwards) { 965 __ str(t0, Address(d, 1 * unit)); 966 __ stp(t1, t2, Address(d, 2 * unit)); 967 __ stp(t3, t4, Address(d, 4 * unit)); 968 __ stp(t5, t6, Address(d, 6 * unit)); 969 __ str(t7, Address(__ pre(d, 8 * unit))); 970 } else { 971 __ str(t1, Address(d, 1 * unit)); 972 __ stp(t3, t0, Address(d, 3 * unit)); 973 __ stp(t5, t2, Address(d, 5 * unit)); 974 __ stp(t7, t4, Address(d, 7 * unit)); 975 __ str(t6, Address(__ pre(d, 8 * unit))); 976 } 977 // now we need to copy any remaining part block which may 978 // include a 4 word block subblock and/or a 2 word subblock. 979 // bits 2 and 1 in the count are the tell-tale for whetehr we 980 // have each such subblock 981 { 982 Label L1, L2; 983 __ tbz(count, exact_log2(4), L1); 984 // this is the same as above but copying only 4 longs hence 985 // with ony one intervening stp between the str instructions 986 // but note that the offsets and registers still follow the 987 // same pattern 988 __ ldp(t0, t1, Address(s, 2 * unit)); 989 __ ldp(t2, t3, Address(__ pre(s, 4 * unit))); 990 if (direction == copy_forwards) { 991 __ str(t0, Address(d, 1 * unit)); 992 __ stp(t1, t2, Address(d, 2 * unit)); 993 __ str(t3, Address(__ pre(d, 4 * unit))); 994 } else { 995 __ str(t1, Address(d, 1 * unit)); 996 __ stp(t3, t0, Address(d, 3 * unit)); 997 __ str(t2, Address(__ pre(d, 4 * unit))); 998 } 999 __ bind(L1); 1000 1001 __ tbz(count, 1, L2); 1002 // this is the same as above but copying only 2 longs hence 1003 // there is no intervening stp between the str instructions 1004 // but note that the offset and register patterns are still 1005 // the same 1006 __ ldp(t0, t1, Address(__ pre(s, 2 * unit))); 1007 if (direction == copy_forwards) { 1008 __ str(t0, Address(d, 1 * unit)); 1009 __ str(t1, Address(__ pre(d, 2 * unit))); 1010 } else { 1011 __ str(t1, Address(d, 1 * unit)); 1012 __ str(t0, Address(__ pre(d, 2 * unit))); 1013 } 1014 __ bind(L2); 1015 1016 // for forwards copy we need to re-adjust the offsets we 1017 // applied so that s and d are follow the last words written 1018 1019 if (direction == copy_forwards) { 1020 __ add(s, s, 16); 1021 __ add(d, d, 8); 1022 } 1023 1024 } 1025 1026 __ ret(lr); 1027 } 1028 } 1029 1030 // Small copy: less than 16 bytes. 1031 // 1032 // NB: Ignores all of the bits of count which represent more than 15 1033 // bytes, so a caller doesn't have to mask them. 1034 1035 void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) { 1036 bool is_backwards = step < 0; 1037 size_t granularity = uabs(step); 1038 int direction = is_backwards ? -1 : 1; 1039 int unit = wordSize * direction; 1040 1041 Label Lword, Lint, Lshort, Lbyte; 1042 1043 assert(granularity 1044 && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small"); 1045 1046 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6; 1047 1048 // ??? I don't know if this bit-test-and-branch is the right thing 1049 // to do. It does a lot of jumping, resulting in several 1050 // mispredicted branches. It might make more sense to do this 1051 // with something like Duff's device with a single computed branch. 1052 1053 __ tbz(count, 3 - exact_log2(granularity), Lword); 1054 __ ldr(tmp, Address(__ adjust(s, unit, is_backwards))); 1055 __ str(tmp, Address(__ adjust(d, unit, is_backwards))); 1056 __ bind(Lword); 1057 1058 if (granularity <= sizeof (jint)) { 1059 __ tbz(count, 2 - exact_log2(granularity), Lint); 1060 __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards))); 1061 __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards))); 1062 __ bind(Lint); 1063 } 1064 1065 if (granularity <= sizeof (jshort)) { 1066 __ tbz(count, 1 - exact_log2(granularity), Lshort); 1067 __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards))); 1068 __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards))); 1069 __ bind(Lshort); 1070 } 1071 1072 if (granularity <= sizeof (jbyte)) { 1073 __ tbz(count, 0, Lbyte); 1074 __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards))); 1075 __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards))); 1076 __ bind(Lbyte); 1077 } 1078 } 1079 1080 Label copy_f, copy_b; 1081 1082 // All-singing all-dancing memory copy. 1083 // 1084 // Copy count units of memory from s to d. The size of a unit is 1085 // step, which can be positive or negative depending on the direction 1086 // of copy. If is_aligned is false, we align the source address. 1087 // 1088 1089 void copy_memory(bool is_aligned, Register s, Register d, 1090 Register count, Register tmp, int step) { 1091 copy_direction direction = step < 0 ? copy_backwards : copy_forwards; 1092 bool is_backwards = step < 0; 1093 int granularity = uabs(step); 1094 const Register t0 = r3, t1 = r4; 1095 1096 // <= 96 bytes do inline. Direction doesn't matter because we always 1097 // load all the data before writing anything 1098 Label copy4, copy8, copy16, copy32, copy80, copy_big, finish; 1099 const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8; 1100 const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12; 1101 const Register send = r17, dend = r18; 1102 1103 if (PrefetchCopyIntervalInBytes > 0) 1104 __ prfm(Address(s, 0), PLDL1KEEP); 1105 __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity)); 1106 __ br(Assembler::HI, copy_big); 1107 1108 __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity)))); 1109 __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity)))); 1110 1111 __ cmp(count, u1(16/granularity)); 1112 __ br(Assembler::LS, copy16); 1113 1114 __ cmp(count, u1(64/granularity)); 1115 __ br(Assembler::HI, copy80); 1116 1117 __ cmp(count, u1(32/granularity)); 1118 __ br(Assembler::LS, copy32); 1119 1120 // 33..64 bytes 1121 if (UseSIMDForMemoryOps) { 1122 __ ldpq(v0, v1, Address(s, 0)); 1123 __ ldpq(v2, v3, Address(send, -32)); 1124 __ stpq(v0, v1, Address(d, 0)); 1125 __ stpq(v2, v3, Address(dend, -32)); 1126 } else { 1127 __ ldp(t0, t1, Address(s, 0)); 1128 __ ldp(t2, t3, Address(s, 16)); 1129 __ ldp(t4, t5, Address(send, -32)); 1130 __ ldp(t6, t7, Address(send, -16)); 1131 1132 __ stp(t0, t1, Address(d, 0)); 1133 __ stp(t2, t3, Address(d, 16)); 1134 __ stp(t4, t5, Address(dend, -32)); 1135 __ stp(t6, t7, Address(dend, -16)); 1136 } 1137 __ b(finish); 1138 1139 // 17..32 bytes 1140 __ bind(copy32); 1141 __ ldp(t0, t1, Address(s, 0)); 1142 __ ldp(t2, t3, Address(send, -16)); 1143 __ stp(t0, t1, Address(d, 0)); 1144 __ stp(t2, t3, Address(dend, -16)); 1145 __ b(finish); 1146 1147 // 65..80/96 bytes 1148 // (96 bytes if SIMD because we do 32 byes per instruction) 1149 __ bind(copy80); 1150 if (UseSIMDForMemoryOps) { 1151 __ ld4(v0, v1, v2, v3, __ T16B, Address(s, 0)); 1152 __ ldpq(v4, v5, Address(send, -32)); 1153 __ st4(v0, v1, v2, v3, __ T16B, Address(d, 0)); 1154 __ stpq(v4, v5, Address(dend, -32)); 1155 } else { 1156 __ ldp(t0, t1, Address(s, 0)); 1157 __ ldp(t2, t3, Address(s, 16)); 1158 __ ldp(t4, t5, Address(s, 32)); 1159 __ ldp(t6, t7, Address(s, 48)); 1160 __ ldp(t8, t9, Address(send, -16)); 1161 1162 __ stp(t0, t1, Address(d, 0)); 1163 __ stp(t2, t3, Address(d, 16)); 1164 __ stp(t4, t5, Address(d, 32)); 1165 __ stp(t6, t7, Address(d, 48)); 1166 __ stp(t8, t9, Address(dend, -16)); 1167 } 1168 __ b(finish); 1169 1170 // 0..16 bytes 1171 __ bind(copy16); 1172 __ cmp(count, u1(8/granularity)); 1173 __ br(Assembler::LO, copy8); 1174 1175 // 8..16 bytes 1176 __ ldr(t0, Address(s, 0)); 1177 __ ldr(t1, Address(send, -8)); 1178 __ str(t0, Address(d, 0)); 1179 __ str(t1, Address(dend, -8)); 1180 __ b(finish); 1181 1182 if (granularity < 8) { 1183 // 4..7 bytes 1184 __ bind(copy8); 1185 __ tbz(count, 2 - exact_log2(granularity), copy4); 1186 __ ldrw(t0, Address(s, 0)); 1187 __ ldrw(t1, Address(send, -4)); 1188 __ strw(t0, Address(d, 0)); 1189 __ strw(t1, Address(dend, -4)); 1190 __ b(finish); 1191 if (granularity < 4) { 1192 // 0..3 bytes 1193 __ bind(copy4); 1194 __ cbz(count, finish); // get rid of 0 case 1195 if (granularity == 2) { 1196 __ ldrh(t0, Address(s, 0)); 1197 __ strh(t0, Address(d, 0)); 1198 } else { // granularity == 1 1199 // Now 1..3 bytes. Handle the 1 and 2 byte case by copying 1200 // the first and last byte. 1201 // Handle the 3 byte case by loading and storing base + count/2 1202 // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1)) 1203 // This does means in the 1 byte case we load/store the same 1204 // byte 3 times. 1205 __ lsr(count, count, 1); 1206 __ ldrb(t0, Address(s, 0)); 1207 __ ldrb(t1, Address(send, -1)); 1208 __ ldrb(t2, Address(s, count)); 1209 __ strb(t0, Address(d, 0)); 1210 __ strb(t1, Address(dend, -1)); 1211 __ strb(t2, Address(d, count)); 1212 } 1213 __ b(finish); 1214 } 1215 } 1216 1217 __ bind(copy_big); 1218 if (is_backwards) { 1219 __ lea(s, Address(s, count, Address::lsl(exact_log2(-step)))); 1220 __ lea(d, Address(d, count, Address::lsl(exact_log2(-step)))); 1221 } 1222 1223 // Now we've got the small case out of the way we can align the 1224 // source address on a 2-word boundary. 1225 1226 Label aligned; 1227 1228 if (is_aligned) { 1229 // We may have to adjust by 1 word to get s 2-word-aligned. 1230 __ tbz(s, exact_log2(wordSize), aligned); 1231 __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards))); 1232 __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards))); 1233 __ sub(count, count, wordSize/granularity); 1234 } else { 1235 if (is_backwards) { 1236 __ andr(rscratch2, s, 2 * wordSize - 1); 1237 } else { 1238 __ neg(rscratch2, s); 1239 __ andr(rscratch2, rscratch2, 2 * wordSize - 1); 1240 } 1241 // rscratch2 is the byte adjustment needed to align s. 1242 __ cbz(rscratch2, aligned); 1243 int shift = exact_log2(granularity); 1244 if (shift) __ lsr(rscratch2, rscratch2, shift); 1245 __ sub(count, count, rscratch2); 1246 1247 #if 0 1248 // ?? This code is only correct for a disjoint copy. It may or 1249 // may not make sense to use it in that case. 1250 1251 // Copy the first pair; s and d may not be aligned. 1252 __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0)); 1253 __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0)); 1254 1255 // Align s and d, adjust count 1256 if (is_backwards) { 1257 __ sub(s, s, rscratch2); 1258 __ sub(d, d, rscratch2); 1259 } else { 1260 __ add(s, s, rscratch2); 1261 __ add(d, d, rscratch2); 1262 } 1263 #else 1264 copy_memory_small(s, d, rscratch2, rscratch1, step); 1265 #endif 1266 } 1267 1268 __ bind(aligned); 1269 1270 // s is now 2-word-aligned. 1271 1272 // We have a count of units and some trailing bytes. Adjust the 1273 // count and do a bulk copy of words. 1274 __ lsr(rscratch2, count, exact_log2(wordSize/granularity)); 1275 if (direction == copy_forwards) 1276 __ bl(copy_f); 1277 else 1278 __ bl(copy_b); 1279 1280 // And the tail. 1281 copy_memory_small(s, d, count, tmp, step); 1282 1283 if (granularity >= 8) __ bind(copy8); 1284 if (granularity >= 4) __ bind(copy4); 1285 __ bind(finish); 1286 } 1287 1288 1289 void clobber_registers() { 1290 #ifdef ASSERT 1291 __ mov(rscratch1, (uint64_t)0xdeadbeef); 1292 __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32); 1293 for (Register r = r3; r <= r18; r++) 1294 if (r != rscratch1) __ mov(r, rscratch1); 1295 #endif 1296 } 1297 1298 // Scan over array at a for count oops, verifying each one. 1299 // Preserves a and count, clobbers rscratch1 and rscratch2. 1300 void verify_oop_array (size_t size, Register a, Register count, Register temp) { 1301 Label loop, end; 1302 __ mov(rscratch1, a); 1303 __ mov(rscratch2, zr); 1304 __ bind(loop); 1305 __ cmp(rscratch2, count); 1306 __ br(Assembler::HS, end); 1307 if (size == (size_t)wordSize) { 1308 __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1309 __ verify_oop(temp); 1310 } else { 1311 __ ldrw(r16, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1312 __ decode_heap_oop(temp); // calls verify_oop 1313 } 1314 __ add(rscratch2, rscratch2, size); 1315 __ b(loop); 1316 __ bind(end); 1317 } 1318 1319 // Arguments: 1320 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1321 // ignored 1322 // is_oop - true => oop array, so generate store check code 1323 // name - stub name string 1324 // 1325 // Inputs: 1326 // c_rarg0 - source array address 1327 // c_rarg1 - destination array address 1328 // c_rarg2 - element count, treated as ssize_t, can be zero 1329 // 1330 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1331 // the hardware handle it. The two dwords within qwords that span 1332 // cache line boundaries will still be loaded and stored atomicly. 1333 // 1334 // Side Effects: 1335 // disjoint_int_copy_entry is set to the no-overlap entry point 1336 // used by generate_conjoint_int_oop_copy(). 1337 // 1338 address generate_disjoint_copy(size_t size, bool aligned, bool is_oop, address *entry, 1339 const char *name, bool dest_uninitialized = false) { 1340 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1341 RegSet saved_reg = RegSet::of(s, d, count); 1342 __ align(CodeEntryAlignment); 1343 StubCodeMark mark(this, "StubRoutines", name); 1344 address start = __ pc(); 1345 __ enter(); 1346 1347 if (entry != NULL) { 1348 *entry = __ pc(); 1349 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1350 BLOCK_COMMENT("Entry:"); 1351 } 1352 1353 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT; 1354 if (dest_uninitialized) { 1355 decorators |= IS_DEST_UNINITIALIZED; 1356 } 1357 if (aligned) { 1358 decorators |= ARRAYCOPY_ALIGNED; 1359 } 1360 1361 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1362 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg); 1363 1364 if (is_oop) { 1365 // save regs before copy_memory 1366 __ push(RegSet::of(d, count), sp); 1367 } 1368 { 1369 // UnsafeCopyMemory page error: continue after ucm 1370 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size); 1371 UnsafeCopyMemoryMark ucmm(this, add_entry, true); 1372 copy_memory(aligned, s, d, count, rscratch1, size); 1373 } 1374 1375 if (is_oop) { 1376 __ pop(RegSet::of(d, count), sp); 1377 if (VerifyOops) 1378 verify_oop_array(size, d, count, r16); 1379 } 1380 1381 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); 1382 1383 __ leave(); 1384 __ mov(r0, zr); // return 0 1385 __ ret(lr); 1386 return start; 1387 } 1388 1389 // Arguments: 1390 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1391 // ignored 1392 // is_oop - true => oop array, so generate store check code 1393 // name - stub name string 1394 // 1395 // Inputs: 1396 // c_rarg0 - source array address 1397 // c_rarg1 - destination array address 1398 // c_rarg2 - element count, treated as ssize_t, can be zero 1399 // 1400 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1401 // the hardware handle it. The two dwords within qwords that span 1402 // cache line boundaries will still be loaded and stored atomicly. 1403 // 1404 address generate_conjoint_copy(size_t size, bool aligned, bool is_oop, address nooverlap_target, 1405 address *entry, const char *name, 1406 bool dest_uninitialized = false) { 1407 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1408 RegSet saved_regs = RegSet::of(s, d, count); 1409 StubCodeMark mark(this, "StubRoutines", name); 1410 address start = __ pc(); 1411 __ enter(); 1412 1413 if (entry != NULL) { 1414 *entry = __ pc(); 1415 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1416 BLOCK_COMMENT("Entry:"); 1417 } 1418 1419 // use fwd copy when (d-s) above_equal (count*size) 1420 __ sub(rscratch1, d, s); 1421 __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size)); 1422 __ br(Assembler::HS, nooverlap_target); 1423 1424 DecoratorSet decorators = IN_HEAP | IS_ARRAY; 1425 if (dest_uninitialized) { 1426 decorators |= IS_DEST_UNINITIALIZED; 1427 } 1428 if (aligned) { 1429 decorators |= ARRAYCOPY_ALIGNED; 1430 } 1431 1432 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1433 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs); 1434 1435 if (is_oop) { 1436 // save regs before copy_memory 1437 __ push(RegSet::of(d, count), sp); 1438 } 1439 { 1440 // UnsafeCopyMemory page error: continue after ucm 1441 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size); 1442 UnsafeCopyMemoryMark ucmm(this, add_entry, true); 1443 copy_memory(aligned, s, d, count, rscratch1, -size); 1444 } 1445 if (is_oop) { 1446 __ pop(RegSet::of(d, count), sp); 1447 if (VerifyOops) 1448 verify_oop_array(size, d, count, r16); 1449 } 1450 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); 1451 __ leave(); 1452 __ mov(r0, zr); // return 0 1453 __ ret(lr); 1454 return start; 1455 } 1456 1457 // Arguments: 1458 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1459 // ignored 1460 // name - stub name string 1461 // 1462 // Inputs: 1463 // c_rarg0 - source array address 1464 // c_rarg1 - destination array address 1465 // c_rarg2 - element count, treated as ssize_t, can be zero 1466 // 1467 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1468 // we let the hardware handle it. The one to eight bytes within words, 1469 // dwords or qwords that span cache line boundaries will still be loaded 1470 // and stored atomically. 1471 // 1472 // Side Effects: 1473 // disjoint_byte_copy_entry is set to the no-overlap entry point // 1474 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1475 // we let the hardware handle it. The one to eight bytes within words, 1476 // dwords or qwords that span cache line boundaries will still be loaded 1477 // and stored atomically. 1478 // 1479 // Side Effects: 1480 // disjoint_byte_copy_entry is set to the no-overlap entry point 1481 // used by generate_conjoint_byte_copy(). 1482 // 1483 address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) { 1484 const bool not_oop = false; 1485 return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name); 1486 } 1487 1488 // Arguments: 1489 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1490 // ignored 1491 // name - stub name string 1492 // 1493 // Inputs: 1494 // c_rarg0 - source array address 1495 // c_rarg1 - destination array address 1496 // c_rarg2 - element count, treated as ssize_t, can be zero 1497 // 1498 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1499 // we let the hardware handle it. The one to eight bytes within words, 1500 // dwords or qwords that span cache line boundaries will still be loaded 1501 // and stored atomically. 1502 // 1503 address generate_conjoint_byte_copy(bool aligned, address nooverlap_target, 1504 address* entry, const char *name) { 1505 const bool not_oop = false; 1506 return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name); 1507 } 1508 1509 // Arguments: 1510 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1511 // ignored 1512 // name - stub name string 1513 // 1514 // Inputs: 1515 // c_rarg0 - source array address 1516 // c_rarg1 - destination array address 1517 // c_rarg2 - element count, treated as ssize_t, can be zero 1518 // 1519 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1520 // let the hardware handle it. The two or four words within dwords 1521 // or qwords that span cache line boundaries will still be loaded 1522 // and stored atomically. 1523 // 1524 // Side Effects: 1525 // disjoint_short_copy_entry is set to the no-overlap entry point 1526 // used by generate_conjoint_short_copy(). 1527 // 1528 address generate_disjoint_short_copy(bool aligned, 1529 address* entry, const char *name) { 1530 const bool not_oop = false; 1531 return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name); 1532 } 1533 1534 // Arguments: 1535 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1536 // ignored 1537 // name - stub name string 1538 // 1539 // Inputs: 1540 // c_rarg0 - source array address 1541 // c_rarg1 - destination array address 1542 // c_rarg2 - element count, treated as ssize_t, can be zero 1543 // 1544 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1545 // let the hardware handle it. The two or four words within dwords 1546 // or qwords that span cache line boundaries will still be loaded 1547 // and stored atomically. 1548 // 1549 address generate_conjoint_short_copy(bool aligned, address nooverlap_target, 1550 address *entry, const char *name) { 1551 const bool not_oop = false; 1552 return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name); 1553 1554 } 1555 // Arguments: 1556 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1557 // ignored 1558 // name - stub name string 1559 // 1560 // Inputs: 1561 // c_rarg0 - source array address 1562 // c_rarg1 - destination array address 1563 // c_rarg2 - element count, treated as ssize_t, can be zero 1564 // 1565 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1566 // the hardware handle it. The two dwords within qwords that span 1567 // cache line boundaries will still be loaded and stored atomicly. 1568 // 1569 // Side Effects: 1570 // disjoint_int_copy_entry is set to the no-overlap entry point 1571 // used by generate_conjoint_int_oop_copy(). 1572 // 1573 address generate_disjoint_int_copy(bool aligned, address *entry, 1574 const char *name, bool dest_uninitialized = false) { 1575 const bool not_oop = false; 1576 return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name); 1577 } 1578 1579 // Arguments: 1580 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1581 // ignored 1582 // name - stub name string 1583 // 1584 // Inputs: 1585 // c_rarg0 - source array address 1586 // c_rarg1 - destination array address 1587 // c_rarg2 - element count, treated as ssize_t, can be zero 1588 // 1589 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1590 // the hardware handle it. The two dwords within qwords that span 1591 // cache line boundaries will still be loaded and stored atomicly. 1592 // 1593 address generate_conjoint_int_copy(bool aligned, address nooverlap_target, 1594 address *entry, const char *name, 1595 bool dest_uninitialized = false) { 1596 const bool not_oop = false; 1597 return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name); 1598 } 1599 1600 1601 // Arguments: 1602 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1603 // ignored 1604 // name - stub name string 1605 // 1606 // Inputs: 1607 // c_rarg0 - source array address 1608 // c_rarg1 - destination array address 1609 // c_rarg2 - element count, treated as size_t, can be zero 1610 // 1611 // Side Effects: 1612 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1613 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1614 // 1615 address generate_disjoint_long_copy(bool aligned, address *entry, 1616 const char *name, bool dest_uninitialized = false) { 1617 const bool not_oop = false; 1618 return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name); 1619 } 1620 1621 // Arguments: 1622 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1623 // ignored 1624 // name - stub name string 1625 // 1626 // Inputs: 1627 // c_rarg0 - source array address 1628 // c_rarg1 - destination array address 1629 // c_rarg2 - element count, treated as size_t, can be zero 1630 // 1631 address generate_conjoint_long_copy(bool aligned, 1632 address nooverlap_target, address *entry, 1633 const char *name, bool dest_uninitialized = false) { 1634 const bool not_oop = false; 1635 return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name); 1636 } 1637 1638 // Arguments: 1639 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1640 // ignored 1641 // name - stub name string 1642 // 1643 // Inputs: 1644 // c_rarg0 - source array address 1645 // c_rarg1 - destination array address 1646 // c_rarg2 - element count, treated as size_t, can be zero 1647 // 1648 // Side Effects: 1649 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1650 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1651 // 1652 address generate_disjoint_oop_copy(bool aligned, address *entry, 1653 const char *name, bool dest_uninitialized) { 1654 const bool is_oop = true; 1655 const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1656 return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized); 1657 } 1658 1659 // Arguments: 1660 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1661 // ignored 1662 // name - stub name string 1663 // 1664 // Inputs: 1665 // c_rarg0 - source array address 1666 // c_rarg1 - destination array address 1667 // c_rarg2 - element count, treated as size_t, can be zero 1668 // 1669 address generate_conjoint_oop_copy(bool aligned, 1670 address nooverlap_target, address *entry, 1671 const char *name, bool dest_uninitialized) { 1672 const bool is_oop = true; 1673 const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1674 return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry, 1675 name, dest_uninitialized); 1676 } 1677 1678 1679 // Helper for generating a dynamic type check. 1680 // Smashes rscratch1, rscratch2. 1681 void generate_type_check(Register sub_klass, 1682 Register super_check_offset, 1683 Register super_klass, 1684 Label& L_success) { 1685 assert_different_registers(sub_klass, super_check_offset, super_klass); 1686 1687 BLOCK_COMMENT("type_check:"); 1688 1689 Label L_miss; 1690 1691 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, NULL, 1692 super_check_offset); 1693 __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL); 1694 1695 // Fall through on failure! 1696 __ BIND(L_miss); 1697 } 1698 1699 // 1700 // Generate checkcasting array copy stub 1701 // 1702 // Input: 1703 // c_rarg0 - source array address 1704 // c_rarg1 - destination array address 1705 // c_rarg2 - element count, treated as ssize_t, can be zero 1706 // c_rarg3 - size_t ckoff (super_check_offset) 1707 // c_rarg4 - oop ckval (super_klass) 1708 // 1709 // Output: 1710 // r0 == 0 - success 1711 // r0 == -1^K - failure, where K is partial transfer count 1712 // 1713 address generate_checkcast_copy(const char *name, address *entry, 1714 bool dest_uninitialized = false) { 1715 1716 Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop; 1717 1718 // Input registers (after setup_arg_regs) 1719 const Register from = c_rarg0; // source array address 1720 const Register to = c_rarg1; // destination array address 1721 const Register count = c_rarg2; // elementscount 1722 const Register ckoff = c_rarg3; // super_check_offset 1723 const Register ckval = c_rarg4; // super_klass 1724 1725 RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4); 1726 RegSet wb_post_saved_regs = RegSet::of(count); 1727 1728 // Registers used as temps (r18, r19, r20 are save-on-entry) 1729 const Register count_save = r21; // orig elementscount 1730 const Register start_to = r20; // destination array start address 1731 const Register copied_oop = r18; // actual oop copied 1732 const Register r19_klass = r19; // oop._klass 1733 1734 //--------------------------------------------------------------- 1735 // Assembler stub will be used for this call to arraycopy 1736 // if the two arrays are subtypes of Object[] but the 1737 // destination array type is not equal to or a supertype 1738 // of the source type. Each element must be separately 1739 // checked. 1740 1741 assert_different_registers(from, to, count, ckoff, ckval, start_to, 1742 copied_oop, r19_klass, count_save); 1743 1744 __ align(CodeEntryAlignment); 1745 StubCodeMark mark(this, "StubRoutines", name); 1746 address start = __ pc(); 1747 1748 __ enter(); // required for proper stackwalking of RuntimeStub frame 1749 1750 #ifdef ASSERT 1751 // caller guarantees that the arrays really are different 1752 // otherwise, we would have to make conjoint checks 1753 { Label L; 1754 array_overlap_test(L, TIMES_OOP); 1755 __ stop("checkcast_copy within a single array"); 1756 __ bind(L); 1757 } 1758 #endif //ASSERT 1759 1760 // Caller of this entry point must set up the argument registers. 1761 if (entry != NULL) { 1762 *entry = __ pc(); 1763 BLOCK_COMMENT("Entry:"); 1764 } 1765 1766 // Empty array: Nothing to do. 1767 __ cbz(count, L_done); 1768 1769 __ push(RegSet::of(r18, r19, r20, r21), sp); 1770 1771 #ifdef ASSERT 1772 BLOCK_COMMENT("assert consistent ckoff/ckval"); 1773 // The ckoff and ckval must be mutually consistent, 1774 // even though caller generates both. 1775 { Label L; 1776 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1777 __ ldrw(start_to, Address(ckval, sco_offset)); 1778 __ cmpw(ckoff, start_to); 1779 __ br(Assembler::EQ, L); 1780 __ stop("super_check_offset inconsistent"); 1781 __ bind(L); 1782 } 1783 #endif //ASSERT 1784 1785 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT; 1786 bool is_oop = true; 1787 if (dest_uninitialized) { 1788 decorators |= IS_DEST_UNINITIALIZED; 1789 } 1790 1791 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1792 bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs); 1793 1794 // save the original count 1795 __ mov(count_save, count); 1796 1797 // Copy from low to high addresses 1798 __ mov(start_to, to); // Save destination array start address 1799 __ b(L_load_element); 1800 1801 // ======== begin loop ======== 1802 // (Loop is rotated; its entry is L_load_element.) 1803 // Loop control: 1804 // for (; count != 0; count--) { 1805 // copied_oop = load_heap_oop(from++); 1806 // ... generate_type_check ...; 1807 // store_heap_oop(to++, copied_oop); 1808 // } 1809 __ align(OptoLoopAlignment); 1810 1811 __ BIND(L_store_element); 1812 __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop, noreg, noreg, AS_RAW); // store the oop 1813 __ sub(count, count, 1); 1814 __ cbz(count, L_do_card_marks); 1815 1816 // ======== loop entry is here ======== 1817 __ BIND(L_load_element); 1818 __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8), noreg, noreg, AS_RAW); // load the oop 1819 __ cbz(copied_oop, L_store_element); 1820 1821 __ load_klass(r19_klass, copied_oop);// query the object klass 1822 generate_type_check(r19_klass, ckoff, ckval, L_store_element); 1823 // ======== end loop ======== 1824 1825 // It was a real error; we must depend on the caller to finish the job. 1826 // Register count = remaining oops, count_orig = total oops. 1827 // Emit GC store barriers for the oops we have copied and report 1828 // their number to the caller. 1829 1830 __ subs(count, count_save, count); // K = partially copied oop count 1831 __ eon(count, count, zr); // report (-1^K) to caller 1832 __ br(Assembler::EQ, L_done_pop); 1833 1834 __ BIND(L_do_card_marks); 1835 bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs); 1836 1837 __ bind(L_done_pop); 1838 __ pop(RegSet::of(r18, r19, r20, r21), sp); 1839 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr); 1840 1841 __ bind(L_done); 1842 __ mov(r0, count); 1843 __ leave(); 1844 __ ret(lr); 1845 1846 return start; 1847 } 1848 1849 // Perform range checks on the proposed arraycopy. 1850 // Kills temp, but nothing else. 1851 // Also, clean the sign bits of src_pos and dst_pos. 1852 void arraycopy_range_checks(Register src, // source array oop (c_rarg0) 1853 Register src_pos, // source position (c_rarg1) 1854 Register dst, // destination array oo (c_rarg2) 1855 Register dst_pos, // destination position (c_rarg3) 1856 Register length, 1857 Register temp, 1858 Label& L_failed) { 1859 BLOCK_COMMENT("arraycopy_range_checks:"); 1860 1861 assert_different_registers(rscratch1, temp); 1862 1863 // if (src_pos + length > arrayOop(src)->length()) FAIL; 1864 __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes())); 1865 __ addw(temp, length, src_pos); 1866 __ cmpw(temp, rscratch1); 1867 __ br(Assembler::HI, L_failed); 1868 1869 // if (dst_pos + length > arrayOop(dst)->length()) FAIL; 1870 __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes())); 1871 __ addw(temp, length, dst_pos); 1872 __ cmpw(temp, rscratch1); 1873 __ br(Assembler::HI, L_failed); 1874 1875 // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'. 1876 __ movw(src_pos, src_pos); 1877 __ movw(dst_pos, dst_pos); 1878 1879 BLOCK_COMMENT("arraycopy_range_checks done"); 1880 } 1881 1882 // These stubs get called from some dumb test routine. 1883 // I'll write them properly when they're called from 1884 // something that's actually doing something. 1885 static void fake_arraycopy_stub(address src, address dst, int count) { 1886 assert(count == 0, "huh?"); 1887 } 1888 1889 1890 // 1891 // Generate 'unsafe' array copy stub 1892 // Though just as safe as the other stubs, it takes an unscaled 1893 // size_t argument instead of an element count. 1894 // 1895 // Input: 1896 // c_rarg0 - source array address 1897 // c_rarg1 - destination array address 1898 // c_rarg2 - byte count, treated as ssize_t, can be zero 1899 // 1900 // Examines the alignment of the operands and dispatches 1901 // to a long, int, short, or byte copy loop. 1902 // 1903 address generate_unsafe_copy(const char *name, 1904 address byte_copy_entry, 1905 address short_copy_entry, 1906 address int_copy_entry, 1907 address long_copy_entry) { 1908 Label L_long_aligned, L_int_aligned, L_short_aligned; 1909 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1910 1911 __ align(CodeEntryAlignment); 1912 StubCodeMark mark(this, "StubRoutines", name); 1913 address start = __ pc(); 1914 __ enter(); // required for proper stackwalking of RuntimeStub frame 1915 1916 // bump this on entry, not on exit: 1917 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr); 1918 1919 __ orr(rscratch1, s, d); 1920 __ orr(rscratch1, rscratch1, count); 1921 1922 __ andr(rscratch1, rscratch1, BytesPerLong-1); 1923 __ cbz(rscratch1, L_long_aligned); 1924 __ andr(rscratch1, rscratch1, BytesPerInt-1); 1925 __ cbz(rscratch1, L_int_aligned); 1926 __ tbz(rscratch1, 0, L_short_aligned); 1927 __ b(RuntimeAddress(byte_copy_entry)); 1928 1929 __ BIND(L_short_aligned); 1930 __ lsr(count, count, LogBytesPerShort); // size => short_count 1931 __ b(RuntimeAddress(short_copy_entry)); 1932 __ BIND(L_int_aligned); 1933 __ lsr(count, count, LogBytesPerInt); // size => int_count 1934 __ b(RuntimeAddress(int_copy_entry)); 1935 __ BIND(L_long_aligned); 1936 __ lsr(count, count, LogBytesPerLong); // size => long_count 1937 __ b(RuntimeAddress(long_copy_entry)); 1938 1939 return start; 1940 } 1941 1942 // 1943 // Generate generic array copy stubs 1944 // 1945 // Input: 1946 // c_rarg0 - src oop 1947 // c_rarg1 - src_pos (32-bits) 1948 // c_rarg2 - dst oop 1949 // c_rarg3 - dst_pos (32-bits) 1950 // c_rarg4 - element count (32-bits) 1951 // 1952 // Output: 1953 // r0 == 0 - success 1954 // r0 == -1^K - failure, where K is partial transfer count 1955 // 1956 address generate_generic_copy(const char *name, 1957 address byte_copy_entry, address short_copy_entry, 1958 address int_copy_entry, address oop_copy_entry, 1959 address long_copy_entry, address checkcast_copy_entry) { 1960 1961 Label L_failed, L_objArray; 1962 Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs; 1963 1964 // Input registers 1965 const Register src = c_rarg0; // source array oop 1966 const Register src_pos = c_rarg1; // source position 1967 const Register dst = c_rarg2; // destination array oop 1968 const Register dst_pos = c_rarg3; // destination position 1969 const Register length = c_rarg4; 1970 1971 1972 // Registers used as temps 1973 const Register dst_klass = c_rarg5; 1974 1975 __ align(CodeEntryAlignment); 1976 1977 StubCodeMark mark(this, "StubRoutines", name); 1978 1979 address start = __ pc(); 1980 1981 __ enter(); // required for proper stackwalking of RuntimeStub frame 1982 1983 // bump this on entry, not on exit: 1984 inc_counter_np(SharedRuntime::_generic_array_copy_ctr); 1985 1986 //----------------------------------------------------------------------- 1987 // Assembler stub will be used for this call to arraycopy 1988 // if the following conditions are met: 1989 // 1990 // (1) src and dst must not be null. 1991 // (2) src_pos must not be negative. 1992 // (3) dst_pos must not be negative. 1993 // (4) length must not be negative. 1994 // (5) src klass and dst klass should be the same and not NULL. 1995 // (6) src and dst should be arrays. 1996 // (7) src_pos + length must not exceed length of src. 1997 // (8) dst_pos + length must not exceed length of dst. 1998 // 1999 2000 // if (src == NULL) return -1; 2001 __ cbz(src, L_failed); 2002 2003 // if (src_pos < 0) return -1; 2004 __ tbnz(src_pos, 31, L_failed); // i.e. sign bit set 2005 2006 // if (dst == NULL) return -1; 2007 __ cbz(dst, L_failed); 2008 2009 // if (dst_pos < 0) return -1; 2010 __ tbnz(dst_pos, 31, L_failed); // i.e. sign bit set 2011 2012 // registers used as temp 2013 const Register scratch_length = r16; // elements count to copy 2014 const Register scratch_src_klass = r17; // array klass 2015 const Register lh = r18; // layout helper 2016 2017 // if (length < 0) return -1; 2018 __ movw(scratch_length, length); // length (elements count, 32-bits value) 2019 __ tbnz(scratch_length, 31, L_failed); // i.e. sign bit set 2020 2021 __ load_klass(scratch_src_klass, src); 2022 #ifdef ASSERT 2023 // assert(src->klass() != NULL); 2024 { 2025 BLOCK_COMMENT("assert klasses not null {"); 2026 Label L1, L2; 2027 __ cbnz(scratch_src_klass, L2); // it is broken if klass is NULL 2028 __ bind(L1); 2029 __ stop("broken null klass"); 2030 __ bind(L2); 2031 __ load_klass(rscratch1, dst); 2032 __ cbz(rscratch1, L1); // this would be broken also 2033 BLOCK_COMMENT("} assert klasses not null done"); 2034 } 2035 #endif 2036 2037 // Load layout helper (32-bits) 2038 // 2039 // |array_tag| | header_size | element_type | |log2_element_size| 2040 // 32 30 24 16 8 2 0 2041 // 2042 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0 2043 // 2044 2045 const int lh_offset = in_bytes(Klass::layout_helper_offset()); 2046 2047 // Handle objArrays completely differently... 2048 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT); 2049 __ ldrw(lh, Address(scratch_src_klass, lh_offset)); 2050 __ movw(rscratch1, objArray_lh); 2051 __ eorw(rscratch2, lh, rscratch1); 2052 __ cbzw(rscratch2, L_objArray); 2053 2054 // if (src->klass() != dst->klass()) return -1; 2055 __ load_klass(rscratch2, dst); 2056 __ eor(rscratch2, rscratch2, scratch_src_klass); 2057 __ cbnz(rscratch2, L_failed); 2058 2059 // if (!src->is_Array()) return -1; 2060 __ tbz(lh, 31, L_failed); // i.e. (lh >= 0) 2061 2062 // At this point, it is known to be a typeArray (array_tag 0x3). 2063 #ifdef ASSERT 2064 { 2065 BLOCK_COMMENT("assert primitive array {"); 2066 Label L; 2067 __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift); 2068 __ cmpw(lh, rscratch2); 2069 __ br(Assembler::GE, L); 2070 __ stop("must be a primitive array"); 2071 __ bind(L); 2072 BLOCK_COMMENT("} assert primitive array done"); 2073 } 2074 #endif 2075 2076 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2077 rscratch2, L_failed); 2078 2079 // TypeArrayKlass 2080 // 2081 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize); 2082 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize); 2083 // 2084 2085 const Register rscratch1_offset = rscratch1; // array offset 2086 const Register r18_elsize = lh; // element size 2087 2088 __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift, 2089 exact_log2(Klass::_lh_header_size_mask+1)); // array_offset 2090 __ add(src, src, rscratch1_offset); // src array offset 2091 __ add(dst, dst, rscratch1_offset); // dst array offset 2092 BLOCK_COMMENT("choose copy loop based on element size"); 2093 2094 // next registers should be set before the jump to corresponding stub 2095 const Register from = c_rarg0; // source array address 2096 const Register to = c_rarg1; // destination array address 2097 const Register count = c_rarg2; // elements count 2098 2099 // 'from', 'to', 'count' registers should be set in such order 2100 // since they are the same as 'src', 'src_pos', 'dst'. 2101 2102 assert(Klass::_lh_log2_element_size_shift == 0, "fix this code"); 2103 2104 // The possible values of elsize are 0-3, i.e. exact_log2(element 2105 // size in bytes). We do a simple bitwise binary search. 2106 __ BIND(L_copy_bytes); 2107 __ tbnz(r18_elsize, 1, L_copy_ints); 2108 __ tbnz(r18_elsize, 0, L_copy_shorts); 2109 __ lea(from, Address(src, src_pos));// src_addr 2110 __ lea(to, Address(dst, dst_pos));// dst_addr 2111 __ movw(count, scratch_length); // length 2112 __ b(RuntimeAddress(byte_copy_entry)); 2113 2114 __ BIND(L_copy_shorts); 2115 __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr 2116 __ lea(to, Address(dst, dst_pos, Address::lsl(1)));// dst_addr 2117 __ movw(count, scratch_length); // length 2118 __ b(RuntimeAddress(short_copy_entry)); 2119 2120 __ BIND(L_copy_ints); 2121 __ tbnz(r18_elsize, 0, L_copy_longs); 2122 __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr 2123 __ lea(to, Address(dst, dst_pos, Address::lsl(2)));// dst_addr 2124 __ movw(count, scratch_length); // length 2125 __ b(RuntimeAddress(int_copy_entry)); 2126 2127 __ BIND(L_copy_longs); 2128 #ifdef ASSERT 2129 { 2130 BLOCK_COMMENT("assert long copy {"); 2131 Label L; 2132 __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r18_elsize 2133 __ cmpw(r18_elsize, LogBytesPerLong); 2134 __ br(Assembler::EQ, L); 2135 __ stop("must be long copy, but elsize is wrong"); 2136 __ bind(L); 2137 BLOCK_COMMENT("} assert long copy done"); 2138 } 2139 #endif 2140 __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr 2141 __ lea(to, Address(dst, dst_pos, Address::lsl(3)));// dst_addr 2142 __ movw(count, scratch_length); // length 2143 __ b(RuntimeAddress(long_copy_entry)); 2144 2145 // ObjArrayKlass 2146 __ BIND(L_objArray); 2147 // live at this point: scratch_src_klass, scratch_length, src[_pos], dst[_pos] 2148 2149 Label L_plain_copy, L_checkcast_copy; 2150 // test array classes for subtyping 2151 __ load_klass(r18, dst); 2152 __ cmp(scratch_src_klass, r18); // usual case is exact equality 2153 __ br(Assembler::NE, L_checkcast_copy); 2154 2155 // Identically typed arrays can be copied without element-wise checks. 2156 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2157 rscratch2, L_failed); 2158 2159 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2160 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2161 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2162 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2163 __ movw(count, scratch_length); // length 2164 __ BIND(L_plain_copy); 2165 __ b(RuntimeAddress(oop_copy_entry)); 2166 2167 __ BIND(L_checkcast_copy); 2168 // live at this point: scratch_src_klass, scratch_length, r18 (dst_klass) 2169 { 2170 // Before looking at dst.length, make sure dst is also an objArray. 2171 __ ldrw(rscratch1, Address(r18, lh_offset)); 2172 __ movw(rscratch2, objArray_lh); 2173 __ eorw(rscratch1, rscratch1, rscratch2); 2174 __ cbnzw(rscratch1, L_failed); 2175 2176 // It is safe to examine both src.length and dst.length. 2177 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2178 r18, L_failed); 2179 2180 __ load_klass(dst_klass, dst); // reload 2181 2182 // Marshal the base address arguments now, freeing registers. 2183 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2184 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2185 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2186 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2187 __ movw(count, length); // length (reloaded) 2188 Register sco_temp = c_rarg3; // this register is free now 2189 assert_different_registers(from, to, count, sco_temp, 2190 dst_klass, scratch_src_klass); 2191 // assert_clean_int(count, sco_temp); 2192 2193 // Generate the type check. 2194 const int sco_offset = in_bytes(Klass::super_check_offset_offset()); 2195 __ ldrw(sco_temp, Address(dst_klass, sco_offset)); 2196 2197 // Smashes rscratch1, rscratch2 2198 generate_type_check(scratch_src_klass, sco_temp, dst_klass, L_plain_copy); 2199 2200 // Fetch destination element klass from the ObjArrayKlass header. 2201 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset()); 2202 __ ldr(dst_klass, Address(dst_klass, ek_offset)); 2203 __ ldrw(sco_temp, Address(dst_klass, sco_offset)); 2204 2205 // the checkcast_copy loop needs two extra arguments: 2206 assert(c_rarg3 == sco_temp, "#3 already in place"); 2207 // Set up arguments for checkcast_copy_entry. 2208 __ mov(c_rarg4, dst_klass); // dst.klass.element_klass 2209 __ b(RuntimeAddress(checkcast_copy_entry)); 2210 } 2211 2212 __ BIND(L_failed); 2213 __ mov(r0, -1); 2214 __ leave(); // required for proper stackwalking of RuntimeStub frame 2215 __ ret(lr); 2216 2217 return start; 2218 } 2219 2220 // 2221 // Generate stub for array fill. If "aligned" is true, the 2222 // "to" address is assumed to be heapword aligned. 2223 // 2224 // Arguments for generated stub: 2225 // to: c_rarg0 2226 // value: c_rarg1 2227 // count: c_rarg2 treated as signed 2228 // 2229 address generate_fill(BasicType t, bool aligned, const char *name) { 2230 __ align(CodeEntryAlignment); 2231 StubCodeMark mark(this, "StubRoutines", name); 2232 address start = __ pc(); 2233 2234 BLOCK_COMMENT("Entry:"); 2235 2236 const Register to = c_rarg0; // source array address 2237 const Register value = c_rarg1; // value 2238 const Register count = c_rarg2; // elements count 2239 2240 const Register bz_base = r10; // base for block_zero routine 2241 const Register cnt_words = r11; // temp register 2242 2243 __ enter(); 2244 2245 Label L_fill_elements, L_exit1; 2246 2247 int shift = -1; 2248 switch (t) { 2249 case T_BYTE: 2250 shift = 0; 2251 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2252 __ bfi(value, value, 8, 8); // 8 bit -> 16 bit 2253 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2254 __ br(Assembler::LO, L_fill_elements); 2255 break; 2256 case T_SHORT: 2257 shift = 1; 2258 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2259 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2260 __ br(Assembler::LO, L_fill_elements); 2261 break; 2262 case T_INT: 2263 shift = 2; 2264 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2265 __ br(Assembler::LO, L_fill_elements); 2266 break; 2267 default: ShouldNotReachHere(); 2268 } 2269 2270 // Align source address at 8 bytes address boundary. 2271 Label L_skip_align1, L_skip_align2, L_skip_align4; 2272 if (!aligned) { 2273 switch (t) { 2274 case T_BYTE: 2275 // One byte misalignment happens only for byte arrays. 2276 __ tbz(to, 0, L_skip_align1); 2277 __ strb(value, Address(__ post(to, 1))); 2278 __ subw(count, count, 1); 2279 __ bind(L_skip_align1); 2280 // Fallthrough 2281 case T_SHORT: 2282 // Two bytes misalignment happens only for byte and short (char) arrays. 2283 __ tbz(to, 1, L_skip_align2); 2284 __ strh(value, Address(__ post(to, 2))); 2285 __ subw(count, count, 2 >> shift); 2286 __ bind(L_skip_align2); 2287 // Fallthrough 2288 case T_INT: 2289 // Align to 8 bytes, we know we are 4 byte aligned to start. 2290 __ tbz(to, 2, L_skip_align4); 2291 __ strw(value, Address(__ post(to, 4))); 2292 __ subw(count, count, 4 >> shift); 2293 __ bind(L_skip_align4); 2294 break; 2295 default: ShouldNotReachHere(); 2296 } 2297 } 2298 2299 // 2300 // Fill large chunks 2301 // 2302 __ lsrw(cnt_words, count, 3 - shift); // number of words 2303 __ bfi(value, value, 32, 32); // 32 bit -> 64 bit 2304 __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift); 2305 if (UseBlockZeroing) { 2306 Label non_block_zeroing, rest; 2307 // If the fill value is zero we can use the fast zero_words(). 2308 __ cbnz(value, non_block_zeroing); 2309 __ mov(bz_base, to); 2310 __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord); 2311 __ zero_words(bz_base, cnt_words); 2312 __ b(rest); 2313 __ bind(non_block_zeroing); 2314 __ fill_words(to, cnt_words, value); 2315 __ bind(rest); 2316 } else { 2317 __ fill_words(to, cnt_words, value); 2318 } 2319 2320 // Remaining count is less than 8 bytes. Fill it by a single store. 2321 // Note that the total length is no less than 8 bytes. 2322 if (t == T_BYTE || t == T_SHORT) { 2323 Label L_exit1; 2324 __ cbzw(count, L_exit1); 2325 __ add(to, to, count, Assembler::LSL, shift); // points to the end 2326 __ str(value, Address(to, -8)); // overwrite some elements 2327 __ bind(L_exit1); 2328 __ leave(); 2329 __ ret(lr); 2330 } 2331 2332 // Handle copies less than 8 bytes. 2333 Label L_fill_2, L_fill_4, L_exit2; 2334 __ bind(L_fill_elements); 2335 switch (t) { 2336 case T_BYTE: 2337 __ tbz(count, 0, L_fill_2); 2338 __ strb(value, Address(__ post(to, 1))); 2339 __ bind(L_fill_2); 2340 __ tbz(count, 1, L_fill_4); 2341 __ strh(value, Address(__ post(to, 2))); 2342 __ bind(L_fill_4); 2343 __ tbz(count, 2, L_exit2); 2344 __ strw(value, Address(to)); 2345 break; 2346 case T_SHORT: 2347 __ tbz(count, 0, L_fill_4); 2348 __ strh(value, Address(__ post(to, 2))); 2349 __ bind(L_fill_4); 2350 __ tbz(count, 1, L_exit2); 2351 __ strw(value, Address(to)); 2352 break; 2353 case T_INT: 2354 __ cbzw(count, L_exit2); 2355 __ strw(value, Address(to)); 2356 break; 2357 default: ShouldNotReachHere(); 2358 } 2359 __ bind(L_exit2); 2360 __ leave(); 2361 __ ret(lr); 2362 return start; 2363 } 2364 2365 address generate_data_cache_writeback() { 2366 const Register line = c_rarg0; // address of line to write back 2367 2368 __ align(CodeEntryAlignment); 2369 2370 StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback"); 2371 2372 address start = __ pc(); 2373 __ enter(); 2374 __ cache_wb(Address(line, 0)); 2375 __ leave(); 2376 __ ret(lr); 2377 2378 return start; 2379 } 2380 2381 address generate_data_cache_writeback_sync() { 2382 const Register is_pre = c_rarg0; // pre or post sync 2383 2384 __ align(CodeEntryAlignment); 2385 2386 StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback_sync"); 2387 2388 // pre wbsync is a no-op 2389 // post wbsync translates to an sfence 2390 2391 Label skip; 2392 address start = __ pc(); 2393 __ enter(); 2394 __ cbnz(is_pre, skip); 2395 __ cache_wbsync(false); 2396 __ bind(skip); 2397 __ leave(); 2398 __ ret(lr); 2399 2400 return start; 2401 } 2402 2403 void generate_arraycopy_stubs() { 2404 address entry; 2405 address entry_jbyte_arraycopy; 2406 address entry_jshort_arraycopy; 2407 address entry_jint_arraycopy; 2408 address entry_oop_arraycopy; 2409 address entry_jlong_arraycopy; 2410 address entry_checkcast_arraycopy; 2411 2412 generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards); 2413 generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards); 2414 2415 StubRoutines::aarch64::_zero_blocks = generate_zero_blocks(); 2416 2417 //*** jbyte 2418 // Always need aligned and unaligned versions 2419 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(false, &entry, 2420 "jbyte_disjoint_arraycopy"); 2421 StubRoutines::_jbyte_arraycopy = generate_conjoint_byte_copy(false, entry, 2422 &entry_jbyte_arraycopy, 2423 "jbyte_arraycopy"); 2424 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry, 2425 "arrayof_jbyte_disjoint_arraycopy"); 2426 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_byte_copy(true, entry, NULL, 2427 "arrayof_jbyte_arraycopy"); 2428 2429 //*** jshort 2430 // Always need aligned and unaligned versions 2431 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, &entry, 2432 "jshort_disjoint_arraycopy"); 2433 StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(false, entry, 2434 &entry_jshort_arraycopy, 2435 "jshort_arraycopy"); 2436 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry, 2437 "arrayof_jshort_disjoint_arraycopy"); 2438 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_short_copy(true, entry, NULL, 2439 "arrayof_jshort_arraycopy"); 2440 2441 //*** jint 2442 // Aligned versions 2443 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry, 2444 "arrayof_jint_disjoint_arraycopy"); 2445 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy, 2446 "arrayof_jint_arraycopy"); 2447 // In 64 bit we need both aligned and unaligned versions of jint arraycopy. 2448 // entry_jint_arraycopy always points to the unaligned version 2449 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_int_copy(false, &entry, 2450 "jint_disjoint_arraycopy"); 2451 StubRoutines::_jint_arraycopy = generate_conjoint_int_copy(false, entry, 2452 &entry_jint_arraycopy, 2453 "jint_arraycopy"); 2454 2455 //*** jlong 2456 // It is always aligned 2457 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry, 2458 "arrayof_jlong_disjoint_arraycopy"); 2459 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy, 2460 "arrayof_jlong_arraycopy"); 2461 StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy; 2462 StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy; 2463 2464 //*** oops 2465 { 2466 // With compressed oops we need unaligned versions; notice that 2467 // we overwrite entry_oop_arraycopy. 2468 bool aligned = !UseCompressedOops; 2469 2470 StubRoutines::_arrayof_oop_disjoint_arraycopy 2471 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy", 2472 /*dest_uninitialized*/false); 2473 StubRoutines::_arrayof_oop_arraycopy 2474 = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy", 2475 /*dest_uninitialized*/false); 2476 // Aligned versions without pre-barriers 2477 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit 2478 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit", 2479 /*dest_uninitialized*/true); 2480 StubRoutines::_arrayof_oop_arraycopy_uninit 2481 = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit", 2482 /*dest_uninitialized*/true); 2483 } 2484 2485 StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy; 2486 StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy; 2487 StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit; 2488 StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit; 2489 2490 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy); 2491 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL, 2492 /*dest_uninitialized*/true); 2493 2494 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy("unsafe_arraycopy", 2495 entry_jbyte_arraycopy, 2496 entry_jshort_arraycopy, 2497 entry_jint_arraycopy, 2498 entry_jlong_arraycopy); 2499 2500 StubRoutines::_generic_arraycopy = generate_generic_copy("generic_arraycopy", 2501 entry_jbyte_arraycopy, 2502 entry_jshort_arraycopy, 2503 entry_jint_arraycopy, 2504 entry_oop_arraycopy, 2505 entry_jlong_arraycopy, 2506 entry_checkcast_arraycopy); 2507 2508 StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill"); 2509 StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill"); 2510 StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill"); 2511 StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill"); 2512 StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill"); 2513 StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill"); 2514 } 2515 2516 void generate_math_stubs() { Unimplemented(); } 2517 2518 // Arguments: 2519 // 2520 // Inputs: 2521 // c_rarg0 - source byte array address 2522 // c_rarg1 - destination byte array address 2523 // c_rarg2 - K (key) in little endian int array 2524 // 2525 address generate_aescrypt_encryptBlock() { 2526 __ align(CodeEntryAlignment); 2527 StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock"); 2528 2529 Label L_doLast; 2530 2531 const Register from = c_rarg0; // source array address 2532 const Register to = c_rarg1; // destination array address 2533 const Register key = c_rarg2; // key array address 2534 const Register keylen = rscratch1; 2535 2536 address start = __ pc(); 2537 __ enter(); 2538 2539 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2540 2541 __ ld1(v0, __ T16B, from); // get 16 bytes of input 2542 2543 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2544 __ rev32(v1, __ T16B, v1); 2545 __ rev32(v2, __ T16B, v2); 2546 __ rev32(v3, __ T16B, v3); 2547 __ rev32(v4, __ T16B, v4); 2548 __ aese(v0, v1); 2549 __ aesmc(v0, v0); 2550 __ aese(v0, v2); 2551 __ aesmc(v0, v0); 2552 __ aese(v0, v3); 2553 __ aesmc(v0, v0); 2554 __ aese(v0, v4); 2555 __ aesmc(v0, v0); 2556 2557 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2558 __ rev32(v1, __ T16B, v1); 2559 __ rev32(v2, __ T16B, v2); 2560 __ rev32(v3, __ T16B, v3); 2561 __ rev32(v4, __ T16B, v4); 2562 __ aese(v0, v1); 2563 __ aesmc(v0, v0); 2564 __ aese(v0, v2); 2565 __ aesmc(v0, v0); 2566 __ aese(v0, v3); 2567 __ aesmc(v0, v0); 2568 __ aese(v0, v4); 2569 __ aesmc(v0, v0); 2570 2571 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2572 __ rev32(v1, __ T16B, v1); 2573 __ rev32(v2, __ T16B, v2); 2574 2575 __ cmpw(keylen, 44); 2576 __ br(Assembler::EQ, L_doLast); 2577 2578 __ aese(v0, v1); 2579 __ aesmc(v0, v0); 2580 __ aese(v0, v2); 2581 __ aesmc(v0, v0); 2582 2583 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2584 __ rev32(v1, __ T16B, v1); 2585 __ rev32(v2, __ T16B, v2); 2586 2587 __ cmpw(keylen, 52); 2588 __ br(Assembler::EQ, L_doLast); 2589 2590 __ aese(v0, v1); 2591 __ aesmc(v0, v0); 2592 __ aese(v0, v2); 2593 __ aesmc(v0, v0); 2594 2595 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2596 __ rev32(v1, __ T16B, v1); 2597 __ rev32(v2, __ T16B, v2); 2598 2599 __ BIND(L_doLast); 2600 2601 __ aese(v0, v1); 2602 __ aesmc(v0, v0); 2603 __ aese(v0, v2); 2604 2605 __ ld1(v1, __ T16B, key); 2606 __ rev32(v1, __ T16B, v1); 2607 __ eor(v0, __ T16B, v0, v1); 2608 2609 __ st1(v0, __ T16B, to); 2610 2611 __ mov(r0, 0); 2612 2613 __ leave(); 2614 __ ret(lr); 2615 2616 return start; 2617 } 2618 2619 // Arguments: 2620 // 2621 // Inputs: 2622 // c_rarg0 - source byte array address 2623 // c_rarg1 - destination byte array address 2624 // c_rarg2 - K (key) in little endian int array 2625 // 2626 address generate_aescrypt_decryptBlock() { 2627 assert(UseAES, "need AES instructions and misaligned SSE support"); 2628 __ align(CodeEntryAlignment); 2629 StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock"); 2630 Label L_doLast; 2631 2632 const Register from = c_rarg0; // source array address 2633 const Register to = c_rarg1; // destination array address 2634 const Register key = c_rarg2; // key array address 2635 const Register keylen = rscratch1; 2636 2637 address start = __ pc(); 2638 __ enter(); // required for proper stackwalking of RuntimeStub frame 2639 2640 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2641 2642 __ ld1(v0, __ T16B, from); // get 16 bytes of input 2643 2644 __ ld1(v5, __ T16B, __ post(key, 16)); 2645 __ rev32(v5, __ T16B, v5); 2646 2647 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2648 __ rev32(v1, __ T16B, v1); 2649 __ rev32(v2, __ T16B, v2); 2650 __ rev32(v3, __ T16B, v3); 2651 __ rev32(v4, __ T16B, v4); 2652 __ aesd(v0, v1); 2653 __ aesimc(v0, v0); 2654 __ aesd(v0, v2); 2655 __ aesimc(v0, v0); 2656 __ aesd(v0, v3); 2657 __ aesimc(v0, v0); 2658 __ aesd(v0, v4); 2659 __ aesimc(v0, v0); 2660 2661 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2662 __ rev32(v1, __ T16B, v1); 2663 __ rev32(v2, __ T16B, v2); 2664 __ rev32(v3, __ T16B, v3); 2665 __ rev32(v4, __ T16B, v4); 2666 __ aesd(v0, v1); 2667 __ aesimc(v0, v0); 2668 __ aesd(v0, v2); 2669 __ aesimc(v0, v0); 2670 __ aesd(v0, v3); 2671 __ aesimc(v0, v0); 2672 __ aesd(v0, v4); 2673 __ aesimc(v0, v0); 2674 2675 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2676 __ rev32(v1, __ T16B, v1); 2677 __ rev32(v2, __ T16B, v2); 2678 2679 __ cmpw(keylen, 44); 2680 __ br(Assembler::EQ, L_doLast); 2681 2682 __ aesd(v0, v1); 2683 __ aesimc(v0, v0); 2684 __ aesd(v0, v2); 2685 __ aesimc(v0, v0); 2686 2687 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2688 __ rev32(v1, __ T16B, v1); 2689 __ rev32(v2, __ T16B, v2); 2690 2691 __ cmpw(keylen, 52); 2692 __ br(Assembler::EQ, L_doLast); 2693 2694 __ aesd(v0, v1); 2695 __ aesimc(v0, v0); 2696 __ aesd(v0, v2); 2697 __ aesimc(v0, v0); 2698 2699 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2700 __ rev32(v1, __ T16B, v1); 2701 __ rev32(v2, __ T16B, v2); 2702 2703 __ BIND(L_doLast); 2704 2705 __ aesd(v0, v1); 2706 __ aesimc(v0, v0); 2707 __ aesd(v0, v2); 2708 2709 __ eor(v0, __ T16B, v0, v5); 2710 2711 __ st1(v0, __ T16B, to); 2712 2713 __ mov(r0, 0); 2714 2715 __ leave(); 2716 __ ret(lr); 2717 2718 return start; 2719 } 2720 2721 // Arguments: 2722 // 2723 // Inputs: 2724 // c_rarg0 - source byte array address 2725 // c_rarg1 - destination byte array address 2726 // c_rarg2 - K (key) in little endian int array 2727 // c_rarg3 - r vector byte array address 2728 // c_rarg4 - input length 2729 // 2730 // Output: 2731 // x0 - input length 2732 // 2733 address generate_cipherBlockChaining_encryptAESCrypt() { 2734 assert(UseAES, "need AES instructions and misaligned SSE support"); 2735 __ align(CodeEntryAlignment); 2736 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt"); 2737 2738 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2739 2740 const Register from = c_rarg0; // source array address 2741 const Register to = c_rarg1; // destination array address 2742 const Register key = c_rarg2; // key array address 2743 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2744 // and left with the results of the last encryption block 2745 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2746 const Register keylen = rscratch1; 2747 2748 address start = __ pc(); 2749 2750 __ enter(); 2751 2752 __ movw(rscratch2, len_reg); 2753 2754 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2755 2756 __ ld1(v0, __ T16B, rvec); 2757 2758 __ cmpw(keylen, 52); 2759 __ br(Assembler::CC, L_loadkeys_44); 2760 __ br(Assembler::EQ, L_loadkeys_52); 2761 2762 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2763 __ rev32(v17, __ T16B, v17); 2764 __ rev32(v18, __ T16B, v18); 2765 __ BIND(L_loadkeys_52); 2766 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2767 __ rev32(v19, __ T16B, v19); 2768 __ rev32(v20, __ T16B, v20); 2769 __ BIND(L_loadkeys_44); 2770 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2771 __ rev32(v21, __ T16B, v21); 2772 __ rev32(v22, __ T16B, v22); 2773 __ rev32(v23, __ T16B, v23); 2774 __ rev32(v24, __ T16B, v24); 2775 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2776 __ rev32(v25, __ T16B, v25); 2777 __ rev32(v26, __ T16B, v26); 2778 __ rev32(v27, __ T16B, v27); 2779 __ rev32(v28, __ T16B, v28); 2780 __ ld1(v29, v30, v31, __ T16B, key); 2781 __ rev32(v29, __ T16B, v29); 2782 __ rev32(v30, __ T16B, v30); 2783 __ rev32(v31, __ T16B, v31); 2784 2785 __ BIND(L_aes_loop); 2786 __ ld1(v1, __ T16B, __ post(from, 16)); 2787 __ eor(v0, __ T16B, v0, v1); 2788 2789 __ br(Assembler::CC, L_rounds_44); 2790 __ br(Assembler::EQ, L_rounds_52); 2791 2792 __ aese(v0, v17); __ aesmc(v0, v0); 2793 __ aese(v0, v18); __ aesmc(v0, v0); 2794 __ BIND(L_rounds_52); 2795 __ aese(v0, v19); __ aesmc(v0, v0); 2796 __ aese(v0, v20); __ aesmc(v0, v0); 2797 __ BIND(L_rounds_44); 2798 __ aese(v0, v21); __ aesmc(v0, v0); 2799 __ aese(v0, v22); __ aesmc(v0, v0); 2800 __ aese(v0, v23); __ aesmc(v0, v0); 2801 __ aese(v0, v24); __ aesmc(v0, v0); 2802 __ aese(v0, v25); __ aesmc(v0, v0); 2803 __ aese(v0, v26); __ aesmc(v0, v0); 2804 __ aese(v0, v27); __ aesmc(v0, v0); 2805 __ aese(v0, v28); __ aesmc(v0, v0); 2806 __ aese(v0, v29); __ aesmc(v0, v0); 2807 __ aese(v0, v30); 2808 __ eor(v0, __ T16B, v0, v31); 2809 2810 __ st1(v0, __ T16B, __ post(to, 16)); 2811 2812 __ subw(len_reg, len_reg, 16); 2813 __ cbnzw(len_reg, L_aes_loop); 2814 2815 __ st1(v0, __ T16B, rvec); 2816 2817 __ mov(r0, rscratch2); 2818 2819 __ leave(); 2820 __ ret(lr); 2821 2822 return start; 2823 } 2824 2825 // Arguments: 2826 // 2827 // Inputs: 2828 // c_rarg0 - source byte array address 2829 // c_rarg1 - destination byte array address 2830 // c_rarg2 - K (key) in little endian int array 2831 // c_rarg3 - r vector byte array address 2832 // c_rarg4 - input length 2833 // 2834 // Output: 2835 // r0 - input length 2836 // 2837 address generate_cipherBlockChaining_decryptAESCrypt() { 2838 assert(UseAES, "need AES instructions and misaligned SSE support"); 2839 __ align(CodeEntryAlignment); 2840 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt"); 2841 2842 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2843 2844 const Register from = c_rarg0; // source array address 2845 const Register to = c_rarg1; // destination array address 2846 const Register key = c_rarg2; // key array address 2847 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2848 // and left with the results of the last encryption block 2849 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2850 const Register keylen = rscratch1; 2851 2852 address start = __ pc(); 2853 2854 __ enter(); 2855 2856 __ movw(rscratch2, len_reg); 2857 2858 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2859 2860 __ ld1(v2, __ T16B, rvec); 2861 2862 __ ld1(v31, __ T16B, __ post(key, 16)); 2863 __ rev32(v31, __ T16B, v31); 2864 2865 __ cmpw(keylen, 52); 2866 __ br(Assembler::CC, L_loadkeys_44); 2867 __ br(Assembler::EQ, L_loadkeys_52); 2868 2869 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2870 __ rev32(v17, __ T16B, v17); 2871 __ rev32(v18, __ T16B, v18); 2872 __ BIND(L_loadkeys_52); 2873 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2874 __ rev32(v19, __ T16B, v19); 2875 __ rev32(v20, __ T16B, v20); 2876 __ BIND(L_loadkeys_44); 2877 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2878 __ rev32(v21, __ T16B, v21); 2879 __ rev32(v22, __ T16B, v22); 2880 __ rev32(v23, __ T16B, v23); 2881 __ rev32(v24, __ T16B, v24); 2882 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2883 __ rev32(v25, __ T16B, v25); 2884 __ rev32(v26, __ T16B, v26); 2885 __ rev32(v27, __ T16B, v27); 2886 __ rev32(v28, __ T16B, v28); 2887 __ ld1(v29, v30, __ T16B, key); 2888 __ rev32(v29, __ T16B, v29); 2889 __ rev32(v30, __ T16B, v30); 2890 2891 __ BIND(L_aes_loop); 2892 __ ld1(v0, __ T16B, __ post(from, 16)); 2893 __ orr(v1, __ T16B, v0, v0); 2894 2895 __ br(Assembler::CC, L_rounds_44); 2896 __ br(Assembler::EQ, L_rounds_52); 2897 2898 __ aesd(v0, v17); __ aesimc(v0, v0); 2899 __ aesd(v0, v18); __ aesimc(v0, v0); 2900 __ BIND(L_rounds_52); 2901 __ aesd(v0, v19); __ aesimc(v0, v0); 2902 __ aesd(v0, v20); __ aesimc(v0, v0); 2903 __ BIND(L_rounds_44); 2904 __ aesd(v0, v21); __ aesimc(v0, v0); 2905 __ aesd(v0, v22); __ aesimc(v0, v0); 2906 __ aesd(v0, v23); __ aesimc(v0, v0); 2907 __ aesd(v0, v24); __ aesimc(v0, v0); 2908 __ aesd(v0, v25); __ aesimc(v0, v0); 2909 __ aesd(v0, v26); __ aesimc(v0, v0); 2910 __ aesd(v0, v27); __ aesimc(v0, v0); 2911 __ aesd(v0, v28); __ aesimc(v0, v0); 2912 __ aesd(v0, v29); __ aesimc(v0, v0); 2913 __ aesd(v0, v30); 2914 __ eor(v0, __ T16B, v0, v31); 2915 __ eor(v0, __ T16B, v0, v2); 2916 2917 __ st1(v0, __ T16B, __ post(to, 16)); 2918 __ orr(v2, __ T16B, v1, v1); 2919 2920 __ subw(len_reg, len_reg, 16); 2921 __ cbnzw(len_reg, L_aes_loop); 2922 2923 __ st1(v2, __ T16B, rvec); 2924 2925 __ mov(r0, rscratch2); 2926 2927 __ leave(); 2928 __ ret(lr); 2929 2930 return start; 2931 } 2932 2933 // Arguments: 2934 // 2935 // Inputs: 2936 // c_rarg0 - byte[] source+offset 2937 // c_rarg1 - int[] SHA.state 2938 // c_rarg2 - int offset 2939 // c_rarg3 - int limit 2940 // 2941 address generate_sha1_implCompress(bool multi_block, const char *name) { 2942 __ align(CodeEntryAlignment); 2943 StubCodeMark mark(this, "StubRoutines", name); 2944 address start = __ pc(); 2945 2946 Register buf = c_rarg0; 2947 Register state = c_rarg1; 2948 Register ofs = c_rarg2; 2949 Register limit = c_rarg3; 2950 2951 Label keys; 2952 Label sha1_loop; 2953 2954 // load the keys into v0..v3 2955 __ adr(rscratch1, keys); 2956 __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1)); 2957 // load 5 words state into v6, v7 2958 __ ldrq(v6, Address(state, 0)); 2959 __ ldrs(v7, Address(state, 16)); 2960 2961 2962 __ BIND(sha1_loop); 2963 // load 64 bytes of data into v16..v19 2964 __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf); 2965 __ rev32(v16, __ T16B, v16); 2966 __ rev32(v17, __ T16B, v17); 2967 __ rev32(v18, __ T16B, v18); 2968 __ rev32(v19, __ T16B, v19); 2969 2970 // do the sha1 2971 __ addv(v4, __ T4S, v16, v0); 2972 __ orr(v20, __ T16B, v6, v6); 2973 2974 FloatRegister d0 = v16; 2975 FloatRegister d1 = v17; 2976 FloatRegister d2 = v18; 2977 FloatRegister d3 = v19; 2978 2979 for (int round = 0; round < 20; round++) { 2980 FloatRegister tmp1 = (round & 1) ? v4 : v5; 2981 FloatRegister tmp2 = (round & 1) ? v21 : v22; 2982 FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7; 2983 FloatRegister tmp4 = (round & 1) ? v5 : v4; 2984 FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3)); 2985 2986 if (round < 16) __ sha1su0(d0, __ T4S, d1, d2); 2987 if (round < 19) __ addv(tmp1, __ T4S, d1, key); 2988 __ sha1h(tmp2, __ T4S, v20); 2989 if (round < 5) 2990 __ sha1c(v20, __ T4S, tmp3, tmp4); 2991 else if (round < 10 || round >= 15) 2992 __ sha1p(v20, __ T4S, tmp3, tmp4); 2993 else 2994 __ sha1m(v20, __ T4S, tmp3, tmp4); 2995 if (round < 16) __ sha1su1(d0, __ T4S, d3); 2996 2997 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 2998 } 2999 3000 __ addv(v7, __ T2S, v7, v21); 3001 __ addv(v6, __ T4S, v6, v20); 3002 3003 if (multi_block) { 3004 __ add(ofs, ofs, 64); 3005 __ cmp(ofs, limit); 3006 __ br(Assembler::LE, sha1_loop); 3007 __ mov(c_rarg0, ofs); // return ofs 3008 } 3009 3010 __ strq(v6, Address(state, 0)); 3011 __ strs(v7, Address(state, 16)); 3012 3013 __ ret(lr); 3014 3015 __ bind(keys); 3016 __ emit_int32(0x5a827999); 3017 __ emit_int32(0x6ed9eba1); 3018 __ emit_int32(0x8f1bbcdc); 3019 __ emit_int32(0xca62c1d6); 3020 3021 return start; 3022 } 3023 3024 3025 // Arguments: 3026 // 3027 // Inputs: 3028 // c_rarg0 - byte[] source+offset 3029 // c_rarg1 - int[] SHA.state 3030 // c_rarg2 - int offset 3031 // c_rarg3 - int limit 3032 // 3033 address generate_sha256_implCompress(bool multi_block, const char *name) { 3034 static const uint32_t round_consts[64] = { 3035 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 3036 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 3037 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 3038 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 3039 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 3040 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 3041 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 3042 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, 3043 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 3044 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 3045 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 3046 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, 3047 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 3048 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, 3049 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 3050 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, 3051 }; 3052 __ align(CodeEntryAlignment); 3053 StubCodeMark mark(this, "StubRoutines", name); 3054 address start = __ pc(); 3055 3056 Register buf = c_rarg0; 3057 Register state = c_rarg1; 3058 Register ofs = c_rarg2; 3059 Register limit = c_rarg3; 3060 3061 Label sha1_loop; 3062 3063 __ stpd(v8, v9, __ pre(sp, -32)); 3064 __ stpd(v10, v11, Address(sp, 16)); 3065 3066 // dga == v0 3067 // dgb == v1 3068 // dg0 == v2 3069 // dg1 == v3 3070 // dg2 == v4 3071 // t0 == v6 3072 // t1 == v7 3073 3074 // load 16 keys to v16..v31 3075 __ lea(rscratch1, ExternalAddress((address)round_consts)); 3076 __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64)); 3077 __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64)); 3078 __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64)); 3079 __ ld1(v28, v29, v30, v31, __ T4S, rscratch1); 3080 3081 // load 8 words (256 bits) state 3082 __ ldpq(v0, v1, state); 3083 3084 __ BIND(sha1_loop); 3085 // load 64 bytes of data into v8..v11 3086 __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf); 3087 __ rev32(v8, __ T16B, v8); 3088 __ rev32(v9, __ T16B, v9); 3089 __ rev32(v10, __ T16B, v10); 3090 __ rev32(v11, __ T16B, v11); 3091 3092 __ addv(v6, __ T4S, v8, v16); 3093 __ orr(v2, __ T16B, v0, v0); 3094 __ orr(v3, __ T16B, v1, v1); 3095 3096 FloatRegister d0 = v8; 3097 FloatRegister d1 = v9; 3098 FloatRegister d2 = v10; 3099 FloatRegister d3 = v11; 3100 3101 3102 for (int round = 0; round < 16; round++) { 3103 FloatRegister tmp1 = (round & 1) ? v6 : v7; 3104 FloatRegister tmp2 = (round & 1) ? v7 : v6; 3105 FloatRegister tmp3 = (round & 1) ? v2 : v4; 3106 FloatRegister tmp4 = (round & 1) ? v4 : v2; 3107 3108 if (round < 12) __ sha256su0(d0, __ T4S, d1); 3109 __ orr(v4, __ T16B, v2, v2); 3110 if (round < 15) 3111 __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17)); 3112 __ sha256h(v2, __ T4S, v3, tmp2); 3113 __ sha256h2(v3, __ T4S, v4, tmp2); 3114 if (round < 12) __ sha256su1(d0, __ T4S, d2, d3); 3115 3116 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3117 } 3118 3119 __ addv(v0, __ T4S, v0, v2); 3120 __ addv(v1, __ T4S, v1, v3); 3121 3122 if (multi_block) { 3123 __ add(ofs, ofs, 64); 3124 __ cmp(ofs, limit); 3125 __ br(Assembler::LE, sha1_loop); 3126 __ mov(c_rarg0, ofs); // return ofs 3127 } 3128 3129 __ ldpd(v10, v11, Address(sp, 16)); 3130 __ ldpd(v8, v9, __ post(sp, 32)); 3131 3132 __ stpq(v0, v1, state); 3133 3134 __ ret(lr); 3135 3136 return start; 3137 } 3138 3139 // Safefetch stubs. 3140 void generate_safefetch(const char* name, int size, address* entry, 3141 address* fault_pc, address* continuation_pc) { 3142 // safefetch signatures: 3143 // int SafeFetch32(int* adr, int errValue); 3144 // intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue); 3145 // 3146 // arguments: 3147 // c_rarg0 = adr 3148 // c_rarg1 = errValue 3149 // 3150 // result: 3151 // PPC_RET = *adr or errValue 3152 3153 StubCodeMark mark(this, "StubRoutines", name); 3154 3155 // Entry point, pc or function descriptor. 3156 *entry = __ pc(); 3157 3158 // Load *adr into c_rarg1, may fault. 3159 *fault_pc = __ pc(); 3160 switch (size) { 3161 case 4: 3162 // int32_t 3163 __ ldrw(c_rarg1, Address(c_rarg0, 0)); 3164 break; 3165 case 8: 3166 // int64_t 3167 __ ldr(c_rarg1, Address(c_rarg0, 0)); 3168 break; 3169 default: 3170 ShouldNotReachHere(); 3171 } 3172 3173 // return errValue or *adr 3174 *continuation_pc = __ pc(); 3175 __ mov(r0, c_rarg1); 3176 __ ret(lr); 3177 } 3178 3179 /** 3180 * Arguments: 3181 * 3182 * Inputs: 3183 * c_rarg0 - int crc 3184 * c_rarg1 - byte* buf 3185 * c_rarg2 - int length 3186 * 3187 * Ouput: 3188 * rax - int crc result 3189 */ 3190 address generate_updateBytesCRC32() { 3191 assert(UseCRC32Intrinsics, "what are we doing here?"); 3192 3193 __ align(CodeEntryAlignment); 3194 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32"); 3195 3196 address start = __ pc(); 3197 3198 const Register crc = c_rarg0; // crc 3199 const Register buf = c_rarg1; // source java byte array address 3200 const Register len = c_rarg2; // length 3201 const Register table0 = c_rarg3; // crc_table address 3202 const Register table1 = c_rarg4; 3203 const Register table2 = c_rarg5; 3204 const Register table3 = c_rarg6; 3205 const Register tmp3 = c_rarg7; 3206 3207 BLOCK_COMMENT("Entry:"); 3208 __ enter(); // required for proper stackwalking of RuntimeStub frame 3209 3210 __ kernel_crc32(crc, buf, len, 3211 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 3212 3213 __ leave(); // required for proper stackwalking of RuntimeStub frame 3214 __ ret(lr); 3215 3216 return start; 3217 } 3218 3219 /** 3220 * Arguments: 3221 * 3222 * Inputs: 3223 * c_rarg0 - int crc 3224 * c_rarg1 - byte* buf 3225 * c_rarg2 - int length 3226 * c_rarg3 - int* table 3227 * 3228 * Ouput: 3229 * r0 - int crc result 3230 */ 3231 address generate_updateBytesCRC32C() { 3232 assert(UseCRC32CIntrinsics, "what are we doing here?"); 3233 3234 __ align(CodeEntryAlignment); 3235 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C"); 3236 3237 address start = __ pc(); 3238 3239 const Register crc = c_rarg0; // crc 3240 const Register buf = c_rarg1; // source java byte array address 3241 const Register len = c_rarg2; // length 3242 const Register table0 = c_rarg3; // crc_table address 3243 const Register table1 = c_rarg4; 3244 const Register table2 = c_rarg5; 3245 const Register table3 = c_rarg6; 3246 const Register tmp3 = c_rarg7; 3247 3248 BLOCK_COMMENT("Entry:"); 3249 __ enter(); // required for proper stackwalking of RuntimeStub frame 3250 3251 __ kernel_crc32c(crc, buf, len, 3252 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 3253 3254 __ leave(); // required for proper stackwalking of RuntimeStub frame 3255 __ ret(lr); 3256 3257 return start; 3258 } 3259 3260 /*** 3261 * Arguments: 3262 * 3263 * Inputs: 3264 * c_rarg0 - int adler 3265 * c_rarg1 - byte* buff 3266 * c_rarg2 - int len 3267 * 3268 * Output: 3269 * c_rarg0 - int adler result 3270 */ 3271 address generate_updateBytesAdler32() { 3272 __ align(CodeEntryAlignment); 3273 StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32"); 3274 address start = __ pc(); 3275 3276 Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1; 3277 3278 // Aliases 3279 Register adler = c_rarg0; 3280 Register s1 = c_rarg0; 3281 Register s2 = c_rarg3; 3282 Register buff = c_rarg1; 3283 Register len = c_rarg2; 3284 Register nmax = r4; 3285 Register base = r5; 3286 Register count = r6; 3287 Register temp0 = rscratch1; 3288 Register temp1 = rscratch2; 3289 FloatRegister vbytes = v0; 3290 FloatRegister vs1acc = v1; 3291 FloatRegister vs2acc = v2; 3292 FloatRegister vtable = v3; 3293 3294 // Max number of bytes we can process before having to take the mod 3295 // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 3296 unsigned long BASE = 0xfff1; 3297 unsigned long NMAX = 0x15B0; 3298 3299 __ mov(base, BASE); 3300 __ mov(nmax, NMAX); 3301 3302 // Load accumulation coefficients for the upper 16 bits 3303 __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table)); 3304 __ ld1(vtable, __ T16B, Address(temp0)); 3305 3306 // s1 is initialized to the lower 16 bits of adler 3307 // s2 is initialized to the upper 16 bits of adler 3308 __ ubfx(s2, adler, 16, 16); // s2 = ((adler >> 16) & 0xffff) 3309 __ uxth(s1, adler); // s1 = (adler & 0xffff) 3310 3311 // The pipelined loop needs at least 16 elements for 1 iteration 3312 // It does check this, but it is more effective to skip to the cleanup loop 3313 __ cmp(len, (u1)16); 3314 __ br(Assembler::HS, L_nmax); 3315 __ cbz(len, L_combine); 3316 3317 __ bind(L_simple_by1_loop); 3318 __ ldrb(temp0, Address(__ post(buff, 1))); 3319 __ add(s1, s1, temp0); 3320 __ add(s2, s2, s1); 3321 __ subs(len, len, 1); 3322 __ br(Assembler::HI, L_simple_by1_loop); 3323 3324 // s1 = s1 % BASE 3325 __ subs(temp0, s1, base); 3326 __ csel(s1, temp0, s1, Assembler::HS); 3327 3328 // s2 = s2 % BASE 3329 __ lsr(temp0, s2, 16); 3330 __ lsl(temp1, temp0, 4); 3331 __ sub(temp1, temp1, temp0); 3332 __ add(s2, temp1, s2, ext::uxth); 3333 3334 __ subs(temp0, s2, base); 3335 __ csel(s2, temp0, s2, Assembler::HS); 3336 3337 __ b(L_combine); 3338 3339 __ bind(L_nmax); 3340 __ subs(len, len, nmax); 3341 __ sub(count, nmax, 16); 3342 __ br(Assembler::LO, L_by16); 3343 3344 __ bind(L_nmax_loop); 3345 3346 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1, 3347 vbytes, vs1acc, vs2acc, vtable); 3348 3349 __ subs(count, count, 16); 3350 __ br(Assembler::HS, L_nmax_loop); 3351 3352 // s1 = s1 % BASE 3353 __ lsr(temp0, s1, 16); 3354 __ lsl(temp1, temp0, 4); 3355 __ sub(temp1, temp1, temp0); 3356 __ add(temp1, temp1, s1, ext::uxth); 3357 3358 __ lsr(temp0, temp1, 16); 3359 __ lsl(s1, temp0, 4); 3360 __ sub(s1, s1, temp0); 3361 __ add(s1, s1, temp1, ext:: uxth); 3362 3363 __ subs(temp0, s1, base); 3364 __ csel(s1, temp0, s1, Assembler::HS); 3365 3366 // s2 = s2 % BASE 3367 __ lsr(temp0, s2, 16); 3368 __ lsl(temp1, temp0, 4); 3369 __ sub(temp1, temp1, temp0); 3370 __ add(temp1, temp1, s2, ext::uxth); 3371 3372 __ lsr(temp0, temp1, 16); 3373 __ lsl(s2, temp0, 4); 3374 __ sub(s2, s2, temp0); 3375 __ add(s2, s2, temp1, ext:: uxth); 3376 3377 __ subs(temp0, s2, base); 3378 __ csel(s2, temp0, s2, Assembler::HS); 3379 3380 __ subs(len, len, nmax); 3381 __ sub(count, nmax, 16); 3382 __ br(Assembler::HS, L_nmax_loop); 3383 3384 __ bind(L_by16); 3385 __ adds(len, len, count); 3386 __ br(Assembler::LO, L_by1); 3387 3388 __ bind(L_by16_loop); 3389 3390 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1, 3391 vbytes, vs1acc, vs2acc, vtable); 3392 3393 __ subs(len, len, 16); 3394 __ br(Assembler::HS, L_by16_loop); 3395 3396 __ bind(L_by1); 3397 __ adds(len, len, 15); 3398 __ br(Assembler::LO, L_do_mod); 3399 3400 __ bind(L_by1_loop); 3401 __ ldrb(temp0, Address(__ post(buff, 1))); 3402 __ add(s1, temp0, s1); 3403 __ add(s2, s2, s1); 3404 __ subs(len, len, 1); 3405 __ br(Assembler::HS, L_by1_loop); 3406 3407 __ bind(L_do_mod); 3408 // s1 = s1 % BASE 3409 __ lsr(temp0, s1, 16); 3410 __ lsl(temp1, temp0, 4); 3411 __ sub(temp1, temp1, temp0); 3412 __ add(temp1, temp1, s1, ext::uxth); 3413 3414 __ lsr(temp0, temp1, 16); 3415 __ lsl(s1, temp0, 4); 3416 __ sub(s1, s1, temp0); 3417 __ add(s1, s1, temp1, ext:: uxth); 3418 3419 __ subs(temp0, s1, base); 3420 __ csel(s1, temp0, s1, Assembler::HS); 3421 3422 // s2 = s2 % BASE 3423 __ lsr(temp0, s2, 16); 3424 __ lsl(temp1, temp0, 4); 3425 __ sub(temp1, temp1, temp0); 3426 __ add(temp1, temp1, s2, ext::uxth); 3427 3428 __ lsr(temp0, temp1, 16); 3429 __ lsl(s2, temp0, 4); 3430 __ sub(s2, s2, temp0); 3431 __ add(s2, s2, temp1, ext:: uxth); 3432 3433 __ subs(temp0, s2, base); 3434 __ csel(s2, temp0, s2, Assembler::HS); 3435 3436 // Combine lower bits and higher bits 3437 __ bind(L_combine); 3438 __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16) 3439 3440 __ ret(lr); 3441 3442 return start; 3443 } 3444 3445 void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff, 3446 Register temp0, Register temp1, FloatRegister vbytes, 3447 FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) { 3448 // Below is a vectorized implementation of updating s1 and s2 for 16 bytes. 3449 // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration. 3450 // In non-vectorized code, we update s1 and s2 as: 3451 // s1 <- s1 + b1 3452 // s2 <- s2 + s1 3453 // s1 <- s1 + b2 3454 // s2 <- s2 + b1 3455 // ... 3456 // s1 <- s1 + b16 3457 // s2 <- s2 + s1 3458 // Putting above assignments together, we have: 3459 // s1_new = s1 + b1 + b2 + ... + b16 3460 // s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16) 3461 // = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1) 3462 // = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1) 3463 __ ld1(vbytes, __ T16B, Address(__ post(buff, 16))); 3464 3465 // s2 = s2 + s1 * 16 3466 __ add(s2, s2, s1, Assembler::LSL, 4); 3467 3468 // vs1acc = b1 + b2 + b3 + ... + b16 3469 // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1) 3470 __ umullv(vs2acc, __ T8B, vtable, vbytes); 3471 __ umlalv(vs2acc, __ T16B, vtable, vbytes); 3472 __ uaddlv(vs1acc, __ T16B, vbytes); 3473 __ uaddlv(vs2acc, __ T8H, vs2acc); 3474 3475 // s1 = s1 + vs1acc, s2 = s2 + vs2acc 3476 __ fmovd(temp0, vs1acc); 3477 __ fmovd(temp1, vs2acc); 3478 __ add(s1, s1, temp0); 3479 __ add(s2, s2, temp1); 3480 } 3481 3482 /** 3483 * Arguments: 3484 * 3485 * Input: 3486 * c_rarg0 - x address 3487 * c_rarg1 - x length 3488 * c_rarg2 - y address 3489 * c_rarg3 - y lenth 3490 * c_rarg4 - z address 3491 * c_rarg5 - z length 3492 */ 3493 address generate_multiplyToLen() { 3494 __ align(CodeEntryAlignment); 3495 StubCodeMark mark(this, "StubRoutines", "multiplyToLen"); 3496 3497 address start = __ pc(); 3498 const Register x = r0; 3499 const Register xlen = r1; 3500 const Register y = r2; 3501 const Register ylen = r3; 3502 const Register z = r4; 3503 const Register zlen = r5; 3504 3505 const Register tmp1 = r10; 3506 const Register tmp2 = r11; 3507 const Register tmp3 = r12; 3508 const Register tmp4 = r13; 3509 const Register tmp5 = r14; 3510 const Register tmp6 = r15; 3511 const Register tmp7 = r16; 3512 3513 BLOCK_COMMENT("Entry:"); 3514 __ enter(); // required for proper stackwalking of RuntimeStub frame 3515 __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 3516 __ leave(); // required for proper stackwalking of RuntimeStub frame 3517 __ ret(lr); 3518 3519 return start; 3520 } 3521 3522 address generate_squareToLen() { 3523 // squareToLen algorithm for sizes 1..127 described in java code works 3524 // faster than multiply_to_len on some CPUs and slower on others, but 3525 // multiply_to_len shows a bit better overall results 3526 __ align(CodeEntryAlignment); 3527 StubCodeMark mark(this, "StubRoutines", "squareToLen"); 3528 address start = __ pc(); 3529 3530 const Register x = r0; 3531 const Register xlen = r1; 3532 const Register z = r2; 3533 const Register zlen = r3; 3534 const Register y = r4; // == x 3535 const Register ylen = r5; // == xlen 3536 3537 const Register tmp1 = r10; 3538 const Register tmp2 = r11; 3539 const Register tmp3 = r12; 3540 const Register tmp4 = r13; 3541 const Register tmp5 = r14; 3542 const Register tmp6 = r15; 3543 const Register tmp7 = r16; 3544 3545 RegSet spilled_regs = RegSet::of(y, ylen); 3546 BLOCK_COMMENT("Entry:"); 3547 __ enter(); 3548 __ push(spilled_regs, sp); 3549 __ mov(y, x); 3550 __ mov(ylen, xlen); 3551 __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 3552 __ pop(spilled_regs, sp); 3553 __ leave(); 3554 __ ret(lr); 3555 return start; 3556 } 3557 3558 address generate_mulAdd() { 3559 __ align(CodeEntryAlignment); 3560 StubCodeMark mark(this, "StubRoutines", "mulAdd"); 3561 3562 address start = __ pc(); 3563 3564 const Register out = r0; 3565 const Register in = r1; 3566 const Register offset = r2; 3567 const Register len = r3; 3568 const Register k = r4; 3569 3570 BLOCK_COMMENT("Entry:"); 3571 __ enter(); 3572 __ mul_add(out, in, offset, len, k); 3573 __ leave(); 3574 __ ret(lr); 3575 3576 return start; 3577 } 3578 3579 void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi, 3580 FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0, 3581 FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4) { 3582 // Karatsuba multiplication performs a 128*128 -> 256-bit 3583 // multiplication in three 128-bit multiplications and a few 3584 // additions. 3585 // 3586 // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1) 3587 // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0 3588 // 3589 // Inputs: 3590 // 3591 // A0 in a.d[0] (subkey) 3592 // A1 in a.d[1] 3593 // (A1+A0) in a1_xor_a0.d[0] 3594 // 3595 // B0 in b.d[0] (state) 3596 // B1 in b.d[1] 3597 3598 __ ext(tmp1, __ T16B, b, b, 0x08); 3599 __ pmull2(result_hi, __ T1Q, b, a, __ T2D); // A1*B1 3600 __ eor(tmp1, __ T16B, tmp1, b); // (B1+B0) 3601 __ pmull(result_lo, __ T1Q, b, a, __ T1D); // A0*B0 3602 __ pmull(tmp2, __ T1Q, tmp1, a1_xor_a0, __ T1D); // (A1+A0)(B1+B0) 3603 3604 __ ext(tmp4, __ T16B, result_lo, result_hi, 0x08); 3605 __ eor(tmp3, __ T16B, result_hi, result_lo); // A1*B1+A0*B0 3606 __ eor(tmp2, __ T16B, tmp2, tmp4); 3607 __ eor(tmp2, __ T16B, tmp2, tmp3); 3608 3609 // Register pair <result_hi:result_lo> holds the result of carry-less multiplication 3610 __ ins(result_hi, __ D, tmp2, 0, 1); 3611 __ ins(result_lo, __ D, tmp2, 1, 0); 3612 } 3613 3614 void ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi, 3615 FloatRegister p, FloatRegister z, FloatRegister t1) { 3616 const FloatRegister t0 = result; 3617 3618 // The GCM field polynomial f is z^128 + p(z), where p = 3619 // z^7+z^2+z+1. 3620 // 3621 // z^128 === -p(z) (mod (z^128 + p(z))) 3622 // 3623 // so, given that the product we're reducing is 3624 // a == lo + hi * z^128 3625 // substituting, 3626 // === lo - hi * p(z) (mod (z^128 + p(z))) 3627 // 3628 // we reduce by multiplying hi by p(z) and subtracting the result 3629 // from (i.e. XORing it with) lo. Because p has no nonzero high 3630 // bits we can do this with two 64-bit multiplications, lo*p and 3631 // hi*p. 3632 3633 __ pmull2(t0, __ T1Q, hi, p, __ T2D); 3634 __ ext(t1, __ T16B, t0, z, 8); 3635 __ eor(hi, __ T16B, hi, t1); 3636 __ ext(t1, __ T16B, z, t0, 8); 3637 __ eor(lo, __ T16B, lo, t1); 3638 __ pmull(t0, __ T1Q, hi, p, __ T1D); 3639 __ eor(result, __ T16B, lo, t0); 3640 } 3641 3642 address generate_has_negatives(address &has_negatives_long) { 3643 const u1 large_loop_size = 64; 3644 const uint64_t UPPER_BIT_MASK=0x8080808080808080; 3645 int dcache_line = VM_Version::dcache_line_size(); 3646 3647 Register ary1 = r1, len = r2, result = r0; 3648 3649 __ align(CodeEntryAlignment); 3650 3651 StubCodeMark mark(this, "StubRoutines", "has_negatives"); 3652 3653 address entry = __ pc(); 3654 3655 __ enter(); 3656 3657 Label RET_TRUE, RET_TRUE_NO_POP, RET_FALSE, ALIGNED, LOOP16, CHECK_16, DONE, 3658 LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL; 3659 3660 __ cmp(len, (u1)15); 3661 __ br(Assembler::GT, LEN_OVER_15); 3662 // The only case when execution falls into this code is when pointer is near 3663 // the end of memory page and we have to avoid reading next page 3664 __ add(ary1, ary1, len); 3665 __ subs(len, len, 8); 3666 __ br(Assembler::GT, LEN_OVER_8); 3667 __ ldr(rscratch2, Address(ary1, -8)); 3668 __ sub(rscratch1, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes. 3669 __ lsrv(rscratch2, rscratch2, rscratch1); 3670 __ tst(rscratch2, UPPER_BIT_MASK); 3671 __ cset(result, Assembler::NE); 3672 __ leave(); 3673 __ ret(lr); 3674 __ bind(LEN_OVER_8); 3675 __ ldp(rscratch1, rscratch2, Address(ary1, -16)); 3676 __ sub(len, len, 8); // no data dep., then sub can be executed while loading 3677 __ tst(rscratch2, UPPER_BIT_MASK); 3678 __ br(Assembler::NE, RET_TRUE_NO_POP); 3679 __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes 3680 __ lsrv(rscratch1, rscratch1, rscratch2); 3681 __ tst(rscratch1, UPPER_BIT_MASK); 3682 __ cset(result, Assembler::NE); 3683 __ leave(); 3684 __ ret(lr); 3685 3686 Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10; 3687 const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6; 3688 3689 has_negatives_long = __ pc(); // 2nd entry point 3690 3691 __ enter(); 3692 3693 __ bind(LEN_OVER_15); 3694 __ push(spilled_regs, sp); 3695 __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment 3696 __ cbz(rscratch2, ALIGNED); 3697 __ ldp(tmp6, tmp1, Address(ary1)); 3698 __ mov(tmp5, 16); 3699 __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address 3700 __ add(ary1, ary1, rscratch1); 3701 __ sub(len, len, rscratch1); 3702 __ orr(tmp6, tmp6, tmp1); 3703 __ tst(tmp6, UPPER_BIT_MASK); 3704 __ br(Assembler::NE, RET_TRUE); 3705 3706 __ bind(ALIGNED); 3707 __ cmp(len, large_loop_size); 3708 __ br(Assembler::LT, CHECK_16); 3709 // Perform 16-byte load as early return in pre-loop to handle situation 3710 // when initially aligned large array has negative values at starting bytes, 3711 // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is 3712 // slower. Cases with negative bytes further ahead won't be affected that 3713 // much. In fact, it'll be faster due to early loads, less instructions and 3714 // less branches in LARGE_LOOP. 3715 __ ldp(tmp6, tmp1, Address(__ post(ary1, 16))); 3716 __ sub(len, len, 16); 3717 __ orr(tmp6, tmp6, tmp1); 3718 __ tst(tmp6, UPPER_BIT_MASK); 3719 __ br(Assembler::NE, RET_TRUE); 3720 __ cmp(len, large_loop_size); 3721 __ br(Assembler::LT, CHECK_16); 3722 3723 if (SoftwarePrefetchHintDistance >= 0 3724 && SoftwarePrefetchHintDistance >= dcache_line) { 3725 // initial prefetch 3726 __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line)); 3727 } 3728 __ bind(LARGE_LOOP); 3729 if (SoftwarePrefetchHintDistance >= 0) { 3730 __ prfm(Address(ary1, SoftwarePrefetchHintDistance)); 3731 } 3732 // Issue load instructions first, since it can save few CPU/MEM cycles, also 3733 // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp) 3734 // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3 3735 // instructions per cycle and have less branches, but this approach disables 3736 // early return, thus, all 64 bytes are loaded and checked every time. 3737 __ ldp(tmp2, tmp3, Address(ary1)); 3738 __ ldp(tmp4, tmp5, Address(ary1, 16)); 3739 __ ldp(rscratch1, rscratch2, Address(ary1, 32)); 3740 __ ldp(tmp6, tmp1, Address(ary1, 48)); 3741 __ add(ary1, ary1, large_loop_size); 3742 __ sub(len, len, large_loop_size); 3743 __ orr(tmp2, tmp2, tmp3); 3744 __ orr(tmp4, tmp4, tmp5); 3745 __ orr(rscratch1, rscratch1, rscratch2); 3746 __ orr(tmp6, tmp6, tmp1); 3747 __ orr(tmp2, tmp2, tmp4); 3748 __ orr(rscratch1, rscratch1, tmp6); 3749 __ orr(tmp2, tmp2, rscratch1); 3750 __ tst(tmp2, UPPER_BIT_MASK); 3751 __ br(Assembler::NE, RET_TRUE); 3752 __ cmp(len, large_loop_size); 3753 __ br(Assembler::GE, LARGE_LOOP); 3754 3755 __ bind(CHECK_16); // small 16-byte load pre-loop 3756 __ cmp(len, (u1)16); 3757 __ br(Assembler::LT, POST_LOOP16); 3758 3759 __ bind(LOOP16); // small 16-byte load loop 3760 __ ldp(tmp2, tmp3, Address(__ post(ary1, 16))); 3761 __ sub(len, len, 16); 3762 __ orr(tmp2, tmp2, tmp3); 3763 __ tst(tmp2, UPPER_BIT_MASK); 3764 __ br(Assembler::NE, RET_TRUE); 3765 __ cmp(len, (u1)16); 3766 __ br(Assembler::GE, LOOP16); // 16-byte load loop end 3767 3768 __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally 3769 __ cmp(len, (u1)8); 3770 __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL); 3771 __ ldr(tmp3, Address(__ post(ary1, 8))); 3772 __ sub(len, len, 8); 3773 __ tst(tmp3, UPPER_BIT_MASK); 3774 __ br(Assembler::NE, RET_TRUE); 3775 3776 __ bind(POST_LOOP16_LOAD_TAIL); 3777 __ cbz(len, RET_FALSE); // Can't shift left by 64 when len==0 3778 __ ldr(tmp1, Address(ary1)); 3779 __ mov(tmp2, 64); 3780 __ sub(tmp4, tmp2, len, __ LSL, 3); 3781 __ lslv(tmp1, tmp1, tmp4); 3782 __ tst(tmp1, UPPER_BIT_MASK); 3783 __ br(Assembler::NE, RET_TRUE); 3784 // Fallthrough 3785 3786 __ bind(RET_FALSE); 3787 __ pop(spilled_regs, sp); 3788 __ leave(); 3789 __ mov(result, zr); 3790 __ ret(lr); 3791 3792 __ bind(RET_TRUE); 3793 __ pop(spilled_regs, sp); 3794 __ bind(RET_TRUE_NO_POP); 3795 __ leave(); 3796 __ mov(result, 1); 3797 __ ret(lr); 3798 3799 __ bind(DONE); 3800 __ pop(spilled_regs, sp); 3801 __ leave(); 3802 __ ret(lr); 3803 return entry; 3804 } 3805 3806 void generate_large_array_equals_loop_nonsimd(int loopThreshold, 3807 bool usePrefetch, Label &NOT_EQUAL) { 3808 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 3809 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, 3810 tmp7 = r12, tmp8 = r13; 3811 Label LOOP; 3812 3813 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 3814 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 3815 __ bind(LOOP); 3816 if (usePrefetch) { 3817 __ prfm(Address(a1, SoftwarePrefetchHintDistance)); 3818 __ prfm(Address(a2, SoftwarePrefetchHintDistance)); 3819 } 3820 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); 3821 __ eor(tmp1, tmp1, tmp2); 3822 __ eor(tmp3, tmp3, tmp4); 3823 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); 3824 __ orr(tmp1, tmp1, tmp3); 3825 __ cbnz(tmp1, NOT_EQUAL); 3826 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 3827 __ eor(tmp5, tmp5, tmp6); 3828 __ eor(tmp7, tmp7, tmp8); 3829 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 3830 __ orr(tmp5, tmp5, tmp7); 3831 __ cbnz(tmp5, NOT_EQUAL); 3832 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); 3833 __ eor(tmp1, tmp1, tmp2); 3834 __ eor(tmp3, tmp3, tmp4); 3835 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); 3836 __ orr(tmp1, tmp1, tmp3); 3837 __ cbnz(tmp1, NOT_EQUAL); 3838 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 3839 __ eor(tmp5, tmp5, tmp6); 3840 __ sub(cnt1, cnt1, 8 * wordSize); 3841 __ eor(tmp7, tmp7, tmp8); 3842 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 3843 // tmp6 is not used. MacroAssembler::subs is used here (rather than 3844 // cmp) because subs allows an unlimited range of immediate operand. 3845 __ subs(tmp6, cnt1, loopThreshold); 3846 __ orr(tmp5, tmp5, tmp7); 3847 __ cbnz(tmp5, NOT_EQUAL); 3848 __ br(__ GE, LOOP); 3849 // post-loop 3850 __ eor(tmp1, tmp1, tmp2); 3851 __ eor(tmp3, tmp3, tmp4); 3852 __ orr(tmp1, tmp1, tmp3); 3853 __ sub(cnt1, cnt1, 2 * wordSize); 3854 __ cbnz(tmp1, NOT_EQUAL); 3855 } 3856 3857 void generate_large_array_equals_loop_simd(int loopThreshold, 3858 bool usePrefetch, Label &NOT_EQUAL) { 3859 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 3860 tmp2 = rscratch2; 3861 Label LOOP; 3862 3863 __ bind(LOOP); 3864 if (usePrefetch) { 3865 __ prfm(Address(a1, SoftwarePrefetchHintDistance)); 3866 __ prfm(Address(a2, SoftwarePrefetchHintDistance)); 3867 } 3868 __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize))); 3869 __ sub(cnt1, cnt1, 8 * wordSize); 3870 __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize))); 3871 __ subs(tmp1, cnt1, loopThreshold); 3872 __ eor(v0, __ T16B, v0, v4); 3873 __ eor(v1, __ T16B, v1, v5); 3874 __ eor(v2, __ T16B, v2, v6); 3875 __ eor(v3, __ T16B, v3, v7); 3876 __ orr(v0, __ T16B, v0, v1); 3877 __ orr(v1, __ T16B, v2, v3); 3878 __ orr(v0, __ T16B, v0, v1); 3879 __ umov(tmp1, v0, __ D, 0); 3880 __ umov(tmp2, v0, __ D, 1); 3881 __ orr(tmp1, tmp1, tmp2); 3882 __ cbnz(tmp1, NOT_EQUAL); 3883 __ br(__ GE, LOOP); 3884 } 3885 3886 // a1 = r1 - array1 address 3887 // a2 = r2 - array2 address 3888 // result = r0 - return value. Already contains "false" 3889 // cnt1 = r10 - amount of elements left to check, reduced by wordSize 3890 // r3-r5 are reserved temporary registers 3891 address generate_large_array_equals() { 3892 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 3893 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, 3894 tmp7 = r12, tmp8 = r13; 3895 Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP, 3896 SMALL_LOOP, POST_LOOP; 3897 const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16; 3898 // calculate if at least 32 prefetched bytes are used 3899 int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32; 3900 int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE); 3901 RegSet spilled_regs = RegSet::range(tmp6, tmp8); 3902 assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4, 3903 tmp5, tmp6, tmp7, tmp8); 3904 3905 __ align(CodeEntryAlignment); 3906 3907 StubCodeMark mark(this, "StubRoutines", "large_array_equals"); 3908 3909 address entry = __ pc(); 3910 __ enter(); 3911 __ sub(cnt1, cnt1, wordSize); // first 8 bytes were loaded outside of stub 3912 // also advance pointers to use post-increment instead of pre-increment 3913 __ add(a1, a1, wordSize); 3914 __ add(a2, a2, wordSize); 3915 if (AvoidUnalignedAccesses) { 3916 // both implementations (SIMD/nonSIMD) are using relatively large load 3917 // instructions (ld1/ldp), which has huge penalty (up to x2 exec time) 3918 // on some CPUs in case of address is not at least 16-byte aligned. 3919 // Arrays are 8-byte aligned currently, so, we can make additional 8-byte 3920 // load if needed at least for 1st address and make if 16-byte aligned. 3921 Label ALIGNED16; 3922 __ tbz(a1, 3, ALIGNED16); 3923 __ ldr(tmp1, Address(__ post(a1, wordSize))); 3924 __ ldr(tmp2, Address(__ post(a2, wordSize))); 3925 __ sub(cnt1, cnt1, wordSize); 3926 __ eor(tmp1, tmp1, tmp2); 3927 __ cbnz(tmp1, NOT_EQUAL_NO_POP); 3928 __ bind(ALIGNED16); 3929 } 3930 if (UseSIMDForArrayEquals) { 3931 if (SoftwarePrefetchHintDistance >= 0) { 3932 __ subs(tmp1, cnt1, prefetchLoopThreshold); 3933 __ br(__ LE, NO_PREFETCH_LARGE_LOOP); 3934 generate_large_array_equals_loop_simd(prefetchLoopThreshold, 3935 /* prfm = */ true, NOT_EQUAL); 3936 __ subs(zr, cnt1, nonPrefetchLoopThreshold); 3937 __ br(__ LT, TAIL); 3938 } 3939 __ bind(NO_PREFETCH_LARGE_LOOP); 3940 generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold, 3941 /* prfm = */ false, NOT_EQUAL); 3942 } else { 3943 __ push(spilled_regs, sp); 3944 if (SoftwarePrefetchHintDistance >= 0) { 3945 __ subs(tmp1, cnt1, prefetchLoopThreshold); 3946 __ br(__ LE, NO_PREFETCH_LARGE_LOOP); 3947 generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold, 3948 /* prfm = */ true, NOT_EQUAL); 3949 __ subs(zr, cnt1, nonPrefetchLoopThreshold); 3950 __ br(__ LT, TAIL); 3951 } 3952 __ bind(NO_PREFETCH_LARGE_LOOP); 3953 generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold, 3954 /* prfm = */ false, NOT_EQUAL); 3955 } 3956 __ bind(TAIL); 3957 __ cbz(cnt1, EQUAL); 3958 __ subs(cnt1, cnt1, wordSize); 3959 __ br(__ LE, POST_LOOP); 3960 __ bind(SMALL_LOOP); 3961 __ ldr(tmp1, Address(__ post(a1, wordSize))); 3962 __ ldr(tmp2, Address(__ post(a2, wordSize))); 3963 __ subs(cnt1, cnt1, wordSize); 3964 __ eor(tmp1, tmp1, tmp2); 3965 __ cbnz(tmp1, NOT_EQUAL); 3966 __ br(__ GT, SMALL_LOOP); 3967 __ bind(POST_LOOP); 3968 __ ldr(tmp1, Address(a1, cnt1)); 3969 __ ldr(tmp2, Address(a2, cnt1)); 3970 __ eor(tmp1, tmp1, tmp2); 3971 __ cbnz(tmp1, NOT_EQUAL); 3972 __ bind(EQUAL); 3973 __ mov(result, true); 3974 __ bind(NOT_EQUAL); 3975 if (!UseSIMDForArrayEquals) { 3976 __ pop(spilled_regs, sp); 3977 } 3978 __ bind(NOT_EQUAL_NO_POP); 3979 __ leave(); 3980 __ ret(lr); 3981 return entry; 3982 } 3983 3984 address generate_dsin_dcos(bool isCos) { 3985 __ align(CodeEntryAlignment); 3986 StubCodeMark mark(this, "StubRoutines", isCos ? "libmDcos" : "libmDsin"); 3987 address start = __ pc(); 3988 __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw, 3989 (address)StubRoutines::aarch64::_two_over_pi, 3990 (address)StubRoutines::aarch64::_pio2, 3991 (address)StubRoutines::aarch64::_dsin_coef, 3992 (address)StubRoutines::aarch64::_dcos_coef); 3993 return start; 3994 } 3995 3996 address generate_dlog() { 3997 __ align(CodeEntryAlignment); 3998 StubCodeMark mark(this, "StubRoutines", "dlog"); 3999 address entry = __ pc(); 4000 FloatRegister vtmp0 = v0, vtmp1 = v1, vtmp2 = v2, vtmp3 = v3, vtmp4 = v4, 4001 vtmp5 = v5, tmpC1 = v16, tmpC2 = v17, tmpC3 = v18, tmpC4 = v19; 4002 Register tmp1 = r0, tmp2 = r1, tmp3 = r2, tmp4 = r3, tmp5 = r4; 4003 __ fast_log(vtmp0, vtmp1, vtmp2, vtmp3, vtmp4, vtmp5, tmpC1, tmpC2, tmpC3, 4004 tmpC4, tmp1, tmp2, tmp3, tmp4, tmp5); 4005 return entry; 4006 } 4007 4008 // code for comparing 16 bytes of strings with same encoding 4009 void compare_string_16_bytes_same(Label &DIFF1, Label &DIFF2) { 4010 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, tmp1 = r10, tmp2 = r11; 4011 __ ldr(rscratch1, Address(__ post(str1, 8))); 4012 __ eor(rscratch2, tmp1, tmp2); 4013 __ ldr(cnt1, Address(__ post(str2, 8))); 4014 __ cbnz(rscratch2, DIFF1); 4015 __ ldr(tmp1, Address(__ post(str1, 8))); 4016 __ eor(rscratch2, rscratch1, cnt1); 4017 __ ldr(tmp2, Address(__ post(str2, 8))); 4018 __ cbnz(rscratch2, DIFF2); 4019 } 4020 4021 // code for comparing 16 characters of strings with Latin1 and Utf16 encoding 4022 void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1, 4023 Label &DIFF2) { 4024 Register cnt1 = r2, tmp2 = r11, tmp3 = r12; 4025 FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2; 4026 4027 __ ldrq(vtmp, Address(__ post(tmp2, 16))); 4028 __ ldr(tmpU, Address(__ post(cnt1, 8))); 4029 __ zip1(vtmp3, __ T16B, vtmp, vtmpZ); 4030 // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3 4031 4032 __ fmovd(tmpL, vtmp3); 4033 __ eor(rscratch2, tmp3, tmpL); 4034 __ cbnz(rscratch2, DIFF2); 4035 4036 __ ldr(tmp3, Address(__ post(cnt1, 8))); 4037 __ umov(tmpL, vtmp3, __ D, 1); 4038 __ eor(rscratch2, tmpU, tmpL); 4039 __ cbnz(rscratch2, DIFF1); 4040 4041 __ zip2(vtmp, __ T16B, vtmp, vtmpZ); 4042 __ ldr(tmpU, Address(__ post(cnt1, 8))); 4043 __ fmovd(tmpL, vtmp); 4044 __ eor(rscratch2, tmp3, tmpL); 4045 __ cbnz(rscratch2, DIFF2); 4046 4047 __ ldr(tmp3, Address(__ post(cnt1, 8))); 4048 __ umov(tmpL, vtmp, __ D, 1); 4049 __ eor(rscratch2, tmpU, tmpL); 4050 __ cbnz(rscratch2, DIFF1); 4051 } 4052 4053 // r0 = result 4054 // r1 = str1 4055 // r2 = cnt1 4056 // r3 = str2 4057 // r4 = cnt2 4058 // r10 = tmp1 4059 // r11 = tmp2 4060 address generate_compare_long_string_different_encoding(bool isLU) { 4061 __ align(CodeEntryAlignment); 4062 StubCodeMark mark(this, "StubRoutines", isLU 4063 ? "compare_long_string_different_encoding LU" 4064 : "compare_long_string_different_encoding UL"); 4065 address entry = __ pc(); 4066 Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2, 4067 DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH, 4068 LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2; 4069 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 4070 tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14; 4071 FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2; 4072 RegSet spilled_regs = RegSet::of(tmp3, tmp4); 4073 4074 int prefetchLoopExitCondition = MAX(64, SoftwarePrefetchHintDistance/2); 4075 4076 __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ); 4077 // cnt2 == amount of characters left to compare 4078 // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL)) 4079 __ zip1(vtmp, __ T8B, vtmp, vtmpZ); 4080 __ add(str1, str1, isLU ? wordSize/2 : wordSize); 4081 __ add(str2, str2, isLU ? wordSize : wordSize/2); 4082 __ fmovd(isLU ? tmp1 : tmp2, vtmp); 4083 __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case. 4084 __ eor(rscratch2, tmp1, tmp2); 4085 __ mov(rscratch1, tmp2); 4086 __ cbnz(rscratch2, CALCULATE_DIFFERENCE); 4087 Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison 4088 tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison 4089 __ push(spilled_regs, sp); 4090 __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load 4091 __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load 4092 4093 __ ldr(tmp3, Address(__ post(cnt1, 8))); 4094 4095 if (SoftwarePrefetchHintDistance >= 0) { 4096 __ subs(rscratch2, cnt2, prefetchLoopExitCondition); 4097 __ br(__ LT, NO_PREFETCH); 4098 __ bind(LARGE_LOOP_PREFETCH); 4099 __ prfm(Address(tmp2, SoftwarePrefetchHintDistance)); 4100 __ mov(tmp4, 2); 4101 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance)); 4102 __ bind(LARGE_LOOP_PREFETCH_REPEAT1); 4103 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 4104 __ subs(tmp4, tmp4, 1); 4105 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1); 4106 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance)); 4107 __ mov(tmp4, 2); 4108 __ bind(LARGE_LOOP_PREFETCH_REPEAT2); 4109 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 4110 __ subs(tmp4, tmp4, 1); 4111 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2); 4112 __ sub(cnt2, cnt2, 64); 4113 __ subs(rscratch2, cnt2, prefetchLoopExitCondition); 4114 __ br(__ GE, LARGE_LOOP_PREFETCH); 4115 } 4116 __ cbz(cnt2, LOAD_LAST); // no characters left except last load 4117 __ bind(NO_PREFETCH); 4118 __ subs(cnt2, cnt2, 16); 4119 __ br(__ LT, TAIL); 4120 __ align(OptoLoopAlignment); 4121 __ bind(SMALL_LOOP); // smaller loop 4122 __ subs(cnt2, cnt2, 16); 4123 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 4124 __ br(__ GE, SMALL_LOOP); 4125 __ cmn(cnt2, (u1)16); 4126 __ br(__ EQ, LOAD_LAST); 4127 __ bind(TAIL); // 1..15 characters left until last load (last 4 characters) 4128 __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string 4129 __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string 4130 __ ldr(tmp3, Address(cnt1, -8)); 4131 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load 4132 __ b(LOAD_LAST); 4133 __ bind(DIFF2); 4134 __ mov(tmpU, tmp3); 4135 __ bind(DIFF1); 4136 __ pop(spilled_regs, sp); 4137 __ b(CALCULATE_DIFFERENCE); 4138 __ bind(LOAD_LAST); 4139 // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU. 4140 // No need to load it again 4141 __ mov(tmpU, tmp3); 4142 __ pop(spilled_regs, sp); 4143 4144 // tmp2 points to the address of the last 4 Latin1 characters right now 4145 __ ldrs(vtmp, Address(tmp2)); 4146 __ zip1(vtmp, __ T8B, vtmp, vtmpZ); 4147 __ fmovd(tmpL, vtmp); 4148 4149 __ eor(rscratch2, tmpU, tmpL); 4150 __ cbz(rscratch2, DONE); 4151 4152 // Find the first different characters in the longwords and 4153 // compute their difference. 4154 __ bind(CALCULATE_DIFFERENCE); 4155 __ rev(rscratch2, rscratch2); 4156 __ clz(rscratch2, rscratch2); 4157 __ andr(rscratch2, rscratch2, -16); 4158 __ lsrv(tmp1, tmp1, rscratch2); 4159 __ uxthw(tmp1, tmp1); 4160 __ lsrv(rscratch1, rscratch1, rscratch2); 4161 __ uxthw(rscratch1, rscratch1); 4162 __ subw(result, tmp1, rscratch1); 4163 __ bind(DONE); 4164 __ ret(lr); 4165 return entry; 4166 } 4167 4168 address generate_method_entry_barrier() { 4169 __ align(CodeEntryAlignment); 4170 StubCodeMark mark(this, "StubRoutines", "nmethod_entry_barrier"); 4171 4172 Label deoptimize_label; 4173 4174 address start = __ pc(); 4175 4176 __ set_last_Java_frame(sp, rfp, lr, rscratch1); 4177 4178 __ enter(); 4179 __ add(rscratch2, sp, wordSize); // rscratch2 points to the saved lr 4180 4181 __ sub(sp, sp, 4 * wordSize); // four words for the returned {sp, fp, lr, pc} 4182 4183 __ push_call_clobbered_registers(); 4184 4185 __ mov(c_rarg0, rscratch2); 4186 __ call_VM_leaf 4187 (CAST_FROM_FN_PTR 4188 (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1); 4189 4190 __ reset_last_Java_frame(true); 4191 4192 __ mov(rscratch1, r0); 4193 4194 __ pop_call_clobbered_registers(); 4195 4196 __ cbnz(rscratch1, deoptimize_label); 4197 4198 __ leave(); 4199 __ ret(lr); 4200 4201 __ BIND(deoptimize_label); 4202 4203 __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize)); 4204 __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize)); 4205 4206 __ mov(sp, rscratch1); 4207 __ br(rscratch2); 4208 4209 return start; 4210 } 4211 4212 // r0 = result 4213 // r1 = str1 4214 // r2 = cnt1 4215 // r3 = str2 4216 // r4 = cnt2 4217 // r10 = tmp1 4218 // r11 = tmp2 4219 address generate_compare_long_string_same_encoding(bool isLL) { 4220 __ align(CodeEntryAlignment); 4221 StubCodeMark mark(this, "StubRoutines", isLL 4222 ? "compare_long_string_same_encoding LL" 4223 : "compare_long_string_same_encoding UU"); 4224 address entry = __ pc(); 4225 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 4226 tmp1 = r10, tmp2 = r11; 4227 Label SMALL_LOOP, LARGE_LOOP_PREFETCH, CHECK_LAST, DIFF2, TAIL, 4228 LENGTH_DIFF, DIFF, LAST_CHECK_AND_LENGTH_DIFF, 4229 DIFF_LAST_POSITION, DIFF_LAST_POSITION2; 4230 // exit from large loop when less than 64 bytes left to read or we're about 4231 // to prefetch memory behind array border 4232 int largeLoopExitCondition = MAX(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2); 4233 // cnt1/cnt2 contains amount of characters to compare. cnt1 can be re-used 4234 // update cnt2 counter with already loaded 8 bytes 4235 __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2)); 4236 // update pointers, because of previous read 4237 __ add(str1, str1, wordSize); 4238 __ add(str2, str2, wordSize); 4239 if (SoftwarePrefetchHintDistance >= 0) { 4240 __ bind(LARGE_LOOP_PREFETCH); 4241 __ prfm(Address(str1, SoftwarePrefetchHintDistance)); 4242 __ prfm(Address(str2, SoftwarePrefetchHintDistance)); 4243 compare_string_16_bytes_same(DIFF, DIFF2); 4244 compare_string_16_bytes_same(DIFF, DIFF2); 4245 __ sub(cnt2, cnt2, isLL ? 64 : 32); 4246 compare_string_16_bytes_same(DIFF, DIFF2); 4247 __ subs(rscratch2, cnt2, largeLoopExitCondition); 4248 compare_string_16_bytes_same(DIFF, DIFF2); 4249 __ br(__ GT, LARGE_LOOP_PREFETCH); 4250 __ cbz(cnt2, LAST_CHECK_AND_LENGTH_DIFF); // no more chars left? 4251 } 4252 // less than 16 bytes left? 4253 __ subs(cnt2, cnt2, isLL ? 16 : 8); 4254 __ br(__ LT, TAIL); 4255 __ align(OptoLoopAlignment); 4256 __ bind(SMALL_LOOP); 4257 compare_string_16_bytes_same(DIFF, DIFF2); 4258 __ subs(cnt2, cnt2, isLL ? 16 : 8); 4259 __ br(__ GE, SMALL_LOOP); 4260 __ bind(TAIL); 4261 __ adds(cnt2, cnt2, isLL ? 16 : 8); 4262 __ br(__ EQ, LAST_CHECK_AND_LENGTH_DIFF); 4263 __ subs(cnt2, cnt2, isLL ? 8 : 4); 4264 __ br(__ LE, CHECK_LAST); 4265 __ eor(rscratch2, tmp1, tmp2); 4266 __ cbnz(rscratch2, DIFF); 4267 __ ldr(tmp1, Address(__ post(str1, 8))); 4268 __ ldr(tmp2, Address(__ post(str2, 8))); 4269 __ sub(cnt2, cnt2, isLL ? 8 : 4); 4270 __ bind(CHECK_LAST); 4271 if (!isLL) { 4272 __ add(cnt2, cnt2, cnt2); // now in bytes 4273 } 4274 __ eor(rscratch2, tmp1, tmp2); 4275 __ cbnz(rscratch2, DIFF); 4276 __ ldr(rscratch1, Address(str1, cnt2)); 4277 __ ldr(cnt1, Address(str2, cnt2)); 4278 __ eor(rscratch2, rscratch1, cnt1); 4279 __ cbz(rscratch2, LENGTH_DIFF); 4280 // Find the first different characters in the longwords and 4281 // compute their difference. 4282 __ bind(DIFF2); 4283 __ rev(rscratch2, rscratch2); 4284 __ clz(rscratch2, rscratch2); 4285 __ andr(rscratch2, rscratch2, isLL ? -8 : -16); 4286 __ lsrv(rscratch1, rscratch1, rscratch2); 4287 if (isLL) { 4288 __ lsrv(cnt1, cnt1, rscratch2); 4289 __ uxtbw(rscratch1, rscratch1); 4290 __ uxtbw(cnt1, cnt1); 4291 } else { 4292 __ lsrv(cnt1, cnt1, rscratch2); 4293 __ uxthw(rscratch1, rscratch1); 4294 __ uxthw(cnt1, cnt1); 4295 } 4296 __ subw(result, rscratch1, cnt1); 4297 __ b(LENGTH_DIFF); 4298 __ bind(DIFF); 4299 __ rev(rscratch2, rscratch2); 4300 __ clz(rscratch2, rscratch2); 4301 __ andr(rscratch2, rscratch2, isLL ? -8 : -16); 4302 __ lsrv(tmp1, tmp1, rscratch2); 4303 if (isLL) { 4304 __ lsrv(tmp2, tmp2, rscratch2); 4305 __ uxtbw(tmp1, tmp1); 4306 __ uxtbw(tmp2, tmp2); 4307 } else { 4308 __ lsrv(tmp2, tmp2, rscratch2); 4309 __ uxthw(tmp1, tmp1); 4310 __ uxthw(tmp2, tmp2); 4311 } 4312 __ subw(result, tmp1, tmp2); 4313 __ b(LENGTH_DIFF); 4314 __ bind(LAST_CHECK_AND_LENGTH_DIFF); 4315 __ eor(rscratch2, tmp1, tmp2); 4316 __ cbnz(rscratch2, DIFF); 4317 __ bind(LENGTH_DIFF); 4318 __ ret(lr); 4319 return entry; 4320 } 4321 4322 void generate_compare_long_strings() { 4323 StubRoutines::aarch64::_compare_long_string_LL 4324 = generate_compare_long_string_same_encoding(true); 4325 StubRoutines::aarch64::_compare_long_string_UU 4326 = generate_compare_long_string_same_encoding(false); 4327 StubRoutines::aarch64::_compare_long_string_LU 4328 = generate_compare_long_string_different_encoding(true); 4329 StubRoutines::aarch64::_compare_long_string_UL 4330 = generate_compare_long_string_different_encoding(false); 4331 } 4332 4333 // R0 = result 4334 // R1 = str2 4335 // R2 = cnt1 4336 // R3 = str1 4337 // R4 = cnt2 4338 // This generic linear code use few additional ideas, which makes it faster: 4339 // 1) we can safely keep at least 1st register of pattern(since length >= 8) 4340 // in order to skip initial loading(help in systems with 1 ld pipeline) 4341 // 2) we can use "fast" algorithm of finding single character to search for 4342 // first symbol with less branches(1 branch per each loaded register instead 4343 // of branch for each symbol), so, this is where constants like 4344 // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from 4345 // 3) after loading and analyzing 1st register of source string, it can be 4346 // used to search for every 1st character entry, saving few loads in 4347 // comparison with "simplier-but-slower" implementation 4348 // 4) in order to avoid lots of push/pop operations, code below is heavily 4349 // re-using/re-initializing/compressing register values, which makes code 4350 // larger and a bit less readable, however, most of extra operations are 4351 // issued during loads or branches, so, penalty is minimal 4352 address generate_string_indexof_linear(bool str1_isL, bool str2_isL) { 4353 const char* stubName = str1_isL 4354 ? (str2_isL ? "indexof_linear_ll" : "indexof_linear_ul") 4355 : "indexof_linear_uu"; 4356 __ align(CodeEntryAlignment); 4357 StubCodeMark mark(this, "StubRoutines", stubName); 4358 address entry = __ pc(); 4359 4360 int str1_chr_size = str1_isL ? 1 : 2; 4361 int str2_chr_size = str2_isL ? 1 : 2; 4362 int str1_chr_shift = str1_isL ? 0 : 1; 4363 int str2_chr_shift = str2_isL ? 0 : 1; 4364 bool isL = str1_isL && str2_isL; 4365 // parameters 4366 Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4; 4367 // temporary registers 4368 Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23; 4369 RegSet spilled_regs = RegSet::range(tmp1, tmp4); 4370 // redefinitions 4371 Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3; 4372 4373 __ push(spilled_regs, sp); 4374 Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO, 4375 L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED, 4376 L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP, 4377 L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH, 4378 L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2, 4379 L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH; 4380 // Read whole register from str1. It is safe, because length >=8 here 4381 __ ldr(ch1, Address(str1)); 4382 // Read whole register from str2. It is safe, because length >=8 here 4383 __ ldr(ch2, Address(str2)); 4384 __ sub(cnt2, cnt2, cnt1); 4385 __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF); 4386 if (str1_isL != str2_isL) { 4387 __ eor(v0, __ T16B, v0, v0); 4388 } 4389 __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001); 4390 __ mul(first, first, tmp1); 4391 // check if we have less than 1 register to check 4392 __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1); 4393 if (str1_isL != str2_isL) { 4394 __ fmovd(v1, ch1); 4395 } 4396 __ br(__ LE, L_SMALL); 4397 __ eor(ch2, first, ch2); 4398 if (str1_isL != str2_isL) { 4399 __ zip1(v1, __ T16B, v1, v0); 4400 } 4401 __ sub(tmp2, ch2, tmp1); 4402 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 4403 __ bics(tmp2, tmp2, ch2); 4404 if (str1_isL != str2_isL) { 4405 __ fmovd(ch1, v1); 4406 } 4407 __ br(__ NE, L_HAS_ZERO); 4408 __ subs(cnt2, cnt2, wordSize/str2_chr_size); 4409 __ add(result, result, wordSize/str2_chr_size); 4410 __ add(str2, str2, wordSize); 4411 __ br(__ LT, L_POST_LOOP); 4412 __ BIND(L_LOOP); 4413 __ ldr(ch2, Address(str2)); 4414 __ eor(ch2, first, ch2); 4415 __ sub(tmp2, ch2, tmp1); 4416 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 4417 __ bics(tmp2, tmp2, ch2); 4418 __ br(__ NE, L_HAS_ZERO); 4419 __ BIND(L_LOOP_PROCEED); 4420 __ subs(cnt2, cnt2, wordSize/str2_chr_size); 4421 __ add(str2, str2, wordSize); 4422 __ add(result, result, wordSize/str2_chr_size); 4423 __ br(__ GE, L_LOOP); 4424 __ BIND(L_POST_LOOP); 4425 __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check 4426 __ br(__ LE, NOMATCH); 4427 __ ldr(ch2, Address(str2)); 4428 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift); 4429 __ eor(ch2, first, ch2); 4430 __ sub(tmp2, ch2, tmp1); 4431 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 4432 __ mov(tmp4, -1); // all bits set 4433 __ b(L_SMALL_PROCEED); 4434 __ align(OptoLoopAlignment); 4435 __ BIND(L_SMALL); 4436 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift); 4437 __ eor(ch2, first, ch2); 4438 if (str1_isL != str2_isL) { 4439 __ zip1(v1, __ T16B, v1, v0); 4440 } 4441 __ sub(tmp2, ch2, tmp1); 4442 __ mov(tmp4, -1); // all bits set 4443 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 4444 if (str1_isL != str2_isL) { 4445 __ fmovd(ch1, v1); // move converted 4 symbols 4446 } 4447 __ BIND(L_SMALL_PROCEED); 4448 __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits. 4449 __ bic(tmp2, tmp2, ch2); 4450 __ ands(tmp2, tmp2, tmp4); // clear useless bits and check 4451 __ rbit(tmp2, tmp2); 4452 __ br(__ EQ, NOMATCH); 4453 __ BIND(L_SMALL_HAS_ZERO_LOOP); 4454 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's 4455 __ cmp(cnt1, u1(wordSize/str2_chr_size)); 4456 __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2); 4457 if (str2_isL) { // LL 4458 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index" 4459 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe. 4460 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info 4461 __ add(result, result, tmp4, __ LSR, LogBitsPerByte); 4462 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 4463 } else { 4464 __ mov(ch2, 0xE); // all bits in byte set except last one 4465 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 4466 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 4467 __ lslv(tmp2, tmp2, tmp4); 4468 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4469 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4470 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 4471 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4472 } 4473 __ cmp(ch1, ch2); 4474 __ mov(tmp4, wordSize/str2_chr_size); 4475 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 4476 __ BIND(L_SMALL_CMP_LOOP); 4477 str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift))) 4478 : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift))); 4479 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))) 4480 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))); 4481 __ add(tmp4, tmp4, 1); 4482 __ cmp(tmp4, cnt1); 4483 __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP); 4484 __ cmp(first, ch2); 4485 __ br(__ EQ, L_SMALL_CMP_LOOP); 4486 __ BIND(L_SMALL_CMP_LOOP_NOMATCH); 4487 __ cbz(tmp2, NOMATCH); // no more matches. exit 4488 __ clz(tmp4, tmp2); 4489 __ add(result, result, 1); // advance index 4490 __ add(str2, str2, str2_chr_size); // advance pointer 4491 __ b(L_SMALL_HAS_ZERO_LOOP); 4492 __ align(OptoLoopAlignment); 4493 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP); 4494 __ cmp(first, ch2); 4495 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 4496 __ b(DONE); 4497 __ align(OptoLoopAlignment); 4498 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2); 4499 if (str2_isL) { // LL 4500 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index" 4501 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe. 4502 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info 4503 __ add(result, result, tmp4, __ LSR, LogBitsPerByte); 4504 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 4505 } else { 4506 __ mov(ch2, 0xE); // all bits in byte set except last one 4507 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 4508 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 4509 __ lslv(tmp2, tmp2, tmp4); 4510 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4511 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4512 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 4513 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4514 } 4515 __ cmp(ch1, ch2); 4516 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 4517 __ b(DONE); 4518 __ align(OptoLoopAlignment); 4519 __ BIND(L_HAS_ZERO); 4520 __ rbit(tmp2, tmp2); 4521 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's 4522 // Now, perform compression of counters(cnt2 and cnt1) into one register. 4523 // It's fine because both counters are 32bit and are not changed in this 4524 // loop. Just restore it on exit. So, cnt1 can be re-used in this loop. 4525 __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2); 4526 __ sub(result, result, 1); 4527 __ BIND(L_HAS_ZERO_LOOP); 4528 __ mov(cnt1, wordSize/str2_chr_size); 4529 __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2); 4530 __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare 4531 if (str2_isL) { 4532 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index 4533 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 4534 __ lslv(tmp2, tmp2, tmp4); 4535 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4536 __ add(tmp4, tmp4, 1); 4537 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4538 __ lsl(tmp2, tmp2, 1); 4539 __ mov(tmp4, wordSize/str2_chr_size); 4540 } else { 4541 __ mov(ch2, 0xE); 4542 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 4543 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 4544 __ lslv(tmp2, tmp2, tmp4); 4545 __ add(tmp4, tmp4, 1); 4546 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4547 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); 4548 __ lsl(tmp2, tmp2, 1); 4549 __ mov(tmp4, wordSize/str2_chr_size); 4550 __ sub(str2, str2, str2_chr_size); 4551 } 4552 __ cmp(ch1, ch2); 4553 __ mov(tmp4, wordSize/str2_chr_size); 4554 __ br(__ NE, L_CMP_LOOP_NOMATCH); 4555 __ BIND(L_CMP_LOOP); 4556 str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift))) 4557 : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift))); 4558 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))) 4559 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))); 4560 __ add(tmp4, tmp4, 1); 4561 __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2); 4562 __ br(__ GE, L_CMP_LOOP_LAST_CMP); 4563 __ cmp(cnt1, ch2); 4564 __ br(__ EQ, L_CMP_LOOP); 4565 __ BIND(L_CMP_LOOP_NOMATCH); 4566 // here we're not matched 4567 __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop 4568 __ clz(tmp4, tmp2); 4569 __ add(str2, str2, str2_chr_size); // advance pointer 4570 __ b(L_HAS_ZERO_LOOP); 4571 __ align(OptoLoopAlignment); 4572 __ BIND(L_CMP_LOOP_LAST_CMP); 4573 __ cmp(cnt1, ch2); 4574 __ br(__ NE, L_CMP_LOOP_NOMATCH); 4575 __ b(DONE); 4576 __ align(OptoLoopAlignment); 4577 __ BIND(L_CMP_LOOP_LAST_CMP2); 4578 if (str2_isL) { 4579 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index 4580 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 4581 __ lslv(tmp2, tmp2, tmp4); 4582 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4583 __ add(tmp4, tmp4, 1); 4584 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4585 __ lsl(tmp2, tmp2, 1); 4586 } else { 4587 __ mov(ch2, 0xE); 4588 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 4589 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 4590 __ lslv(tmp2, tmp2, tmp4); 4591 __ add(tmp4, tmp4, 1); 4592 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4593 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); 4594 __ lsl(tmp2, tmp2, 1); 4595 __ sub(str2, str2, str2_chr_size); 4596 } 4597 __ cmp(ch1, ch2); 4598 __ br(__ NE, L_CMP_LOOP_NOMATCH); 4599 __ b(DONE); 4600 __ align(OptoLoopAlignment); 4601 __ BIND(L_HAS_ZERO_LOOP_NOMATCH); 4602 // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until 4603 // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP, 4604 // so, result was increased at max by wordSize/str2_chr_size - 1, so, 4605 // respective high bit wasn't changed. L_LOOP_PROCEED will increase 4606 // result by analyzed characters value, so, we can just reset lower bits 4607 // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL 4608 // 2) restore cnt1 and cnt2 values from "compressed" cnt2 4609 // 3) advance str2 value to represent next str2 octet. result & 7/3 is 4610 // index of last analyzed substring inside current octet. So, str2 in at 4611 // respective start address. We need to advance it to next octet 4612 __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed 4613 __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2); 4614 __ bfm(result, zr, 0, 2 - str2_chr_shift); 4615 __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2 4616 __ movw(cnt2, cnt2); 4617 __ b(L_LOOP_PROCEED); 4618 __ align(OptoLoopAlignment); 4619 __ BIND(NOMATCH); 4620 __ mov(result, -1); 4621 __ BIND(DONE); 4622 __ pop(spilled_regs, sp); 4623 __ ret(lr); 4624 return entry; 4625 } 4626 4627 void generate_string_indexof_stubs() { 4628 StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true); 4629 StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false); 4630 StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false); 4631 } 4632 4633 void inflate_and_store_2_fp_registers(bool generatePrfm, 4634 FloatRegister src1, FloatRegister src2) { 4635 Register dst = r1; 4636 __ zip1(v1, __ T16B, src1, v0); 4637 __ zip2(v2, __ T16B, src1, v0); 4638 if (generatePrfm) { 4639 __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM); 4640 } 4641 __ zip1(v3, __ T16B, src2, v0); 4642 __ zip2(v4, __ T16B, src2, v0); 4643 __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64))); 4644 } 4645 4646 // R0 = src 4647 // R1 = dst 4648 // R2 = len 4649 // R3 = len >> 3 4650 // V0 = 0 4651 // v1 = loaded 8 bytes 4652 address generate_large_byte_array_inflate() { 4653 __ align(CodeEntryAlignment); 4654 StubCodeMark mark(this, "StubRoutines", "large_byte_array_inflate"); 4655 address entry = __ pc(); 4656 Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE; 4657 Register src = r0, dst = r1, len = r2, octetCounter = r3; 4658 const int large_loop_threshold = MAX(64, SoftwarePrefetchHintDistance)/8 + 4; 4659 4660 // do one more 8-byte read to have address 16-byte aligned in most cases 4661 // also use single store instruction 4662 __ ldrd(v2, __ post(src, 8)); 4663 __ sub(octetCounter, octetCounter, 2); 4664 __ zip1(v1, __ T16B, v1, v0); 4665 __ zip1(v2, __ T16B, v2, v0); 4666 __ st1(v1, v2, __ T16B, __ post(dst, 32)); 4667 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 4668 __ subs(rscratch1, octetCounter, large_loop_threshold); 4669 __ br(__ LE, LOOP_START); 4670 __ b(LOOP_PRFM_START); 4671 __ bind(LOOP_PRFM); 4672 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 4673 __ bind(LOOP_PRFM_START); 4674 __ prfm(Address(src, SoftwarePrefetchHintDistance)); 4675 __ sub(octetCounter, octetCounter, 8); 4676 __ subs(rscratch1, octetCounter, large_loop_threshold); 4677 inflate_and_store_2_fp_registers(true, v3, v4); 4678 inflate_and_store_2_fp_registers(true, v5, v6); 4679 __ br(__ GT, LOOP_PRFM); 4680 __ cmp(octetCounter, (u1)8); 4681 __ br(__ LT, DONE); 4682 __ bind(LOOP); 4683 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 4684 __ bind(LOOP_START); 4685 __ sub(octetCounter, octetCounter, 8); 4686 __ cmp(octetCounter, (u1)8); 4687 inflate_and_store_2_fp_registers(false, v3, v4); 4688 inflate_and_store_2_fp_registers(false, v5, v6); 4689 __ br(__ GE, LOOP); 4690 __ bind(DONE); 4691 __ ret(lr); 4692 return entry; 4693 } 4694 4695 /** 4696 * Arguments: 4697 * 4698 * Input: 4699 * c_rarg0 - current state address 4700 * c_rarg1 - H key address 4701 * c_rarg2 - data address 4702 * c_rarg3 - number of blocks 4703 * 4704 * Output: 4705 * Updated state at c_rarg0 4706 */ 4707 address generate_ghash_processBlocks() { 4708 // Bafflingly, GCM uses little-endian for the byte order, but 4709 // big-endian for the bit order. For example, the polynomial 1 is 4710 // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00. 4711 // 4712 // So, we must either reverse the bytes in each word and do 4713 // everything big-endian or reverse the bits in each byte and do 4714 // it little-endian. On AArch64 it's more idiomatic to reverse 4715 // the bits in each byte (we have an instruction, RBIT, to do 4716 // that) and keep the data in little-endian bit order throught the 4717 // calculation, bit-reversing the inputs and outputs. 4718 4719 StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks"); 4720 __ align(wordSize * 2); 4721 address p = __ pc(); 4722 __ emit_int64(0x87); // The low-order bits of the field 4723 // polynomial (i.e. p = z^7+z^2+z+1) 4724 // repeated in the low and high parts of a 4725 // 128-bit vector 4726 __ emit_int64(0x87); 4727 4728 __ align(CodeEntryAlignment); 4729 address start = __ pc(); 4730 4731 Register state = c_rarg0; 4732 Register subkeyH = c_rarg1; 4733 Register data = c_rarg2; 4734 Register blocks = c_rarg3; 4735 4736 FloatRegister vzr = v30; 4737 __ eor(vzr, __ T16B, vzr, vzr); // zero register 4738 4739 __ ldrq(v0, Address(state)); 4740 __ ldrq(v1, Address(subkeyH)); 4741 4742 __ rev64(v0, __ T16B, v0); // Bit-reverse words in state and subkeyH 4743 __ rbit(v0, __ T16B, v0); 4744 __ rev64(v1, __ T16B, v1); 4745 __ rbit(v1, __ T16B, v1); 4746 4747 __ ldrq(v26, p); 4748 4749 __ ext(v16, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1 4750 __ eor(v16, __ T16B, v16, v1); // xor subkeyH into subkeyL (Karatsuba: (A1+A0)) 4751 4752 { 4753 Label L_ghash_loop; 4754 __ bind(L_ghash_loop); 4755 4756 __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit 4757 // reversing each byte 4758 __ rbit(v2, __ T16B, v2); 4759 __ eor(v2, __ T16B, v0, v2); // bit-swapped data ^ bit-swapped state 4760 4761 // Multiply state in v2 by subkey in v1 4762 ghash_multiply(/*result_lo*/v5, /*result_hi*/v7, 4763 /*a*/v1, /*b*/v2, /*a1_xor_a0*/v16, 4764 /*temps*/v6, v20, v18, v21); 4765 // Reduce v7:v5 by the field polynomial 4766 ghash_reduce(v0, v5, v7, v26, vzr, v20); 4767 4768 __ sub(blocks, blocks, 1); 4769 __ cbnz(blocks, L_ghash_loop); 4770 } 4771 4772 // The bit-reversed result is at this point in v0 4773 __ rev64(v1, __ T16B, v0); 4774 __ rbit(v1, __ T16B, v1); 4775 4776 __ st1(v1, __ T16B, state); 4777 __ ret(lr); 4778 4779 return start; 4780 } 4781 4782 // Continuation point for throwing of implicit exceptions that are 4783 // not handled in the current activation. Fabricates an exception 4784 // oop and initiates normal exception dispatching in this 4785 // frame. Since we need to preserve callee-saved values (currently 4786 // only for C2, but done for C1 as well) we need a callee-saved oop 4787 // map and therefore have to make these stubs into RuntimeStubs 4788 // rather than BufferBlobs. If the compiler needs all registers to 4789 // be preserved between the fault point and the exception handler 4790 // then it must assume responsibility for that in 4791 // AbstractCompiler::continuation_for_implicit_null_exception or 4792 // continuation_for_implicit_division_by_zero_exception. All other 4793 // implicit exceptions (e.g., NullPointerException or 4794 // AbstractMethodError on entry) are either at call sites or 4795 // otherwise assume that stack unwinding will be initiated, so 4796 // caller saved registers were assumed volatile in the compiler. 4797 4798 #undef __ 4799 #define __ masm-> 4800 4801 address generate_throw_exception(const char* name, 4802 address runtime_entry, 4803 Register arg1 = noreg, 4804 Register arg2 = noreg) { 4805 // Information about frame layout at time of blocking runtime call. 4806 // Note that we only have to preserve callee-saved registers since 4807 // the compilers are responsible for supplying a continuation point 4808 // if they expect all registers to be preserved. 4809 // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0 4810 enum layout { 4811 rfp_off = 0, 4812 rfp_off2, 4813 return_off, 4814 return_off2, 4815 framesize // inclusive of return address 4816 }; 4817 4818 int insts_size = 512; 4819 int locs_size = 64; 4820 4821 CodeBuffer code(name, insts_size, locs_size); 4822 OopMapSet* oop_maps = new OopMapSet(); 4823 MacroAssembler* masm = new MacroAssembler(&code); 4824 4825 address start = __ pc(); 4826 4827 // This is an inlined and slightly modified version of call_VM 4828 // which has the ability to fetch the return PC out of 4829 // thread-local storage and also sets up last_Java_sp slightly 4830 // differently than the real call_VM 4831 4832 __ enter(); // Save FP and LR before call 4833 4834 assert(is_even(framesize/2), "sp not 16-byte aligned"); 4835 4836 // lr and fp are already in place 4837 __ sub(sp, rfp, ((unsigned)framesize-4) << LogBytesPerInt); // prolog 4838 4839 int frame_complete = __ pc() - start; 4840 4841 // Set up last_Java_sp and last_Java_fp 4842 address the_pc = __ pc(); 4843 __ set_last_Java_frame(sp, rfp, the_pc, rscratch1); 4844 4845 // Call runtime 4846 if (arg1 != noreg) { 4847 assert(arg2 != c_rarg1, "clobbered"); 4848 __ mov(c_rarg1, arg1); 4849 } 4850 if (arg2 != noreg) { 4851 __ mov(c_rarg2, arg2); 4852 } 4853 __ mov(c_rarg0, rthread); 4854 BLOCK_COMMENT("call runtime_entry"); 4855 __ mov(rscratch1, runtime_entry); 4856 __ blr(rscratch1); 4857 4858 // Generate oop map 4859 OopMap* map = new OopMap(framesize, 0); 4860 4861 oop_maps->add_gc_map(the_pc - start, map); 4862 4863 __ reset_last_Java_frame(true); 4864 __ maybe_isb(); 4865 4866 __ leave(); 4867 4868 // check for pending exceptions 4869 #ifdef ASSERT 4870 Label L; 4871 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 4872 __ cbnz(rscratch1, L); 4873 __ should_not_reach_here(); 4874 __ bind(L); 4875 #endif // ASSERT 4876 __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 4877 4878 4879 // codeBlob framesize is in words (not VMRegImpl::slot_size) 4880 RuntimeStub* stub = 4881 RuntimeStub::new_runtime_stub(name, 4882 &code, 4883 frame_complete, 4884 (framesize >> (LogBytesPerWord - LogBytesPerInt)), 4885 oop_maps, false); 4886 return stub->entry_point(); 4887 } 4888 4889 class MontgomeryMultiplyGenerator : public MacroAssembler { 4890 4891 Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn, 4892 Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj; 4893 4894 RegSet _toSave; 4895 bool _squaring; 4896 4897 public: 4898 MontgomeryMultiplyGenerator (Assembler *as, bool squaring) 4899 : MacroAssembler(as->code()), _squaring(squaring) { 4900 4901 // Register allocation 4902 4903 Register reg = c_rarg0; 4904 Pa_base = reg; // Argument registers 4905 if (squaring) 4906 Pb_base = Pa_base; 4907 else 4908 Pb_base = ++reg; 4909 Pn_base = ++reg; 4910 Rlen= ++reg; 4911 inv = ++reg; 4912 Pm_base = ++reg; 4913 4914 // Working registers: 4915 Ra = ++reg; // The current digit of a, b, n, and m. 4916 Rb = ++reg; 4917 Rm = ++reg; 4918 Rn = ++reg; 4919 4920 Pa = ++reg; // Pointers to the current/next digit of a, b, n, and m. 4921 Pb = ++reg; 4922 Pm = ++reg; 4923 Pn = ++reg; 4924 4925 t0 = ++reg; // Three registers which form a 4926 t1 = ++reg; // triple-precision accumuator. 4927 t2 = ++reg; 4928 4929 Ri = ++reg; // Inner and outer loop indexes. 4930 Rj = ++reg; 4931 4932 Rhi_ab = ++reg; // Product registers: low and high parts 4933 Rlo_ab = ++reg; // of a*b and m*n. 4934 Rhi_mn = ++reg; 4935 Rlo_mn = ++reg; 4936 4937 // r19 and up are callee-saved. 4938 _toSave = RegSet::range(r19, reg) + Pm_base; 4939 } 4940 4941 private: 4942 void save_regs() { 4943 push(_toSave, sp); 4944 } 4945 4946 void restore_regs() { 4947 pop(_toSave, sp); 4948 } 4949 4950 template <typename T> 4951 void unroll_2(Register count, T block) { 4952 Label loop, end, odd; 4953 tbnz(count, 0, odd); 4954 cbz(count, end); 4955 align(16); 4956 bind(loop); 4957 (this->*block)(); 4958 bind(odd); 4959 (this->*block)(); 4960 subs(count, count, 2); 4961 br(Assembler::GT, loop); 4962 bind(end); 4963 } 4964 4965 template <typename T> 4966 void unroll_2(Register count, T block, Register d, Register s, Register tmp) { 4967 Label loop, end, odd; 4968 tbnz(count, 0, odd); 4969 cbz(count, end); 4970 align(16); 4971 bind(loop); 4972 (this->*block)(d, s, tmp); 4973 bind(odd); 4974 (this->*block)(d, s, tmp); 4975 subs(count, count, 2); 4976 br(Assembler::GT, loop); 4977 bind(end); 4978 } 4979 4980 void pre1(RegisterOrConstant i) { 4981 block_comment("pre1"); 4982 // Pa = Pa_base; 4983 // Pb = Pb_base + i; 4984 // Pm = Pm_base; 4985 // Pn = Pn_base + i; 4986 // Ra = *Pa; 4987 // Rb = *Pb; 4988 // Rm = *Pm; 4989 // Rn = *Pn; 4990 ldr(Ra, Address(Pa_base)); 4991 ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 4992 ldr(Rm, Address(Pm_base)); 4993 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 4994 lea(Pa, Address(Pa_base)); 4995 lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 4996 lea(Pm, Address(Pm_base)); 4997 lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 4998 4999 // Zero the m*n result. 5000 mov(Rhi_mn, zr); 5001 mov(Rlo_mn, zr); 5002 } 5003 5004 // The core multiply-accumulate step of a Montgomery 5005 // multiplication. The idea is to schedule operations as a 5006 // pipeline so that instructions with long latencies (loads and 5007 // multiplies) have time to complete before their results are 5008 // used. This most benefits in-order implementations of the 5009 // architecture but out-of-order ones also benefit. 5010 void step() { 5011 block_comment("step"); 5012 // MACC(Ra, Rb, t0, t1, t2); 5013 // Ra = *++Pa; 5014 // Rb = *--Pb; 5015 umulh(Rhi_ab, Ra, Rb); 5016 mul(Rlo_ab, Ra, Rb); 5017 ldr(Ra, pre(Pa, wordSize)); 5018 ldr(Rb, pre(Pb, -wordSize)); 5019 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the 5020 // previous iteration. 5021 // MACC(Rm, Rn, t0, t1, t2); 5022 // Rm = *++Pm; 5023 // Rn = *--Pn; 5024 umulh(Rhi_mn, Rm, Rn); 5025 mul(Rlo_mn, Rm, Rn); 5026 ldr(Rm, pre(Pm, wordSize)); 5027 ldr(Rn, pre(Pn, -wordSize)); 5028 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 5029 } 5030 5031 void post1() { 5032 block_comment("post1"); 5033 5034 // MACC(Ra, Rb, t0, t1, t2); 5035 // Ra = *++Pa; 5036 // Rb = *--Pb; 5037 umulh(Rhi_ab, Ra, Rb); 5038 mul(Rlo_ab, Ra, Rb); 5039 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 5040 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 5041 5042 // *Pm = Rm = t0 * inv; 5043 mul(Rm, t0, inv); 5044 str(Rm, Address(Pm)); 5045 5046 // MACC(Rm, Rn, t0, t1, t2); 5047 // t0 = t1; t1 = t2; t2 = 0; 5048 umulh(Rhi_mn, Rm, Rn); 5049 5050 #ifndef PRODUCT 5051 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 5052 { 5053 mul(Rlo_mn, Rm, Rn); 5054 add(Rlo_mn, t0, Rlo_mn); 5055 Label ok; 5056 cbz(Rlo_mn, ok); { 5057 stop("broken Montgomery multiply"); 5058 } bind(ok); 5059 } 5060 #endif 5061 // We have very carefully set things up so that 5062 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 5063 // the lower half of Rm * Rn because we know the result already: 5064 // it must be -t0. t0 + (-t0) must generate a carry iff 5065 // t0 != 0. So, rather than do a mul and an adds we just set 5066 // the carry flag iff t0 is nonzero. 5067 // 5068 // mul(Rlo_mn, Rm, Rn); 5069 // adds(zr, t0, Rlo_mn); 5070 subs(zr, t0, 1); // Set carry iff t0 is nonzero 5071 adcs(t0, t1, Rhi_mn); 5072 adc(t1, t2, zr); 5073 mov(t2, zr); 5074 } 5075 5076 void pre2(RegisterOrConstant i, RegisterOrConstant len) { 5077 block_comment("pre2"); 5078 // Pa = Pa_base + i-len; 5079 // Pb = Pb_base + len; 5080 // Pm = Pm_base + i-len; 5081 // Pn = Pn_base + len; 5082 5083 if (i.is_register()) { 5084 sub(Rj, i.as_register(), len); 5085 } else { 5086 mov(Rj, i.as_constant()); 5087 sub(Rj, Rj, len); 5088 } 5089 // Rj == i-len 5090 5091 lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord))); 5092 lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord))); 5093 lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 5094 lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord))); 5095 5096 // Ra = *++Pa; 5097 // Rb = *--Pb; 5098 // Rm = *++Pm; 5099 // Rn = *--Pn; 5100 ldr(Ra, pre(Pa, wordSize)); 5101 ldr(Rb, pre(Pb, -wordSize)); 5102 ldr(Rm, pre(Pm, wordSize)); 5103 ldr(Rn, pre(Pn, -wordSize)); 5104 5105 mov(Rhi_mn, zr); 5106 mov(Rlo_mn, zr); 5107 } 5108 5109 void post2(RegisterOrConstant i, RegisterOrConstant len) { 5110 block_comment("post2"); 5111 if (i.is_constant()) { 5112 mov(Rj, i.as_constant()-len.as_constant()); 5113 } else { 5114 sub(Rj, i.as_register(), len); 5115 } 5116 5117 adds(t0, t0, Rlo_mn); // The pending m*n, low part 5118 5119 // As soon as we know the least significant digit of our result, 5120 // store it. 5121 // Pm_base[i-len] = t0; 5122 str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 5123 5124 // t0 = t1; t1 = t2; t2 = 0; 5125 adcs(t0, t1, Rhi_mn); // The pending m*n, high part 5126 adc(t1, t2, zr); 5127 mov(t2, zr); 5128 } 5129 5130 // A carry in t0 after Montgomery multiplication means that we 5131 // should subtract multiples of n from our result in m. We'll 5132 // keep doing that until there is no carry. 5133 void normalize(RegisterOrConstant len) { 5134 block_comment("normalize"); 5135 // while (t0) 5136 // t0 = sub(Pm_base, Pn_base, t0, len); 5137 Label loop, post, again; 5138 Register cnt = t1, i = t2; // Re-use registers; we're done with them now 5139 cbz(t0, post); { 5140 bind(again); { 5141 mov(i, zr); 5142 mov(cnt, len); 5143 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 5144 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 5145 subs(zr, zr, zr); // set carry flag, i.e. no borrow 5146 align(16); 5147 bind(loop); { 5148 sbcs(Rm, Rm, Rn); 5149 str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 5150 add(i, i, 1); 5151 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 5152 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 5153 sub(cnt, cnt, 1); 5154 } cbnz(cnt, loop); 5155 sbc(t0, t0, zr); 5156 } cbnz(t0, again); 5157 } bind(post); 5158 } 5159 5160 // Move memory at s to d, reversing words. 5161 // Increments d to end of copied memory 5162 // Destroys tmp1, tmp2 5163 // Preserves len 5164 // Leaves s pointing to the address which was in d at start 5165 void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) { 5166 assert(tmp1 < r19 && tmp2 < r19, "register corruption"); 5167 5168 lea(s, Address(s, len, Address::uxtw(LogBytesPerWord))); 5169 mov(tmp1, len); 5170 unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2); 5171 sub(s, d, len, ext::uxtw, LogBytesPerWord); 5172 } 5173 // where 5174 void reverse1(Register d, Register s, Register tmp) { 5175 ldr(tmp, pre(s, -wordSize)); 5176 ror(tmp, tmp, 32); 5177 str(tmp, post(d, wordSize)); 5178 } 5179 5180 void step_squaring() { 5181 // An extra ACC 5182 step(); 5183 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 5184 } 5185 5186 void last_squaring(RegisterOrConstant i) { 5187 Label dont; 5188 // if ((i & 1) == 0) { 5189 tbnz(i.as_register(), 0, dont); { 5190 // MACC(Ra, Rb, t0, t1, t2); 5191 // Ra = *++Pa; 5192 // Rb = *--Pb; 5193 umulh(Rhi_ab, Ra, Rb); 5194 mul(Rlo_ab, Ra, Rb); 5195 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 5196 } bind(dont); 5197 } 5198 5199 void extra_step_squaring() { 5200 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 5201 5202 // MACC(Rm, Rn, t0, t1, t2); 5203 // Rm = *++Pm; 5204 // Rn = *--Pn; 5205 umulh(Rhi_mn, Rm, Rn); 5206 mul(Rlo_mn, Rm, Rn); 5207 ldr(Rm, pre(Pm, wordSize)); 5208 ldr(Rn, pre(Pn, -wordSize)); 5209 } 5210 5211 void post1_squaring() { 5212 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 5213 5214 // *Pm = Rm = t0 * inv; 5215 mul(Rm, t0, inv); 5216 str(Rm, Address(Pm)); 5217 5218 // MACC(Rm, Rn, t0, t1, t2); 5219 // t0 = t1; t1 = t2; t2 = 0; 5220 umulh(Rhi_mn, Rm, Rn); 5221 5222 #ifndef PRODUCT 5223 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 5224 { 5225 mul(Rlo_mn, Rm, Rn); 5226 add(Rlo_mn, t0, Rlo_mn); 5227 Label ok; 5228 cbz(Rlo_mn, ok); { 5229 stop("broken Montgomery multiply"); 5230 } bind(ok); 5231 } 5232 #endif 5233 // We have very carefully set things up so that 5234 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 5235 // the lower half of Rm * Rn because we know the result already: 5236 // it must be -t0. t0 + (-t0) must generate a carry iff 5237 // t0 != 0. So, rather than do a mul and an adds we just set 5238 // the carry flag iff t0 is nonzero. 5239 // 5240 // mul(Rlo_mn, Rm, Rn); 5241 // adds(zr, t0, Rlo_mn); 5242 subs(zr, t0, 1); // Set carry iff t0 is nonzero 5243 adcs(t0, t1, Rhi_mn); 5244 adc(t1, t2, zr); 5245 mov(t2, zr); 5246 } 5247 5248 void acc(Register Rhi, Register Rlo, 5249 Register t0, Register t1, Register t2) { 5250 adds(t0, t0, Rlo); 5251 adcs(t1, t1, Rhi); 5252 adc(t2, t2, zr); 5253 } 5254 5255 public: 5256 /** 5257 * Fast Montgomery multiplication. The derivation of the 5258 * algorithm is in A Cryptographic Library for the Motorola 5259 * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. 5260 * 5261 * Arguments: 5262 * 5263 * Inputs for multiplication: 5264 * c_rarg0 - int array elements a 5265 * c_rarg1 - int array elements b 5266 * c_rarg2 - int array elements n (the modulus) 5267 * c_rarg3 - int length 5268 * c_rarg4 - int inv 5269 * c_rarg5 - int array elements m (the result) 5270 * 5271 * Inputs for squaring: 5272 * c_rarg0 - int array elements a 5273 * c_rarg1 - int array elements n (the modulus) 5274 * c_rarg2 - int length 5275 * c_rarg3 - int inv 5276 * c_rarg4 - int array elements m (the result) 5277 * 5278 */ 5279 address generate_multiply() { 5280 Label argh, nothing; 5281 bind(argh); 5282 stop("MontgomeryMultiply total_allocation must be <= 8192"); 5283 5284 align(CodeEntryAlignment); 5285 address entry = pc(); 5286 5287 cbzw(Rlen, nothing); 5288 5289 enter(); 5290 5291 // Make room. 5292 cmpw(Rlen, 512); 5293 br(Assembler::HI, argh); 5294 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 5295 andr(sp, Ra, -2 * wordSize); 5296 5297 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 5298 5299 { 5300 // Copy input args, reversing as we go. We use Ra as a 5301 // temporary variable. 5302 reverse(Ra, Pa_base, Rlen, t0, t1); 5303 if (!_squaring) 5304 reverse(Ra, Pb_base, Rlen, t0, t1); 5305 reverse(Ra, Pn_base, Rlen, t0, t1); 5306 } 5307 5308 // Push all call-saved registers and also Pm_base which we'll need 5309 // at the end. 5310 save_regs(); 5311 5312 #ifndef PRODUCT 5313 // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply"); 5314 { 5315 ldr(Rn, Address(Pn_base, 0)); 5316 mul(Rlo_mn, Rn, inv); 5317 subs(zr, Rlo_mn, -1); 5318 Label ok; 5319 br(EQ, ok); { 5320 stop("broken inverse in Montgomery multiply"); 5321 } bind(ok); 5322 } 5323 #endif 5324 5325 mov(Pm_base, Ra); 5326 5327 mov(t0, zr); 5328 mov(t1, zr); 5329 mov(t2, zr); 5330 5331 block_comment("for (int i = 0; i < len; i++) {"); 5332 mov(Ri, zr); { 5333 Label loop, end; 5334 cmpw(Ri, Rlen); 5335 br(Assembler::GE, end); 5336 5337 bind(loop); 5338 pre1(Ri); 5339 5340 block_comment(" for (j = i; j; j--) {"); { 5341 movw(Rj, Ri); 5342 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 5343 } block_comment(" } // j"); 5344 5345 post1(); 5346 addw(Ri, Ri, 1); 5347 cmpw(Ri, Rlen); 5348 br(Assembler::LT, loop); 5349 bind(end); 5350 block_comment("} // i"); 5351 } 5352 5353 block_comment("for (int i = len; i < 2*len; i++) {"); 5354 mov(Ri, Rlen); { 5355 Label loop, end; 5356 cmpw(Ri, Rlen, Assembler::LSL, 1); 5357 br(Assembler::GE, end); 5358 5359 bind(loop); 5360 pre2(Ri, Rlen); 5361 5362 block_comment(" for (j = len*2-i-1; j; j--) {"); { 5363 lslw(Rj, Rlen, 1); 5364 subw(Rj, Rj, Ri); 5365 subw(Rj, Rj, 1); 5366 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 5367 } block_comment(" } // j"); 5368 5369 post2(Ri, Rlen); 5370 addw(Ri, Ri, 1); 5371 cmpw(Ri, Rlen, Assembler::LSL, 1); 5372 br(Assembler::LT, loop); 5373 bind(end); 5374 } 5375 block_comment("} // i"); 5376 5377 normalize(Rlen); 5378 5379 mov(Ra, Pm_base); // Save Pm_base in Ra 5380 restore_regs(); // Restore caller's Pm_base 5381 5382 // Copy our result into caller's Pm_base 5383 reverse(Pm_base, Ra, Rlen, t0, t1); 5384 5385 leave(); 5386 bind(nothing); 5387 ret(lr); 5388 5389 return entry; 5390 } 5391 // In C, approximately: 5392 5393 // void 5394 // montgomery_multiply(unsigned long Pa_base[], unsigned long Pb_base[], 5395 // unsigned long Pn_base[], unsigned long Pm_base[], 5396 // unsigned long inv, int len) { 5397 // unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 5398 // unsigned long *Pa, *Pb, *Pn, *Pm; 5399 // unsigned long Ra, Rb, Rn, Rm; 5400 5401 // int i; 5402 5403 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 5404 5405 // for (i = 0; i < len; i++) { 5406 // int j; 5407 5408 // Pa = Pa_base; 5409 // Pb = Pb_base + i; 5410 // Pm = Pm_base; 5411 // Pn = Pn_base + i; 5412 5413 // Ra = *Pa; 5414 // Rb = *Pb; 5415 // Rm = *Pm; 5416 // Rn = *Pn; 5417 5418 // int iters = i; 5419 // for (j = 0; iters--; j++) { 5420 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 5421 // MACC(Ra, Rb, t0, t1, t2); 5422 // Ra = *++Pa; 5423 // Rb = *--Pb; 5424 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5425 // MACC(Rm, Rn, t0, t1, t2); 5426 // Rm = *++Pm; 5427 // Rn = *--Pn; 5428 // } 5429 5430 // assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be"); 5431 // MACC(Ra, Rb, t0, t1, t2); 5432 // *Pm = Rm = t0 * inv; 5433 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 5434 // MACC(Rm, Rn, t0, t1, t2); 5435 5436 // assert(t0 == 0, "broken Montgomery multiply"); 5437 5438 // t0 = t1; t1 = t2; t2 = 0; 5439 // } 5440 5441 // for (i = len; i < 2*len; i++) { 5442 // int j; 5443 5444 // Pa = Pa_base + i-len; 5445 // Pb = Pb_base + len; 5446 // Pm = Pm_base + i-len; 5447 // Pn = Pn_base + len; 5448 5449 // Ra = *++Pa; 5450 // Rb = *--Pb; 5451 // Rm = *++Pm; 5452 // Rn = *--Pn; 5453 5454 // int iters = len*2-i-1; 5455 // for (j = i-len+1; iters--; j++) { 5456 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 5457 // MACC(Ra, Rb, t0, t1, t2); 5458 // Ra = *++Pa; 5459 // Rb = *--Pb; 5460 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5461 // MACC(Rm, Rn, t0, t1, t2); 5462 // Rm = *++Pm; 5463 // Rn = *--Pn; 5464 // } 5465 5466 // Pm_base[i-len] = t0; 5467 // t0 = t1; t1 = t2; t2 = 0; 5468 // } 5469 5470 // while (t0) 5471 // t0 = sub(Pm_base, Pn_base, t0, len); 5472 // } 5473 5474 /** 5475 * Fast Montgomery squaring. This uses asymptotically 25% fewer 5476 * multiplies than Montgomery multiplication so it should be up to 5477 * 25% faster. However, its loop control is more complex and it 5478 * may actually run slower on some machines. 5479 * 5480 * Arguments: 5481 * 5482 * Inputs: 5483 * c_rarg0 - int array elements a 5484 * c_rarg1 - int array elements n (the modulus) 5485 * c_rarg2 - int length 5486 * c_rarg3 - int inv 5487 * c_rarg4 - int array elements m (the result) 5488 * 5489 */ 5490 address generate_square() { 5491 Label argh; 5492 bind(argh); 5493 stop("MontgomeryMultiply total_allocation must be <= 8192"); 5494 5495 align(CodeEntryAlignment); 5496 address entry = pc(); 5497 5498 enter(); 5499 5500 // Make room. 5501 cmpw(Rlen, 512); 5502 br(Assembler::HI, argh); 5503 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 5504 andr(sp, Ra, -2 * wordSize); 5505 5506 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 5507 5508 { 5509 // Copy input args, reversing as we go. We use Ra as a 5510 // temporary variable. 5511 reverse(Ra, Pa_base, Rlen, t0, t1); 5512 reverse(Ra, Pn_base, Rlen, t0, t1); 5513 } 5514 5515 // Push all call-saved registers and also Pm_base which we'll need 5516 // at the end. 5517 save_regs(); 5518 5519 mov(Pm_base, Ra); 5520 5521 mov(t0, zr); 5522 mov(t1, zr); 5523 mov(t2, zr); 5524 5525 block_comment("for (int i = 0; i < len; i++) {"); 5526 mov(Ri, zr); { 5527 Label loop, end; 5528 bind(loop); 5529 cmp(Ri, Rlen); 5530 br(Assembler::GE, end); 5531 5532 pre1(Ri); 5533 5534 block_comment("for (j = (i+1)/2; j; j--) {"); { 5535 add(Rj, Ri, 1); 5536 lsr(Rj, Rj, 1); 5537 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 5538 } block_comment(" } // j"); 5539 5540 last_squaring(Ri); 5541 5542 block_comment(" for (j = i/2; j; j--) {"); { 5543 lsr(Rj, Ri, 1); 5544 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 5545 } block_comment(" } // j"); 5546 5547 post1_squaring(); 5548 add(Ri, Ri, 1); 5549 cmp(Ri, Rlen); 5550 br(Assembler::LT, loop); 5551 5552 bind(end); 5553 block_comment("} // i"); 5554 } 5555 5556 block_comment("for (int i = len; i < 2*len; i++) {"); 5557 mov(Ri, Rlen); { 5558 Label loop, end; 5559 bind(loop); 5560 cmp(Ri, Rlen, Assembler::LSL, 1); 5561 br(Assembler::GE, end); 5562 5563 pre2(Ri, Rlen); 5564 5565 block_comment(" for (j = (2*len-i-1)/2; j; j--) {"); { 5566 lsl(Rj, Rlen, 1); 5567 sub(Rj, Rj, Ri); 5568 sub(Rj, Rj, 1); 5569 lsr(Rj, Rj, 1); 5570 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 5571 } block_comment(" } // j"); 5572 5573 last_squaring(Ri); 5574 5575 block_comment(" for (j = (2*len-i)/2; j; j--) {"); { 5576 lsl(Rj, Rlen, 1); 5577 sub(Rj, Rj, Ri); 5578 lsr(Rj, Rj, 1); 5579 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 5580 } block_comment(" } // j"); 5581 5582 post2(Ri, Rlen); 5583 add(Ri, Ri, 1); 5584 cmp(Ri, Rlen, Assembler::LSL, 1); 5585 5586 br(Assembler::LT, loop); 5587 bind(end); 5588 block_comment("} // i"); 5589 } 5590 5591 normalize(Rlen); 5592 5593 mov(Ra, Pm_base); // Save Pm_base in Ra 5594 restore_regs(); // Restore caller's Pm_base 5595 5596 // Copy our result into caller's Pm_base 5597 reverse(Pm_base, Ra, Rlen, t0, t1); 5598 5599 leave(); 5600 ret(lr); 5601 5602 return entry; 5603 } 5604 // In C, approximately: 5605 5606 // void 5607 // montgomery_square(unsigned long Pa_base[], unsigned long Pn_base[], 5608 // unsigned long Pm_base[], unsigned long inv, int len) { 5609 // unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 5610 // unsigned long *Pa, *Pb, *Pn, *Pm; 5611 // unsigned long Ra, Rb, Rn, Rm; 5612 5613 // int i; 5614 5615 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 5616 5617 // for (i = 0; i < len; i++) { 5618 // int j; 5619 5620 // Pa = Pa_base; 5621 // Pb = Pa_base + i; 5622 // Pm = Pm_base; 5623 // Pn = Pn_base + i; 5624 5625 // Ra = *Pa; 5626 // Rb = *Pb; 5627 // Rm = *Pm; 5628 // Rn = *Pn; 5629 5630 // int iters = (i+1)/2; 5631 // for (j = 0; iters--; j++) { 5632 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 5633 // MACC2(Ra, Rb, t0, t1, t2); 5634 // Ra = *++Pa; 5635 // Rb = *--Pb; 5636 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5637 // MACC(Rm, Rn, t0, t1, t2); 5638 // Rm = *++Pm; 5639 // Rn = *--Pn; 5640 // } 5641 // if ((i & 1) == 0) { 5642 // assert(Ra == Pa_base[j], "must be"); 5643 // MACC(Ra, Ra, t0, t1, t2); 5644 // } 5645 // iters = i/2; 5646 // assert(iters == i-j, "must be"); 5647 // for (; iters--; j++) { 5648 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5649 // MACC(Rm, Rn, t0, t1, t2); 5650 // Rm = *++Pm; 5651 // Rn = *--Pn; 5652 // } 5653 5654 // *Pm = Rm = t0 * inv; 5655 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 5656 // MACC(Rm, Rn, t0, t1, t2); 5657 5658 // assert(t0 == 0, "broken Montgomery multiply"); 5659 5660 // t0 = t1; t1 = t2; t2 = 0; 5661 // } 5662 5663 // for (i = len; i < 2*len; i++) { 5664 // int start = i-len+1; 5665 // int end = start + (len - start)/2; 5666 // int j; 5667 5668 // Pa = Pa_base + i-len; 5669 // Pb = Pa_base + len; 5670 // Pm = Pm_base + i-len; 5671 // Pn = Pn_base + len; 5672 5673 // Ra = *++Pa; 5674 // Rb = *--Pb; 5675 // Rm = *++Pm; 5676 // Rn = *--Pn; 5677 5678 // int iters = (2*len-i-1)/2; 5679 // assert(iters == end-start, "must be"); 5680 // for (j = start; iters--; j++) { 5681 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 5682 // MACC2(Ra, Rb, t0, t1, t2); 5683 // Ra = *++Pa; 5684 // Rb = *--Pb; 5685 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5686 // MACC(Rm, Rn, t0, t1, t2); 5687 // Rm = *++Pm; 5688 // Rn = *--Pn; 5689 // } 5690 // if ((i & 1) == 0) { 5691 // assert(Ra == Pa_base[j], "must be"); 5692 // MACC(Ra, Ra, t0, t1, t2); 5693 // } 5694 // iters = (2*len-i)/2; 5695 // assert(iters == len-j, "must be"); 5696 // for (; iters--; j++) { 5697 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5698 // MACC(Rm, Rn, t0, t1, t2); 5699 // Rm = *++Pm; 5700 // Rn = *--Pn; 5701 // } 5702 // Pm_base[i-len] = t0; 5703 // t0 = t1; t1 = t2; t2 = 0; 5704 // } 5705 5706 // while (t0) 5707 // t0 = sub(Pm_base, Pn_base, t0, len); 5708 // } 5709 }; 5710 5711 5712 // Initialization 5713 void generate_initial() { 5714 // Generate initial stubs and initializes the entry points 5715 5716 // entry points that exist in all platforms Note: This is code 5717 // that could be shared among different platforms - however the 5718 // benefit seems to be smaller than the disadvantage of having a 5719 // much more complicated generator structure. See also comment in 5720 // stubRoutines.hpp. 5721 5722 StubRoutines::_forward_exception_entry = generate_forward_exception(); 5723 5724 StubRoutines::_call_stub_entry = 5725 generate_call_stub(StubRoutines::_call_stub_return_address); 5726 5727 // is referenced by megamorphic call 5728 StubRoutines::_catch_exception_entry = generate_catch_exception(); 5729 5730 // Build this early so it's available for the interpreter. 5731 StubRoutines::_throw_StackOverflowError_entry = 5732 generate_throw_exception("StackOverflowError throw_exception", 5733 CAST_FROM_FN_PTR(address, 5734 SharedRuntime::throw_StackOverflowError)); 5735 StubRoutines::_throw_delayed_StackOverflowError_entry = 5736 generate_throw_exception("delayed StackOverflowError throw_exception", 5737 CAST_FROM_FN_PTR(address, 5738 SharedRuntime::throw_delayed_StackOverflowError)); 5739 if (UseCRC32Intrinsics) { 5740 // set table address before stub generation which use it 5741 StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table; 5742 StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32(); 5743 } 5744 5745 if (UseCRC32CIntrinsics) { 5746 StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(); 5747 } 5748 5749 // Disabled until JDK-8210858 is fixed 5750 // if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog)) { 5751 // StubRoutines::_dlog = generate_dlog(); 5752 // } 5753 5754 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) { 5755 StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false); 5756 } 5757 5758 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) { 5759 StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true); 5760 } 5761 5762 // Safefetch stubs. 5763 generate_safefetch("SafeFetch32", sizeof(int), &StubRoutines::_safefetch32_entry, 5764 &StubRoutines::_safefetch32_fault_pc, 5765 &StubRoutines::_safefetch32_continuation_pc); 5766 generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry, 5767 &StubRoutines::_safefetchN_fault_pc, 5768 &StubRoutines::_safefetchN_continuation_pc); 5769 } 5770 5771 void generate_all() { 5772 // support for verify_oop (must happen after universe_init) 5773 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop(); 5774 StubRoutines::_throw_AbstractMethodError_entry = 5775 generate_throw_exception("AbstractMethodError throw_exception", 5776 CAST_FROM_FN_PTR(address, 5777 SharedRuntime:: 5778 throw_AbstractMethodError)); 5779 5780 StubRoutines::_throw_IncompatibleClassChangeError_entry = 5781 generate_throw_exception("IncompatibleClassChangeError throw_exception", 5782 CAST_FROM_FN_PTR(address, 5783 SharedRuntime:: 5784 throw_IncompatibleClassChangeError)); 5785 5786 StubRoutines::_throw_NullPointerException_at_call_entry = 5787 generate_throw_exception("NullPointerException at call throw_exception", 5788 CAST_FROM_FN_PTR(address, 5789 SharedRuntime:: 5790 throw_NullPointerException_at_call)); 5791 5792 StubRoutines::aarch64::_vector_iota_indices = generate_iota_indices("iota_indices"); 5793 5794 // arraycopy stubs used by compilers 5795 generate_arraycopy_stubs(); 5796 5797 // has negatives stub for large arrays. 5798 StubRoutines::aarch64::_has_negatives = generate_has_negatives(StubRoutines::aarch64::_has_negatives_long); 5799 5800 // array equals stub for large arrays. 5801 if (!UseSimpleArrayEquals) { 5802 StubRoutines::aarch64::_large_array_equals = generate_large_array_equals(); 5803 } 5804 5805 generate_compare_long_strings(); 5806 5807 generate_string_indexof_stubs(); 5808 5809 // byte_array_inflate stub for large arrays. 5810 StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate(); 5811 5812 BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod(); 5813 if (bs_nm != NULL) { 5814 StubRoutines::aarch64::_method_entry_barrier = generate_method_entry_barrier(); 5815 } 5816 #ifdef COMPILER2 5817 if (UseMultiplyToLenIntrinsic) { 5818 StubRoutines::_multiplyToLen = generate_multiplyToLen(); 5819 } 5820 5821 if (UseSquareToLenIntrinsic) { 5822 StubRoutines::_squareToLen = generate_squareToLen(); 5823 } 5824 5825 if (UseMulAddIntrinsic) { 5826 StubRoutines::_mulAdd = generate_mulAdd(); 5827 } 5828 5829 if (UseMontgomeryMultiplyIntrinsic) { 5830 StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply"); 5831 MontgomeryMultiplyGenerator g(_masm, /*squaring*/false); 5832 StubRoutines::_montgomeryMultiply = g.generate_multiply(); 5833 } 5834 5835 if (UseMontgomerySquareIntrinsic) { 5836 StubCodeMark mark(this, "StubRoutines", "montgomerySquare"); 5837 MontgomeryMultiplyGenerator g(_masm, /*squaring*/true); 5838 // We use generate_multiply() rather than generate_square() 5839 // because it's faster for the sizes of modulus we care about. 5840 StubRoutines::_montgomerySquare = g.generate_multiply(); 5841 } 5842 #endif // COMPILER2 5843 5844 // generate GHASH intrinsics code 5845 if (UseGHASHIntrinsics) { 5846 StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks(); 5847 } 5848 5849 // data cache line writeback 5850 StubRoutines::_data_cache_writeback = generate_data_cache_writeback(); 5851 StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync(); 5852 5853 if (UseAESIntrinsics) { 5854 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); 5855 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); 5856 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(); 5857 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt(); 5858 } 5859 5860 if (UseSHA1Intrinsics) { 5861 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress"); 5862 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB"); 5863 } 5864 if (UseSHA256Intrinsics) { 5865 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress"); 5866 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB"); 5867 } 5868 5869 // generate Adler32 intrinsics code 5870 if (UseAdler32Intrinsics) { 5871 StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32(); 5872 } 5873 5874 StubRoutines::aarch64::set_completed(); 5875 } 5876 5877 public: 5878 StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) { 5879 if (all) { 5880 generate_all(); 5881 } else { 5882 generate_initial(); 5883 } 5884 } 5885 }; // end class declaration 5886 5887 #define UCM_TABLE_MAX_ENTRIES 8 5888 void StubGenerator_generate(CodeBuffer* code, bool all) { 5889 if (UnsafeCopyMemory::_table == NULL) { 5890 UnsafeCopyMemory::create_table(UCM_TABLE_MAX_ENTRIES); 5891 } 5892 StubGenerator g(code, all); 5893 }