1 /* 2 * Copyright (c) 2003, 2019, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2014, 2019, Red Hat Inc. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "precompiled.hpp" 27 #include "asm/macroAssembler.hpp" 28 #include "asm/macroAssembler.inline.hpp" 29 #include "gc/shared/barrierSet.hpp" 30 #include "gc/shared/barrierSetAssembler.hpp" 31 #include "interpreter/interpreter.hpp" 32 #include "memory/universe.hpp" 33 #include "nativeInst_aarch64.hpp" 34 #include "oops/instanceOop.hpp" 35 #include "oops/method.hpp" 36 #include "oops/objArrayKlass.hpp" 37 #include "oops/oop.inline.hpp" 38 #include "prims/methodHandles.hpp" 39 #include "runtime/frame.inline.hpp" 40 #include "runtime/handles.inline.hpp" 41 #include "runtime/sharedRuntime.hpp" 42 #include "runtime/stubCodeGenerator.hpp" 43 #include "runtime/stubRoutines.hpp" 44 #include "runtime/thread.inline.hpp" 45 #include "utilities/align.hpp" 46 #ifdef COMPILER2 47 #include "opto/runtime.hpp" 48 #endif 49 #if INCLUDE_ZGC 50 #include "gc/z/zThreadLocalData.hpp" 51 #endif 52 53 #ifdef BUILTIN_SIM 54 #include "../../../../../../simulator/simulator.hpp" 55 #endif 56 57 // Declaration and definition of StubGenerator (no .hpp file). 58 // For a more detailed description of the stub routine structure 59 // see the comment in stubRoutines.hpp 60 61 #undef __ 62 #define __ _masm-> 63 #define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8)) 64 65 #ifdef PRODUCT 66 #define BLOCK_COMMENT(str) /* nothing */ 67 #else 68 #define BLOCK_COMMENT(str) __ block_comment(str) 69 #endif 70 71 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 72 73 // Stub Code definitions 74 75 class StubGenerator: public StubCodeGenerator { 76 private: 77 78 #ifdef PRODUCT 79 #define inc_counter_np(counter) ((void)0) 80 #else 81 void inc_counter_np_(int& counter) { 82 __ lea(rscratch2, ExternalAddress((address)&counter)); 83 __ ldrw(rscratch1, Address(rscratch2)); 84 __ addw(rscratch1, rscratch1, 1); 85 __ strw(rscratch1, Address(rscratch2)); 86 } 87 #define inc_counter_np(counter) \ 88 BLOCK_COMMENT("inc_counter " #counter); \ 89 inc_counter_np_(counter); 90 #endif 91 92 // Call stubs are used to call Java from C 93 // 94 // Arguments: 95 // c_rarg0: call wrapper address address 96 // c_rarg1: result address 97 // c_rarg2: result type BasicType 98 // c_rarg3: method Method* 99 // c_rarg4: (interpreter) entry point address 100 // c_rarg5: parameters intptr_t* 101 // c_rarg6: parameter size (in words) int 102 // c_rarg7: thread Thread* 103 // 104 // There is no return from the stub itself as any Java result 105 // is written to result 106 // 107 // we save r30 (lr) as the return PC at the base of the frame and 108 // link r29 (fp) below it as the frame pointer installing sp (r31) 109 // into fp. 110 // 111 // we save r0-r7, which accounts for all the c arguments. 112 // 113 // TODO: strictly do we need to save them all? they are treated as 114 // volatile by C so could we omit saving the ones we are going to 115 // place in global registers (thread? method?) or those we only use 116 // during setup of the Java call? 117 // 118 // we don't need to save r8 which C uses as an indirect result location 119 // return register. 120 // 121 // we don't need to save r9-r15 which both C and Java treat as 122 // volatile 123 // 124 // we don't need to save r16-18 because Java does not use them 125 // 126 // we save r19-r28 which Java uses as scratch registers and C 127 // expects to be callee-save 128 // 129 // we save the bottom 64 bits of each value stored in v8-v15; it is 130 // the responsibility of the caller to preserve larger values. 131 // 132 // so the stub frame looks like this when we enter Java code 133 // 134 // [ return_from_Java ] <--- sp 135 // [ argument word n ] 136 // ... 137 // -27 [ argument word 1 ] 138 // -26 [ saved v15 ] <--- sp_after_call 139 // -25 [ saved v14 ] 140 // -24 [ saved v13 ] 141 // -23 [ saved v12 ] 142 // -22 [ saved v11 ] 143 // -21 [ saved v10 ] 144 // -20 [ saved v9 ] 145 // -19 [ saved v8 ] 146 // -18 [ saved r28 ] 147 // -17 [ saved r27 ] 148 // -16 [ saved r26 ] 149 // -15 [ saved r25 ] 150 // -14 [ saved r24 ] 151 // -13 [ saved r23 ] 152 // -12 [ saved r22 ] 153 // -11 [ saved r21 ] 154 // -10 [ saved r20 ] 155 // -9 [ saved r19 ] 156 // -8 [ call wrapper (r0) ] 157 // -7 [ result (r1) ] 158 // -6 [ result type (r2) ] 159 // -5 [ method (r3) ] 160 // -4 [ entry point (r4) ] 161 // -3 [ parameters (r5) ] 162 // -2 [ parameter size (r6) ] 163 // -1 [ thread (r7) ] 164 // 0 [ saved fp (r29) ] <--- fp == saved sp (r31) 165 // 1 [ saved lr (r30) ] 166 167 // Call stub stack layout word offsets from fp 168 enum call_stub_layout { 169 sp_after_call_off = -26, 170 171 d15_off = -26, 172 d13_off = -24, 173 d11_off = -22, 174 d9_off = -20, 175 176 r28_off = -18, 177 r26_off = -16, 178 r24_off = -14, 179 r22_off = -12, 180 r20_off = -10, 181 call_wrapper_off = -8, 182 result_off = -7, 183 result_type_off = -6, 184 method_off = -5, 185 entry_point_off = -4, 186 parameter_size_off = -2, 187 thread_off = -1, 188 fp_f = 0, 189 retaddr_off = 1, 190 }; 191 192 address generate_call_stub(address& return_address) { 193 assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 && 194 (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off, 195 "adjust this code"); 196 197 StubCodeMark mark(this, "StubRoutines", "call_stub"); 198 address start = __ pc(); 199 200 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 201 202 const Address call_wrapper (rfp, call_wrapper_off * wordSize); 203 const Address result (rfp, result_off * wordSize); 204 const Address result_type (rfp, result_type_off * wordSize); 205 const Address method (rfp, method_off * wordSize); 206 const Address entry_point (rfp, entry_point_off * wordSize); 207 const Address parameter_size(rfp, parameter_size_off * wordSize); 208 209 const Address thread (rfp, thread_off * wordSize); 210 211 const Address d15_save (rfp, d15_off * wordSize); 212 const Address d13_save (rfp, d13_off * wordSize); 213 const Address d11_save (rfp, d11_off * wordSize); 214 const Address d9_save (rfp, d9_off * wordSize); 215 216 const Address r28_save (rfp, r28_off * wordSize); 217 const Address r26_save (rfp, r26_off * wordSize); 218 const Address r24_save (rfp, r24_off * wordSize); 219 const Address r22_save (rfp, r22_off * wordSize); 220 const Address r20_save (rfp, r20_off * wordSize); 221 222 // stub code 223 224 // we need a C prolog to bootstrap the x86 caller into the sim 225 __ c_stub_prolog(8, 0, MacroAssembler::ret_type_void); 226 227 address aarch64_entry = __ pc(); 228 229 #ifdef BUILTIN_SIM 230 // Save sender's SP for stack traces. 231 __ mov(rscratch1, sp); 232 __ str(rscratch1, Address(__ pre(sp, -2 * wordSize))); 233 #endif 234 // set up frame and move sp to end of save area 235 __ enter(); 236 __ sub(sp, rfp, -sp_after_call_off * wordSize); 237 238 // save register parameters and Java scratch/global registers 239 // n.b. we save thread even though it gets installed in 240 // rthread because we want to sanity check rthread later 241 __ str(c_rarg7, thread); 242 __ strw(c_rarg6, parameter_size); 243 __ stp(c_rarg4, c_rarg5, entry_point); 244 __ stp(c_rarg2, c_rarg3, result_type); 245 __ stp(c_rarg0, c_rarg1, call_wrapper); 246 247 __ stp(r20, r19, r20_save); 248 __ stp(r22, r21, r22_save); 249 __ stp(r24, r23, r24_save); 250 __ stp(r26, r25, r26_save); 251 __ stp(r28, r27, r28_save); 252 253 __ stpd(v9, v8, d9_save); 254 __ stpd(v11, v10, d11_save); 255 __ stpd(v13, v12, d13_save); 256 __ stpd(v15, v14, d15_save); 257 258 // install Java thread in global register now we have saved 259 // whatever value it held 260 __ mov(rthread, c_rarg7); 261 // And method 262 __ mov(rmethod, c_rarg3); 263 264 // set up the heapbase register 265 __ reinit_heapbase(); 266 267 #ifdef ASSERT 268 // make sure we have no pending exceptions 269 { 270 Label L; 271 __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset()))); 272 __ cmp(rscratch1, (u1)NULL_WORD); 273 __ br(Assembler::EQ, L); 274 __ stop("StubRoutines::call_stub: entered with pending exception"); 275 __ BIND(L); 276 } 277 #endif 278 // pass parameters if any 279 __ mov(esp, sp); 280 __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way 281 __ andr(sp, rscratch1, -2 * wordSize); 282 283 BLOCK_COMMENT("pass parameters if any"); 284 Label parameters_done; 285 // parameter count is still in c_rarg6 286 // and parameter pointer identifying param 1 is in c_rarg5 287 __ cbzw(c_rarg6, parameters_done); 288 289 address loop = __ pc(); 290 __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize))); 291 __ subsw(c_rarg6, c_rarg6, 1); 292 __ push(rscratch1); 293 __ br(Assembler::GT, loop); 294 295 __ BIND(parameters_done); 296 297 // call Java entry -- passing methdoOop, and current sp 298 // rmethod: Method* 299 // r13: sender sp 300 BLOCK_COMMENT("call Java function"); 301 __ mov(r13, sp); 302 __ blr(c_rarg4); 303 304 // tell the simulator we have returned to the stub 305 306 // we do this here because the notify will already have been done 307 // if we get to the next instruction via an exception 308 // 309 // n.b. adding this instruction here affects the calculation of 310 // whether or not a routine returns to the call stub (used when 311 // doing stack walks) since the normal test is to check the return 312 // pc against the address saved below. so we may need to allow for 313 // this extra instruction in the check. 314 315 if (NotifySimulator) { 316 __ notify(Assembler::method_reentry); 317 } 318 // save current address for use by exception handling code 319 320 return_address = __ pc(); 321 322 // store result depending on type (everything that is not 323 // T_OBJECT, T_VALUETYPE, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT) 324 // n.b. this assumes Java returns an integral result in r0 325 // and a floating result in j_farg0 326 __ ldr(j_rarg2, result); 327 Label is_long, is_float, is_double, exit; 328 __ ldr(j_rarg1, result_type); 329 __ cmp(j_rarg1, (u1)T_OBJECT); 330 __ br(Assembler::EQ, is_long); 331 __ cmp(j_rarg1, (u1)T_VALUETYPE); 332 __ br(Assembler::EQ, is_long); 333 __ cmp(j_rarg1, (u1)T_LONG); 334 __ br(Assembler::EQ, is_long); 335 __ cmp(j_rarg1, (u1)T_FLOAT); 336 __ br(Assembler::EQ, is_float); 337 __ cmp(j_rarg1, (u1)T_DOUBLE); 338 __ br(Assembler::EQ, is_double); 339 340 // handle T_INT case 341 __ strw(r0, Address(j_rarg2)); 342 343 __ BIND(exit); 344 345 // pop parameters 346 __ sub(esp, rfp, -sp_after_call_off * wordSize); 347 348 #ifdef ASSERT 349 // verify that threads correspond 350 { 351 Label L, S; 352 __ ldr(rscratch1, thread); 353 __ cmp(rthread, rscratch1); 354 __ br(Assembler::NE, S); 355 __ get_thread(rscratch1); 356 __ cmp(rthread, rscratch1); 357 __ br(Assembler::EQ, L); 358 __ BIND(S); 359 __ stop("StubRoutines::call_stub: threads must correspond"); 360 __ BIND(L); 361 } 362 #endif 363 364 // restore callee-save registers 365 __ ldpd(v15, v14, d15_save); 366 __ ldpd(v13, v12, d13_save); 367 __ ldpd(v11, v10, d11_save); 368 __ ldpd(v9, v8, d9_save); 369 370 __ ldp(r28, r27, r28_save); 371 __ ldp(r26, r25, r26_save); 372 __ ldp(r24, r23, r24_save); 373 __ ldp(r22, r21, r22_save); 374 __ ldp(r20, r19, r20_save); 375 376 __ ldp(c_rarg0, c_rarg1, call_wrapper); 377 __ ldrw(c_rarg2, result_type); 378 __ ldr(c_rarg3, method); 379 __ ldp(c_rarg4, c_rarg5, entry_point); 380 __ ldp(c_rarg6, c_rarg7, parameter_size); 381 382 #ifndef PRODUCT 383 // tell the simulator we are about to end Java execution 384 if (NotifySimulator) { 385 __ notify(Assembler::method_exit); 386 } 387 #endif 388 // leave frame and return to caller 389 __ leave(); 390 __ ret(lr); 391 392 // handle return types different from T_INT 393 394 __ BIND(is_long); 395 __ str(r0, Address(j_rarg2, 0)); 396 __ br(Assembler::AL, exit); 397 398 __ BIND(is_float); 399 __ strs(j_farg0, Address(j_rarg2, 0)); 400 __ br(Assembler::AL, exit); 401 402 __ BIND(is_double); 403 __ strd(j_farg0, Address(j_rarg2, 0)); 404 __ br(Assembler::AL, exit); 405 406 return start; 407 } 408 409 // Return point for a Java call if there's an exception thrown in 410 // Java code. The exception is caught and transformed into a 411 // pending exception stored in JavaThread that can be tested from 412 // within the VM. 413 // 414 // Note: Usually the parameters are removed by the callee. In case 415 // of an exception crossing an activation frame boundary, that is 416 // not the case if the callee is compiled code => need to setup the 417 // rsp. 418 // 419 // r0: exception oop 420 421 // NOTE: this is used as a target from the signal handler so it 422 // needs an x86 prolog which returns into the current simulator 423 // executing the generated catch_exception code. so the prolog 424 // needs to install rax in a sim register and adjust the sim's 425 // restart pc to enter the generated code at the start position 426 // then return from native to simulated execution. 427 428 address generate_catch_exception() { 429 StubCodeMark mark(this, "StubRoutines", "catch_exception"); 430 address start = __ pc(); 431 432 // same as in generate_call_stub(): 433 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 434 const Address thread (rfp, thread_off * wordSize); 435 436 #ifdef ASSERT 437 // verify that threads correspond 438 { 439 Label L, S; 440 __ ldr(rscratch1, thread); 441 __ cmp(rthread, rscratch1); 442 __ br(Assembler::NE, S); 443 __ get_thread(rscratch1); 444 __ cmp(rthread, rscratch1); 445 __ br(Assembler::EQ, L); 446 __ bind(S); 447 __ stop("StubRoutines::catch_exception: threads must correspond"); 448 __ bind(L); 449 } 450 #endif 451 452 // set pending exception 453 __ verify_oop(r0); 454 455 __ str(r0, Address(rthread, Thread::pending_exception_offset())); 456 __ mov(rscratch1, (address)__FILE__); 457 __ str(rscratch1, Address(rthread, Thread::exception_file_offset())); 458 __ movw(rscratch1, (int)__LINE__); 459 __ strw(rscratch1, Address(rthread, Thread::exception_line_offset())); 460 461 // complete return to VM 462 assert(StubRoutines::_call_stub_return_address != NULL, 463 "_call_stub_return_address must have been generated before"); 464 __ b(StubRoutines::_call_stub_return_address); 465 466 return start; 467 } 468 469 // Continuation point for runtime calls returning with a pending 470 // exception. The pending exception check happened in the runtime 471 // or native call stub. The pending exception in Thread is 472 // converted into a Java-level exception. 473 // 474 // Contract with Java-level exception handlers: 475 // r0: exception 476 // r3: throwing pc 477 // 478 // NOTE: At entry of this stub, exception-pc must be in LR !! 479 480 // NOTE: this is always used as a jump target within generated code 481 // so it just needs to be generated code wiht no x86 prolog 482 483 address generate_forward_exception() { 484 StubCodeMark mark(this, "StubRoutines", "forward exception"); 485 address start = __ pc(); 486 487 // Upon entry, LR points to the return address returning into 488 // Java (interpreted or compiled) code; i.e., the return address 489 // becomes the throwing pc. 490 // 491 // Arguments pushed before the runtime call are still on the stack 492 // but the exception handler will reset the stack pointer -> 493 // ignore them. A potential result in registers can be ignored as 494 // well. 495 496 #ifdef ASSERT 497 // make sure this code is only executed if there is a pending exception 498 { 499 Label L; 500 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 501 __ cbnz(rscratch1, L); 502 __ stop("StubRoutines::forward exception: no pending exception (1)"); 503 __ bind(L); 504 } 505 #endif 506 507 // compute exception handler into r19 508 509 // call the VM to find the handler address associated with the 510 // caller address. pass thread in r0 and caller pc (ret address) 511 // in r1. n.b. the caller pc is in lr, unlike x86 where it is on 512 // the stack. 513 __ mov(c_rarg1, lr); 514 // lr will be trashed by the VM call so we move it to R19 515 // (callee-saved) because we also need to pass it to the handler 516 // returned by this call. 517 __ mov(r19, lr); 518 BLOCK_COMMENT("call exception_handler_for_return_address"); 519 __ call_VM_leaf(CAST_FROM_FN_PTR(address, 520 SharedRuntime::exception_handler_for_return_address), 521 rthread, c_rarg1); 522 // we should not really care that lr is no longer the callee 523 // address. we saved the value the handler needs in r19 so we can 524 // just copy it to r3. however, the C2 handler will push its own 525 // frame and then calls into the VM and the VM code asserts that 526 // the PC for the frame above the handler belongs to a compiled 527 // Java method. So, we restore lr here to satisfy that assert. 528 __ mov(lr, r19); 529 // setup r0 & r3 & clear pending exception 530 __ mov(r3, r19); 531 __ mov(r19, r0); 532 __ ldr(r0, Address(rthread, Thread::pending_exception_offset())); 533 __ str(zr, Address(rthread, Thread::pending_exception_offset())); 534 535 #ifdef ASSERT 536 // make sure exception is set 537 { 538 Label L; 539 __ cbnz(r0, L); 540 __ stop("StubRoutines::forward exception: no pending exception (2)"); 541 __ bind(L); 542 } 543 #endif 544 545 // continue at exception handler 546 // r0: exception 547 // r3: throwing pc 548 // r19: exception handler 549 __ verify_oop(r0); 550 __ br(r19); 551 552 return start; 553 } 554 555 // Non-destructive plausibility checks for oops 556 // 557 // Arguments: 558 // r0: oop to verify 559 // rscratch1: error message 560 // 561 // Stack after saving c_rarg3: 562 // [tos + 0]: saved c_rarg3 563 // [tos + 1]: saved c_rarg2 564 // [tos + 2]: saved lr 565 // [tos + 3]: saved rscratch2 566 // [tos + 4]: saved r0 567 // [tos + 5]: saved rscratch1 568 address generate_verify_oop() { 569 570 StubCodeMark mark(this, "StubRoutines", "verify_oop"); 571 address start = __ pc(); 572 573 Label exit, error; 574 575 // save c_rarg2 and c_rarg3 576 __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16))); 577 578 // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 579 __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 580 __ ldr(c_rarg3, Address(c_rarg2)); 581 __ add(c_rarg3, c_rarg3, 1); 582 __ str(c_rarg3, Address(c_rarg2)); 583 584 // object is in r0 585 // make sure object is 'reasonable' 586 __ cbz(r0, exit); // if obj is NULL it is OK 587 588 #if INCLUDE_ZGC 589 if (UseZGC) { 590 // Check if mask is good. 591 // verifies that ZAddressBadMask & r0 == 0 592 __ ldr(c_rarg3, Address(rthread, ZThreadLocalData::address_bad_mask_offset())); 593 __ andr(c_rarg2, r0, c_rarg3); 594 __ cbnz(c_rarg2, error); 595 } 596 #endif 597 598 // Check if the oop is in the right area of memory 599 __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask()); 600 __ andr(c_rarg2, r0, c_rarg3); 601 __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits()); 602 603 // Compare c_rarg2 and c_rarg3. We don't use a compare 604 // instruction here because the flags register is live. 605 __ eor(c_rarg2, c_rarg2, c_rarg3); 606 __ cbnz(c_rarg2, error); 607 608 // make sure klass is 'reasonable', which is not zero. 609 __ load_klass(r0, r0); // get klass 610 __ cbz(r0, error); // if klass is NULL it is broken 611 612 // return if everything seems ok 613 __ bind(exit); 614 615 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 616 __ ret(lr); 617 618 // handle errors 619 __ bind(error); 620 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 621 622 __ push(RegSet::range(r0, r29), sp); 623 // debug(char* msg, int64_t pc, int64_t regs[]) 624 __ mov(c_rarg0, rscratch1); // pass address of error message 625 __ mov(c_rarg1, lr); // pass return address 626 __ mov(c_rarg2, sp); // pass address of regs on stack 627 #ifndef PRODUCT 628 assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area"); 629 #endif 630 BLOCK_COMMENT("call MacroAssembler::debug"); 631 __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64)); 632 __ blrt(rscratch1, 3, 0, 1); 633 634 return start; 635 } 636 637 void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); } 638 639 // The inner part of zero_words(). This is the bulk operation, 640 // zeroing words in blocks, possibly using DC ZVA to do it. The 641 // caller is responsible for zeroing the last few words. 642 // 643 // Inputs: 644 // r10: the HeapWord-aligned base address of an array to zero. 645 // r11: the count in HeapWords, r11 > 0. 646 // 647 // Returns r10 and r11, adjusted for the caller to clear. 648 // r10: the base address of the tail of words left to clear. 649 // r11: the number of words in the tail. 650 // r11 < MacroAssembler::zero_words_block_size. 651 652 address generate_zero_blocks() { 653 Label done; 654 Label base_aligned; 655 656 Register base = r10, cnt = r11; 657 658 __ align(CodeEntryAlignment); 659 StubCodeMark mark(this, "StubRoutines", "zero_blocks"); 660 address start = __ pc(); 661 662 if (UseBlockZeroing) { 663 int zva_length = VM_Version::zva_length(); 664 665 // Ensure ZVA length can be divided by 16. This is required by 666 // the subsequent operations. 667 assert (zva_length % 16 == 0, "Unexpected ZVA Length"); 668 669 __ tbz(base, 3, base_aligned); 670 __ str(zr, Address(__ post(base, 8))); 671 __ sub(cnt, cnt, 1); 672 __ bind(base_aligned); 673 674 // Ensure count >= zva_length * 2 so that it still deserves a zva after 675 // alignment. 676 Label small; 677 int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit); 678 __ subs(rscratch1, cnt, low_limit >> 3); 679 __ br(Assembler::LT, small); 680 __ zero_dcache_blocks(base, cnt); 681 __ bind(small); 682 } 683 684 { 685 // Number of stp instructions we'll unroll 686 const int unroll = 687 MacroAssembler::zero_words_block_size / 2; 688 // Clear the remaining blocks. 689 Label loop; 690 __ subs(cnt, cnt, unroll * 2); 691 __ br(Assembler::LT, done); 692 __ bind(loop); 693 for (int i = 0; i < unroll; i++) 694 __ stp(zr, zr, __ post(base, 16)); 695 __ subs(cnt, cnt, unroll * 2); 696 __ br(Assembler::GE, loop); 697 __ bind(done); 698 __ add(cnt, cnt, unroll * 2); 699 } 700 701 __ ret(lr); 702 703 return start; 704 } 705 706 707 typedef enum { 708 copy_forwards = 1, 709 copy_backwards = -1 710 } copy_direction; 711 712 // Bulk copy of blocks of 8 words. 713 // 714 // count is a count of words. 715 // 716 // Precondition: count >= 8 717 // 718 // Postconditions: 719 // 720 // The least significant bit of count contains the remaining count 721 // of words to copy. The rest of count is trash. 722 // 723 // s and d are adjusted to point to the remaining words to copy 724 // 725 void generate_copy_longs(Label &start, Register s, Register d, Register count, 726 copy_direction direction) { 727 int unit = wordSize * direction; 728 int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize; 729 730 int offset; 731 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6, 732 t4 = r7, t5 = r10, t6 = r11, t7 = r12; 733 const Register stride = r13; 734 735 assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7); 736 assert_different_registers(s, d, count, rscratch1); 737 738 Label again, drain; 739 const char *stub_name; 740 if (direction == copy_forwards) 741 stub_name = "forward_copy_longs"; 742 else 743 stub_name = "backward_copy_longs"; 744 745 __ align(CodeEntryAlignment); 746 747 StubCodeMark mark(this, "StubRoutines", stub_name); 748 749 __ bind(start); 750 751 Label unaligned_copy_long; 752 if (AvoidUnalignedAccesses) { 753 __ tbnz(d, 3, unaligned_copy_long); 754 } 755 756 if (direction == copy_forwards) { 757 __ sub(s, s, bias); 758 __ sub(d, d, bias); 759 } 760 761 #ifdef ASSERT 762 // Make sure we are never given < 8 words 763 { 764 Label L; 765 __ cmp(count, (u1)8); 766 __ br(Assembler::GE, L); 767 __ stop("genrate_copy_longs called with < 8 words"); 768 __ bind(L); 769 } 770 #endif 771 772 // Fill 8 registers 773 if (UseSIMDForMemoryOps) { 774 __ ldpq(v0, v1, Address(s, 4 * unit)); 775 __ ldpq(v2, v3, Address(__ pre(s, 8 * unit))); 776 } else { 777 __ ldp(t0, t1, Address(s, 2 * unit)); 778 __ ldp(t2, t3, Address(s, 4 * unit)); 779 __ ldp(t4, t5, Address(s, 6 * unit)); 780 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 781 } 782 783 __ subs(count, count, 16); 784 __ br(Assembler::LO, drain); 785 786 int prefetch = PrefetchCopyIntervalInBytes; 787 bool use_stride = false; 788 if (direction == copy_backwards) { 789 use_stride = prefetch > 256; 790 prefetch = -prefetch; 791 if (use_stride) __ mov(stride, prefetch); 792 } 793 794 __ bind(again); 795 796 if (PrefetchCopyIntervalInBytes > 0) 797 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 798 799 if (UseSIMDForMemoryOps) { 800 __ stpq(v0, v1, Address(d, 4 * unit)); 801 __ ldpq(v0, v1, Address(s, 4 * unit)); 802 __ stpq(v2, v3, Address(__ pre(d, 8 * unit))); 803 __ ldpq(v2, v3, Address(__ pre(s, 8 * unit))); 804 } else { 805 __ stp(t0, t1, Address(d, 2 * unit)); 806 __ ldp(t0, t1, Address(s, 2 * unit)); 807 __ stp(t2, t3, Address(d, 4 * unit)); 808 __ ldp(t2, t3, Address(s, 4 * unit)); 809 __ stp(t4, t5, Address(d, 6 * unit)); 810 __ ldp(t4, t5, Address(s, 6 * unit)); 811 __ stp(t6, t7, Address(__ pre(d, 8 * unit))); 812 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 813 } 814 815 __ subs(count, count, 8); 816 __ br(Assembler::HS, again); 817 818 // Drain 819 __ bind(drain); 820 if (UseSIMDForMemoryOps) { 821 __ stpq(v0, v1, Address(d, 4 * unit)); 822 __ stpq(v2, v3, Address(__ pre(d, 8 * unit))); 823 } else { 824 __ stp(t0, t1, Address(d, 2 * unit)); 825 __ stp(t2, t3, Address(d, 4 * unit)); 826 __ stp(t4, t5, Address(d, 6 * unit)); 827 __ stp(t6, t7, Address(__ pre(d, 8 * unit))); 828 } 829 830 { 831 Label L1, L2; 832 __ tbz(count, exact_log2(4), L1); 833 if (UseSIMDForMemoryOps) { 834 __ ldpq(v0, v1, Address(__ pre(s, 4 * unit))); 835 __ stpq(v0, v1, Address(__ pre(d, 4 * unit))); 836 } else { 837 __ ldp(t0, t1, Address(s, 2 * unit)); 838 __ ldp(t2, t3, Address(__ pre(s, 4 * unit))); 839 __ stp(t0, t1, Address(d, 2 * unit)); 840 __ stp(t2, t3, Address(__ pre(d, 4 * unit))); 841 } 842 __ bind(L1); 843 844 if (direction == copy_forwards) { 845 __ add(s, s, bias); 846 __ add(d, d, bias); 847 } 848 849 __ tbz(count, 1, L2); 850 __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards))); 851 __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards))); 852 __ bind(L2); 853 } 854 855 __ ret(lr); 856 857 if (AvoidUnalignedAccesses) { 858 Label drain, again; 859 // Register order for storing. Order is different for backward copy. 860 861 __ bind(unaligned_copy_long); 862 863 // source address is even aligned, target odd aligned 864 // 865 // when forward copying word pairs we read long pairs at offsets 866 // {0, 2, 4, 6} (in long words). when backwards copying we read 867 // long pairs at offsets {-2, -4, -6, -8}. We adjust the source 868 // address by -2 in the forwards case so we can compute the 869 // source offsets for both as {2, 4, 6, 8} * unit where unit = 1 870 // or -1. 871 // 872 // when forward copying we need to store 1 word, 3 pairs and 873 // then 1 word at offsets {0, 1, 3, 5, 7}. Rather thna use a 874 // zero offset We adjust the destination by -1 which means we 875 // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores. 876 // 877 // When backwards copyng we need to store 1 word, 3 pairs and 878 // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use 879 // offsets {1, 3, 5, 7, 8} * unit. 880 881 if (direction == copy_forwards) { 882 __ sub(s, s, 16); 883 __ sub(d, d, 8); 884 } 885 886 // Fill 8 registers 887 // 888 // for forwards copy s was offset by -16 from the original input 889 // value of s so the register contents are at these offsets 890 // relative to the 64 bit block addressed by that original input 891 // and so on for each successive 64 byte block when s is updated 892 // 893 // t0 at offset 0, t1 at offset 8 894 // t2 at offset 16, t3 at offset 24 895 // t4 at offset 32, t5 at offset 40 896 // t6 at offset 48, t7 at offset 56 897 898 // for backwards copy s was not offset so the register contents 899 // are at these offsets into the preceding 64 byte block 900 // relative to that original input and so on for each successive 901 // preceding 64 byte block when s is updated. this explains the 902 // slightly counter-intuitive looking pattern of register usage 903 // in the stp instructions for backwards copy. 904 // 905 // t0 at offset -16, t1 at offset -8 906 // t2 at offset -32, t3 at offset -24 907 // t4 at offset -48, t5 at offset -40 908 // t6 at offset -64, t7 at offset -56 909 910 __ ldp(t0, t1, Address(s, 2 * unit)); 911 __ ldp(t2, t3, Address(s, 4 * unit)); 912 __ ldp(t4, t5, Address(s, 6 * unit)); 913 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 914 915 __ subs(count, count, 16); 916 __ br(Assembler::LO, drain); 917 918 int prefetch = PrefetchCopyIntervalInBytes; 919 bool use_stride = false; 920 if (direction == copy_backwards) { 921 use_stride = prefetch > 256; 922 prefetch = -prefetch; 923 if (use_stride) __ mov(stride, prefetch); 924 } 925 926 __ bind(again); 927 928 if (PrefetchCopyIntervalInBytes > 0) 929 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 930 931 if (direction == copy_forwards) { 932 // allowing for the offset of -8 the store instructions place 933 // registers into the target 64 bit block at the following 934 // offsets 935 // 936 // t0 at offset 0 937 // t1 at offset 8, t2 at offset 16 938 // t3 at offset 24, t4 at offset 32 939 // t5 at offset 40, t6 at offset 48 940 // t7 at offset 56 941 942 __ str(t0, Address(d, 1 * unit)); 943 __ stp(t1, t2, Address(d, 2 * unit)); 944 __ ldp(t0, t1, Address(s, 2 * unit)); 945 __ stp(t3, t4, Address(d, 4 * unit)); 946 __ ldp(t2, t3, Address(s, 4 * unit)); 947 __ stp(t5, t6, Address(d, 6 * unit)); 948 __ ldp(t4, t5, Address(s, 6 * unit)); 949 __ str(t7, Address(__ pre(d, 8 * unit))); 950 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 951 } else { 952 // d was not offset when we started so the registers are 953 // written into the 64 bit block preceding d with the following 954 // offsets 955 // 956 // t1 at offset -8 957 // t3 at offset -24, t0 at offset -16 958 // t5 at offset -48, t2 at offset -32 959 // t7 at offset -56, t4 at offset -48 960 // t6 at offset -64 961 // 962 // note that this matches the offsets previously noted for the 963 // loads 964 965 __ str(t1, Address(d, 1 * unit)); 966 __ stp(t3, t0, Address(d, 3 * unit)); 967 __ ldp(t0, t1, Address(s, 2 * unit)); 968 __ stp(t5, t2, Address(d, 5 * unit)); 969 __ ldp(t2, t3, Address(s, 4 * unit)); 970 __ stp(t7, t4, Address(d, 7 * unit)); 971 __ ldp(t4, t5, Address(s, 6 * unit)); 972 __ str(t6, Address(__ pre(d, 8 * unit))); 973 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 974 } 975 976 __ subs(count, count, 8); 977 __ br(Assembler::HS, again); 978 979 // Drain 980 // 981 // this uses the same pattern of offsets and register arguments 982 // as above 983 __ bind(drain); 984 if (direction == copy_forwards) { 985 __ str(t0, Address(d, 1 * unit)); 986 __ stp(t1, t2, Address(d, 2 * unit)); 987 __ stp(t3, t4, Address(d, 4 * unit)); 988 __ stp(t5, t6, Address(d, 6 * unit)); 989 __ str(t7, Address(__ pre(d, 8 * unit))); 990 } else { 991 __ str(t1, Address(d, 1 * unit)); 992 __ stp(t3, t0, Address(d, 3 * unit)); 993 __ stp(t5, t2, Address(d, 5 * unit)); 994 __ stp(t7, t4, Address(d, 7 * unit)); 995 __ str(t6, Address(__ pre(d, 8 * unit))); 996 } 997 // now we need to copy any remaining part block which may 998 // include a 4 word block subblock and/or a 2 word subblock. 999 // bits 2 and 1 in the count are the tell-tale for whetehr we 1000 // have each such subblock 1001 { 1002 Label L1, L2; 1003 __ tbz(count, exact_log2(4), L1); 1004 // this is the same as above but copying only 4 longs hence 1005 // with ony one intervening stp between the str instructions 1006 // but note that the offsets and registers still follow the 1007 // same pattern 1008 __ ldp(t0, t1, Address(s, 2 * unit)); 1009 __ ldp(t2, t3, Address(__ pre(s, 4 * unit))); 1010 if (direction == copy_forwards) { 1011 __ str(t0, Address(d, 1 * unit)); 1012 __ stp(t1, t2, Address(d, 2 * unit)); 1013 __ str(t3, Address(__ pre(d, 4 * unit))); 1014 } else { 1015 __ str(t1, Address(d, 1 * unit)); 1016 __ stp(t3, t0, Address(d, 3 * unit)); 1017 __ str(t2, Address(__ pre(d, 4 * unit))); 1018 } 1019 __ bind(L1); 1020 1021 __ tbz(count, 1, L2); 1022 // this is the same as above but copying only 2 longs hence 1023 // there is no intervening stp between the str instructions 1024 // but note that the offset and register patterns are still 1025 // the same 1026 __ ldp(t0, t1, Address(__ pre(s, 2 * unit))); 1027 if (direction == copy_forwards) { 1028 __ str(t0, Address(d, 1 * unit)); 1029 __ str(t1, Address(__ pre(d, 2 * unit))); 1030 } else { 1031 __ str(t1, Address(d, 1 * unit)); 1032 __ str(t0, Address(__ pre(d, 2 * unit))); 1033 } 1034 __ bind(L2); 1035 1036 // for forwards copy we need to re-adjust the offsets we 1037 // applied so that s and d are follow the last words written 1038 1039 if (direction == copy_forwards) { 1040 __ add(s, s, 16); 1041 __ add(d, d, 8); 1042 } 1043 1044 } 1045 1046 __ ret(lr); 1047 } 1048 } 1049 1050 // Small copy: less than 16 bytes. 1051 // 1052 // NB: Ignores all of the bits of count which represent more than 15 1053 // bytes, so a caller doesn't have to mask them. 1054 1055 void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) { 1056 bool is_backwards = step < 0; 1057 size_t granularity = uabs(step); 1058 int direction = is_backwards ? -1 : 1; 1059 int unit = wordSize * direction; 1060 1061 Label Lword, Lint, Lshort, Lbyte; 1062 1063 assert(granularity 1064 && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small"); 1065 1066 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6; 1067 1068 // ??? I don't know if this bit-test-and-branch is the right thing 1069 // to do. It does a lot of jumping, resulting in several 1070 // mispredicted branches. It might make more sense to do this 1071 // with something like Duff's device with a single computed branch. 1072 1073 __ tbz(count, 3 - exact_log2(granularity), Lword); 1074 __ ldr(tmp, Address(__ adjust(s, unit, is_backwards))); 1075 __ str(tmp, Address(__ adjust(d, unit, is_backwards))); 1076 __ bind(Lword); 1077 1078 if (granularity <= sizeof (jint)) { 1079 __ tbz(count, 2 - exact_log2(granularity), Lint); 1080 __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards))); 1081 __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards))); 1082 __ bind(Lint); 1083 } 1084 1085 if (granularity <= sizeof (jshort)) { 1086 __ tbz(count, 1 - exact_log2(granularity), Lshort); 1087 __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards))); 1088 __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards))); 1089 __ bind(Lshort); 1090 } 1091 1092 if (granularity <= sizeof (jbyte)) { 1093 __ tbz(count, 0, Lbyte); 1094 __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards))); 1095 __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards))); 1096 __ bind(Lbyte); 1097 } 1098 } 1099 1100 Label copy_f, copy_b; 1101 1102 // All-singing all-dancing memory copy. 1103 // 1104 // Copy count units of memory from s to d. The size of a unit is 1105 // step, which can be positive or negative depending on the direction 1106 // of copy. If is_aligned is false, we align the source address. 1107 // 1108 1109 void copy_memory(bool is_aligned, Register s, Register d, 1110 Register count, Register tmp, int step) { 1111 copy_direction direction = step < 0 ? copy_backwards : copy_forwards; 1112 bool is_backwards = step < 0; 1113 int granularity = uabs(step); 1114 const Register t0 = r3, t1 = r4; 1115 1116 // <= 96 bytes do inline. Direction doesn't matter because we always 1117 // load all the data before writing anything 1118 Label copy4, copy8, copy16, copy32, copy80, copy_big, finish; 1119 const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8; 1120 const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12; 1121 const Register send = r17, dend = r18; 1122 1123 if (PrefetchCopyIntervalInBytes > 0) 1124 __ prfm(Address(s, 0), PLDL1KEEP); 1125 __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity)); 1126 __ br(Assembler::HI, copy_big); 1127 1128 __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity)))); 1129 __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity)))); 1130 1131 __ cmp(count, u1(16/granularity)); 1132 __ br(Assembler::LS, copy16); 1133 1134 __ cmp(count, u1(64/granularity)); 1135 __ br(Assembler::HI, copy80); 1136 1137 __ cmp(count, u1(32/granularity)); 1138 __ br(Assembler::LS, copy32); 1139 1140 // 33..64 bytes 1141 if (UseSIMDForMemoryOps) { 1142 __ ldpq(v0, v1, Address(s, 0)); 1143 __ ldpq(v2, v3, Address(send, -32)); 1144 __ stpq(v0, v1, Address(d, 0)); 1145 __ stpq(v2, v3, Address(dend, -32)); 1146 } else { 1147 __ ldp(t0, t1, Address(s, 0)); 1148 __ ldp(t2, t3, Address(s, 16)); 1149 __ ldp(t4, t5, Address(send, -32)); 1150 __ ldp(t6, t7, Address(send, -16)); 1151 1152 __ stp(t0, t1, Address(d, 0)); 1153 __ stp(t2, t3, Address(d, 16)); 1154 __ stp(t4, t5, Address(dend, -32)); 1155 __ stp(t6, t7, Address(dend, -16)); 1156 } 1157 __ b(finish); 1158 1159 // 17..32 bytes 1160 __ bind(copy32); 1161 __ ldp(t0, t1, Address(s, 0)); 1162 __ ldp(t2, t3, Address(send, -16)); 1163 __ stp(t0, t1, Address(d, 0)); 1164 __ stp(t2, t3, Address(dend, -16)); 1165 __ b(finish); 1166 1167 // 65..80/96 bytes 1168 // (96 bytes if SIMD because we do 32 byes per instruction) 1169 __ bind(copy80); 1170 if (UseSIMDForMemoryOps) { 1171 __ ld4(v0, v1, v2, v3, __ T16B, Address(s, 0)); 1172 __ ldpq(v4, v5, Address(send, -32)); 1173 __ st4(v0, v1, v2, v3, __ T16B, Address(d, 0)); 1174 __ stpq(v4, v5, Address(dend, -32)); 1175 } else { 1176 __ ldp(t0, t1, Address(s, 0)); 1177 __ ldp(t2, t3, Address(s, 16)); 1178 __ ldp(t4, t5, Address(s, 32)); 1179 __ ldp(t6, t7, Address(s, 48)); 1180 __ ldp(t8, t9, Address(send, -16)); 1181 1182 __ stp(t0, t1, Address(d, 0)); 1183 __ stp(t2, t3, Address(d, 16)); 1184 __ stp(t4, t5, Address(d, 32)); 1185 __ stp(t6, t7, Address(d, 48)); 1186 __ stp(t8, t9, Address(dend, -16)); 1187 } 1188 __ b(finish); 1189 1190 // 0..16 bytes 1191 __ bind(copy16); 1192 __ cmp(count, u1(8/granularity)); 1193 __ br(Assembler::LO, copy8); 1194 1195 // 8..16 bytes 1196 __ ldr(t0, Address(s, 0)); 1197 __ ldr(t1, Address(send, -8)); 1198 __ str(t0, Address(d, 0)); 1199 __ str(t1, Address(dend, -8)); 1200 __ b(finish); 1201 1202 if (granularity < 8) { 1203 // 4..7 bytes 1204 __ bind(copy8); 1205 __ tbz(count, 2 - exact_log2(granularity), copy4); 1206 __ ldrw(t0, Address(s, 0)); 1207 __ ldrw(t1, Address(send, -4)); 1208 __ strw(t0, Address(d, 0)); 1209 __ strw(t1, Address(dend, -4)); 1210 __ b(finish); 1211 if (granularity < 4) { 1212 // 0..3 bytes 1213 __ bind(copy4); 1214 __ cbz(count, finish); // get rid of 0 case 1215 if (granularity == 2) { 1216 __ ldrh(t0, Address(s, 0)); 1217 __ strh(t0, Address(d, 0)); 1218 } else { // granularity == 1 1219 // Now 1..3 bytes. Handle the 1 and 2 byte case by copying 1220 // the first and last byte. 1221 // Handle the 3 byte case by loading and storing base + count/2 1222 // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1)) 1223 // This does means in the 1 byte case we load/store the same 1224 // byte 3 times. 1225 __ lsr(count, count, 1); 1226 __ ldrb(t0, Address(s, 0)); 1227 __ ldrb(t1, Address(send, -1)); 1228 __ ldrb(t2, Address(s, count)); 1229 __ strb(t0, Address(d, 0)); 1230 __ strb(t1, Address(dend, -1)); 1231 __ strb(t2, Address(d, count)); 1232 } 1233 __ b(finish); 1234 } 1235 } 1236 1237 __ bind(copy_big); 1238 if (is_backwards) { 1239 __ lea(s, Address(s, count, Address::lsl(exact_log2(-step)))); 1240 __ lea(d, Address(d, count, Address::lsl(exact_log2(-step)))); 1241 } 1242 1243 // Now we've got the small case out of the way we can align the 1244 // source address on a 2-word boundary. 1245 1246 Label aligned; 1247 1248 if (is_aligned) { 1249 // We may have to adjust by 1 word to get s 2-word-aligned. 1250 __ tbz(s, exact_log2(wordSize), aligned); 1251 __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards))); 1252 __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards))); 1253 __ sub(count, count, wordSize/granularity); 1254 } else { 1255 if (is_backwards) { 1256 __ andr(rscratch2, s, 2 * wordSize - 1); 1257 } else { 1258 __ neg(rscratch2, s); 1259 __ andr(rscratch2, rscratch2, 2 * wordSize - 1); 1260 } 1261 // rscratch2 is the byte adjustment needed to align s. 1262 __ cbz(rscratch2, aligned); 1263 int shift = exact_log2(granularity); 1264 if (shift) __ lsr(rscratch2, rscratch2, shift); 1265 __ sub(count, count, rscratch2); 1266 1267 #if 0 1268 // ?? This code is only correct for a disjoint copy. It may or 1269 // may not make sense to use it in that case. 1270 1271 // Copy the first pair; s and d may not be aligned. 1272 __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0)); 1273 __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0)); 1274 1275 // Align s and d, adjust count 1276 if (is_backwards) { 1277 __ sub(s, s, rscratch2); 1278 __ sub(d, d, rscratch2); 1279 } else { 1280 __ add(s, s, rscratch2); 1281 __ add(d, d, rscratch2); 1282 } 1283 #else 1284 copy_memory_small(s, d, rscratch2, rscratch1, step); 1285 #endif 1286 } 1287 1288 __ bind(aligned); 1289 1290 // s is now 2-word-aligned. 1291 1292 // We have a count of units and some trailing bytes. Adjust the 1293 // count and do a bulk copy of words. 1294 __ lsr(rscratch2, count, exact_log2(wordSize/granularity)); 1295 if (direction == copy_forwards) 1296 __ bl(copy_f); 1297 else 1298 __ bl(copy_b); 1299 1300 // And the tail. 1301 copy_memory_small(s, d, count, tmp, step); 1302 1303 if (granularity >= 8) __ bind(copy8); 1304 if (granularity >= 4) __ bind(copy4); 1305 __ bind(finish); 1306 } 1307 1308 1309 void clobber_registers() { 1310 #ifdef ASSERT 1311 __ mov(rscratch1, (uint64_t)0xdeadbeef); 1312 __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32); 1313 for (Register r = r3; r <= r18; r++) 1314 if (r != rscratch1) __ mov(r, rscratch1); 1315 #endif 1316 } 1317 1318 // Scan over array at a for count oops, verifying each one. 1319 // Preserves a and count, clobbers rscratch1 and rscratch2. 1320 void verify_oop_array (size_t size, Register a, Register count, Register temp) { 1321 Label loop, end; 1322 __ mov(rscratch1, a); 1323 __ mov(rscratch2, zr); 1324 __ bind(loop); 1325 __ cmp(rscratch2, count); 1326 __ br(Assembler::HS, end); 1327 if (size == (size_t)wordSize) { 1328 __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1329 __ verify_oop(temp); 1330 } else { 1331 __ ldrw(r16, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1332 __ decode_heap_oop(temp); // calls verify_oop 1333 } 1334 __ add(rscratch2, rscratch2, size); 1335 __ b(loop); 1336 __ bind(end); 1337 } 1338 1339 // Arguments: 1340 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1341 // ignored 1342 // is_oop - true => oop array, so generate store check code 1343 // name - stub name string 1344 // 1345 // Inputs: 1346 // c_rarg0 - source array address 1347 // c_rarg1 - destination array address 1348 // c_rarg2 - element count, treated as ssize_t, can be zero 1349 // 1350 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1351 // the hardware handle it. The two dwords within qwords that span 1352 // cache line boundaries will still be loaded and stored atomicly. 1353 // 1354 // Side Effects: 1355 // disjoint_int_copy_entry is set to the no-overlap entry point 1356 // used by generate_conjoint_int_oop_copy(). 1357 // 1358 address generate_disjoint_copy(size_t size, bool aligned, bool is_oop, address *entry, 1359 const char *name, bool dest_uninitialized = false) { 1360 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1361 RegSet saved_reg = RegSet::of(s, d, count); 1362 __ align(CodeEntryAlignment); 1363 StubCodeMark mark(this, "StubRoutines", name); 1364 address start = __ pc(); 1365 __ enter(); 1366 1367 if (entry != NULL) { 1368 *entry = __ pc(); 1369 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1370 BLOCK_COMMENT("Entry:"); 1371 } 1372 1373 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT; 1374 if (dest_uninitialized) { 1375 decorators |= IS_DEST_UNINITIALIZED; 1376 } 1377 if (aligned) { 1378 decorators |= ARRAYCOPY_ALIGNED; 1379 } 1380 1381 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1382 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg); 1383 1384 if (is_oop) { 1385 // save regs before copy_memory 1386 __ push(RegSet::of(d, count), sp); 1387 } 1388 copy_memory(aligned, s, d, count, rscratch1, size); 1389 1390 if (is_oop) { 1391 __ pop(RegSet::of(d, count), sp); 1392 if (VerifyOops) 1393 verify_oop_array(size, d, count, r16); 1394 } 1395 1396 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); 1397 1398 __ leave(); 1399 __ mov(r0, zr); // return 0 1400 __ ret(lr); 1401 #ifdef BUILTIN_SIM 1402 { 1403 AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck); 1404 sim->notifyCompile(const_cast<char*>(name), start); 1405 } 1406 #endif 1407 return start; 1408 } 1409 1410 // Arguments: 1411 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1412 // ignored 1413 // is_oop - true => oop array, so generate store check code 1414 // name - stub name string 1415 // 1416 // Inputs: 1417 // c_rarg0 - source array address 1418 // c_rarg1 - destination array address 1419 // c_rarg2 - element count, treated as ssize_t, can be zero 1420 // 1421 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1422 // the hardware handle it. The two dwords within qwords that span 1423 // cache line boundaries will still be loaded and stored atomicly. 1424 // 1425 address generate_conjoint_copy(size_t size, bool aligned, bool is_oop, address nooverlap_target, 1426 address *entry, const char *name, 1427 bool dest_uninitialized = false) { 1428 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1429 RegSet saved_regs = RegSet::of(s, d, count); 1430 StubCodeMark mark(this, "StubRoutines", name); 1431 address start = __ pc(); 1432 __ enter(); 1433 1434 if (entry != NULL) { 1435 *entry = __ pc(); 1436 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1437 BLOCK_COMMENT("Entry:"); 1438 } 1439 1440 // use fwd copy when (d-s) above_equal (count*size) 1441 __ sub(rscratch1, d, s); 1442 __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size)); 1443 __ br(Assembler::HS, nooverlap_target); 1444 1445 DecoratorSet decorators = IN_HEAP | IS_ARRAY; 1446 if (dest_uninitialized) { 1447 decorators |= IS_DEST_UNINITIALIZED; 1448 } 1449 if (aligned) { 1450 decorators |= ARRAYCOPY_ALIGNED; 1451 } 1452 1453 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1454 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs); 1455 1456 if (is_oop) { 1457 // save regs before copy_memory 1458 __ push(RegSet::of(d, count), sp); 1459 } 1460 copy_memory(aligned, s, d, count, rscratch1, -size); 1461 if (is_oop) { 1462 __ pop(RegSet::of(d, count), sp); 1463 if (VerifyOops) 1464 verify_oop_array(size, d, count, r16); 1465 } 1466 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); 1467 __ leave(); 1468 __ mov(r0, zr); // return 0 1469 __ ret(lr); 1470 #ifdef BUILTIN_SIM 1471 { 1472 AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck); 1473 sim->notifyCompile(const_cast<char*>(name), start); 1474 } 1475 #endif 1476 return start; 1477 } 1478 1479 // Arguments: 1480 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1481 // ignored 1482 // name - stub name string 1483 // 1484 // Inputs: 1485 // c_rarg0 - source array address 1486 // c_rarg1 - destination array address 1487 // c_rarg2 - element count, treated as ssize_t, can be zero 1488 // 1489 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1490 // we let the hardware handle it. The one to eight bytes within words, 1491 // dwords or qwords that span cache line boundaries will still be loaded 1492 // and stored atomically. 1493 // 1494 // Side Effects: 1495 // disjoint_byte_copy_entry is set to the no-overlap entry point // 1496 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1497 // we let the hardware handle it. The one to eight bytes within words, 1498 // dwords or qwords that span cache line boundaries will still be loaded 1499 // and stored atomically. 1500 // 1501 // Side Effects: 1502 // disjoint_byte_copy_entry is set to the no-overlap entry point 1503 // used by generate_conjoint_byte_copy(). 1504 // 1505 address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) { 1506 const bool not_oop = false; 1507 return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name); 1508 } 1509 1510 // Arguments: 1511 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1512 // ignored 1513 // name - stub name string 1514 // 1515 // Inputs: 1516 // c_rarg0 - source array address 1517 // c_rarg1 - destination array address 1518 // c_rarg2 - element count, treated as ssize_t, can be zero 1519 // 1520 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1521 // we let the hardware handle it. The one to eight bytes within words, 1522 // dwords or qwords that span cache line boundaries will still be loaded 1523 // and stored atomically. 1524 // 1525 address generate_conjoint_byte_copy(bool aligned, address nooverlap_target, 1526 address* entry, const char *name) { 1527 const bool not_oop = false; 1528 return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name); 1529 } 1530 1531 // Arguments: 1532 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1533 // ignored 1534 // name - stub name string 1535 // 1536 // Inputs: 1537 // c_rarg0 - source array address 1538 // c_rarg1 - destination array address 1539 // c_rarg2 - element count, treated as ssize_t, can be zero 1540 // 1541 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1542 // let the hardware handle it. The two or four words within dwords 1543 // or qwords that span cache line boundaries will still be loaded 1544 // and stored atomically. 1545 // 1546 // Side Effects: 1547 // disjoint_short_copy_entry is set to the no-overlap entry point 1548 // used by generate_conjoint_short_copy(). 1549 // 1550 address generate_disjoint_short_copy(bool aligned, 1551 address* entry, const char *name) { 1552 const bool not_oop = false; 1553 return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name); 1554 } 1555 1556 // Arguments: 1557 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1558 // ignored 1559 // name - stub name string 1560 // 1561 // Inputs: 1562 // c_rarg0 - source array address 1563 // c_rarg1 - destination array address 1564 // c_rarg2 - element count, treated as ssize_t, can be zero 1565 // 1566 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1567 // let the hardware handle it. The two or four words within dwords 1568 // or qwords that span cache line boundaries will still be loaded 1569 // and stored atomically. 1570 // 1571 address generate_conjoint_short_copy(bool aligned, address nooverlap_target, 1572 address *entry, const char *name) { 1573 const bool not_oop = false; 1574 return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name); 1575 1576 } 1577 // Arguments: 1578 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1579 // ignored 1580 // name - stub name string 1581 // 1582 // Inputs: 1583 // c_rarg0 - source array address 1584 // c_rarg1 - destination array address 1585 // c_rarg2 - element count, treated as ssize_t, can be zero 1586 // 1587 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1588 // the hardware handle it. The two dwords within qwords that span 1589 // cache line boundaries will still be loaded and stored atomicly. 1590 // 1591 // Side Effects: 1592 // disjoint_int_copy_entry is set to the no-overlap entry point 1593 // used by generate_conjoint_int_oop_copy(). 1594 // 1595 address generate_disjoint_int_copy(bool aligned, address *entry, 1596 const char *name, bool dest_uninitialized = false) { 1597 const bool not_oop = false; 1598 return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name); 1599 } 1600 1601 // Arguments: 1602 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1603 // ignored 1604 // name - stub name string 1605 // 1606 // Inputs: 1607 // c_rarg0 - source array address 1608 // c_rarg1 - destination array address 1609 // c_rarg2 - element count, treated as ssize_t, can be zero 1610 // 1611 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1612 // the hardware handle it. The two dwords within qwords that span 1613 // cache line boundaries will still be loaded and stored atomicly. 1614 // 1615 address generate_conjoint_int_copy(bool aligned, address nooverlap_target, 1616 address *entry, const char *name, 1617 bool dest_uninitialized = false) { 1618 const bool not_oop = false; 1619 return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name); 1620 } 1621 1622 1623 // Arguments: 1624 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1625 // ignored 1626 // name - stub name string 1627 // 1628 // Inputs: 1629 // c_rarg0 - source array address 1630 // c_rarg1 - destination array address 1631 // c_rarg2 - element count, treated as size_t, can be zero 1632 // 1633 // Side Effects: 1634 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1635 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1636 // 1637 address generate_disjoint_long_copy(bool aligned, address *entry, 1638 const char *name, bool dest_uninitialized = false) { 1639 const bool not_oop = false; 1640 return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name); 1641 } 1642 1643 // Arguments: 1644 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1645 // ignored 1646 // name - stub name string 1647 // 1648 // Inputs: 1649 // c_rarg0 - source array address 1650 // c_rarg1 - destination array address 1651 // c_rarg2 - element count, treated as size_t, can be zero 1652 // 1653 address generate_conjoint_long_copy(bool aligned, 1654 address nooverlap_target, address *entry, 1655 const char *name, bool dest_uninitialized = false) { 1656 const bool not_oop = false; 1657 return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name); 1658 } 1659 1660 // Arguments: 1661 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1662 // ignored 1663 // name - stub name string 1664 // 1665 // Inputs: 1666 // c_rarg0 - source array address 1667 // c_rarg1 - destination array address 1668 // c_rarg2 - element count, treated as size_t, can be zero 1669 // 1670 // Side Effects: 1671 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1672 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1673 // 1674 address generate_disjoint_oop_copy(bool aligned, address *entry, 1675 const char *name, bool dest_uninitialized) { 1676 const bool is_oop = true; 1677 const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1678 return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized); 1679 } 1680 1681 // Arguments: 1682 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1683 // ignored 1684 // name - stub name string 1685 // 1686 // Inputs: 1687 // c_rarg0 - source array address 1688 // c_rarg1 - destination array address 1689 // c_rarg2 - element count, treated as size_t, can be zero 1690 // 1691 address generate_conjoint_oop_copy(bool aligned, 1692 address nooverlap_target, address *entry, 1693 const char *name, bool dest_uninitialized) { 1694 const bool is_oop = true; 1695 const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1696 return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry, 1697 name, dest_uninitialized); 1698 } 1699 1700 1701 // Helper for generating a dynamic type check. 1702 // Smashes rscratch1, rscratch2. 1703 void generate_type_check(Register sub_klass, 1704 Register super_check_offset, 1705 Register super_klass, 1706 Label& L_success) { 1707 assert_different_registers(sub_klass, super_check_offset, super_klass); 1708 1709 BLOCK_COMMENT("type_check:"); 1710 1711 Label L_miss; 1712 1713 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, NULL, 1714 super_check_offset); 1715 __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL); 1716 1717 // Fall through on failure! 1718 __ BIND(L_miss); 1719 } 1720 1721 // 1722 // Generate checkcasting array copy stub 1723 // 1724 // Input: 1725 // c_rarg0 - source array address 1726 // c_rarg1 - destination array address 1727 // c_rarg2 - element count, treated as ssize_t, can be zero 1728 // c_rarg3 - size_t ckoff (super_check_offset) 1729 // c_rarg4 - oop ckval (super_klass) 1730 // 1731 // Output: 1732 // r0 == 0 - success 1733 // r0 == -1^K - failure, where K is partial transfer count 1734 // 1735 address generate_checkcast_copy(const char *name, address *entry, 1736 bool dest_uninitialized = false) { 1737 1738 Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop; 1739 1740 // Input registers (after setup_arg_regs) 1741 const Register from = c_rarg0; // source array address 1742 const Register to = c_rarg1; // destination array address 1743 const Register count = c_rarg2; // elementscount 1744 const Register ckoff = c_rarg3; // super_check_offset 1745 const Register ckval = c_rarg4; // super_klass 1746 1747 RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4); 1748 RegSet wb_post_saved_regs = RegSet::of(count); 1749 1750 // Registers used as temps (r18, r19, r20 are save-on-entry) 1751 const Register count_save = r21; // orig elementscount 1752 const Register start_to = r20; // destination array start address 1753 const Register copied_oop = r18; // actual oop copied 1754 const Register r19_klass = r19; // oop._klass 1755 1756 //--------------------------------------------------------------- 1757 // Assembler stub will be used for this call to arraycopy 1758 // if the two arrays are subtypes of Object[] but the 1759 // destination array type is not equal to or a supertype 1760 // of the source type. Each element must be separately 1761 // checked. 1762 1763 assert_different_registers(from, to, count, ckoff, ckval, start_to, 1764 copied_oop, r19_klass, count_save); 1765 1766 __ align(CodeEntryAlignment); 1767 StubCodeMark mark(this, "StubRoutines", name); 1768 address start = __ pc(); 1769 1770 __ enter(); // required for proper stackwalking of RuntimeStub frame 1771 1772 #ifdef ASSERT 1773 // caller guarantees that the arrays really are different 1774 // otherwise, we would have to make conjoint checks 1775 { Label L; 1776 array_overlap_test(L, TIMES_OOP); 1777 __ stop("checkcast_copy within a single array"); 1778 __ bind(L); 1779 } 1780 #endif //ASSERT 1781 1782 // Caller of this entry point must set up the argument registers. 1783 if (entry != NULL) { 1784 *entry = __ pc(); 1785 BLOCK_COMMENT("Entry:"); 1786 } 1787 1788 // Empty array: Nothing to do. 1789 __ cbz(count, L_done); 1790 1791 __ push(RegSet::of(r18, r19, r20, r21), sp); 1792 1793 #ifdef ASSERT 1794 BLOCK_COMMENT("assert consistent ckoff/ckval"); 1795 // The ckoff and ckval must be mutually consistent, 1796 // even though caller generates both. 1797 { Label L; 1798 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1799 __ ldrw(start_to, Address(ckval, sco_offset)); 1800 __ cmpw(ckoff, start_to); 1801 __ br(Assembler::EQ, L); 1802 __ stop("super_check_offset inconsistent"); 1803 __ bind(L); 1804 } 1805 #endif //ASSERT 1806 1807 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT; 1808 bool is_oop = true; 1809 if (dest_uninitialized) { 1810 decorators |= IS_DEST_UNINITIALIZED; 1811 } 1812 1813 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1814 bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs); 1815 1816 // save the original count 1817 __ mov(count_save, count); 1818 1819 // Copy from low to high addresses 1820 __ mov(start_to, to); // Save destination array start address 1821 __ b(L_load_element); 1822 1823 // ======== begin loop ======== 1824 // (Loop is rotated; its entry is L_load_element.) 1825 // Loop control: 1826 // for (; count != 0; count--) { 1827 // copied_oop = load_heap_oop(from++); 1828 // ... generate_type_check ...; 1829 // store_heap_oop(to++, copied_oop); 1830 // } 1831 __ align(OptoLoopAlignment); 1832 1833 __ BIND(L_store_element); 1834 __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop, noreg, noreg, noreg, AS_RAW); // store the oop 1835 __ sub(count, count, 1); 1836 __ cbz(count, L_do_card_marks); 1837 1838 // ======== loop entry is here ======== 1839 __ BIND(L_load_element); 1840 __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8), noreg, noreg, AS_RAW); // load the oop 1841 __ cbz(copied_oop, L_store_element); 1842 1843 __ load_klass(r19_klass, copied_oop);// query the object klass 1844 generate_type_check(r19_klass, ckoff, ckval, L_store_element); 1845 // ======== end loop ======== 1846 1847 // It was a real error; we must depend on the caller to finish the job. 1848 // Register count = remaining oops, count_orig = total oops. 1849 // Emit GC store barriers for the oops we have copied and report 1850 // their number to the caller. 1851 1852 __ subs(count, count_save, count); // K = partially copied oop count 1853 __ eon(count, count, zr); // report (-1^K) to caller 1854 __ br(Assembler::EQ, L_done_pop); 1855 1856 __ BIND(L_do_card_marks); 1857 bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs); 1858 1859 __ bind(L_done_pop); 1860 __ pop(RegSet::of(r18, r19, r20, r21), sp); 1861 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr); 1862 1863 __ bind(L_done); 1864 __ mov(r0, count); 1865 __ leave(); 1866 __ ret(lr); 1867 1868 return start; 1869 } 1870 1871 // Perform range checks on the proposed arraycopy. 1872 // Kills temp, but nothing else. 1873 // Also, clean the sign bits of src_pos and dst_pos. 1874 void arraycopy_range_checks(Register src, // source array oop (c_rarg0) 1875 Register src_pos, // source position (c_rarg1) 1876 Register dst, // destination array oo (c_rarg2) 1877 Register dst_pos, // destination position (c_rarg3) 1878 Register length, 1879 Register temp, 1880 Label& L_failed) { 1881 BLOCK_COMMENT("arraycopy_range_checks:"); 1882 1883 assert_different_registers(rscratch1, temp); 1884 1885 // if (src_pos + length > arrayOop(src)->length()) FAIL; 1886 __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes())); 1887 __ addw(temp, length, src_pos); 1888 __ cmpw(temp, rscratch1); 1889 __ br(Assembler::HI, L_failed); 1890 1891 // if (dst_pos + length > arrayOop(dst)->length()) FAIL; 1892 __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes())); 1893 __ addw(temp, length, dst_pos); 1894 __ cmpw(temp, rscratch1); 1895 __ br(Assembler::HI, L_failed); 1896 1897 // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'. 1898 __ movw(src_pos, src_pos); 1899 __ movw(dst_pos, dst_pos); 1900 1901 BLOCK_COMMENT("arraycopy_range_checks done"); 1902 } 1903 1904 // These stubs get called from some dumb test routine. 1905 // I'll write them properly when they're called from 1906 // something that's actually doing something. 1907 static void fake_arraycopy_stub(address src, address dst, int count) { 1908 assert(count == 0, "huh?"); 1909 } 1910 1911 1912 // 1913 // Generate 'unsafe' array copy stub 1914 // Though just as safe as the other stubs, it takes an unscaled 1915 // size_t argument instead of an element count. 1916 // 1917 // Input: 1918 // c_rarg0 - source array address 1919 // c_rarg1 - destination array address 1920 // c_rarg2 - byte count, treated as ssize_t, can be zero 1921 // 1922 // Examines the alignment of the operands and dispatches 1923 // to a long, int, short, or byte copy loop. 1924 // 1925 address generate_unsafe_copy(const char *name, 1926 address byte_copy_entry, 1927 address short_copy_entry, 1928 address int_copy_entry, 1929 address long_copy_entry) { 1930 Label L_long_aligned, L_int_aligned, L_short_aligned; 1931 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1932 1933 __ align(CodeEntryAlignment); 1934 StubCodeMark mark(this, "StubRoutines", name); 1935 address start = __ pc(); 1936 __ enter(); // required for proper stackwalking of RuntimeStub frame 1937 1938 // bump this on entry, not on exit: 1939 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr); 1940 1941 __ orr(rscratch1, s, d); 1942 __ orr(rscratch1, rscratch1, count); 1943 1944 __ andr(rscratch1, rscratch1, BytesPerLong-1); 1945 __ cbz(rscratch1, L_long_aligned); 1946 __ andr(rscratch1, rscratch1, BytesPerInt-1); 1947 __ cbz(rscratch1, L_int_aligned); 1948 __ tbz(rscratch1, 0, L_short_aligned); 1949 __ b(RuntimeAddress(byte_copy_entry)); 1950 1951 __ BIND(L_short_aligned); 1952 __ lsr(count, count, LogBytesPerShort); // size => short_count 1953 __ b(RuntimeAddress(short_copy_entry)); 1954 __ BIND(L_int_aligned); 1955 __ lsr(count, count, LogBytesPerInt); // size => int_count 1956 __ b(RuntimeAddress(int_copy_entry)); 1957 __ BIND(L_long_aligned); 1958 __ lsr(count, count, LogBytesPerLong); // size => long_count 1959 __ b(RuntimeAddress(long_copy_entry)); 1960 1961 return start; 1962 } 1963 1964 // 1965 // Generate generic array copy stubs 1966 // 1967 // Input: 1968 // c_rarg0 - src oop 1969 // c_rarg1 - src_pos (32-bits) 1970 // c_rarg2 - dst oop 1971 // c_rarg3 - dst_pos (32-bits) 1972 // c_rarg4 - element count (32-bits) 1973 // 1974 // Output: 1975 // r0 == 0 - success 1976 // r0 == -1^K - failure, where K is partial transfer count 1977 // 1978 address generate_generic_copy(const char *name, 1979 address byte_copy_entry, address short_copy_entry, 1980 address int_copy_entry, address oop_copy_entry, 1981 address long_copy_entry, address checkcast_copy_entry) { 1982 1983 Label L_failed, L_objArray; 1984 Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs; 1985 1986 // Input registers 1987 const Register src = c_rarg0; // source array oop 1988 const Register src_pos = c_rarg1; // source position 1989 const Register dst = c_rarg2; // destination array oop 1990 const Register dst_pos = c_rarg3; // destination position 1991 const Register length = c_rarg4; 1992 1993 1994 // Registers used as temps 1995 const Register dst_klass = c_rarg5; 1996 1997 __ align(CodeEntryAlignment); 1998 1999 StubCodeMark mark(this, "StubRoutines", name); 2000 2001 address start = __ pc(); 2002 2003 __ enter(); // required for proper stackwalking of RuntimeStub frame 2004 2005 // bump this on entry, not on exit: 2006 inc_counter_np(SharedRuntime::_generic_array_copy_ctr); 2007 2008 //----------------------------------------------------------------------- 2009 // Assembler stub will be used for this call to arraycopy 2010 // if the following conditions are met: 2011 // 2012 // (1) src and dst must not be null. 2013 // (2) src_pos must not be negative. 2014 // (3) dst_pos must not be negative. 2015 // (4) length must not be negative. 2016 // (5) src klass and dst klass should be the same and not NULL. 2017 // (6) src and dst should be arrays. 2018 // (7) src_pos + length must not exceed length of src. 2019 // (8) dst_pos + length must not exceed length of dst. 2020 // 2021 2022 // if (src == NULL) return -1; 2023 __ cbz(src, L_failed); 2024 2025 // if (src_pos < 0) return -1; 2026 __ tbnz(src_pos, 31, L_failed); // i.e. sign bit set 2027 2028 // if (dst == NULL) return -1; 2029 __ cbz(dst, L_failed); 2030 2031 // if (dst_pos < 0) return -1; 2032 __ tbnz(dst_pos, 31, L_failed); // i.e. sign bit set 2033 2034 // registers used as temp 2035 const Register scratch_length = r16; // elements count to copy 2036 const Register scratch_src_klass = r17; // array klass 2037 const Register lh = r18; // layout helper 2038 2039 // if (length < 0) return -1; 2040 __ movw(scratch_length, length); // length (elements count, 32-bits value) 2041 __ tbnz(scratch_length, 31, L_failed); // i.e. sign bit set 2042 2043 __ load_klass(scratch_src_klass, src); 2044 #ifdef ASSERT 2045 // assert(src->klass() != NULL); 2046 { 2047 BLOCK_COMMENT("assert klasses not null {"); 2048 Label L1, L2; 2049 __ cbnz(scratch_src_klass, L2); // it is broken if klass is NULL 2050 __ bind(L1); 2051 __ stop("broken null klass"); 2052 __ bind(L2); 2053 __ load_klass(rscratch1, dst); 2054 __ cbz(rscratch1, L1); // this would be broken also 2055 BLOCK_COMMENT("} assert klasses not null done"); 2056 } 2057 #endif 2058 2059 // Load layout helper (32-bits) 2060 // 2061 // |array_tag| | header_size | element_type | |log2_element_size| 2062 // 32 30 24 16 8 2 0 2063 // 2064 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0 2065 // 2066 2067 const int lh_offset = in_bytes(Klass::layout_helper_offset()); 2068 2069 // Handle objArrays completely differently... 2070 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT); 2071 __ ldrw(lh, Address(scratch_src_klass, lh_offset)); 2072 __ movw(rscratch1, objArray_lh); 2073 __ eorw(rscratch2, lh, rscratch1); 2074 __ cbzw(rscratch2, L_objArray); 2075 2076 // if (src->klass() != dst->klass()) return -1; 2077 __ load_klass(rscratch2, dst); 2078 __ eor(rscratch2, rscratch2, scratch_src_klass); 2079 __ cbnz(rscratch2, L_failed); 2080 2081 // if (!src->is_Array()) return -1; 2082 __ tbz(lh, 31, L_failed); // i.e. (lh >= 0) 2083 2084 // At this point, it is known to be a typeArray (array_tag 0x3). 2085 #ifdef ASSERT 2086 { 2087 BLOCK_COMMENT("assert primitive array {"); 2088 Label L; 2089 __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift); 2090 __ cmpw(lh, rscratch2); 2091 __ br(Assembler::GE, L); 2092 __ stop("must be a primitive array"); 2093 __ bind(L); 2094 BLOCK_COMMENT("} assert primitive array done"); 2095 } 2096 #endif 2097 2098 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2099 rscratch2, L_failed); 2100 2101 // TypeArrayKlass 2102 // 2103 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize); 2104 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize); 2105 // 2106 2107 const Register rscratch1_offset = rscratch1; // array offset 2108 const Register r18_elsize = lh; // element size 2109 2110 __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift, 2111 exact_log2(Klass::_lh_header_size_mask+1)); // array_offset 2112 __ add(src, src, rscratch1_offset); // src array offset 2113 __ add(dst, dst, rscratch1_offset); // dst array offset 2114 BLOCK_COMMENT("choose copy loop based on element size"); 2115 2116 // next registers should be set before the jump to corresponding stub 2117 const Register from = c_rarg0; // source array address 2118 const Register to = c_rarg1; // destination array address 2119 const Register count = c_rarg2; // elements count 2120 2121 // 'from', 'to', 'count' registers should be set in such order 2122 // since they are the same as 'src', 'src_pos', 'dst'. 2123 2124 assert(Klass::_lh_log2_element_size_shift == 0, "fix this code"); 2125 2126 // The possible values of elsize are 0-3, i.e. exact_log2(element 2127 // size in bytes). We do a simple bitwise binary search. 2128 __ BIND(L_copy_bytes); 2129 __ tbnz(r18_elsize, 1, L_copy_ints); 2130 __ tbnz(r18_elsize, 0, L_copy_shorts); 2131 __ lea(from, Address(src, src_pos));// src_addr 2132 __ lea(to, Address(dst, dst_pos));// dst_addr 2133 __ movw(count, scratch_length); // length 2134 __ b(RuntimeAddress(byte_copy_entry)); 2135 2136 __ BIND(L_copy_shorts); 2137 __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr 2138 __ lea(to, Address(dst, dst_pos, Address::lsl(1)));// dst_addr 2139 __ movw(count, scratch_length); // length 2140 __ b(RuntimeAddress(short_copy_entry)); 2141 2142 __ BIND(L_copy_ints); 2143 __ tbnz(r18_elsize, 0, L_copy_longs); 2144 __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr 2145 __ lea(to, Address(dst, dst_pos, Address::lsl(2)));// dst_addr 2146 __ movw(count, scratch_length); // length 2147 __ b(RuntimeAddress(int_copy_entry)); 2148 2149 __ BIND(L_copy_longs); 2150 #ifdef ASSERT 2151 { 2152 BLOCK_COMMENT("assert long copy {"); 2153 Label L; 2154 __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r18_elsize 2155 __ cmpw(r18_elsize, LogBytesPerLong); 2156 __ br(Assembler::EQ, L); 2157 __ stop("must be long copy, but elsize is wrong"); 2158 __ bind(L); 2159 BLOCK_COMMENT("} assert long copy done"); 2160 } 2161 #endif 2162 __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr 2163 __ lea(to, Address(dst, dst_pos, Address::lsl(3)));// dst_addr 2164 __ movw(count, scratch_length); // length 2165 __ b(RuntimeAddress(long_copy_entry)); 2166 2167 // ObjArrayKlass 2168 __ BIND(L_objArray); 2169 // live at this point: scratch_src_klass, scratch_length, src[_pos], dst[_pos] 2170 2171 Label L_plain_copy, L_checkcast_copy; 2172 // test array classes for subtyping 2173 __ load_klass(r18, dst); 2174 __ cmp(scratch_src_klass, r18); // usual case is exact equality 2175 __ br(Assembler::NE, L_checkcast_copy); 2176 2177 // Identically typed arrays can be copied without element-wise checks. 2178 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2179 rscratch2, L_failed); 2180 2181 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2182 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2183 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2184 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2185 __ movw(count, scratch_length); // length 2186 __ BIND(L_plain_copy); 2187 __ b(RuntimeAddress(oop_copy_entry)); 2188 2189 __ BIND(L_checkcast_copy); 2190 // live at this point: scratch_src_klass, scratch_length, r18 (dst_klass) 2191 { 2192 // Before looking at dst.length, make sure dst is also an objArray. 2193 __ ldrw(rscratch1, Address(r18, lh_offset)); 2194 __ movw(rscratch2, objArray_lh); 2195 __ eorw(rscratch1, rscratch1, rscratch2); 2196 __ cbnzw(rscratch1, L_failed); 2197 2198 // It is safe to examine both src.length and dst.length. 2199 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2200 r18, L_failed); 2201 2202 __ load_klass(dst_klass, dst); // reload 2203 2204 // Marshal the base address arguments now, freeing registers. 2205 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2206 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2207 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2208 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2209 __ movw(count, length); // length (reloaded) 2210 Register sco_temp = c_rarg3; // this register is free now 2211 assert_different_registers(from, to, count, sco_temp, 2212 dst_klass, scratch_src_klass); 2213 // assert_clean_int(count, sco_temp); 2214 2215 // Generate the type check. 2216 const int sco_offset = in_bytes(Klass::super_check_offset_offset()); 2217 __ ldrw(sco_temp, Address(dst_klass, sco_offset)); 2218 2219 // Smashes rscratch1, rscratch2 2220 generate_type_check(scratch_src_klass, sco_temp, dst_klass, L_plain_copy); 2221 2222 // Fetch destination element klass from the ObjArrayKlass header. 2223 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset()); 2224 __ ldr(dst_klass, Address(dst_klass, ek_offset)); 2225 __ ldrw(sco_temp, Address(dst_klass, sco_offset)); 2226 2227 // the checkcast_copy loop needs two extra arguments: 2228 assert(c_rarg3 == sco_temp, "#3 already in place"); 2229 // Set up arguments for checkcast_copy_entry. 2230 __ mov(c_rarg4, dst_klass); // dst.klass.element_klass 2231 __ b(RuntimeAddress(checkcast_copy_entry)); 2232 } 2233 2234 __ BIND(L_failed); 2235 __ mov(r0, -1); 2236 __ leave(); // required for proper stackwalking of RuntimeStub frame 2237 __ ret(lr); 2238 2239 return start; 2240 } 2241 2242 // 2243 // Generate stub for array fill. If "aligned" is true, the 2244 // "to" address is assumed to be heapword aligned. 2245 // 2246 // Arguments for generated stub: 2247 // to: c_rarg0 2248 // value: c_rarg1 2249 // count: c_rarg2 treated as signed 2250 // 2251 address generate_fill(BasicType t, bool aligned, const char *name) { 2252 __ align(CodeEntryAlignment); 2253 StubCodeMark mark(this, "StubRoutines", name); 2254 address start = __ pc(); 2255 2256 BLOCK_COMMENT("Entry:"); 2257 2258 const Register to = c_rarg0; // source array address 2259 const Register value = c_rarg1; // value 2260 const Register count = c_rarg2; // elements count 2261 2262 const Register bz_base = r10; // base for block_zero routine 2263 const Register cnt_words = r11; // temp register 2264 2265 __ enter(); 2266 2267 Label L_fill_elements, L_exit1; 2268 2269 int shift = -1; 2270 switch (t) { 2271 case T_BYTE: 2272 shift = 0; 2273 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2274 __ bfi(value, value, 8, 8); // 8 bit -> 16 bit 2275 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2276 __ br(Assembler::LO, L_fill_elements); 2277 break; 2278 case T_SHORT: 2279 shift = 1; 2280 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2281 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2282 __ br(Assembler::LO, L_fill_elements); 2283 break; 2284 case T_INT: 2285 shift = 2; 2286 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2287 __ br(Assembler::LO, L_fill_elements); 2288 break; 2289 default: ShouldNotReachHere(); 2290 } 2291 2292 // Align source address at 8 bytes address boundary. 2293 Label L_skip_align1, L_skip_align2, L_skip_align4; 2294 if (!aligned) { 2295 switch (t) { 2296 case T_BYTE: 2297 // One byte misalignment happens only for byte arrays. 2298 __ tbz(to, 0, L_skip_align1); 2299 __ strb(value, Address(__ post(to, 1))); 2300 __ subw(count, count, 1); 2301 __ bind(L_skip_align1); 2302 // Fallthrough 2303 case T_SHORT: 2304 // Two bytes misalignment happens only for byte and short (char) arrays. 2305 __ tbz(to, 1, L_skip_align2); 2306 __ strh(value, Address(__ post(to, 2))); 2307 __ subw(count, count, 2 >> shift); 2308 __ bind(L_skip_align2); 2309 // Fallthrough 2310 case T_INT: 2311 // Align to 8 bytes, we know we are 4 byte aligned to start. 2312 __ tbz(to, 2, L_skip_align4); 2313 __ strw(value, Address(__ post(to, 4))); 2314 __ subw(count, count, 4 >> shift); 2315 __ bind(L_skip_align4); 2316 break; 2317 default: ShouldNotReachHere(); 2318 } 2319 } 2320 2321 // 2322 // Fill large chunks 2323 // 2324 __ lsrw(cnt_words, count, 3 - shift); // number of words 2325 __ bfi(value, value, 32, 32); // 32 bit -> 64 bit 2326 __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift); 2327 if (UseBlockZeroing) { 2328 Label non_block_zeroing, rest; 2329 // If the fill value is zero we can use the fast zero_words(). 2330 __ cbnz(value, non_block_zeroing); 2331 __ mov(bz_base, to); 2332 __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord); 2333 __ zero_words(bz_base, cnt_words); 2334 __ b(rest); 2335 __ bind(non_block_zeroing); 2336 __ fill_words(to, cnt_words, value); 2337 __ bind(rest); 2338 } else { 2339 __ fill_words(to, cnt_words, value); 2340 } 2341 2342 // Remaining count is less than 8 bytes. Fill it by a single store. 2343 // Note that the total length is no less than 8 bytes. 2344 if (t == T_BYTE || t == T_SHORT) { 2345 Label L_exit1; 2346 __ cbzw(count, L_exit1); 2347 __ add(to, to, count, Assembler::LSL, shift); // points to the end 2348 __ str(value, Address(to, -8)); // overwrite some elements 2349 __ bind(L_exit1); 2350 __ leave(); 2351 __ ret(lr); 2352 } 2353 2354 // Handle copies less than 8 bytes. 2355 Label L_fill_2, L_fill_4, L_exit2; 2356 __ bind(L_fill_elements); 2357 switch (t) { 2358 case T_BYTE: 2359 __ tbz(count, 0, L_fill_2); 2360 __ strb(value, Address(__ post(to, 1))); 2361 __ bind(L_fill_2); 2362 __ tbz(count, 1, L_fill_4); 2363 __ strh(value, Address(__ post(to, 2))); 2364 __ bind(L_fill_4); 2365 __ tbz(count, 2, L_exit2); 2366 __ strw(value, Address(to)); 2367 break; 2368 case T_SHORT: 2369 __ tbz(count, 0, L_fill_4); 2370 __ strh(value, Address(__ post(to, 2))); 2371 __ bind(L_fill_4); 2372 __ tbz(count, 1, L_exit2); 2373 __ strw(value, Address(to)); 2374 break; 2375 case T_INT: 2376 __ cbzw(count, L_exit2); 2377 __ strw(value, Address(to)); 2378 break; 2379 default: ShouldNotReachHere(); 2380 } 2381 __ bind(L_exit2); 2382 __ leave(); 2383 __ ret(lr); 2384 return start; 2385 } 2386 2387 void generate_arraycopy_stubs() { 2388 address entry; 2389 address entry_jbyte_arraycopy; 2390 address entry_jshort_arraycopy; 2391 address entry_jint_arraycopy; 2392 address entry_oop_arraycopy; 2393 address entry_jlong_arraycopy; 2394 address entry_checkcast_arraycopy; 2395 2396 generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards); 2397 generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards); 2398 2399 StubRoutines::aarch64::_zero_blocks = generate_zero_blocks(); 2400 2401 //*** jbyte 2402 // Always need aligned and unaligned versions 2403 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(false, &entry, 2404 "jbyte_disjoint_arraycopy"); 2405 StubRoutines::_jbyte_arraycopy = generate_conjoint_byte_copy(false, entry, 2406 &entry_jbyte_arraycopy, 2407 "jbyte_arraycopy"); 2408 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry, 2409 "arrayof_jbyte_disjoint_arraycopy"); 2410 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_byte_copy(true, entry, NULL, 2411 "arrayof_jbyte_arraycopy"); 2412 2413 //*** jshort 2414 // Always need aligned and unaligned versions 2415 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, &entry, 2416 "jshort_disjoint_arraycopy"); 2417 StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(false, entry, 2418 &entry_jshort_arraycopy, 2419 "jshort_arraycopy"); 2420 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry, 2421 "arrayof_jshort_disjoint_arraycopy"); 2422 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_short_copy(true, entry, NULL, 2423 "arrayof_jshort_arraycopy"); 2424 2425 //*** jint 2426 // Aligned versions 2427 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry, 2428 "arrayof_jint_disjoint_arraycopy"); 2429 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy, 2430 "arrayof_jint_arraycopy"); 2431 // In 64 bit we need both aligned and unaligned versions of jint arraycopy. 2432 // entry_jint_arraycopy always points to the unaligned version 2433 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_int_copy(false, &entry, 2434 "jint_disjoint_arraycopy"); 2435 StubRoutines::_jint_arraycopy = generate_conjoint_int_copy(false, entry, 2436 &entry_jint_arraycopy, 2437 "jint_arraycopy"); 2438 2439 //*** jlong 2440 // It is always aligned 2441 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry, 2442 "arrayof_jlong_disjoint_arraycopy"); 2443 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy, 2444 "arrayof_jlong_arraycopy"); 2445 StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy; 2446 StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy; 2447 2448 //*** oops 2449 { 2450 // With compressed oops we need unaligned versions; notice that 2451 // we overwrite entry_oop_arraycopy. 2452 bool aligned = !UseCompressedOops; 2453 2454 StubRoutines::_arrayof_oop_disjoint_arraycopy 2455 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy", 2456 /*dest_uninitialized*/false); 2457 StubRoutines::_arrayof_oop_arraycopy 2458 = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy", 2459 /*dest_uninitialized*/false); 2460 // Aligned versions without pre-barriers 2461 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit 2462 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit", 2463 /*dest_uninitialized*/true); 2464 StubRoutines::_arrayof_oop_arraycopy_uninit 2465 = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit", 2466 /*dest_uninitialized*/true); 2467 } 2468 2469 StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy; 2470 StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy; 2471 StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit; 2472 StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit; 2473 2474 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy); 2475 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL, 2476 /*dest_uninitialized*/true); 2477 2478 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy("unsafe_arraycopy", 2479 entry_jbyte_arraycopy, 2480 entry_jshort_arraycopy, 2481 entry_jint_arraycopy, 2482 entry_jlong_arraycopy); 2483 2484 StubRoutines::_generic_arraycopy = generate_generic_copy("generic_arraycopy", 2485 entry_jbyte_arraycopy, 2486 entry_jshort_arraycopy, 2487 entry_jint_arraycopy, 2488 entry_oop_arraycopy, 2489 entry_jlong_arraycopy, 2490 entry_checkcast_arraycopy); 2491 2492 StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill"); 2493 StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill"); 2494 StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill"); 2495 StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill"); 2496 StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill"); 2497 StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill"); 2498 } 2499 2500 void generate_math_stubs() { Unimplemented(); } 2501 2502 // Arguments: 2503 // 2504 // Inputs: 2505 // c_rarg0 - source byte array address 2506 // c_rarg1 - destination byte array address 2507 // c_rarg2 - K (key) in little endian int array 2508 // 2509 address generate_aescrypt_encryptBlock() { 2510 __ align(CodeEntryAlignment); 2511 StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock"); 2512 2513 Label L_doLast; 2514 2515 const Register from = c_rarg0; // source array address 2516 const Register to = c_rarg1; // destination array address 2517 const Register key = c_rarg2; // key array address 2518 const Register keylen = rscratch1; 2519 2520 address start = __ pc(); 2521 __ enter(); 2522 2523 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2524 2525 __ ld1(v0, __ T16B, from); // get 16 bytes of input 2526 2527 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2528 __ rev32(v1, __ T16B, v1); 2529 __ rev32(v2, __ T16B, v2); 2530 __ rev32(v3, __ T16B, v3); 2531 __ rev32(v4, __ T16B, v4); 2532 __ aese(v0, v1); 2533 __ aesmc(v0, v0); 2534 __ aese(v0, v2); 2535 __ aesmc(v0, v0); 2536 __ aese(v0, v3); 2537 __ aesmc(v0, v0); 2538 __ aese(v0, v4); 2539 __ aesmc(v0, v0); 2540 2541 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2542 __ rev32(v1, __ T16B, v1); 2543 __ rev32(v2, __ T16B, v2); 2544 __ rev32(v3, __ T16B, v3); 2545 __ rev32(v4, __ T16B, v4); 2546 __ aese(v0, v1); 2547 __ aesmc(v0, v0); 2548 __ aese(v0, v2); 2549 __ aesmc(v0, v0); 2550 __ aese(v0, v3); 2551 __ aesmc(v0, v0); 2552 __ aese(v0, v4); 2553 __ aesmc(v0, v0); 2554 2555 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2556 __ rev32(v1, __ T16B, v1); 2557 __ rev32(v2, __ T16B, v2); 2558 2559 __ cmpw(keylen, 44); 2560 __ br(Assembler::EQ, L_doLast); 2561 2562 __ aese(v0, v1); 2563 __ aesmc(v0, v0); 2564 __ aese(v0, v2); 2565 __ aesmc(v0, v0); 2566 2567 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2568 __ rev32(v1, __ T16B, v1); 2569 __ rev32(v2, __ T16B, v2); 2570 2571 __ cmpw(keylen, 52); 2572 __ br(Assembler::EQ, L_doLast); 2573 2574 __ aese(v0, v1); 2575 __ aesmc(v0, v0); 2576 __ aese(v0, v2); 2577 __ aesmc(v0, v0); 2578 2579 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2580 __ rev32(v1, __ T16B, v1); 2581 __ rev32(v2, __ T16B, v2); 2582 2583 __ BIND(L_doLast); 2584 2585 __ aese(v0, v1); 2586 __ aesmc(v0, v0); 2587 __ aese(v0, v2); 2588 2589 __ ld1(v1, __ T16B, key); 2590 __ rev32(v1, __ T16B, v1); 2591 __ eor(v0, __ T16B, v0, v1); 2592 2593 __ st1(v0, __ T16B, to); 2594 2595 __ mov(r0, 0); 2596 2597 __ leave(); 2598 __ ret(lr); 2599 2600 return start; 2601 } 2602 2603 // Arguments: 2604 // 2605 // Inputs: 2606 // c_rarg0 - source byte array address 2607 // c_rarg1 - destination byte array address 2608 // c_rarg2 - K (key) in little endian int array 2609 // 2610 address generate_aescrypt_decryptBlock() { 2611 assert(UseAES, "need AES instructions and misaligned SSE support"); 2612 __ align(CodeEntryAlignment); 2613 StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock"); 2614 Label L_doLast; 2615 2616 const Register from = c_rarg0; // source array address 2617 const Register to = c_rarg1; // destination array address 2618 const Register key = c_rarg2; // key array address 2619 const Register keylen = rscratch1; 2620 2621 address start = __ pc(); 2622 __ enter(); // required for proper stackwalking of RuntimeStub frame 2623 2624 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2625 2626 __ ld1(v0, __ T16B, from); // get 16 bytes of input 2627 2628 __ ld1(v5, __ T16B, __ post(key, 16)); 2629 __ rev32(v5, __ T16B, v5); 2630 2631 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2632 __ rev32(v1, __ T16B, v1); 2633 __ rev32(v2, __ T16B, v2); 2634 __ rev32(v3, __ T16B, v3); 2635 __ rev32(v4, __ T16B, v4); 2636 __ aesd(v0, v1); 2637 __ aesimc(v0, v0); 2638 __ aesd(v0, v2); 2639 __ aesimc(v0, v0); 2640 __ aesd(v0, v3); 2641 __ aesimc(v0, v0); 2642 __ aesd(v0, v4); 2643 __ aesimc(v0, v0); 2644 2645 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2646 __ rev32(v1, __ T16B, v1); 2647 __ rev32(v2, __ T16B, v2); 2648 __ rev32(v3, __ T16B, v3); 2649 __ rev32(v4, __ T16B, v4); 2650 __ aesd(v0, v1); 2651 __ aesimc(v0, v0); 2652 __ aesd(v0, v2); 2653 __ aesimc(v0, v0); 2654 __ aesd(v0, v3); 2655 __ aesimc(v0, v0); 2656 __ aesd(v0, v4); 2657 __ aesimc(v0, v0); 2658 2659 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2660 __ rev32(v1, __ T16B, v1); 2661 __ rev32(v2, __ T16B, v2); 2662 2663 __ cmpw(keylen, 44); 2664 __ br(Assembler::EQ, L_doLast); 2665 2666 __ aesd(v0, v1); 2667 __ aesimc(v0, v0); 2668 __ aesd(v0, v2); 2669 __ aesimc(v0, v0); 2670 2671 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2672 __ rev32(v1, __ T16B, v1); 2673 __ rev32(v2, __ T16B, v2); 2674 2675 __ cmpw(keylen, 52); 2676 __ br(Assembler::EQ, L_doLast); 2677 2678 __ aesd(v0, v1); 2679 __ aesimc(v0, v0); 2680 __ aesd(v0, v2); 2681 __ aesimc(v0, v0); 2682 2683 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2684 __ rev32(v1, __ T16B, v1); 2685 __ rev32(v2, __ T16B, v2); 2686 2687 __ BIND(L_doLast); 2688 2689 __ aesd(v0, v1); 2690 __ aesimc(v0, v0); 2691 __ aesd(v0, v2); 2692 2693 __ eor(v0, __ T16B, v0, v5); 2694 2695 __ st1(v0, __ T16B, to); 2696 2697 __ mov(r0, 0); 2698 2699 __ leave(); 2700 __ ret(lr); 2701 2702 return start; 2703 } 2704 2705 // Arguments: 2706 // 2707 // Inputs: 2708 // c_rarg0 - source byte array address 2709 // c_rarg1 - destination byte array address 2710 // c_rarg2 - K (key) in little endian int array 2711 // c_rarg3 - r vector byte array address 2712 // c_rarg4 - input length 2713 // 2714 // Output: 2715 // x0 - input length 2716 // 2717 address generate_cipherBlockChaining_encryptAESCrypt() { 2718 assert(UseAES, "need AES instructions and misaligned SSE support"); 2719 __ align(CodeEntryAlignment); 2720 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt"); 2721 2722 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2723 2724 const Register from = c_rarg0; // source array address 2725 const Register to = c_rarg1; // destination array address 2726 const Register key = c_rarg2; // key array address 2727 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2728 // and left with the results of the last encryption block 2729 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2730 const Register keylen = rscratch1; 2731 2732 address start = __ pc(); 2733 2734 __ enter(); 2735 2736 __ movw(rscratch2, len_reg); 2737 2738 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2739 2740 __ ld1(v0, __ T16B, rvec); 2741 2742 __ cmpw(keylen, 52); 2743 __ br(Assembler::CC, L_loadkeys_44); 2744 __ br(Assembler::EQ, L_loadkeys_52); 2745 2746 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2747 __ rev32(v17, __ T16B, v17); 2748 __ rev32(v18, __ T16B, v18); 2749 __ BIND(L_loadkeys_52); 2750 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2751 __ rev32(v19, __ T16B, v19); 2752 __ rev32(v20, __ T16B, v20); 2753 __ BIND(L_loadkeys_44); 2754 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2755 __ rev32(v21, __ T16B, v21); 2756 __ rev32(v22, __ T16B, v22); 2757 __ rev32(v23, __ T16B, v23); 2758 __ rev32(v24, __ T16B, v24); 2759 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2760 __ rev32(v25, __ T16B, v25); 2761 __ rev32(v26, __ T16B, v26); 2762 __ rev32(v27, __ T16B, v27); 2763 __ rev32(v28, __ T16B, v28); 2764 __ ld1(v29, v30, v31, __ T16B, key); 2765 __ rev32(v29, __ T16B, v29); 2766 __ rev32(v30, __ T16B, v30); 2767 __ rev32(v31, __ T16B, v31); 2768 2769 __ BIND(L_aes_loop); 2770 __ ld1(v1, __ T16B, __ post(from, 16)); 2771 __ eor(v0, __ T16B, v0, v1); 2772 2773 __ br(Assembler::CC, L_rounds_44); 2774 __ br(Assembler::EQ, L_rounds_52); 2775 2776 __ aese(v0, v17); __ aesmc(v0, v0); 2777 __ aese(v0, v18); __ aesmc(v0, v0); 2778 __ BIND(L_rounds_52); 2779 __ aese(v0, v19); __ aesmc(v0, v0); 2780 __ aese(v0, v20); __ aesmc(v0, v0); 2781 __ BIND(L_rounds_44); 2782 __ aese(v0, v21); __ aesmc(v0, v0); 2783 __ aese(v0, v22); __ aesmc(v0, v0); 2784 __ aese(v0, v23); __ aesmc(v0, v0); 2785 __ aese(v0, v24); __ aesmc(v0, v0); 2786 __ aese(v0, v25); __ aesmc(v0, v0); 2787 __ aese(v0, v26); __ aesmc(v0, v0); 2788 __ aese(v0, v27); __ aesmc(v0, v0); 2789 __ aese(v0, v28); __ aesmc(v0, v0); 2790 __ aese(v0, v29); __ aesmc(v0, v0); 2791 __ aese(v0, v30); 2792 __ eor(v0, __ T16B, v0, v31); 2793 2794 __ st1(v0, __ T16B, __ post(to, 16)); 2795 2796 __ subw(len_reg, len_reg, 16); 2797 __ cbnzw(len_reg, L_aes_loop); 2798 2799 __ st1(v0, __ T16B, rvec); 2800 2801 __ mov(r0, rscratch2); 2802 2803 __ leave(); 2804 __ ret(lr); 2805 2806 return start; 2807 } 2808 2809 // Arguments: 2810 // 2811 // Inputs: 2812 // c_rarg0 - source byte array address 2813 // c_rarg1 - destination byte array address 2814 // c_rarg2 - K (key) in little endian int array 2815 // c_rarg3 - r vector byte array address 2816 // c_rarg4 - input length 2817 // 2818 // Output: 2819 // r0 - input length 2820 // 2821 address generate_cipherBlockChaining_decryptAESCrypt() { 2822 assert(UseAES, "need AES instructions and misaligned SSE support"); 2823 __ align(CodeEntryAlignment); 2824 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt"); 2825 2826 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2827 2828 const Register from = c_rarg0; // source array address 2829 const Register to = c_rarg1; // destination array address 2830 const Register key = c_rarg2; // key array address 2831 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2832 // and left with the results of the last encryption block 2833 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2834 const Register keylen = rscratch1; 2835 2836 address start = __ pc(); 2837 2838 __ enter(); 2839 2840 __ movw(rscratch2, len_reg); 2841 2842 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2843 2844 __ ld1(v2, __ T16B, rvec); 2845 2846 __ ld1(v31, __ T16B, __ post(key, 16)); 2847 __ rev32(v31, __ T16B, v31); 2848 2849 __ cmpw(keylen, 52); 2850 __ br(Assembler::CC, L_loadkeys_44); 2851 __ br(Assembler::EQ, L_loadkeys_52); 2852 2853 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2854 __ rev32(v17, __ T16B, v17); 2855 __ rev32(v18, __ T16B, v18); 2856 __ BIND(L_loadkeys_52); 2857 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2858 __ rev32(v19, __ T16B, v19); 2859 __ rev32(v20, __ T16B, v20); 2860 __ BIND(L_loadkeys_44); 2861 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2862 __ rev32(v21, __ T16B, v21); 2863 __ rev32(v22, __ T16B, v22); 2864 __ rev32(v23, __ T16B, v23); 2865 __ rev32(v24, __ T16B, v24); 2866 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2867 __ rev32(v25, __ T16B, v25); 2868 __ rev32(v26, __ T16B, v26); 2869 __ rev32(v27, __ T16B, v27); 2870 __ rev32(v28, __ T16B, v28); 2871 __ ld1(v29, v30, __ T16B, key); 2872 __ rev32(v29, __ T16B, v29); 2873 __ rev32(v30, __ T16B, v30); 2874 2875 __ BIND(L_aes_loop); 2876 __ ld1(v0, __ T16B, __ post(from, 16)); 2877 __ orr(v1, __ T16B, v0, v0); 2878 2879 __ br(Assembler::CC, L_rounds_44); 2880 __ br(Assembler::EQ, L_rounds_52); 2881 2882 __ aesd(v0, v17); __ aesimc(v0, v0); 2883 __ aesd(v0, v18); __ aesimc(v0, v0); 2884 __ BIND(L_rounds_52); 2885 __ aesd(v0, v19); __ aesimc(v0, v0); 2886 __ aesd(v0, v20); __ aesimc(v0, v0); 2887 __ BIND(L_rounds_44); 2888 __ aesd(v0, v21); __ aesimc(v0, v0); 2889 __ aesd(v0, v22); __ aesimc(v0, v0); 2890 __ aesd(v0, v23); __ aesimc(v0, v0); 2891 __ aesd(v0, v24); __ aesimc(v0, v0); 2892 __ aesd(v0, v25); __ aesimc(v0, v0); 2893 __ aesd(v0, v26); __ aesimc(v0, v0); 2894 __ aesd(v0, v27); __ aesimc(v0, v0); 2895 __ aesd(v0, v28); __ aesimc(v0, v0); 2896 __ aesd(v0, v29); __ aesimc(v0, v0); 2897 __ aesd(v0, v30); 2898 __ eor(v0, __ T16B, v0, v31); 2899 __ eor(v0, __ T16B, v0, v2); 2900 2901 __ st1(v0, __ T16B, __ post(to, 16)); 2902 __ orr(v2, __ T16B, v1, v1); 2903 2904 __ subw(len_reg, len_reg, 16); 2905 __ cbnzw(len_reg, L_aes_loop); 2906 2907 __ st1(v2, __ T16B, rvec); 2908 2909 __ mov(r0, rscratch2); 2910 2911 __ leave(); 2912 __ ret(lr); 2913 2914 return start; 2915 } 2916 2917 // Arguments: 2918 // 2919 // Inputs: 2920 // c_rarg0 - byte[] source+offset 2921 // c_rarg1 - int[] SHA.state 2922 // c_rarg2 - int offset 2923 // c_rarg3 - int limit 2924 // 2925 address generate_sha1_implCompress(bool multi_block, const char *name) { 2926 __ align(CodeEntryAlignment); 2927 StubCodeMark mark(this, "StubRoutines", name); 2928 address start = __ pc(); 2929 2930 Register buf = c_rarg0; 2931 Register state = c_rarg1; 2932 Register ofs = c_rarg2; 2933 Register limit = c_rarg3; 2934 2935 Label keys; 2936 Label sha1_loop; 2937 2938 // load the keys into v0..v3 2939 __ adr(rscratch1, keys); 2940 __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1)); 2941 // load 5 words state into v6, v7 2942 __ ldrq(v6, Address(state, 0)); 2943 __ ldrs(v7, Address(state, 16)); 2944 2945 2946 __ BIND(sha1_loop); 2947 // load 64 bytes of data into v16..v19 2948 __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf); 2949 __ rev32(v16, __ T16B, v16); 2950 __ rev32(v17, __ T16B, v17); 2951 __ rev32(v18, __ T16B, v18); 2952 __ rev32(v19, __ T16B, v19); 2953 2954 // do the sha1 2955 __ addv(v4, __ T4S, v16, v0); 2956 __ orr(v20, __ T16B, v6, v6); 2957 2958 FloatRegister d0 = v16; 2959 FloatRegister d1 = v17; 2960 FloatRegister d2 = v18; 2961 FloatRegister d3 = v19; 2962 2963 for (int round = 0; round < 20; round++) { 2964 FloatRegister tmp1 = (round & 1) ? v4 : v5; 2965 FloatRegister tmp2 = (round & 1) ? v21 : v22; 2966 FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7; 2967 FloatRegister tmp4 = (round & 1) ? v5 : v4; 2968 FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3)); 2969 2970 if (round < 16) __ sha1su0(d0, __ T4S, d1, d2); 2971 if (round < 19) __ addv(tmp1, __ T4S, d1, key); 2972 __ sha1h(tmp2, __ T4S, v20); 2973 if (round < 5) 2974 __ sha1c(v20, __ T4S, tmp3, tmp4); 2975 else if (round < 10 || round >= 15) 2976 __ sha1p(v20, __ T4S, tmp3, tmp4); 2977 else 2978 __ sha1m(v20, __ T4S, tmp3, tmp4); 2979 if (round < 16) __ sha1su1(d0, __ T4S, d3); 2980 2981 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 2982 } 2983 2984 __ addv(v7, __ T2S, v7, v21); 2985 __ addv(v6, __ T4S, v6, v20); 2986 2987 if (multi_block) { 2988 __ add(ofs, ofs, 64); 2989 __ cmp(ofs, limit); 2990 __ br(Assembler::LE, sha1_loop); 2991 __ mov(c_rarg0, ofs); // return ofs 2992 } 2993 2994 __ strq(v6, Address(state, 0)); 2995 __ strs(v7, Address(state, 16)); 2996 2997 __ ret(lr); 2998 2999 __ bind(keys); 3000 __ emit_int32(0x5a827999); 3001 __ emit_int32(0x6ed9eba1); 3002 __ emit_int32(0x8f1bbcdc); 3003 __ emit_int32(0xca62c1d6); 3004 3005 return start; 3006 } 3007 3008 3009 // Arguments: 3010 // 3011 // Inputs: 3012 // c_rarg0 - byte[] source+offset 3013 // c_rarg1 - int[] SHA.state 3014 // c_rarg2 - int offset 3015 // c_rarg3 - int limit 3016 // 3017 address generate_sha256_implCompress(bool multi_block, const char *name) { 3018 static const uint32_t round_consts[64] = { 3019 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 3020 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 3021 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 3022 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 3023 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 3024 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 3025 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 3026 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, 3027 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 3028 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 3029 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 3030 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, 3031 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 3032 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, 3033 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 3034 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, 3035 }; 3036 __ align(CodeEntryAlignment); 3037 StubCodeMark mark(this, "StubRoutines", name); 3038 address start = __ pc(); 3039 3040 Register buf = c_rarg0; 3041 Register state = c_rarg1; 3042 Register ofs = c_rarg2; 3043 Register limit = c_rarg3; 3044 3045 Label sha1_loop; 3046 3047 __ stpd(v8, v9, __ pre(sp, -32)); 3048 __ stpd(v10, v11, Address(sp, 16)); 3049 3050 // dga == v0 3051 // dgb == v1 3052 // dg0 == v2 3053 // dg1 == v3 3054 // dg2 == v4 3055 // t0 == v6 3056 // t1 == v7 3057 3058 // load 16 keys to v16..v31 3059 __ lea(rscratch1, ExternalAddress((address)round_consts)); 3060 __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64)); 3061 __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64)); 3062 __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64)); 3063 __ ld1(v28, v29, v30, v31, __ T4S, rscratch1); 3064 3065 // load 8 words (256 bits) state 3066 __ ldpq(v0, v1, state); 3067 3068 __ BIND(sha1_loop); 3069 // load 64 bytes of data into v8..v11 3070 __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf); 3071 __ rev32(v8, __ T16B, v8); 3072 __ rev32(v9, __ T16B, v9); 3073 __ rev32(v10, __ T16B, v10); 3074 __ rev32(v11, __ T16B, v11); 3075 3076 __ addv(v6, __ T4S, v8, v16); 3077 __ orr(v2, __ T16B, v0, v0); 3078 __ orr(v3, __ T16B, v1, v1); 3079 3080 FloatRegister d0 = v8; 3081 FloatRegister d1 = v9; 3082 FloatRegister d2 = v10; 3083 FloatRegister d3 = v11; 3084 3085 3086 for (int round = 0; round < 16; round++) { 3087 FloatRegister tmp1 = (round & 1) ? v6 : v7; 3088 FloatRegister tmp2 = (round & 1) ? v7 : v6; 3089 FloatRegister tmp3 = (round & 1) ? v2 : v4; 3090 FloatRegister tmp4 = (round & 1) ? v4 : v2; 3091 3092 if (round < 12) __ sha256su0(d0, __ T4S, d1); 3093 __ orr(v4, __ T16B, v2, v2); 3094 if (round < 15) 3095 __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17)); 3096 __ sha256h(v2, __ T4S, v3, tmp2); 3097 __ sha256h2(v3, __ T4S, v4, tmp2); 3098 if (round < 12) __ sha256su1(d0, __ T4S, d2, d3); 3099 3100 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3101 } 3102 3103 __ addv(v0, __ T4S, v0, v2); 3104 __ addv(v1, __ T4S, v1, v3); 3105 3106 if (multi_block) { 3107 __ add(ofs, ofs, 64); 3108 __ cmp(ofs, limit); 3109 __ br(Assembler::LE, sha1_loop); 3110 __ mov(c_rarg0, ofs); // return ofs 3111 } 3112 3113 __ ldpd(v10, v11, Address(sp, 16)); 3114 __ ldpd(v8, v9, __ post(sp, 32)); 3115 3116 __ stpq(v0, v1, state); 3117 3118 __ ret(lr); 3119 3120 return start; 3121 } 3122 3123 #ifndef BUILTIN_SIM 3124 // Safefetch stubs. 3125 void generate_safefetch(const char* name, int size, address* entry, 3126 address* fault_pc, address* continuation_pc) { 3127 // safefetch signatures: 3128 // int SafeFetch32(int* adr, int errValue); 3129 // intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue); 3130 // 3131 // arguments: 3132 // c_rarg0 = adr 3133 // c_rarg1 = errValue 3134 // 3135 // result: 3136 // PPC_RET = *adr or errValue 3137 3138 StubCodeMark mark(this, "StubRoutines", name); 3139 3140 // Entry point, pc or function descriptor. 3141 *entry = __ pc(); 3142 3143 // Load *adr into c_rarg1, may fault. 3144 *fault_pc = __ pc(); 3145 switch (size) { 3146 case 4: 3147 // int32_t 3148 __ ldrw(c_rarg1, Address(c_rarg0, 0)); 3149 break; 3150 case 8: 3151 // int64_t 3152 __ ldr(c_rarg1, Address(c_rarg0, 0)); 3153 break; 3154 default: 3155 ShouldNotReachHere(); 3156 } 3157 3158 // return errValue or *adr 3159 *continuation_pc = __ pc(); 3160 __ mov(r0, c_rarg1); 3161 __ ret(lr); 3162 } 3163 #endif 3164 3165 /** 3166 * Arguments: 3167 * 3168 * Inputs: 3169 * c_rarg0 - int crc 3170 * c_rarg1 - byte* buf 3171 * c_rarg2 - int length 3172 * 3173 * Ouput: 3174 * rax - int crc result 3175 */ 3176 address generate_updateBytesCRC32() { 3177 assert(UseCRC32Intrinsics, "what are we doing here?"); 3178 3179 __ align(CodeEntryAlignment); 3180 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32"); 3181 3182 address start = __ pc(); 3183 3184 const Register crc = c_rarg0; // crc 3185 const Register buf = c_rarg1; // source java byte array address 3186 const Register len = c_rarg2; // length 3187 const Register table0 = c_rarg3; // crc_table address 3188 const Register table1 = c_rarg4; 3189 const Register table2 = c_rarg5; 3190 const Register table3 = c_rarg6; 3191 const Register tmp3 = c_rarg7; 3192 3193 BLOCK_COMMENT("Entry:"); 3194 __ enter(); // required for proper stackwalking of RuntimeStub frame 3195 3196 __ kernel_crc32(crc, buf, len, 3197 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 3198 3199 __ leave(); // required for proper stackwalking of RuntimeStub frame 3200 __ ret(lr); 3201 3202 return start; 3203 } 3204 3205 /** 3206 * Arguments: 3207 * 3208 * Inputs: 3209 * c_rarg0 - int crc 3210 * c_rarg1 - byte* buf 3211 * c_rarg2 - int length 3212 * c_rarg3 - int* table 3213 * 3214 * Ouput: 3215 * r0 - int crc result 3216 */ 3217 address generate_updateBytesCRC32C() { 3218 assert(UseCRC32CIntrinsics, "what are we doing here?"); 3219 3220 __ align(CodeEntryAlignment); 3221 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C"); 3222 3223 address start = __ pc(); 3224 3225 const Register crc = c_rarg0; // crc 3226 const Register buf = c_rarg1; // source java byte array address 3227 const Register len = c_rarg2; // length 3228 const Register table0 = c_rarg3; // crc_table address 3229 const Register table1 = c_rarg4; 3230 const Register table2 = c_rarg5; 3231 const Register table3 = c_rarg6; 3232 const Register tmp3 = c_rarg7; 3233 3234 BLOCK_COMMENT("Entry:"); 3235 __ enter(); // required for proper stackwalking of RuntimeStub frame 3236 3237 __ kernel_crc32c(crc, buf, len, 3238 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 3239 3240 __ leave(); // required for proper stackwalking of RuntimeStub frame 3241 __ ret(lr); 3242 3243 return start; 3244 } 3245 3246 /*** 3247 * Arguments: 3248 * 3249 * Inputs: 3250 * c_rarg0 - int adler 3251 * c_rarg1 - byte* buff 3252 * c_rarg2 - int len 3253 * 3254 * Output: 3255 * c_rarg0 - int adler result 3256 */ 3257 address generate_updateBytesAdler32() { 3258 __ align(CodeEntryAlignment); 3259 StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32"); 3260 address start = __ pc(); 3261 3262 Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1; 3263 3264 // Aliases 3265 Register adler = c_rarg0; 3266 Register s1 = c_rarg0; 3267 Register s2 = c_rarg3; 3268 Register buff = c_rarg1; 3269 Register len = c_rarg2; 3270 Register nmax = r4; 3271 Register base = r5; 3272 Register count = r6; 3273 Register temp0 = rscratch1; 3274 Register temp1 = rscratch2; 3275 FloatRegister vbytes = v0; 3276 FloatRegister vs1acc = v1; 3277 FloatRegister vs2acc = v2; 3278 FloatRegister vtable = v3; 3279 3280 // Max number of bytes we can process before having to take the mod 3281 // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 3282 unsigned long BASE = 0xfff1; 3283 unsigned long NMAX = 0x15B0; 3284 3285 __ mov(base, BASE); 3286 __ mov(nmax, NMAX); 3287 3288 // Load accumulation coefficients for the upper 16 bits 3289 __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table)); 3290 __ ld1(vtable, __ T16B, Address(temp0)); 3291 3292 // s1 is initialized to the lower 16 bits of adler 3293 // s2 is initialized to the upper 16 bits of adler 3294 __ ubfx(s2, adler, 16, 16); // s2 = ((adler >> 16) & 0xffff) 3295 __ uxth(s1, adler); // s1 = (adler & 0xffff) 3296 3297 // The pipelined loop needs at least 16 elements for 1 iteration 3298 // It does check this, but it is more effective to skip to the cleanup loop 3299 __ cmp(len, (u1)16); 3300 __ br(Assembler::HS, L_nmax); 3301 __ cbz(len, L_combine); 3302 3303 __ bind(L_simple_by1_loop); 3304 __ ldrb(temp0, Address(__ post(buff, 1))); 3305 __ add(s1, s1, temp0); 3306 __ add(s2, s2, s1); 3307 __ subs(len, len, 1); 3308 __ br(Assembler::HI, L_simple_by1_loop); 3309 3310 // s1 = s1 % BASE 3311 __ subs(temp0, s1, base); 3312 __ csel(s1, temp0, s1, Assembler::HS); 3313 3314 // s2 = s2 % BASE 3315 __ lsr(temp0, s2, 16); 3316 __ lsl(temp1, temp0, 4); 3317 __ sub(temp1, temp1, temp0); 3318 __ add(s2, temp1, s2, ext::uxth); 3319 3320 __ subs(temp0, s2, base); 3321 __ csel(s2, temp0, s2, Assembler::HS); 3322 3323 __ b(L_combine); 3324 3325 __ bind(L_nmax); 3326 __ subs(len, len, nmax); 3327 __ sub(count, nmax, 16); 3328 __ br(Assembler::LO, L_by16); 3329 3330 __ bind(L_nmax_loop); 3331 3332 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1, 3333 vbytes, vs1acc, vs2acc, vtable); 3334 3335 __ subs(count, count, 16); 3336 __ br(Assembler::HS, L_nmax_loop); 3337 3338 // s1 = s1 % BASE 3339 __ lsr(temp0, s1, 16); 3340 __ lsl(temp1, temp0, 4); 3341 __ sub(temp1, temp1, temp0); 3342 __ add(temp1, temp1, s1, ext::uxth); 3343 3344 __ lsr(temp0, temp1, 16); 3345 __ lsl(s1, temp0, 4); 3346 __ sub(s1, s1, temp0); 3347 __ add(s1, s1, temp1, ext:: uxth); 3348 3349 __ subs(temp0, s1, base); 3350 __ csel(s1, temp0, s1, Assembler::HS); 3351 3352 // s2 = s2 % BASE 3353 __ lsr(temp0, s2, 16); 3354 __ lsl(temp1, temp0, 4); 3355 __ sub(temp1, temp1, temp0); 3356 __ add(temp1, temp1, s2, ext::uxth); 3357 3358 __ lsr(temp0, temp1, 16); 3359 __ lsl(s2, temp0, 4); 3360 __ sub(s2, s2, temp0); 3361 __ add(s2, s2, temp1, ext:: uxth); 3362 3363 __ subs(temp0, s2, base); 3364 __ csel(s2, temp0, s2, Assembler::HS); 3365 3366 __ subs(len, len, nmax); 3367 __ sub(count, nmax, 16); 3368 __ br(Assembler::HS, L_nmax_loop); 3369 3370 __ bind(L_by16); 3371 __ adds(len, len, count); 3372 __ br(Assembler::LO, L_by1); 3373 3374 __ bind(L_by16_loop); 3375 3376 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1, 3377 vbytes, vs1acc, vs2acc, vtable); 3378 3379 __ subs(len, len, 16); 3380 __ br(Assembler::HS, L_by16_loop); 3381 3382 __ bind(L_by1); 3383 __ adds(len, len, 15); 3384 __ br(Assembler::LO, L_do_mod); 3385 3386 __ bind(L_by1_loop); 3387 __ ldrb(temp0, Address(__ post(buff, 1))); 3388 __ add(s1, temp0, s1); 3389 __ add(s2, s2, s1); 3390 __ subs(len, len, 1); 3391 __ br(Assembler::HS, L_by1_loop); 3392 3393 __ bind(L_do_mod); 3394 // s1 = s1 % BASE 3395 __ lsr(temp0, s1, 16); 3396 __ lsl(temp1, temp0, 4); 3397 __ sub(temp1, temp1, temp0); 3398 __ add(temp1, temp1, s1, ext::uxth); 3399 3400 __ lsr(temp0, temp1, 16); 3401 __ lsl(s1, temp0, 4); 3402 __ sub(s1, s1, temp0); 3403 __ add(s1, s1, temp1, ext:: uxth); 3404 3405 __ subs(temp0, s1, base); 3406 __ csel(s1, temp0, s1, Assembler::HS); 3407 3408 // s2 = s2 % BASE 3409 __ lsr(temp0, s2, 16); 3410 __ lsl(temp1, temp0, 4); 3411 __ sub(temp1, temp1, temp0); 3412 __ add(temp1, temp1, s2, ext::uxth); 3413 3414 __ lsr(temp0, temp1, 16); 3415 __ lsl(s2, temp0, 4); 3416 __ sub(s2, s2, temp0); 3417 __ add(s2, s2, temp1, ext:: uxth); 3418 3419 __ subs(temp0, s2, base); 3420 __ csel(s2, temp0, s2, Assembler::HS); 3421 3422 // Combine lower bits and higher bits 3423 __ bind(L_combine); 3424 __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16) 3425 3426 __ ret(lr); 3427 3428 return start; 3429 } 3430 3431 void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff, 3432 Register temp0, Register temp1, FloatRegister vbytes, 3433 FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) { 3434 // Below is a vectorized implementation of updating s1 and s2 for 16 bytes. 3435 // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration. 3436 // In non-vectorized code, we update s1 and s2 as: 3437 // s1 <- s1 + b1 3438 // s2 <- s2 + s1 3439 // s1 <- s1 + b2 3440 // s2 <- s2 + b1 3441 // ... 3442 // s1 <- s1 + b16 3443 // s2 <- s2 + s1 3444 // Putting above assignments together, we have: 3445 // s1_new = s1 + b1 + b2 + ... + b16 3446 // s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16) 3447 // = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1) 3448 // = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1) 3449 __ ld1(vbytes, __ T16B, Address(__ post(buff, 16))); 3450 3451 // s2 = s2 + s1 * 16 3452 __ add(s2, s2, s1, Assembler::LSL, 4); 3453 3454 // vs1acc = b1 + b2 + b3 + ... + b16 3455 // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1) 3456 __ umullv(vs2acc, __ T8B, vtable, vbytes); 3457 __ umlalv(vs2acc, __ T16B, vtable, vbytes); 3458 __ uaddlv(vs1acc, __ T16B, vbytes); 3459 __ uaddlv(vs2acc, __ T8H, vs2acc); 3460 3461 // s1 = s1 + vs1acc, s2 = s2 + vs2acc 3462 __ fmovd(temp0, vs1acc); 3463 __ fmovd(temp1, vs2acc); 3464 __ add(s1, s1, temp0); 3465 __ add(s2, s2, temp1); 3466 } 3467 3468 /** 3469 * Arguments: 3470 * 3471 * Input: 3472 * c_rarg0 - x address 3473 * c_rarg1 - x length 3474 * c_rarg2 - y address 3475 * c_rarg3 - y lenth 3476 * c_rarg4 - z address 3477 * c_rarg5 - z length 3478 */ 3479 address generate_multiplyToLen() { 3480 __ align(CodeEntryAlignment); 3481 StubCodeMark mark(this, "StubRoutines", "multiplyToLen"); 3482 3483 address start = __ pc(); 3484 const Register x = r0; 3485 const Register xlen = r1; 3486 const Register y = r2; 3487 const Register ylen = r3; 3488 const Register z = r4; 3489 const Register zlen = r5; 3490 3491 const Register tmp1 = r10; 3492 const Register tmp2 = r11; 3493 const Register tmp3 = r12; 3494 const Register tmp4 = r13; 3495 const Register tmp5 = r14; 3496 const Register tmp6 = r15; 3497 const Register tmp7 = r16; 3498 3499 BLOCK_COMMENT("Entry:"); 3500 __ enter(); // required for proper stackwalking of RuntimeStub frame 3501 __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 3502 __ leave(); // required for proper stackwalking of RuntimeStub frame 3503 __ ret(lr); 3504 3505 return start; 3506 } 3507 3508 address generate_squareToLen() { 3509 // squareToLen algorithm for sizes 1..127 described in java code works 3510 // faster than multiply_to_len on some CPUs and slower on others, but 3511 // multiply_to_len shows a bit better overall results 3512 __ align(CodeEntryAlignment); 3513 StubCodeMark mark(this, "StubRoutines", "squareToLen"); 3514 address start = __ pc(); 3515 3516 const Register x = r0; 3517 const Register xlen = r1; 3518 const Register z = r2; 3519 const Register zlen = r3; 3520 const Register y = r4; // == x 3521 const Register ylen = r5; // == xlen 3522 3523 const Register tmp1 = r10; 3524 const Register tmp2 = r11; 3525 const Register tmp3 = r12; 3526 const Register tmp4 = r13; 3527 const Register tmp5 = r14; 3528 const Register tmp6 = r15; 3529 const Register tmp7 = r16; 3530 3531 RegSet spilled_regs = RegSet::of(y, ylen); 3532 BLOCK_COMMENT("Entry:"); 3533 __ enter(); 3534 __ push(spilled_regs, sp); 3535 __ mov(y, x); 3536 __ mov(ylen, xlen); 3537 __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 3538 __ pop(spilled_regs, sp); 3539 __ leave(); 3540 __ ret(lr); 3541 return start; 3542 } 3543 3544 address generate_mulAdd() { 3545 __ align(CodeEntryAlignment); 3546 StubCodeMark mark(this, "StubRoutines", "mulAdd"); 3547 3548 address start = __ pc(); 3549 3550 const Register out = r0; 3551 const Register in = r1; 3552 const Register offset = r2; 3553 const Register len = r3; 3554 const Register k = r4; 3555 3556 BLOCK_COMMENT("Entry:"); 3557 __ enter(); 3558 __ mul_add(out, in, offset, len, k); 3559 __ leave(); 3560 __ ret(lr); 3561 3562 return start; 3563 } 3564 3565 void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi, 3566 FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0, 3567 FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4) { 3568 // Karatsuba multiplication performs a 128*128 -> 256-bit 3569 // multiplication in three 128-bit multiplications and a few 3570 // additions. 3571 // 3572 // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1) 3573 // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0 3574 // 3575 // Inputs: 3576 // 3577 // A0 in a.d[0] (subkey) 3578 // A1 in a.d[1] 3579 // (A1+A0) in a1_xor_a0.d[0] 3580 // 3581 // B0 in b.d[0] (state) 3582 // B1 in b.d[1] 3583 3584 __ ext(tmp1, __ T16B, b, b, 0x08); 3585 __ pmull2(result_hi, __ T1Q, b, a, __ T2D); // A1*B1 3586 __ eor(tmp1, __ T16B, tmp1, b); // (B1+B0) 3587 __ pmull(result_lo, __ T1Q, b, a, __ T1D); // A0*B0 3588 __ pmull(tmp2, __ T1Q, tmp1, a1_xor_a0, __ T1D); // (A1+A0)(B1+B0) 3589 3590 __ ext(tmp4, __ T16B, result_lo, result_hi, 0x08); 3591 __ eor(tmp3, __ T16B, result_hi, result_lo); // A1*B1+A0*B0 3592 __ eor(tmp2, __ T16B, tmp2, tmp4); 3593 __ eor(tmp2, __ T16B, tmp2, tmp3); 3594 3595 // Register pair <result_hi:result_lo> holds the result of carry-less multiplication 3596 __ ins(result_hi, __ D, tmp2, 0, 1); 3597 __ ins(result_lo, __ D, tmp2, 1, 0); 3598 } 3599 3600 void ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi, 3601 FloatRegister p, FloatRegister z, FloatRegister t1) { 3602 const FloatRegister t0 = result; 3603 3604 // The GCM field polynomial f is z^128 + p(z), where p = 3605 // z^7+z^2+z+1. 3606 // 3607 // z^128 === -p(z) (mod (z^128 + p(z))) 3608 // 3609 // so, given that the product we're reducing is 3610 // a == lo + hi * z^128 3611 // substituting, 3612 // === lo - hi * p(z) (mod (z^128 + p(z))) 3613 // 3614 // we reduce by multiplying hi by p(z) and subtracting the result 3615 // from (i.e. XORing it with) lo. Because p has no nonzero high 3616 // bits we can do this with two 64-bit multiplications, lo*p and 3617 // hi*p. 3618 3619 __ pmull2(t0, __ T1Q, hi, p, __ T2D); 3620 __ ext(t1, __ T16B, t0, z, 8); 3621 __ eor(hi, __ T16B, hi, t1); 3622 __ ext(t1, __ T16B, z, t0, 8); 3623 __ eor(lo, __ T16B, lo, t1); 3624 __ pmull(t0, __ T1Q, hi, p, __ T1D); 3625 __ eor(result, __ T16B, lo, t0); 3626 } 3627 3628 address generate_has_negatives(address &has_negatives_long) { 3629 const u1 large_loop_size = 64; 3630 const uint64_t UPPER_BIT_MASK=0x8080808080808080; 3631 int dcache_line = VM_Version::dcache_line_size(); 3632 3633 Register ary1 = r1, len = r2, result = r0; 3634 3635 __ align(CodeEntryAlignment); 3636 3637 StubCodeMark mark(this, "StubRoutines", "has_negatives"); 3638 3639 address entry = __ pc(); 3640 3641 __ enter(); 3642 3643 Label RET_TRUE, RET_TRUE_NO_POP, RET_FALSE, ALIGNED, LOOP16, CHECK_16, DONE, 3644 LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL; 3645 3646 __ cmp(len, (u1)15); 3647 __ br(Assembler::GT, LEN_OVER_15); 3648 // The only case when execution falls into this code is when pointer is near 3649 // the end of memory page and we have to avoid reading next page 3650 __ add(ary1, ary1, len); 3651 __ subs(len, len, 8); 3652 __ br(Assembler::GT, LEN_OVER_8); 3653 __ ldr(rscratch2, Address(ary1, -8)); 3654 __ sub(rscratch1, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes. 3655 __ lsrv(rscratch2, rscratch2, rscratch1); 3656 __ tst(rscratch2, UPPER_BIT_MASK); 3657 __ cset(result, Assembler::NE); 3658 __ leave(); 3659 __ ret(lr); 3660 __ bind(LEN_OVER_8); 3661 __ ldp(rscratch1, rscratch2, Address(ary1, -16)); 3662 __ sub(len, len, 8); // no data dep., then sub can be executed while loading 3663 __ tst(rscratch2, UPPER_BIT_MASK); 3664 __ br(Assembler::NE, RET_TRUE_NO_POP); 3665 __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes 3666 __ lsrv(rscratch1, rscratch1, rscratch2); 3667 __ tst(rscratch1, UPPER_BIT_MASK); 3668 __ cset(result, Assembler::NE); 3669 __ leave(); 3670 __ ret(lr); 3671 3672 Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10; 3673 const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6; 3674 3675 has_negatives_long = __ pc(); // 2nd entry point 3676 3677 __ enter(); 3678 3679 __ bind(LEN_OVER_15); 3680 __ push(spilled_regs, sp); 3681 __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment 3682 __ cbz(rscratch2, ALIGNED); 3683 __ ldp(tmp6, tmp1, Address(ary1)); 3684 __ mov(tmp5, 16); 3685 __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address 3686 __ add(ary1, ary1, rscratch1); 3687 __ sub(len, len, rscratch1); 3688 __ orr(tmp6, tmp6, tmp1); 3689 __ tst(tmp6, UPPER_BIT_MASK); 3690 __ br(Assembler::NE, RET_TRUE); 3691 3692 __ bind(ALIGNED); 3693 __ cmp(len, large_loop_size); 3694 __ br(Assembler::LT, CHECK_16); 3695 // Perform 16-byte load as early return in pre-loop to handle situation 3696 // when initially aligned large array has negative values at starting bytes, 3697 // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is 3698 // slower. Cases with negative bytes further ahead won't be affected that 3699 // much. In fact, it'll be faster due to early loads, less instructions and 3700 // less branches in LARGE_LOOP. 3701 __ ldp(tmp6, tmp1, Address(__ post(ary1, 16))); 3702 __ sub(len, len, 16); 3703 __ orr(tmp6, tmp6, tmp1); 3704 __ tst(tmp6, UPPER_BIT_MASK); 3705 __ br(Assembler::NE, RET_TRUE); 3706 __ cmp(len, large_loop_size); 3707 __ br(Assembler::LT, CHECK_16); 3708 3709 if (SoftwarePrefetchHintDistance >= 0 3710 && SoftwarePrefetchHintDistance >= dcache_line) { 3711 // initial prefetch 3712 __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line)); 3713 } 3714 __ bind(LARGE_LOOP); 3715 if (SoftwarePrefetchHintDistance >= 0) { 3716 __ prfm(Address(ary1, SoftwarePrefetchHintDistance)); 3717 } 3718 // Issue load instructions first, since it can save few CPU/MEM cycles, also 3719 // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp) 3720 // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3 3721 // instructions per cycle and have less branches, but this approach disables 3722 // early return, thus, all 64 bytes are loaded and checked every time. 3723 __ ldp(tmp2, tmp3, Address(ary1)); 3724 __ ldp(tmp4, tmp5, Address(ary1, 16)); 3725 __ ldp(rscratch1, rscratch2, Address(ary1, 32)); 3726 __ ldp(tmp6, tmp1, Address(ary1, 48)); 3727 __ add(ary1, ary1, large_loop_size); 3728 __ sub(len, len, large_loop_size); 3729 __ orr(tmp2, tmp2, tmp3); 3730 __ orr(tmp4, tmp4, tmp5); 3731 __ orr(rscratch1, rscratch1, rscratch2); 3732 __ orr(tmp6, tmp6, tmp1); 3733 __ orr(tmp2, tmp2, tmp4); 3734 __ orr(rscratch1, rscratch1, tmp6); 3735 __ orr(tmp2, tmp2, rscratch1); 3736 __ tst(tmp2, UPPER_BIT_MASK); 3737 __ br(Assembler::NE, RET_TRUE); 3738 __ cmp(len, large_loop_size); 3739 __ br(Assembler::GE, LARGE_LOOP); 3740 3741 __ bind(CHECK_16); // small 16-byte load pre-loop 3742 __ cmp(len, (u1)16); 3743 __ br(Assembler::LT, POST_LOOP16); 3744 3745 __ bind(LOOP16); // small 16-byte load loop 3746 __ ldp(tmp2, tmp3, Address(__ post(ary1, 16))); 3747 __ sub(len, len, 16); 3748 __ orr(tmp2, tmp2, tmp3); 3749 __ tst(tmp2, UPPER_BIT_MASK); 3750 __ br(Assembler::NE, RET_TRUE); 3751 __ cmp(len, (u1)16); 3752 __ br(Assembler::GE, LOOP16); // 16-byte load loop end 3753 3754 __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally 3755 __ cmp(len, (u1)8); 3756 __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL); 3757 __ ldr(tmp3, Address(__ post(ary1, 8))); 3758 __ sub(len, len, 8); 3759 __ tst(tmp3, UPPER_BIT_MASK); 3760 __ br(Assembler::NE, RET_TRUE); 3761 3762 __ bind(POST_LOOP16_LOAD_TAIL); 3763 __ cbz(len, RET_FALSE); // Can't shift left by 64 when len==0 3764 __ ldr(tmp1, Address(ary1)); 3765 __ mov(tmp2, 64); 3766 __ sub(tmp4, tmp2, len, __ LSL, 3); 3767 __ lslv(tmp1, tmp1, tmp4); 3768 __ tst(tmp1, UPPER_BIT_MASK); 3769 __ br(Assembler::NE, RET_TRUE); 3770 // Fallthrough 3771 3772 __ bind(RET_FALSE); 3773 __ pop(spilled_regs, sp); 3774 __ leave(); 3775 __ mov(result, zr); 3776 __ ret(lr); 3777 3778 __ bind(RET_TRUE); 3779 __ pop(spilled_regs, sp); 3780 __ bind(RET_TRUE_NO_POP); 3781 __ leave(); 3782 __ mov(result, 1); 3783 __ ret(lr); 3784 3785 __ bind(DONE); 3786 __ pop(spilled_regs, sp); 3787 __ leave(); 3788 __ ret(lr); 3789 return entry; 3790 } 3791 3792 void generate_large_array_equals_loop_nonsimd(int loopThreshold, 3793 bool usePrefetch, Label &NOT_EQUAL) { 3794 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 3795 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, 3796 tmp7 = r12, tmp8 = r13; 3797 Label LOOP; 3798 3799 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 3800 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 3801 __ bind(LOOP); 3802 if (usePrefetch) { 3803 __ prfm(Address(a1, SoftwarePrefetchHintDistance)); 3804 __ prfm(Address(a2, SoftwarePrefetchHintDistance)); 3805 } 3806 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); 3807 __ eor(tmp1, tmp1, tmp2); 3808 __ eor(tmp3, tmp3, tmp4); 3809 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); 3810 __ orr(tmp1, tmp1, tmp3); 3811 __ cbnz(tmp1, NOT_EQUAL); 3812 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 3813 __ eor(tmp5, tmp5, tmp6); 3814 __ eor(tmp7, tmp7, tmp8); 3815 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 3816 __ orr(tmp5, tmp5, tmp7); 3817 __ cbnz(tmp5, NOT_EQUAL); 3818 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); 3819 __ eor(tmp1, tmp1, tmp2); 3820 __ eor(tmp3, tmp3, tmp4); 3821 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); 3822 __ orr(tmp1, tmp1, tmp3); 3823 __ cbnz(tmp1, NOT_EQUAL); 3824 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 3825 __ eor(tmp5, tmp5, tmp6); 3826 __ sub(cnt1, cnt1, 8 * wordSize); 3827 __ eor(tmp7, tmp7, tmp8); 3828 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 3829 // tmp6 is not used. MacroAssembler::subs is used here (rather than 3830 // cmp) because subs allows an unlimited range of immediate operand. 3831 __ subs(tmp6, cnt1, loopThreshold); 3832 __ orr(tmp5, tmp5, tmp7); 3833 __ cbnz(tmp5, NOT_EQUAL); 3834 __ br(__ GE, LOOP); 3835 // post-loop 3836 __ eor(tmp1, tmp1, tmp2); 3837 __ eor(tmp3, tmp3, tmp4); 3838 __ orr(tmp1, tmp1, tmp3); 3839 __ sub(cnt1, cnt1, 2 * wordSize); 3840 __ cbnz(tmp1, NOT_EQUAL); 3841 } 3842 3843 void generate_large_array_equals_loop_simd(int loopThreshold, 3844 bool usePrefetch, Label &NOT_EQUAL) { 3845 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 3846 tmp2 = rscratch2; 3847 Label LOOP; 3848 3849 __ bind(LOOP); 3850 if (usePrefetch) { 3851 __ prfm(Address(a1, SoftwarePrefetchHintDistance)); 3852 __ prfm(Address(a2, SoftwarePrefetchHintDistance)); 3853 } 3854 __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize))); 3855 __ sub(cnt1, cnt1, 8 * wordSize); 3856 __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize))); 3857 __ subs(tmp1, cnt1, loopThreshold); 3858 __ eor(v0, __ T16B, v0, v4); 3859 __ eor(v1, __ T16B, v1, v5); 3860 __ eor(v2, __ T16B, v2, v6); 3861 __ eor(v3, __ T16B, v3, v7); 3862 __ orr(v0, __ T16B, v0, v1); 3863 __ orr(v1, __ T16B, v2, v3); 3864 __ orr(v0, __ T16B, v0, v1); 3865 __ umov(tmp1, v0, __ D, 0); 3866 __ umov(tmp2, v0, __ D, 1); 3867 __ orr(tmp1, tmp1, tmp2); 3868 __ cbnz(tmp1, NOT_EQUAL); 3869 __ br(__ GE, LOOP); 3870 } 3871 3872 // a1 = r1 - array1 address 3873 // a2 = r2 - array2 address 3874 // result = r0 - return value. Already contains "false" 3875 // cnt1 = r10 - amount of elements left to check, reduced by wordSize 3876 // r3-r5 are reserved temporary registers 3877 address generate_large_array_equals() { 3878 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 3879 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, 3880 tmp7 = r12, tmp8 = r13; 3881 Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP, 3882 SMALL_LOOP, POST_LOOP; 3883 const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16; 3884 // calculate if at least 32 prefetched bytes are used 3885 int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32; 3886 int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE); 3887 RegSet spilled_regs = RegSet::range(tmp6, tmp8); 3888 assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4, 3889 tmp5, tmp6, tmp7, tmp8); 3890 3891 __ align(CodeEntryAlignment); 3892 3893 StubCodeMark mark(this, "StubRoutines", "large_array_equals"); 3894 3895 address entry = __ pc(); 3896 __ enter(); 3897 __ sub(cnt1, cnt1, wordSize); // first 8 bytes were loaded outside of stub 3898 // also advance pointers to use post-increment instead of pre-increment 3899 __ add(a1, a1, wordSize); 3900 __ add(a2, a2, wordSize); 3901 if (AvoidUnalignedAccesses) { 3902 // both implementations (SIMD/nonSIMD) are using relatively large load 3903 // instructions (ld1/ldp), which has huge penalty (up to x2 exec time) 3904 // on some CPUs in case of address is not at least 16-byte aligned. 3905 // Arrays are 8-byte aligned currently, so, we can make additional 8-byte 3906 // load if needed at least for 1st address and make if 16-byte aligned. 3907 Label ALIGNED16; 3908 __ tbz(a1, 3, ALIGNED16); 3909 __ ldr(tmp1, Address(__ post(a1, wordSize))); 3910 __ ldr(tmp2, Address(__ post(a2, wordSize))); 3911 __ sub(cnt1, cnt1, wordSize); 3912 __ eor(tmp1, tmp1, tmp2); 3913 __ cbnz(tmp1, NOT_EQUAL_NO_POP); 3914 __ bind(ALIGNED16); 3915 } 3916 if (UseSIMDForArrayEquals) { 3917 if (SoftwarePrefetchHintDistance >= 0) { 3918 __ subs(tmp1, cnt1, prefetchLoopThreshold); 3919 __ br(__ LE, NO_PREFETCH_LARGE_LOOP); 3920 generate_large_array_equals_loop_simd(prefetchLoopThreshold, 3921 /* prfm = */ true, NOT_EQUAL); 3922 __ subs(zr, cnt1, nonPrefetchLoopThreshold); 3923 __ br(__ LT, TAIL); 3924 } 3925 __ bind(NO_PREFETCH_LARGE_LOOP); 3926 generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold, 3927 /* prfm = */ false, NOT_EQUAL); 3928 } else { 3929 __ push(spilled_regs, sp); 3930 if (SoftwarePrefetchHintDistance >= 0) { 3931 __ subs(tmp1, cnt1, prefetchLoopThreshold); 3932 __ br(__ LE, NO_PREFETCH_LARGE_LOOP); 3933 generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold, 3934 /* prfm = */ true, NOT_EQUAL); 3935 __ subs(zr, cnt1, nonPrefetchLoopThreshold); 3936 __ br(__ LT, TAIL); 3937 } 3938 __ bind(NO_PREFETCH_LARGE_LOOP); 3939 generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold, 3940 /* prfm = */ false, NOT_EQUAL); 3941 } 3942 __ bind(TAIL); 3943 __ cbz(cnt1, EQUAL); 3944 __ subs(cnt1, cnt1, wordSize); 3945 __ br(__ LE, POST_LOOP); 3946 __ bind(SMALL_LOOP); 3947 __ ldr(tmp1, Address(__ post(a1, wordSize))); 3948 __ ldr(tmp2, Address(__ post(a2, wordSize))); 3949 __ subs(cnt1, cnt1, wordSize); 3950 __ eor(tmp1, tmp1, tmp2); 3951 __ cbnz(tmp1, NOT_EQUAL); 3952 __ br(__ GT, SMALL_LOOP); 3953 __ bind(POST_LOOP); 3954 __ ldr(tmp1, Address(a1, cnt1)); 3955 __ ldr(tmp2, Address(a2, cnt1)); 3956 __ eor(tmp1, tmp1, tmp2); 3957 __ cbnz(tmp1, NOT_EQUAL); 3958 __ bind(EQUAL); 3959 __ mov(result, true); 3960 __ bind(NOT_EQUAL); 3961 if (!UseSIMDForArrayEquals) { 3962 __ pop(spilled_regs, sp); 3963 } 3964 __ bind(NOT_EQUAL_NO_POP); 3965 __ leave(); 3966 __ ret(lr); 3967 return entry; 3968 } 3969 3970 address generate_dsin_dcos(bool isCos) { 3971 __ align(CodeEntryAlignment); 3972 StubCodeMark mark(this, "StubRoutines", isCos ? "libmDcos" : "libmDsin"); 3973 address start = __ pc(); 3974 __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw, 3975 (address)StubRoutines::aarch64::_two_over_pi, 3976 (address)StubRoutines::aarch64::_pio2, 3977 (address)StubRoutines::aarch64::_dsin_coef, 3978 (address)StubRoutines::aarch64::_dcos_coef); 3979 return start; 3980 } 3981 3982 address generate_dlog() { 3983 __ align(CodeEntryAlignment); 3984 StubCodeMark mark(this, "StubRoutines", "dlog"); 3985 address entry = __ pc(); 3986 FloatRegister vtmp0 = v0, vtmp1 = v1, vtmp2 = v2, vtmp3 = v3, vtmp4 = v4, 3987 vtmp5 = v5, tmpC1 = v16, tmpC2 = v17, tmpC3 = v18, tmpC4 = v19; 3988 Register tmp1 = r0, tmp2 = r1, tmp3 = r2, tmp4 = r3, tmp5 = r4; 3989 __ fast_log(vtmp0, vtmp1, vtmp2, vtmp3, vtmp4, vtmp5, tmpC1, tmpC2, tmpC3, 3990 tmpC4, tmp1, tmp2, tmp3, tmp4, tmp5); 3991 return entry; 3992 } 3993 3994 // code for comparing 16 bytes of strings with same encoding 3995 void compare_string_16_bytes_same(Label &DIFF1, Label &DIFF2) { 3996 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, tmp1 = r10, tmp2 = r11; 3997 __ ldr(rscratch1, Address(__ post(str1, 8))); 3998 __ eor(rscratch2, tmp1, tmp2); 3999 __ ldr(cnt1, Address(__ post(str2, 8))); 4000 __ cbnz(rscratch2, DIFF1); 4001 __ ldr(tmp1, Address(__ post(str1, 8))); 4002 __ eor(rscratch2, rscratch1, cnt1); 4003 __ ldr(tmp2, Address(__ post(str2, 8))); 4004 __ cbnz(rscratch2, DIFF2); 4005 } 4006 4007 // code for comparing 16 characters of strings with Latin1 and Utf16 encoding 4008 void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1, 4009 Label &DIFF2) { 4010 Register cnt1 = r2, tmp1 = r10, tmp2 = r11, tmp3 = r12; 4011 FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2; 4012 4013 __ ldrq(vtmp, Address(__ post(tmp2, 16))); 4014 __ ldr(tmpU, Address(__ post(cnt1, 8))); 4015 __ zip1(vtmp3, __ T16B, vtmp, vtmpZ); 4016 // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3 4017 4018 __ fmovd(tmpL, vtmp3); 4019 __ eor(rscratch2, tmp3, tmpL); 4020 __ cbnz(rscratch2, DIFF2); 4021 4022 __ ldr(tmp3, Address(__ post(cnt1, 8))); 4023 __ umov(tmpL, vtmp3, __ D, 1); 4024 __ eor(rscratch2, tmpU, tmpL); 4025 __ cbnz(rscratch2, DIFF1); 4026 4027 __ zip2(vtmp, __ T16B, vtmp, vtmpZ); 4028 __ ldr(tmpU, Address(__ post(cnt1, 8))); 4029 __ fmovd(tmpL, vtmp); 4030 __ eor(rscratch2, tmp3, tmpL); 4031 __ cbnz(rscratch2, DIFF2); 4032 4033 __ ldr(tmp3, Address(__ post(cnt1, 8))); 4034 __ umov(tmpL, vtmp, __ D, 1); 4035 __ eor(rscratch2, tmpU, tmpL); 4036 __ cbnz(rscratch2, DIFF1); 4037 } 4038 4039 // r0 = result 4040 // r1 = str1 4041 // r2 = cnt1 4042 // r3 = str2 4043 // r4 = cnt2 4044 // r10 = tmp1 4045 // r11 = tmp2 4046 address generate_compare_long_string_different_encoding(bool isLU) { 4047 __ align(CodeEntryAlignment); 4048 StubCodeMark mark(this, "StubRoutines", isLU 4049 ? "compare_long_string_different_encoding LU" 4050 : "compare_long_string_different_encoding UL"); 4051 address entry = __ pc(); 4052 Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2, 4053 DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH, 4054 LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2; 4055 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 4056 tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14; 4057 FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2; 4058 RegSet spilled_regs = RegSet::of(tmp3, tmp4); 4059 4060 int prefetchLoopExitCondition = MAX(64, SoftwarePrefetchHintDistance/2); 4061 4062 __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ); 4063 // cnt2 == amount of characters left to compare 4064 // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL)) 4065 __ zip1(vtmp, __ T8B, vtmp, vtmpZ); 4066 __ add(str1, str1, isLU ? wordSize/2 : wordSize); 4067 __ add(str2, str2, isLU ? wordSize : wordSize/2); 4068 __ fmovd(isLU ? tmp1 : tmp2, vtmp); 4069 __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case. 4070 __ add(str1, str1, cnt2, __ LSL, isLU ? 0 : 1); 4071 __ eor(rscratch2, tmp1, tmp2); 4072 __ add(str2, str2, cnt2, __ LSL, isLU ? 1 : 0); 4073 __ mov(rscratch1, tmp2); 4074 __ cbnz(rscratch2, CALCULATE_DIFFERENCE); 4075 Register strU = isLU ? str2 : str1, 4076 strL = isLU ? str1 : str2, 4077 tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison 4078 tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison 4079 __ push(spilled_regs, sp); 4080 __ sub(tmp2, strL, cnt2); // strL pointer to load from 4081 __ sub(cnt1, strU, cnt2, __ LSL, 1); // strU pointer to load from 4082 4083 __ ldr(tmp3, Address(__ post(cnt1, 8))); 4084 4085 if (SoftwarePrefetchHintDistance >= 0) { 4086 __ subs(rscratch2, cnt2, prefetchLoopExitCondition); 4087 __ br(__ LT, NO_PREFETCH); 4088 __ bind(LARGE_LOOP_PREFETCH); 4089 __ prfm(Address(tmp2, SoftwarePrefetchHintDistance)); 4090 __ mov(tmp4, 2); 4091 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance)); 4092 __ bind(LARGE_LOOP_PREFETCH_REPEAT1); 4093 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 4094 __ subs(tmp4, tmp4, 1); 4095 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1); 4096 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance)); 4097 __ mov(tmp4, 2); 4098 __ bind(LARGE_LOOP_PREFETCH_REPEAT2); 4099 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 4100 __ subs(tmp4, tmp4, 1); 4101 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2); 4102 __ sub(cnt2, cnt2, 64); 4103 __ subs(rscratch2, cnt2, prefetchLoopExitCondition); 4104 __ br(__ GE, LARGE_LOOP_PREFETCH); 4105 } 4106 __ cbz(cnt2, LOAD_LAST); // no characters left except last load 4107 __ bind(NO_PREFETCH); 4108 __ subs(cnt2, cnt2, 16); 4109 __ br(__ LT, TAIL); 4110 __ bind(SMALL_LOOP); // smaller loop 4111 __ subs(cnt2, cnt2, 16); 4112 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 4113 __ br(__ GE, SMALL_LOOP); 4114 __ cmn(cnt2, (u1)16); 4115 __ br(__ EQ, LOAD_LAST); 4116 __ bind(TAIL); // 1..15 characters left until last load (last 4 characters) 4117 __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 8 bytes before last 4 characters in UTF-16 string 4118 __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string 4119 __ ldr(tmp3, Address(cnt1, -8)); 4120 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load 4121 __ b(LOAD_LAST); 4122 __ bind(DIFF2); 4123 __ mov(tmpU, tmp3); 4124 __ bind(DIFF1); 4125 __ pop(spilled_regs, sp); 4126 __ b(CALCULATE_DIFFERENCE); 4127 __ bind(LOAD_LAST); 4128 // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU. 4129 // No need to load it again 4130 __ mov(tmpU, tmp3); 4131 __ pop(spilled_regs, sp); 4132 4133 __ ldrs(vtmp, Address(strL)); 4134 __ zip1(vtmp, __ T8B, vtmp, vtmpZ); 4135 __ fmovd(tmpL, vtmp); 4136 4137 __ eor(rscratch2, tmpU, tmpL); 4138 __ cbz(rscratch2, DONE); 4139 4140 // Find the first different characters in the longwords and 4141 // compute their difference. 4142 __ bind(CALCULATE_DIFFERENCE); 4143 __ rev(rscratch2, rscratch2); 4144 __ clz(rscratch2, rscratch2); 4145 __ andr(rscratch2, rscratch2, -16); 4146 __ lsrv(tmp1, tmp1, rscratch2); 4147 __ uxthw(tmp1, tmp1); 4148 __ lsrv(rscratch1, rscratch1, rscratch2); 4149 __ uxthw(rscratch1, rscratch1); 4150 __ subw(result, tmp1, rscratch1); 4151 __ bind(DONE); 4152 __ ret(lr); 4153 return entry; 4154 } 4155 4156 // r0 = result 4157 // r1 = str1 4158 // r2 = cnt1 4159 // r3 = str2 4160 // r4 = cnt2 4161 // r10 = tmp1 4162 // r11 = tmp2 4163 address generate_compare_long_string_same_encoding(bool isLL) { 4164 __ align(CodeEntryAlignment); 4165 StubCodeMark mark(this, "StubRoutines", isLL 4166 ? "compare_long_string_same_encoding LL" 4167 : "compare_long_string_same_encoding UU"); 4168 address entry = __ pc(); 4169 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 4170 tmp1 = r10, tmp2 = r11; 4171 Label SMALL_LOOP, LARGE_LOOP_PREFETCH, CHECK_LAST, DIFF2, TAIL, 4172 LENGTH_DIFF, DIFF, LAST_CHECK_AND_LENGTH_DIFF, 4173 DIFF_LAST_POSITION, DIFF_LAST_POSITION2; 4174 // exit from large loop when less than 64 bytes left to read or we're about 4175 // to prefetch memory behind array border 4176 int largeLoopExitCondition = MAX(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2); 4177 // cnt1/cnt2 contains amount of characters to compare. cnt1 can be re-used 4178 // update cnt2 counter with already loaded 8 bytes 4179 __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2)); 4180 // update pointers, because of previous read 4181 __ add(str1, str1, wordSize); 4182 __ add(str2, str2, wordSize); 4183 if (SoftwarePrefetchHintDistance >= 0) { 4184 __ bind(LARGE_LOOP_PREFETCH); 4185 __ prfm(Address(str1, SoftwarePrefetchHintDistance)); 4186 __ prfm(Address(str2, SoftwarePrefetchHintDistance)); 4187 compare_string_16_bytes_same(DIFF, DIFF2); 4188 compare_string_16_bytes_same(DIFF, DIFF2); 4189 __ sub(cnt2, cnt2, isLL ? 64 : 32); 4190 compare_string_16_bytes_same(DIFF, DIFF2); 4191 __ subs(rscratch2, cnt2, largeLoopExitCondition); 4192 compare_string_16_bytes_same(DIFF, DIFF2); 4193 __ br(__ GT, LARGE_LOOP_PREFETCH); 4194 __ cbz(cnt2, LAST_CHECK_AND_LENGTH_DIFF); // no more chars left? 4195 } 4196 // less than 16 bytes left? 4197 __ subs(cnt2, cnt2, isLL ? 16 : 8); 4198 __ br(__ LT, TAIL); 4199 __ bind(SMALL_LOOP); 4200 compare_string_16_bytes_same(DIFF, DIFF2); 4201 __ subs(cnt2, cnt2, isLL ? 16 : 8); 4202 __ br(__ GE, SMALL_LOOP); 4203 __ bind(TAIL); 4204 __ adds(cnt2, cnt2, isLL ? 16 : 8); 4205 __ br(__ EQ, LAST_CHECK_AND_LENGTH_DIFF); 4206 __ subs(cnt2, cnt2, isLL ? 8 : 4); 4207 __ br(__ LE, CHECK_LAST); 4208 __ eor(rscratch2, tmp1, tmp2); 4209 __ cbnz(rscratch2, DIFF); 4210 __ ldr(tmp1, Address(__ post(str1, 8))); 4211 __ ldr(tmp2, Address(__ post(str2, 8))); 4212 __ sub(cnt2, cnt2, isLL ? 8 : 4); 4213 __ bind(CHECK_LAST); 4214 if (!isLL) { 4215 __ add(cnt2, cnt2, cnt2); // now in bytes 4216 } 4217 __ eor(rscratch2, tmp1, tmp2); 4218 __ cbnz(rscratch2, DIFF); 4219 __ ldr(rscratch1, Address(str1, cnt2)); 4220 __ ldr(cnt1, Address(str2, cnt2)); 4221 __ eor(rscratch2, rscratch1, cnt1); 4222 __ cbz(rscratch2, LENGTH_DIFF); 4223 // Find the first different characters in the longwords and 4224 // compute their difference. 4225 __ bind(DIFF2); 4226 __ rev(rscratch2, rscratch2); 4227 __ clz(rscratch2, rscratch2); 4228 __ andr(rscratch2, rscratch2, isLL ? -8 : -16); 4229 __ lsrv(rscratch1, rscratch1, rscratch2); 4230 if (isLL) { 4231 __ lsrv(cnt1, cnt1, rscratch2); 4232 __ uxtbw(rscratch1, rscratch1); 4233 __ uxtbw(cnt1, cnt1); 4234 } else { 4235 __ lsrv(cnt1, cnt1, rscratch2); 4236 __ uxthw(rscratch1, rscratch1); 4237 __ uxthw(cnt1, cnt1); 4238 } 4239 __ subw(result, rscratch1, cnt1); 4240 __ b(LENGTH_DIFF); 4241 __ bind(DIFF); 4242 __ rev(rscratch2, rscratch2); 4243 __ clz(rscratch2, rscratch2); 4244 __ andr(rscratch2, rscratch2, isLL ? -8 : -16); 4245 __ lsrv(tmp1, tmp1, rscratch2); 4246 if (isLL) { 4247 __ lsrv(tmp2, tmp2, rscratch2); 4248 __ uxtbw(tmp1, tmp1); 4249 __ uxtbw(tmp2, tmp2); 4250 } else { 4251 __ lsrv(tmp2, tmp2, rscratch2); 4252 __ uxthw(tmp1, tmp1); 4253 __ uxthw(tmp2, tmp2); 4254 } 4255 __ subw(result, tmp1, tmp2); 4256 __ b(LENGTH_DIFF); 4257 __ bind(LAST_CHECK_AND_LENGTH_DIFF); 4258 __ eor(rscratch2, tmp1, tmp2); 4259 __ cbnz(rscratch2, DIFF); 4260 __ bind(LENGTH_DIFF); 4261 __ ret(lr); 4262 return entry; 4263 } 4264 4265 void generate_compare_long_strings() { 4266 StubRoutines::aarch64::_compare_long_string_LL 4267 = generate_compare_long_string_same_encoding(true); 4268 StubRoutines::aarch64::_compare_long_string_UU 4269 = generate_compare_long_string_same_encoding(false); 4270 StubRoutines::aarch64::_compare_long_string_LU 4271 = generate_compare_long_string_different_encoding(true); 4272 StubRoutines::aarch64::_compare_long_string_UL 4273 = generate_compare_long_string_different_encoding(false); 4274 } 4275 4276 // R0 = result 4277 // R1 = str2 4278 // R2 = cnt1 4279 // R3 = str1 4280 // R4 = cnt2 4281 // This generic linear code use few additional ideas, which makes it faster: 4282 // 1) we can safely keep at least 1st register of pattern(since length >= 8) 4283 // in order to skip initial loading(help in systems with 1 ld pipeline) 4284 // 2) we can use "fast" algorithm of finding single character to search for 4285 // first symbol with less branches(1 branch per each loaded register instead 4286 // of branch for each symbol), so, this is where constants like 4287 // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from 4288 // 3) after loading and analyzing 1st register of source string, it can be 4289 // used to search for every 1st character entry, saving few loads in 4290 // comparison with "simplier-but-slower" implementation 4291 // 4) in order to avoid lots of push/pop operations, code below is heavily 4292 // re-using/re-initializing/compressing register values, which makes code 4293 // larger and a bit less readable, however, most of extra operations are 4294 // issued during loads or branches, so, penalty is minimal 4295 address generate_string_indexof_linear(bool str1_isL, bool str2_isL) { 4296 const char* stubName = str1_isL 4297 ? (str2_isL ? "indexof_linear_ll" : "indexof_linear_ul") 4298 : "indexof_linear_uu"; 4299 __ align(CodeEntryAlignment); 4300 StubCodeMark mark(this, "StubRoutines", stubName); 4301 address entry = __ pc(); 4302 4303 int str1_chr_size = str1_isL ? 1 : 2; 4304 int str2_chr_size = str2_isL ? 1 : 2; 4305 int str1_chr_shift = str1_isL ? 0 : 1; 4306 int str2_chr_shift = str2_isL ? 0 : 1; 4307 bool isL = str1_isL && str2_isL; 4308 // parameters 4309 Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4; 4310 // temporary registers 4311 Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23; 4312 RegSet spilled_regs = RegSet::range(tmp1, tmp4); 4313 // redefinitions 4314 Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3; 4315 4316 __ push(spilled_regs, sp); 4317 Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO, 4318 L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED, 4319 L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP, 4320 L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH, 4321 L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2, 4322 L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH; 4323 // Read whole register from str1. It is safe, because length >=8 here 4324 __ ldr(ch1, Address(str1)); 4325 // Read whole register from str2. It is safe, because length >=8 here 4326 __ ldr(ch2, Address(str2)); 4327 __ sub(cnt2, cnt2, cnt1); 4328 __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF); 4329 if (str1_isL != str2_isL) { 4330 __ eor(v0, __ T16B, v0, v0); 4331 } 4332 __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001); 4333 __ mul(first, first, tmp1); 4334 // check if we have less than 1 register to check 4335 __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1); 4336 if (str1_isL != str2_isL) { 4337 __ fmovd(v1, ch1); 4338 } 4339 __ br(__ LE, L_SMALL); 4340 __ eor(ch2, first, ch2); 4341 if (str1_isL != str2_isL) { 4342 __ zip1(v1, __ T16B, v1, v0); 4343 } 4344 __ sub(tmp2, ch2, tmp1); 4345 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 4346 __ bics(tmp2, tmp2, ch2); 4347 if (str1_isL != str2_isL) { 4348 __ fmovd(ch1, v1); 4349 } 4350 __ br(__ NE, L_HAS_ZERO); 4351 __ subs(cnt2, cnt2, wordSize/str2_chr_size); 4352 __ add(result, result, wordSize/str2_chr_size); 4353 __ add(str2, str2, wordSize); 4354 __ br(__ LT, L_POST_LOOP); 4355 __ BIND(L_LOOP); 4356 __ ldr(ch2, Address(str2)); 4357 __ eor(ch2, first, ch2); 4358 __ sub(tmp2, ch2, tmp1); 4359 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 4360 __ bics(tmp2, tmp2, ch2); 4361 __ br(__ NE, L_HAS_ZERO); 4362 __ BIND(L_LOOP_PROCEED); 4363 __ subs(cnt2, cnt2, wordSize/str2_chr_size); 4364 __ add(str2, str2, wordSize); 4365 __ add(result, result, wordSize/str2_chr_size); 4366 __ br(__ GE, L_LOOP); 4367 __ BIND(L_POST_LOOP); 4368 __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check 4369 __ br(__ LE, NOMATCH); 4370 __ ldr(ch2, Address(str2)); 4371 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift); 4372 __ eor(ch2, first, ch2); 4373 __ sub(tmp2, ch2, tmp1); 4374 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 4375 __ mov(tmp4, -1); // all bits set 4376 __ b(L_SMALL_PROCEED); 4377 __ align(OptoLoopAlignment); 4378 __ BIND(L_SMALL); 4379 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift); 4380 __ eor(ch2, first, ch2); 4381 if (str1_isL != str2_isL) { 4382 __ zip1(v1, __ T16B, v1, v0); 4383 } 4384 __ sub(tmp2, ch2, tmp1); 4385 __ mov(tmp4, -1); // all bits set 4386 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 4387 if (str1_isL != str2_isL) { 4388 __ fmovd(ch1, v1); // move converted 4 symbols 4389 } 4390 __ BIND(L_SMALL_PROCEED); 4391 __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits. 4392 __ bic(tmp2, tmp2, ch2); 4393 __ ands(tmp2, tmp2, tmp4); // clear useless bits and check 4394 __ rbit(tmp2, tmp2); 4395 __ br(__ EQ, NOMATCH); 4396 __ BIND(L_SMALL_HAS_ZERO_LOOP); 4397 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's 4398 __ cmp(cnt1, u1(wordSize/str2_chr_size)); 4399 __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2); 4400 if (str2_isL) { // LL 4401 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index" 4402 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe. 4403 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info 4404 __ add(result, result, tmp4, __ LSR, LogBitsPerByte); 4405 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 4406 } else { 4407 __ mov(ch2, 0xE); // all bits in byte set except last one 4408 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 4409 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 4410 __ lslv(tmp2, tmp2, tmp4); 4411 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4412 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4413 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 4414 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4415 } 4416 __ cmp(ch1, ch2); 4417 __ mov(tmp4, wordSize/str2_chr_size); 4418 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 4419 __ BIND(L_SMALL_CMP_LOOP); 4420 str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift))) 4421 : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift))); 4422 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))) 4423 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))); 4424 __ add(tmp4, tmp4, 1); 4425 __ cmp(tmp4, cnt1); 4426 __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP); 4427 __ cmp(first, ch2); 4428 __ br(__ EQ, L_SMALL_CMP_LOOP); 4429 __ BIND(L_SMALL_CMP_LOOP_NOMATCH); 4430 __ cbz(tmp2, NOMATCH); // no more matches. exit 4431 __ clz(tmp4, tmp2); 4432 __ add(result, result, 1); // advance index 4433 __ add(str2, str2, str2_chr_size); // advance pointer 4434 __ b(L_SMALL_HAS_ZERO_LOOP); 4435 __ align(OptoLoopAlignment); 4436 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP); 4437 __ cmp(first, ch2); 4438 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 4439 __ b(DONE); 4440 __ align(OptoLoopAlignment); 4441 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2); 4442 if (str2_isL) { // LL 4443 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index" 4444 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe. 4445 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info 4446 __ add(result, result, tmp4, __ LSR, LogBitsPerByte); 4447 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 4448 } else { 4449 __ mov(ch2, 0xE); // all bits in byte set except last one 4450 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 4451 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 4452 __ lslv(tmp2, tmp2, tmp4); 4453 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4454 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4455 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 4456 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4457 } 4458 __ cmp(ch1, ch2); 4459 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 4460 __ b(DONE); 4461 __ align(OptoLoopAlignment); 4462 __ BIND(L_HAS_ZERO); 4463 __ rbit(tmp2, tmp2); 4464 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's 4465 // Now, perform compression of counters(cnt2 and cnt1) into one register. 4466 // It's fine because both counters are 32bit and are not changed in this 4467 // loop. Just restore it on exit. So, cnt1 can be re-used in this loop. 4468 __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2); 4469 __ sub(result, result, 1); 4470 __ BIND(L_HAS_ZERO_LOOP); 4471 __ mov(cnt1, wordSize/str2_chr_size); 4472 __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2); 4473 __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare 4474 if (str2_isL) { 4475 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index 4476 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 4477 __ lslv(tmp2, tmp2, tmp4); 4478 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4479 __ add(tmp4, tmp4, 1); 4480 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4481 __ lsl(tmp2, tmp2, 1); 4482 __ mov(tmp4, wordSize/str2_chr_size); 4483 } else { 4484 __ mov(ch2, 0xE); 4485 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 4486 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 4487 __ lslv(tmp2, tmp2, tmp4); 4488 __ add(tmp4, tmp4, 1); 4489 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4490 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); 4491 __ lsl(tmp2, tmp2, 1); 4492 __ mov(tmp4, wordSize/str2_chr_size); 4493 __ sub(str2, str2, str2_chr_size); 4494 } 4495 __ cmp(ch1, ch2); 4496 __ mov(tmp4, wordSize/str2_chr_size); 4497 __ br(__ NE, L_CMP_LOOP_NOMATCH); 4498 __ BIND(L_CMP_LOOP); 4499 str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift))) 4500 : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift))); 4501 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))) 4502 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))); 4503 __ add(tmp4, tmp4, 1); 4504 __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2); 4505 __ br(__ GE, L_CMP_LOOP_LAST_CMP); 4506 __ cmp(cnt1, ch2); 4507 __ br(__ EQ, L_CMP_LOOP); 4508 __ BIND(L_CMP_LOOP_NOMATCH); 4509 // here we're not matched 4510 __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop 4511 __ clz(tmp4, tmp2); 4512 __ add(str2, str2, str2_chr_size); // advance pointer 4513 __ b(L_HAS_ZERO_LOOP); 4514 __ align(OptoLoopAlignment); 4515 __ BIND(L_CMP_LOOP_LAST_CMP); 4516 __ cmp(cnt1, ch2); 4517 __ br(__ NE, L_CMP_LOOP_NOMATCH); 4518 __ b(DONE); 4519 __ align(OptoLoopAlignment); 4520 __ BIND(L_CMP_LOOP_LAST_CMP2); 4521 if (str2_isL) { 4522 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index 4523 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 4524 __ lslv(tmp2, tmp2, tmp4); 4525 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4526 __ add(tmp4, tmp4, 1); 4527 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4528 __ lsl(tmp2, tmp2, 1); 4529 } else { 4530 __ mov(ch2, 0xE); 4531 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 4532 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 4533 __ lslv(tmp2, tmp2, tmp4); 4534 __ add(tmp4, tmp4, 1); 4535 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4536 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); 4537 __ lsl(tmp2, tmp2, 1); 4538 __ sub(str2, str2, str2_chr_size); 4539 } 4540 __ cmp(ch1, ch2); 4541 __ br(__ NE, L_CMP_LOOP_NOMATCH); 4542 __ b(DONE); 4543 __ align(OptoLoopAlignment); 4544 __ BIND(L_HAS_ZERO_LOOP_NOMATCH); 4545 // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until 4546 // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP, 4547 // so, result was increased at max by wordSize/str2_chr_size - 1, so, 4548 // respective high bit wasn't changed. L_LOOP_PROCEED will increase 4549 // result by analyzed characters value, so, we can just reset lower bits 4550 // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL 4551 // 2) restore cnt1 and cnt2 values from "compressed" cnt2 4552 // 3) advance str2 value to represent next str2 octet. result & 7/3 is 4553 // index of last analyzed substring inside current octet. So, str2 in at 4554 // respective start address. We need to advance it to next octet 4555 __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed 4556 __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2); 4557 __ bfm(result, zr, 0, 2 - str2_chr_shift); 4558 __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2 4559 __ movw(cnt2, cnt2); 4560 __ b(L_LOOP_PROCEED); 4561 __ align(OptoLoopAlignment); 4562 __ BIND(NOMATCH); 4563 __ mov(result, -1); 4564 __ BIND(DONE); 4565 __ pop(spilled_regs, sp); 4566 __ ret(lr); 4567 return entry; 4568 } 4569 4570 void generate_string_indexof_stubs() { 4571 StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true); 4572 StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false); 4573 StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false); 4574 } 4575 4576 void inflate_and_store_2_fp_registers(bool generatePrfm, 4577 FloatRegister src1, FloatRegister src2) { 4578 Register dst = r1; 4579 __ zip1(v1, __ T16B, src1, v0); 4580 __ zip2(v2, __ T16B, src1, v0); 4581 if (generatePrfm) { 4582 __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM); 4583 } 4584 __ zip1(v3, __ T16B, src2, v0); 4585 __ zip2(v4, __ T16B, src2, v0); 4586 __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64))); 4587 } 4588 4589 // R0 = src 4590 // R1 = dst 4591 // R2 = len 4592 // R3 = len >> 3 4593 // V0 = 0 4594 // v1 = loaded 8 bytes 4595 address generate_large_byte_array_inflate() { 4596 __ align(CodeEntryAlignment); 4597 StubCodeMark mark(this, "StubRoutines", "large_byte_array_inflate"); 4598 address entry = __ pc(); 4599 Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE; 4600 Register src = r0, dst = r1, len = r2, octetCounter = r3; 4601 const int large_loop_threshold = MAX(64, SoftwarePrefetchHintDistance)/8 + 4; 4602 4603 // do one more 8-byte read to have address 16-byte aligned in most cases 4604 // also use single store instruction 4605 __ ldrd(v2, __ post(src, 8)); 4606 __ sub(octetCounter, octetCounter, 2); 4607 __ zip1(v1, __ T16B, v1, v0); 4608 __ zip1(v2, __ T16B, v2, v0); 4609 __ st1(v1, v2, __ T16B, __ post(dst, 32)); 4610 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 4611 __ subs(rscratch1, octetCounter, large_loop_threshold); 4612 __ br(__ LE, LOOP_START); 4613 __ b(LOOP_PRFM_START); 4614 __ bind(LOOP_PRFM); 4615 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 4616 __ bind(LOOP_PRFM_START); 4617 __ prfm(Address(src, SoftwarePrefetchHintDistance)); 4618 __ sub(octetCounter, octetCounter, 8); 4619 __ subs(rscratch1, octetCounter, large_loop_threshold); 4620 inflate_and_store_2_fp_registers(true, v3, v4); 4621 inflate_and_store_2_fp_registers(true, v5, v6); 4622 __ br(__ GT, LOOP_PRFM); 4623 __ cmp(octetCounter, (u1)8); 4624 __ br(__ LT, DONE); 4625 __ bind(LOOP); 4626 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 4627 __ bind(LOOP_START); 4628 __ sub(octetCounter, octetCounter, 8); 4629 __ cmp(octetCounter, (u1)8); 4630 inflate_and_store_2_fp_registers(false, v3, v4); 4631 inflate_and_store_2_fp_registers(false, v5, v6); 4632 __ br(__ GE, LOOP); 4633 __ bind(DONE); 4634 __ ret(lr); 4635 return entry; 4636 } 4637 4638 /** 4639 * Arguments: 4640 * 4641 * Input: 4642 * c_rarg0 - current state address 4643 * c_rarg1 - H key address 4644 * c_rarg2 - data address 4645 * c_rarg3 - number of blocks 4646 * 4647 * Output: 4648 * Updated state at c_rarg0 4649 */ 4650 address generate_ghash_processBlocks() { 4651 // Bafflingly, GCM uses little-endian for the byte order, but 4652 // big-endian for the bit order. For example, the polynomial 1 is 4653 // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00. 4654 // 4655 // So, we must either reverse the bytes in each word and do 4656 // everything big-endian or reverse the bits in each byte and do 4657 // it little-endian. On AArch64 it's more idiomatic to reverse 4658 // the bits in each byte (we have an instruction, RBIT, to do 4659 // that) and keep the data in little-endian bit order throught the 4660 // calculation, bit-reversing the inputs and outputs. 4661 4662 StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks"); 4663 __ align(wordSize * 2); 4664 address p = __ pc(); 4665 __ emit_int64(0x87); // The low-order bits of the field 4666 // polynomial (i.e. p = z^7+z^2+z+1) 4667 // repeated in the low and high parts of a 4668 // 128-bit vector 4669 __ emit_int64(0x87); 4670 4671 __ align(CodeEntryAlignment); 4672 address start = __ pc(); 4673 4674 Register state = c_rarg0; 4675 Register subkeyH = c_rarg1; 4676 Register data = c_rarg2; 4677 Register blocks = c_rarg3; 4678 4679 FloatRegister vzr = v30; 4680 __ eor(vzr, __ T16B, vzr, vzr); // zero register 4681 4682 __ ldrq(v0, Address(state)); 4683 __ ldrq(v1, Address(subkeyH)); 4684 4685 __ rev64(v0, __ T16B, v0); // Bit-reverse words in state and subkeyH 4686 __ rbit(v0, __ T16B, v0); 4687 __ rev64(v1, __ T16B, v1); 4688 __ rbit(v1, __ T16B, v1); 4689 4690 __ ldrq(v26, p); 4691 4692 __ ext(v16, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1 4693 __ eor(v16, __ T16B, v16, v1); // xor subkeyH into subkeyL (Karatsuba: (A1+A0)) 4694 4695 { 4696 Label L_ghash_loop; 4697 __ bind(L_ghash_loop); 4698 4699 __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit 4700 // reversing each byte 4701 __ rbit(v2, __ T16B, v2); 4702 __ eor(v2, __ T16B, v0, v2); // bit-swapped data ^ bit-swapped state 4703 4704 // Multiply state in v2 by subkey in v1 4705 ghash_multiply(/*result_lo*/v5, /*result_hi*/v7, 4706 /*a*/v1, /*b*/v2, /*a1_xor_a0*/v16, 4707 /*temps*/v6, v20, v18, v21); 4708 // Reduce v7:v5 by the field polynomial 4709 ghash_reduce(v0, v5, v7, v26, vzr, v20); 4710 4711 __ sub(blocks, blocks, 1); 4712 __ cbnz(blocks, L_ghash_loop); 4713 } 4714 4715 // The bit-reversed result is at this point in v0 4716 __ rev64(v1, __ T16B, v0); 4717 __ rbit(v1, __ T16B, v1); 4718 4719 __ st1(v1, __ T16B, state); 4720 __ ret(lr); 4721 4722 return start; 4723 } 4724 4725 // Continuation point for throwing of implicit exceptions that are 4726 // not handled in the current activation. Fabricates an exception 4727 // oop and initiates normal exception dispatching in this 4728 // frame. Since we need to preserve callee-saved values (currently 4729 // only for C2, but done for C1 as well) we need a callee-saved oop 4730 // map and therefore have to make these stubs into RuntimeStubs 4731 // rather than BufferBlobs. If the compiler needs all registers to 4732 // be preserved between the fault point and the exception handler 4733 // then it must assume responsibility for that in 4734 // AbstractCompiler::continuation_for_implicit_null_exception or 4735 // continuation_for_implicit_division_by_zero_exception. All other 4736 // implicit exceptions (e.g., NullPointerException or 4737 // AbstractMethodError on entry) are either at call sites or 4738 // otherwise assume that stack unwinding will be initiated, so 4739 // caller saved registers were assumed volatile in the compiler. 4740 4741 #undef __ 4742 #define __ masm-> 4743 4744 address generate_throw_exception(const char* name, 4745 address runtime_entry, 4746 Register arg1 = noreg, 4747 Register arg2 = noreg) { 4748 // Information about frame layout at time of blocking runtime call. 4749 // Note that we only have to preserve callee-saved registers since 4750 // the compilers are responsible for supplying a continuation point 4751 // if they expect all registers to be preserved. 4752 // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0 4753 enum layout { 4754 rfp_off = 0, 4755 rfp_off2, 4756 return_off, 4757 return_off2, 4758 framesize // inclusive of return address 4759 }; 4760 4761 int insts_size = 512; 4762 int locs_size = 64; 4763 4764 CodeBuffer code(name, insts_size, locs_size); 4765 OopMapSet* oop_maps = new OopMapSet(); 4766 MacroAssembler* masm = new MacroAssembler(&code); 4767 4768 address start = __ pc(); 4769 4770 // This is an inlined and slightly modified version of call_VM 4771 // which has the ability to fetch the return PC out of 4772 // thread-local storage and also sets up last_Java_sp slightly 4773 // differently than the real call_VM 4774 4775 __ enter(); // Save FP and LR before call 4776 4777 assert(is_even(framesize/2), "sp not 16-byte aligned"); 4778 4779 // lr and fp are already in place 4780 __ sub(sp, rfp, ((unsigned)framesize-4) << LogBytesPerInt); // prolog 4781 4782 int frame_complete = __ pc() - start; 4783 4784 // Set up last_Java_sp and last_Java_fp 4785 address the_pc = __ pc(); 4786 __ set_last_Java_frame(sp, rfp, the_pc, rscratch1); 4787 4788 // Call runtime 4789 if (arg1 != noreg) { 4790 assert(arg2 != c_rarg1, "clobbered"); 4791 __ mov(c_rarg1, arg1); 4792 } 4793 if (arg2 != noreg) { 4794 __ mov(c_rarg2, arg2); 4795 } 4796 __ mov(c_rarg0, rthread); 4797 BLOCK_COMMENT("call runtime_entry"); 4798 __ mov(rscratch1, runtime_entry); 4799 __ blrt(rscratch1, 3 /* number_of_arguments */, 0, 1); 4800 4801 // Generate oop map 4802 OopMap* map = new OopMap(framesize, 0); 4803 4804 oop_maps->add_gc_map(the_pc - start, map); 4805 4806 __ reset_last_Java_frame(true); 4807 __ maybe_isb(); 4808 4809 __ leave(); 4810 4811 // check for pending exceptions 4812 #ifdef ASSERT 4813 Label L; 4814 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 4815 __ cbnz(rscratch1, L); 4816 __ should_not_reach_here(); 4817 __ bind(L); 4818 #endif // ASSERT 4819 __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 4820 4821 4822 // codeBlob framesize is in words (not VMRegImpl::slot_size) 4823 RuntimeStub* stub = 4824 RuntimeStub::new_runtime_stub(name, 4825 &code, 4826 frame_complete, 4827 (framesize >> (LogBytesPerWord - LogBytesPerInt)), 4828 oop_maps, false); 4829 return stub->entry_point(); 4830 } 4831 4832 class MontgomeryMultiplyGenerator : public MacroAssembler { 4833 4834 Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn, 4835 Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj; 4836 4837 RegSet _toSave; 4838 bool _squaring; 4839 4840 public: 4841 MontgomeryMultiplyGenerator (Assembler *as, bool squaring) 4842 : MacroAssembler(as->code()), _squaring(squaring) { 4843 4844 // Register allocation 4845 4846 Register reg = c_rarg0; 4847 Pa_base = reg; // Argument registers 4848 if (squaring) 4849 Pb_base = Pa_base; 4850 else 4851 Pb_base = ++reg; 4852 Pn_base = ++reg; 4853 Rlen= ++reg; 4854 inv = ++reg; 4855 Pm_base = ++reg; 4856 4857 // Working registers: 4858 Ra = ++reg; // The current digit of a, b, n, and m. 4859 Rb = ++reg; 4860 Rm = ++reg; 4861 Rn = ++reg; 4862 4863 Pa = ++reg; // Pointers to the current/next digit of a, b, n, and m. 4864 Pb = ++reg; 4865 Pm = ++reg; 4866 Pn = ++reg; 4867 4868 t0 = ++reg; // Three registers which form a 4869 t1 = ++reg; // triple-precision accumuator. 4870 t2 = ++reg; 4871 4872 Ri = ++reg; // Inner and outer loop indexes. 4873 Rj = ++reg; 4874 4875 Rhi_ab = ++reg; // Product registers: low and high parts 4876 Rlo_ab = ++reg; // of a*b and m*n. 4877 Rhi_mn = ++reg; 4878 Rlo_mn = ++reg; 4879 4880 // r19 and up are callee-saved. 4881 _toSave = RegSet::range(r19, reg) + Pm_base; 4882 } 4883 4884 private: 4885 void save_regs() { 4886 push(_toSave, sp); 4887 } 4888 4889 void restore_regs() { 4890 pop(_toSave, sp); 4891 } 4892 4893 template <typename T> 4894 void unroll_2(Register count, T block) { 4895 Label loop, end, odd; 4896 tbnz(count, 0, odd); 4897 cbz(count, end); 4898 align(16); 4899 bind(loop); 4900 (this->*block)(); 4901 bind(odd); 4902 (this->*block)(); 4903 subs(count, count, 2); 4904 br(Assembler::GT, loop); 4905 bind(end); 4906 } 4907 4908 template <typename T> 4909 void unroll_2(Register count, T block, Register d, Register s, Register tmp) { 4910 Label loop, end, odd; 4911 tbnz(count, 0, odd); 4912 cbz(count, end); 4913 align(16); 4914 bind(loop); 4915 (this->*block)(d, s, tmp); 4916 bind(odd); 4917 (this->*block)(d, s, tmp); 4918 subs(count, count, 2); 4919 br(Assembler::GT, loop); 4920 bind(end); 4921 } 4922 4923 void pre1(RegisterOrConstant i) { 4924 block_comment("pre1"); 4925 // Pa = Pa_base; 4926 // Pb = Pb_base + i; 4927 // Pm = Pm_base; 4928 // Pn = Pn_base + i; 4929 // Ra = *Pa; 4930 // Rb = *Pb; 4931 // Rm = *Pm; 4932 // Rn = *Pn; 4933 ldr(Ra, Address(Pa_base)); 4934 ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 4935 ldr(Rm, Address(Pm_base)); 4936 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 4937 lea(Pa, Address(Pa_base)); 4938 lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 4939 lea(Pm, Address(Pm_base)); 4940 lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 4941 4942 // Zero the m*n result. 4943 mov(Rhi_mn, zr); 4944 mov(Rlo_mn, zr); 4945 } 4946 4947 // The core multiply-accumulate step of a Montgomery 4948 // multiplication. The idea is to schedule operations as a 4949 // pipeline so that instructions with long latencies (loads and 4950 // multiplies) have time to complete before their results are 4951 // used. This most benefits in-order implementations of the 4952 // architecture but out-of-order ones also benefit. 4953 void step() { 4954 block_comment("step"); 4955 // MACC(Ra, Rb, t0, t1, t2); 4956 // Ra = *++Pa; 4957 // Rb = *--Pb; 4958 umulh(Rhi_ab, Ra, Rb); 4959 mul(Rlo_ab, Ra, Rb); 4960 ldr(Ra, pre(Pa, wordSize)); 4961 ldr(Rb, pre(Pb, -wordSize)); 4962 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the 4963 // previous iteration. 4964 // MACC(Rm, Rn, t0, t1, t2); 4965 // Rm = *++Pm; 4966 // Rn = *--Pn; 4967 umulh(Rhi_mn, Rm, Rn); 4968 mul(Rlo_mn, Rm, Rn); 4969 ldr(Rm, pre(Pm, wordSize)); 4970 ldr(Rn, pre(Pn, -wordSize)); 4971 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 4972 } 4973 4974 void post1() { 4975 block_comment("post1"); 4976 4977 // MACC(Ra, Rb, t0, t1, t2); 4978 // Ra = *++Pa; 4979 // Rb = *--Pb; 4980 umulh(Rhi_ab, Ra, Rb); 4981 mul(Rlo_ab, Ra, Rb); 4982 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 4983 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 4984 4985 // *Pm = Rm = t0 * inv; 4986 mul(Rm, t0, inv); 4987 str(Rm, Address(Pm)); 4988 4989 // MACC(Rm, Rn, t0, t1, t2); 4990 // t0 = t1; t1 = t2; t2 = 0; 4991 umulh(Rhi_mn, Rm, Rn); 4992 4993 #ifndef PRODUCT 4994 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 4995 { 4996 mul(Rlo_mn, Rm, Rn); 4997 add(Rlo_mn, t0, Rlo_mn); 4998 Label ok; 4999 cbz(Rlo_mn, ok); { 5000 stop("broken Montgomery multiply"); 5001 } bind(ok); 5002 } 5003 #endif 5004 // We have very carefully set things up so that 5005 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 5006 // the lower half of Rm * Rn because we know the result already: 5007 // it must be -t0. t0 + (-t0) must generate a carry iff 5008 // t0 != 0. So, rather than do a mul and an adds we just set 5009 // the carry flag iff t0 is nonzero. 5010 // 5011 // mul(Rlo_mn, Rm, Rn); 5012 // adds(zr, t0, Rlo_mn); 5013 subs(zr, t0, 1); // Set carry iff t0 is nonzero 5014 adcs(t0, t1, Rhi_mn); 5015 adc(t1, t2, zr); 5016 mov(t2, zr); 5017 } 5018 5019 void pre2(RegisterOrConstant i, RegisterOrConstant len) { 5020 block_comment("pre2"); 5021 // Pa = Pa_base + i-len; 5022 // Pb = Pb_base + len; 5023 // Pm = Pm_base + i-len; 5024 // Pn = Pn_base + len; 5025 5026 if (i.is_register()) { 5027 sub(Rj, i.as_register(), len); 5028 } else { 5029 mov(Rj, i.as_constant()); 5030 sub(Rj, Rj, len); 5031 } 5032 // Rj == i-len 5033 5034 lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord))); 5035 lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord))); 5036 lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 5037 lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord))); 5038 5039 // Ra = *++Pa; 5040 // Rb = *--Pb; 5041 // Rm = *++Pm; 5042 // Rn = *--Pn; 5043 ldr(Ra, pre(Pa, wordSize)); 5044 ldr(Rb, pre(Pb, -wordSize)); 5045 ldr(Rm, pre(Pm, wordSize)); 5046 ldr(Rn, pre(Pn, -wordSize)); 5047 5048 mov(Rhi_mn, zr); 5049 mov(Rlo_mn, zr); 5050 } 5051 5052 void post2(RegisterOrConstant i, RegisterOrConstant len) { 5053 block_comment("post2"); 5054 if (i.is_constant()) { 5055 mov(Rj, i.as_constant()-len.as_constant()); 5056 } else { 5057 sub(Rj, i.as_register(), len); 5058 } 5059 5060 adds(t0, t0, Rlo_mn); // The pending m*n, low part 5061 5062 // As soon as we know the least significant digit of our result, 5063 // store it. 5064 // Pm_base[i-len] = t0; 5065 str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 5066 5067 // t0 = t1; t1 = t2; t2 = 0; 5068 adcs(t0, t1, Rhi_mn); // The pending m*n, high part 5069 adc(t1, t2, zr); 5070 mov(t2, zr); 5071 } 5072 5073 // A carry in t0 after Montgomery multiplication means that we 5074 // should subtract multiples of n from our result in m. We'll 5075 // keep doing that until there is no carry. 5076 void normalize(RegisterOrConstant len) { 5077 block_comment("normalize"); 5078 // while (t0) 5079 // t0 = sub(Pm_base, Pn_base, t0, len); 5080 Label loop, post, again; 5081 Register cnt = t1, i = t2; // Re-use registers; we're done with them now 5082 cbz(t0, post); { 5083 bind(again); { 5084 mov(i, zr); 5085 mov(cnt, len); 5086 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 5087 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 5088 subs(zr, zr, zr); // set carry flag, i.e. no borrow 5089 align(16); 5090 bind(loop); { 5091 sbcs(Rm, Rm, Rn); 5092 str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 5093 add(i, i, 1); 5094 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 5095 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 5096 sub(cnt, cnt, 1); 5097 } cbnz(cnt, loop); 5098 sbc(t0, t0, zr); 5099 } cbnz(t0, again); 5100 } bind(post); 5101 } 5102 5103 // Move memory at s to d, reversing words. 5104 // Increments d to end of copied memory 5105 // Destroys tmp1, tmp2 5106 // Preserves len 5107 // Leaves s pointing to the address which was in d at start 5108 void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) { 5109 assert(tmp1 < r19 && tmp2 < r19, "register corruption"); 5110 5111 lea(s, Address(s, len, Address::uxtw(LogBytesPerWord))); 5112 mov(tmp1, len); 5113 unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2); 5114 sub(s, d, len, ext::uxtw, LogBytesPerWord); 5115 } 5116 // where 5117 void reverse1(Register d, Register s, Register tmp) { 5118 ldr(tmp, pre(s, -wordSize)); 5119 ror(tmp, tmp, 32); 5120 str(tmp, post(d, wordSize)); 5121 } 5122 5123 void step_squaring() { 5124 // An extra ACC 5125 step(); 5126 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 5127 } 5128 5129 void last_squaring(RegisterOrConstant i) { 5130 Label dont; 5131 // if ((i & 1) == 0) { 5132 tbnz(i.as_register(), 0, dont); { 5133 // MACC(Ra, Rb, t0, t1, t2); 5134 // Ra = *++Pa; 5135 // Rb = *--Pb; 5136 umulh(Rhi_ab, Ra, Rb); 5137 mul(Rlo_ab, Ra, Rb); 5138 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 5139 } bind(dont); 5140 } 5141 5142 void extra_step_squaring() { 5143 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 5144 5145 // MACC(Rm, Rn, t0, t1, t2); 5146 // Rm = *++Pm; 5147 // Rn = *--Pn; 5148 umulh(Rhi_mn, Rm, Rn); 5149 mul(Rlo_mn, Rm, Rn); 5150 ldr(Rm, pre(Pm, wordSize)); 5151 ldr(Rn, pre(Pn, -wordSize)); 5152 } 5153 5154 void post1_squaring() { 5155 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 5156 5157 // *Pm = Rm = t0 * inv; 5158 mul(Rm, t0, inv); 5159 str(Rm, Address(Pm)); 5160 5161 // MACC(Rm, Rn, t0, t1, t2); 5162 // t0 = t1; t1 = t2; t2 = 0; 5163 umulh(Rhi_mn, Rm, Rn); 5164 5165 #ifndef PRODUCT 5166 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 5167 { 5168 mul(Rlo_mn, Rm, Rn); 5169 add(Rlo_mn, t0, Rlo_mn); 5170 Label ok; 5171 cbz(Rlo_mn, ok); { 5172 stop("broken Montgomery multiply"); 5173 } bind(ok); 5174 } 5175 #endif 5176 // We have very carefully set things up so that 5177 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 5178 // the lower half of Rm * Rn because we know the result already: 5179 // it must be -t0. t0 + (-t0) must generate a carry iff 5180 // t0 != 0. So, rather than do a mul and an adds we just set 5181 // the carry flag iff t0 is nonzero. 5182 // 5183 // mul(Rlo_mn, Rm, Rn); 5184 // adds(zr, t0, Rlo_mn); 5185 subs(zr, t0, 1); // Set carry iff t0 is nonzero 5186 adcs(t0, t1, Rhi_mn); 5187 adc(t1, t2, zr); 5188 mov(t2, zr); 5189 } 5190 5191 void acc(Register Rhi, Register Rlo, 5192 Register t0, Register t1, Register t2) { 5193 adds(t0, t0, Rlo); 5194 adcs(t1, t1, Rhi); 5195 adc(t2, t2, zr); 5196 } 5197 5198 public: 5199 /** 5200 * Fast Montgomery multiplication. The derivation of the 5201 * algorithm is in A Cryptographic Library for the Motorola 5202 * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. 5203 * 5204 * Arguments: 5205 * 5206 * Inputs for multiplication: 5207 * c_rarg0 - int array elements a 5208 * c_rarg1 - int array elements b 5209 * c_rarg2 - int array elements n (the modulus) 5210 * c_rarg3 - int length 5211 * c_rarg4 - int inv 5212 * c_rarg5 - int array elements m (the result) 5213 * 5214 * Inputs for squaring: 5215 * c_rarg0 - int array elements a 5216 * c_rarg1 - int array elements n (the modulus) 5217 * c_rarg2 - int length 5218 * c_rarg3 - int inv 5219 * c_rarg4 - int array elements m (the result) 5220 * 5221 */ 5222 address generate_multiply() { 5223 Label argh, nothing; 5224 bind(argh); 5225 stop("MontgomeryMultiply total_allocation must be <= 8192"); 5226 5227 align(CodeEntryAlignment); 5228 address entry = pc(); 5229 5230 cbzw(Rlen, nothing); 5231 5232 enter(); 5233 5234 // Make room. 5235 cmpw(Rlen, 512); 5236 br(Assembler::HI, argh); 5237 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 5238 andr(sp, Ra, -2 * wordSize); 5239 5240 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 5241 5242 { 5243 // Copy input args, reversing as we go. We use Ra as a 5244 // temporary variable. 5245 reverse(Ra, Pa_base, Rlen, t0, t1); 5246 if (!_squaring) 5247 reverse(Ra, Pb_base, Rlen, t0, t1); 5248 reverse(Ra, Pn_base, Rlen, t0, t1); 5249 } 5250 5251 // Push all call-saved registers and also Pm_base which we'll need 5252 // at the end. 5253 save_regs(); 5254 5255 #ifndef PRODUCT 5256 // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply"); 5257 { 5258 ldr(Rn, Address(Pn_base, 0)); 5259 mul(Rlo_mn, Rn, inv); 5260 subs(zr, Rlo_mn, -1); 5261 Label ok; 5262 br(EQ, ok); { 5263 stop("broken inverse in Montgomery multiply"); 5264 } bind(ok); 5265 } 5266 #endif 5267 5268 mov(Pm_base, Ra); 5269 5270 mov(t0, zr); 5271 mov(t1, zr); 5272 mov(t2, zr); 5273 5274 block_comment("for (int i = 0; i < len; i++) {"); 5275 mov(Ri, zr); { 5276 Label loop, end; 5277 cmpw(Ri, Rlen); 5278 br(Assembler::GE, end); 5279 5280 bind(loop); 5281 pre1(Ri); 5282 5283 block_comment(" for (j = i; j; j--) {"); { 5284 movw(Rj, Ri); 5285 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 5286 } block_comment(" } // j"); 5287 5288 post1(); 5289 addw(Ri, Ri, 1); 5290 cmpw(Ri, Rlen); 5291 br(Assembler::LT, loop); 5292 bind(end); 5293 block_comment("} // i"); 5294 } 5295 5296 block_comment("for (int i = len; i < 2*len; i++) {"); 5297 mov(Ri, Rlen); { 5298 Label loop, end; 5299 cmpw(Ri, Rlen, Assembler::LSL, 1); 5300 br(Assembler::GE, end); 5301 5302 bind(loop); 5303 pre2(Ri, Rlen); 5304 5305 block_comment(" for (j = len*2-i-1; j; j--) {"); { 5306 lslw(Rj, Rlen, 1); 5307 subw(Rj, Rj, Ri); 5308 subw(Rj, Rj, 1); 5309 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 5310 } block_comment(" } // j"); 5311 5312 post2(Ri, Rlen); 5313 addw(Ri, Ri, 1); 5314 cmpw(Ri, Rlen, Assembler::LSL, 1); 5315 br(Assembler::LT, loop); 5316 bind(end); 5317 } 5318 block_comment("} // i"); 5319 5320 normalize(Rlen); 5321 5322 mov(Ra, Pm_base); // Save Pm_base in Ra 5323 restore_regs(); // Restore caller's Pm_base 5324 5325 // Copy our result into caller's Pm_base 5326 reverse(Pm_base, Ra, Rlen, t0, t1); 5327 5328 leave(); 5329 bind(nothing); 5330 ret(lr); 5331 5332 return entry; 5333 } 5334 // In C, approximately: 5335 5336 // void 5337 // montgomery_multiply(unsigned long Pa_base[], unsigned long Pb_base[], 5338 // unsigned long Pn_base[], unsigned long Pm_base[], 5339 // unsigned long inv, int len) { 5340 // unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 5341 // unsigned long *Pa, *Pb, *Pn, *Pm; 5342 // unsigned long Ra, Rb, Rn, Rm; 5343 5344 // int i; 5345 5346 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 5347 5348 // for (i = 0; i < len; i++) { 5349 // int j; 5350 5351 // Pa = Pa_base; 5352 // Pb = Pb_base + i; 5353 // Pm = Pm_base; 5354 // Pn = Pn_base + i; 5355 5356 // Ra = *Pa; 5357 // Rb = *Pb; 5358 // Rm = *Pm; 5359 // Rn = *Pn; 5360 5361 // int iters = i; 5362 // for (j = 0; iters--; j++) { 5363 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 5364 // MACC(Ra, Rb, t0, t1, t2); 5365 // Ra = *++Pa; 5366 // Rb = *--Pb; 5367 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5368 // MACC(Rm, Rn, t0, t1, t2); 5369 // Rm = *++Pm; 5370 // Rn = *--Pn; 5371 // } 5372 5373 // assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be"); 5374 // MACC(Ra, Rb, t0, t1, t2); 5375 // *Pm = Rm = t0 * inv; 5376 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 5377 // MACC(Rm, Rn, t0, t1, t2); 5378 5379 // assert(t0 == 0, "broken Montgomery multiply"); 5380 5381 // t0 = t1; t1 = t2; t2 = 0; 5382 // } 5383 5384 // for (i = len; i < 2*len; i++) { 5385 // int j; 5386 5387 // Pa = Pa_base + i-len; 5388 // Pb = Pb_base + len; 5389 // Pm = Pm_base + i-len; 5390 // Pn = Pn_base + len; 5391 5392 // Ra = *++Pa; 5393 // Rb = *--Pb; 5394 // Rm = *++Pm; 5395 // Rn = *--Pn; 5396 5397 // int iters = len*2-i-1; 5398 // for (j = i-len+1; iters--; j++) { 5399 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 5400 // MACC(Ra, Rb, t0, t1, t2); 5401 // Ra = *++Pa; 5402 // Rb = *--Pb; 5403 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5404 // MACC(Rm, Rn, t0, t1, t2); 5405 // Rm = *++Pm; 5406 // Rn = *--Pn; 5407 // } 5408 5409 // Pm_base[i-len] = t0; 5410 // t0 = t1; t1 = t2; t2 = 0; 5411 // } 5412 5413 // while (t0) 5414 // t0 = sub(Pm_base, Pn_base, t0, len); 5415 // } 5416 5417 /** 5418 * Fast Montgomery squaring. This uses asymptotically 25% fewer 5419 * multiplies than Montgomery multiplication so it should be up to 5420 * 25% faster. However, its loop control is more complex and it 5421 * may actually run slower on some machines. 5422 * 5423 * Arguments: 5424 * 5425 * Inputs: 5426 * c_rarg0 - int array elements a 5427 * c_rarg1 - int array elements n (the modulus) 5428 * c_rarg2 - int length 5429 * c_rarg3 - int inv 5430 * c_rarg4 - int array elements m (the result) 5431 * 5432 */ 5433 address generate_square() { 5434 Label argh; 5435 bind(argh); 5436 stop("MontgomeryMultiply total_allocation must be <= 8192"); 5437 5438 align(CodeEntryAlignment); 5439 address entry = pc(); 5440 5441 enter(); 5442 5443 // Make room. 5444 cmpw(Rlen, 512); 5445 br(Assembler::HI, argh); 5446 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 5447 andr(sp, Ra, -2 * wordSize); 5448 5449 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 5450 5451 { 5452 // Copy input args, reversing as we go. We use Ra as a 5453 // temporary variable. 5454 reverse(Ra, Pa_base, Rlen, t0, t1); 5455 reverse(Ra, Pn_base, Rlen, t0, t1); 5456 } 5457 5458 // Push all call-saved registers and also Pm_base which we'll need 5459 // at the end. 5460 save_regs(); 5461 5462 mov(Pm_base, Ra); 5463 5464 mov(t0, zr); 5465 mov(t1, zr); 5466 mov(t2, zr); 5467 5468 block_comment("for (int i = 0; i < len; i++) {"); 5469 mov(Ri, zr); { 5470 Label loop, end; 5471 bind(loop); 5472 cmp(Ri, Rlen); 5473 br(Assembler::GE, end); 5474 5475 pre1(Ri); 5476 5477 block_comment("for (j = (i+1)/2; j; j--) {"); { 5478 add(Rj, Ri, 1); 5479 lsr(Rj, Rj, 1); 5480 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 5481 } block_comment(" } // j"); 5482 5483 last_squaring(Ri); 5484 5485 block_comment(" for (j = i/2; j; j--) {"); { 5486 lsr(Rj, Ri, 1); 5487 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 5488 } block_comment(" } // j"); 5489 5490 post1_squaring(); 5491 add(Ri, Ri, 1); 5492 cmp(Ri, Rlen); 5493 br(Assembler::LT, loop); 5494 5495 bind(end); 5496 block_comment("} // i"); 5497 } 5498 5499 block_comment("for (int i = len; i < 2*len; i++) {"); 5500 mov(Ri, Rlen); { 5501 Label loop, end; 5502 bind(loop); 5503 cmp(Ri, Rlen, Assembler::LSL, 1); 5504 br(Assembler::GE, end); 5505 5506 pre2(Ri, Rlen); 5507 5508 block_comment(" for (j = (2*len-i-1)/2; j; j--) {"); { 5509 lsl(Rj, Rlen, 1); 5510 sub(Rj, Rj, Ri); 5511 sub(Rj, Rj, 1); 5512 lsr(Rj, Rj, 1); 5513 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 5514 } block_comment(" } // j"); 5515 5516 last_squaring(Ri); 5517 5518 block_comment(" for (j = (2*len-i)/2; j; j--) {"); { 5519 lsl(Rj, Rlen, 1); 5520 sub(Rj, Rj, Ri); 5521 lsr(Rj, Rj, 1); 5522 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 5523 } block_comment(" } // j"); 5524 5525 post2(Ri, Rlen); 5526 add(Ri, Ri, 1); 5527 cmp(Ri, Rlen, Assembler::LSL, 1); 5528 5529 br(Assembler::LT, loop); 5530 bind(end); 5531 block_comment("} // i"); 5532 } 5533 5534 normalize(Rlen); 5535 5536 mov(Ra, Pm_base); // Save Pm_base in Ra 5537 restore_regs(); // Restore caller's Pm_base 5538 5539 // Copy our result into caller's Pm_base 5540 reverse(Pm_base, Ra, Rlen, t0, t1); 5541 5542 leave(); 5543 ret(lr); 5544 5545 return entry; 5546 } 5547 // In C, approximately: 5548 5549 // void 5550 // montgomery_square(unsigned long Pa_base[], unsigned long Pn_base[], 5551 // unsigned long Pm_base[], unsigned long inv, int len) { 5552 // unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 5553 // unsigned long *Pa, *Pb, *Pn, *Pm; 5554 // unsigned long Ra, Rb, Rn, Rm; 5555 5556 // int i; 5557 5558 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 5559 5560 // for (i = 0; i < len; i++) { 5561 // int j; 5562 5563 // Pa = Pa_base; 5564 // Pb = Pa_base + i; 5565 // Pm = Pm_base; 5566 // Pn = Pn_base + i; 5567 5568 // Ra = *Pa; 5569 // Rb = *Pb; 5570 // Rm = *Pm; 5571 // Rn = *Pn; 5572 5573 // int iters = (i+1)/2; 5574 // for (j = 0; iters--; j++) { 5575 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 5576 // MACC2(Ra, Rb, t0, t1, t2); 5577 // Ra = *++Pa; 5578 // Rb = *--Pb; 5579 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5580 // MACC(Rm, Rn, t0, t1, t2); 5581 // Rm = *++Pm; 5582 // Rn = *--Pn; 5583 // } 5584 // if ((i & 1) == 0) { 5585 // assert(Ra == Pa_base[j], "must be"); 5586 // MACC(Ra, Ra, t0, t1, t2); 5587 // } 5588 // iters = i/2; 5589 // assert(iters == i-j, "must be"); 5590 // for (; iters--; j++) { 5591 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5592 // MACC(Rm, Rn, t0, t1, t2); 5593 // Rm = *++Pm; 5594 // Rn = *--Pn; 5595 // } 5596 5597 // *Pm = Rm = t0 * inv; 5598 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 5599 // MACC(Rm, Rn, t0, t1, t2); 5600 5601 // assert(t0 == 0, "broken Montgomery multiply"); 5602 5603 // t0 = t1; t1 = t2; t2 = 0; 5604 // } 5605 5606 // for (i = len; i < 2*len; i++) { 5607 // int start = i-len+1; 5608 // int end = start + (len - start)/2; 5609 // int j; 5610 5611 // Pa = Pa_base + i-len; 5612 // Pb = Pa_base + len; 5613 // Pm = Pm_base + i-len; 5614 // Pn = Pn_base + len; 5615 5616 // Ra = *++Pa; 5617 // Rb = *--Pb; 5618 // Rm = *++Pm; 5619 // Rn = *--Pn; 5620 5621 // int iters = (2*len-i-1)/2; 5622 // assert(iters == end-start, "must be"); 5623 // for (j = start; iters--; j++) { 5624 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 5625 // MACC2(Ra, Rb, t0, t1, t2); 5626 // Ra = *++Pa; 5627 // Rb = *--Pb; 5628 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5629 // MACC(Rm, Rn, t0, t1, t2); 5630 // Rm = *++Pm; 5631 // Rn = *--Pn; 5632 // } 5633 // if ((i & 1) == 0) { 5634 // assert(Ra == Pa_base[j], "must be"); 5635 // MACC(Ra, Ra, t0, t1, t2); 5636 // } 5637 // iters = (2*len-i)/2; 5638 // assert(iters == len-j, "must be"); 5639 // for (; iters--; j++) { 5640 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5641 // MACC(Rm, Rn, t0, t1, t2); 5642 // Rm = *++Pm; 5643 // Rn = *--Pn; 5644 // } 5645 // Pm_base[i-len] = t0; 5646 // t0 = t1; t1 = t2; t2 = 0; 5647 // } 5648 5649 // while (t0) 5650 // t0 = sub(Pm_base, Pn_base, t0, len); 5651 // } 5652 }; 5653 5654 5655 // Call here from the interpreter or compiled code to either load 5656 // multiple returned values from the value type instance being 5657 // returned to registers or to store returned values to a newly 5658 // allocated value type instance. 5659 address generate_return_value_stub(address destination, const char* name, bool has_res) { 5660 5661 // Information about frame layout at time of blocking runtime call. 5662 // Note that we only have to preserve callee-saved registers since 5663 // the compilers are responsible for supplying a continuation point 5664 // if they expect all registers to be preserved. 5665 // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0 5666 enum layout { 5667 rfp_off = 0, rfp_off2, 5668 5669 j_rarg7_off, j_rarg7_2, 5670 j_rarg6_off, j_rarg6_2, 5671 j_rarg5_off, j_rarg5_2, 5672 j_rarg4_off, j_rarg4_2, 5673 j_rarg3_off, j_rarg3_2, 5674 j_rarg2_off, j_rarg2_2, 5675 j_rarg1_off, j_rarg1_2, 5676 j_rarg0_off, j_rarg0_2, 5677 5678 j_farg0_off, j_farg0_2, 5679 j_farg1_off, j_farg1_2, 5680 j_farg2_off, j_farg2_2, 5681 j_farg3_off, j_farg3_2, 5682 j_farg4_off, j_farg4_2, 5683 j_farg5_off, j_farg5_2, 5684 j_farg6_off, j_farg6_2, 5685 j_farg7_off, j_farg7_2, 5686 5687 return_off, return_off2, 5688 framesize // inclusive of return address 5689 }; 5690 5691 int insts_size = 512; 5692 int locs_size = 64; 5693 5694 CodeBuffer code(name, insts_size, locs_size); 5695 OopMapSet* oop_maps = new OopMapSet(); 5696 MacroAssembler* masm = new MacroAssembler(&code); 5697 5698 address start = __ pc(); 5699 5700 const Address f7_save (rfp, j_farg7_off * wordSize); 5701 const Address f6_save (rfp, j_farg6_off * wordSize); 5702 const Address f5_save (rfp, j_farg5_off * wordSize); 5703 const Address f4_save (rfp, j_farg4_off * wordSize); 5704 const Address f3_save (rfp, j_farg3_off * wordSize); 5705 const Address f2_save (rfp, j_farg2_off * wordSize); 5706 const Address f1_save (rfp, j_farg1_off * wordSize); 5707 const Address f0_save (rfp, j_farg0_off * wordSize); 5708 5709 const Address r0_save (rfp, j_rarg0_off * wordSize); 5710 const Address r1_save (rfp, j_rarg1_off * wordSize); 5711 const Address r2_save (rfp, j_rarg2_off * wordSize); 5712 const Address r3_save (rfp, j_rarg3_off * wordSize); 5713 const Address r4_save (rfp, j_rarg4_off * wordSize); 5714 const Address r5_save (rfp, j_rarg5_off * wordSize); 5715 const Address r6_save (rfp, j_rarg6_off * wordSize); 5716 const Address r7_save (rfp, j_rarg7_off * wordSize); 5717 5718 // Generate oop map 5719 OopMap* map = new OopMap(framesize, 0); 5720 5721 map->set_callee_saved(VMRegImpl::stack2reg(rfp_off), rfp->as_VMReg()); 5722 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg7_off), j_rarg7->as_VMReg()); 5723 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg6_off), j_rarg6->as_VMReg()); 5724 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg5_off), j_rarg5->as_VMReg()); 5725 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg4_off), j_rarg4->as_VMReg()); 5726 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg3_off), j_rarg3->as_VMReg()); 5727 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg2_off), j_rarg2->as_VMReg()); 5728 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg1_off), j_rarg1->as_VMReg()); 5729 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg0_off), j_rarg0->as_VMReg()); 5730 5731 map->set_callee_saved(VMRegImpl::stack2reg(j_farg0_off), j_farg0->as_VMReg()); 5732 map->set_callee_saved(VMRegImpl::stack2reg(j_farg1_off), j_farg1->as_VMReg()); 5733 map->set_callee_saved(VMRegImpl::stack2reg(j_farg2_off), j_farg2->as_VMReg()); 5734 map->set_callee_saved(VMRegImpl::stack2reg(j_farg3_off), j_farg3->as_VMReg()); 5735 map->set_callee_saved(VMRegImpl::stack2reg(j_farg4_off), j_farg4->as_VMReg()); 5736 map->set_callee_saved(VMRegImpl::stack2reg(j_farg5_off), j_farg5->as_VMReg()); 5737 map->set_callee_saved(VMRegImpl::stack2reg(j_farg6_off), j_farg6->as_VMReg()); 5738 map->set_callee_saved(VMRegImpl::stack2reg(j_farg7_off), j_farg7->as_VMReg()); 5739 5740 // This is an inlined and slightly modified version of call_VM 5741 // which has the ability to fetch the return PC out of 5742 // thread-local storage and also sets up last_Java_sp slightly 5743 // differently than the real call_VM 5744 5745 __ enter(); // Save FP and LR before call 5746 5747 assert(is_even(framesize/2), "sp not 16-byte aligned"); 5748 5749 // lr and fp are already in place 5750 __ sub(sp, rfp, ((unsigned)framesize - 4) << LogBytesPerInt); // prolog 5751 5752 __ strd(j_farg7, f7_save); 5753 __ strd(j_farg6, f6_save); 5754 __ strd(j_farg5, f5_save); 5755 __ strd(j_farg4, f4_save); 5756 __ strd(j_farg3, f3_save); 5757 __ strd(j_farg2, f2_save); 5758 __ strd(j_farg1, f1_save); 5759 __ strd(j_farg0, f0_save); 5760 5761 __ str(j_rarg0, r0_save); 5762 __ str(j_rarg1, r1_save); 5763 __ str(j_rarg2, r2_save); 5764 __ str(j_rarg3, r3_save); 5765 __ str(j_rarg4, r4_save); 5766 __ str(j_rarg5, r5_save); 5767 __ str(j_rarg6, r6_save); 5768 __ str(j_rarg7, r7_save); 5769 5770 int frame_complete = __ pc() - start; 5771 5772 // Set up last_Java_sp and last_Java_fp 5773 address the_pc = __ pc(); 5774 __ set_last_Java_frame(sp, rfp, the_pc, rscratch1); 5775 5776 // Call runtime 5777 __ mov(c_rarg0, rthread); 5778 __ mov(c_rarg1, r0); 5779 5780 BLOCK_COMMENT("call runtime_entry"); 5781 __ mov(rscratch1, destination); 5782 __ blrt(rscratch1, 2 /* number_of_arguments */, 0, 1); 5783 5784 oop_maps->add_gc_map(the_pc - start, map); 5785 5786 __ reset_last_Java_frame(false); 5787 __ maybe_isb(); 5788 5789 __ ldrd(j_farg7, f7_save); 5790 __ ldrd(j_farg6, f6_save); 5791 __ ldrd(j_farg5, f5_save); 5792 __ ldrd(j_farg4, f4_save); 5793 __ ldrd(j_farg3, f3_save); 5794 __ ldrd(j_farg3, f2_save); 5795 __ ldrd(j_farg1, f1_save); 5796 __ ldrd(j_farg0, f0_save); 5797 5798 __ ldr(j_rarg0, r0_save); 5799 __ ldr(j_rarg1, r1_save); 5800 __ ldr(j_rarg2, r2_save); 5801 __ ldr(j_rarg3, r3_save); 5802 __ ldr(j_rarg4, r4_save); 5803 __ ldr(j_rarg5, r5_save); 5804 __ ldr(j_rarg6, r6_save); 5805 __ ldr(j_rarg7, r7_save); 5806 5807 __ leave(); 5808 5809 // check for pending exceptions 5810 Label pending; 5811 __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset()))); 5812 __ cmp(rscratch1, (u1)NULL_WORD); 5813 __ br(Assembler::NE, pending); 5814 5815 if (has_res) { 5816 __ get_vm_result(r0, rthread); 5817 } 5818 __ ret(lr); 5819 5820 __ bind(pending); 5821 __ ldr(r0, Address(rthread, in_bytes(Thread::pending_exception_offset()))); 5822 __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 5823 5824 5825 // codeBlob framesize is in words (not VMRegImpl::slot_size) 5826 int frame_size_in_words = (framesize >> (LogBytesPerWord - LogBytesPerInt)); 5827 RuntimeStub* stub = 5828 RuntimeStub::new_runtime_stub(name, &code, frame_complete, frame_size_in_words, oop_maps, false); 5829 5830 return stub->entry_point(); 5831 } 5832 5833 // Initialization 5834 void generate_initial() { 5835 // Generate initial stubs and initializes the entry points 5836 5837 // entry points that exist in all platforms Note: This is code 5838 // that could be shared among different platforms - however the 5839 // benefit seems to be smaller than the disadvantage of having a 5840 // much more complicated generator structure. See also comment in 5841 // stubRoutines.hpp. 5842 5843 StubRoutines::_forward_exception_entry = generate_forward_exception(); 5844 5845 StubRoutines::_call_stub_entry = 5846 generate_call_stub(StubRoutines::_call_stub_return_address); 5847 5848 // is referenced by megamorphic call 5849 StubRoutines::_catch_exception_entry = generate_catch_exception(); 5850 5851 // Build this early so it's available for the interpreter. 5852 StubRoutines::_throw_StackOverflowError_entry = 5853 generate_throw_exception("StackOverflowError throw_exception", 5854 CAST_FROM_FN_PTR(address, 5855 SharedRuntime::throw_StackOverflowError)); 5856 StubRoutines::_throw_delayed_StackOverflowError_entry = 5857 generate_throw_exception("delayed StackOverflowError throw_exception", 5858 CAST_FROM_FN_PTR(address, 5859 SharedRuntime::throw_delayed_StackOverflowError)); 5860 if (UseCRC32Intrinsics) { 5861 // set table address before stub generation which use it 5862 StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table; 5863 StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32(); 5864 } 5865 5866 if (UseCRC32CIntrinsics) { 5867 StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(); 5868 } 5869 5870 // Disabled until JDK-8210858 is fixed 5871 // if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog)) { 5872 // StubRoutines::_dlog = generate_dlog(); 5873 // } 5874 5875 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) { 5876 StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false); 5877 } 5878 5879 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) { 5880 StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true); 5881 } 5882 5883 5884 StubRoutines::_load_value_type_fields_in_regs = 5885 generate_return_value_stub(CAST_FROM_FN_PTR(address, SharedRuntime::load_value_type_fields_in_regs), "load_value_type_fields_in_regs", false); 5886 StubRoutines::_store_value_type_fields_to_buf = 5887 generate_return_value_stub(CAST_FROM_FN_PTR(address, SharedRuntime::store_value_type_fields_to_buf), "store_value_type_fields_to_buf", true); 5888 } 5889 5890 void generate_all() { 5891 // support for verify_oop (must happen after universe_init) 5892 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop(); 5893 StubRoutines::_throw_AbstractMethodError_entry = 5894 generate_throw_exception("AbstractMethodError throw_exception", 5895 CAST_FROM_FN_PTR(address, 5896 SharedRuntime:: 5897 throw_AbstractMethodError)); 5898 5899 StubRoutines::_throw_IncompatibleClassChangeError_entry = 5900 generate_throw_exception("IncompatibleClassChangeError throw_exception", 5901 CAST_FROM_FN_PTR(address, 5902 SharedRuntime:: 5903 throw_IncompatibleClassChangeError)); 5904 5905 StubRoutines::_throw_NullPointerException_at_call_entry = 5906 generate_throw_exception("NullPointerException at call throw_exception", 5907 CAST_FROM_FN_PTR(address, 5908 SharedRuntime:: 5909 throw_NullPointerException_at_call)); 5910 5911 // arraycopy stubs used by compilers 5912 generate_arraycopy_stubs(); 5913 5914 // has negatives stub for large arrays. 5915 StubRoutines::aarch64::_has_negatives = generate_has_negatives(StubRoutines::aarch64::_has_negatives_long); 5916 5917 // array equals stub for large arrays. 5918 if (!UseSimpleArrayEquals) { 5919 StubRoutines::aarch64::_large_array_equals = generate_large_array_equals(); 5920 } 5921 5922 generate_compare_long_strings(); 5923 5924 generate_string_indexof_stubs(); 5925 5926 // byte_array_inflate stub for large arrays. 5927 StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate(); 5928 5929 #ifdef COMPILER2 5930 if (UseMultiplyToLenIntrinsic) { 5931 StubRoutines::_multiplyToLen = generate_multiplyToLen(); 5932 } 5933 5934 if (UseSquareToLenIntrinsic) { 5935 StubRoutines::_squareToLen = generate_squareToLen(); 5936 } 5937 5938 if (UseMulAddIntrinsic) { 5939 StubRoutines::_mulAdd = generate_mulAdd(); 5940 } 5941 5942 if (UseMontgomeryMultiplyIntrinsic) { 5943 StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply"); 5944 MontgomeryMultiplyGenerator g(_masm, /*squaring*/false); 5945 StubRoutines::_montgomeryMultiply = g.generate_multiply(); 5946 } 5947 5948 if (UseMontgomerySquareIntrinsic) { 5949 StubCodeMark mark(this, "StubRoutines", "montgomerySquare"); 5950 MontgomeryMultiplyGenerator g(_masm, /*squaring*/true); 5951 // We use generate_multiply() rather than generate_square() 5952 // because it's faster for the sizes of modulus we care about. 5953 StubRoutines::_montgomerySquare = g.generate_multiply(); 5954 } 5955 #endif // COMPILER2 5956 5957 #ifndef BUILTIN_SIM 5958 // generate GHASH intrinsics code 5959 if (UseGHASHIntrinsics) { 5960 StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks(); 5961 } 5962 5963 if (UseAESIntrinsics) { 5964 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); 5965 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); 5966 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(); 5967 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt(); 5968 } 5969 5970 if (UseSHA1Intrinsics) { 5971 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress"); 5972 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB"); 5973 } 5974 if (UseSHA256Intrinsics) { 5975 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress"); 5976 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB"); 5977 } 5978 5979 // generate Adler32 intrinsics code 5980 if (UseAdler32Intrinsics) { 5981 StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32(); 5982 } 5983 5984 // Safefetch stubs. 5985 generate_safefetch("SafeFetch32", sizeof(int), &StubRoutines::_safefetch32_entry, 5986 &StubRoutines::_safefetch32_fault_pc, 5987 &StubRoutines::_safefetch32_continuation_pc); 5988 generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry, 5989 &StubRoutines::_safefetchN_fault_pc, 5990 &StubRoutines::_safefetchN_continuation_pc); 5991 #endif 5992 StubRoutines::aarch64::set_completed(); 5993 } 5994 5995 public: 5996 StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) { 5997 if (all) { 5998 generate_all(); 5999 } else { 6000 generate_initial(); 6001 } 6002 } 6003 }; // end class declaration 6004 6005 void StubGenerator_generate(CodeBuffer* code, bool all) { 6006 StubGenerator g(code, all); 6007 }