1 /* 2 * Copyright (c) 2003, 2019, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2014, 2019, Red Hat Inc. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "precompiled.hpp" 27 #include "asm/macroAssembler.hpp" 28 #include "asm/macroAssembler.inline.hpp" 29 #include "gc/shared/barrierSet.hpp" 30 #include "gc/shared/barrierSetAssembler.hpp" 31 #include "interpreter/interpreter.hpp" 32 #include "nativeInst_aarch64.hpp" 33 #include "oops/instanceOop.hpp" 34 #include "oops/method.hpp" 35 #include "oops/objArrayKlass.hpp" 36 #include "oops/oop.inline.hpp" 37 #include "prims/methodHandles.hpp" 38 #include "runtime/frame.inline.hpp" 39 #include "runtime/handles.inline.hpp" 40 #include "runtime/sharedRuntime.hpp" 41 #include "runtime/stubCodeGenerator.hpp" 42 #include "runtime/stubRoutines.hpp" 43 #include "runtime/thread.inline.hpp" 44 #include "utilities/align.hpp" 45 #ifdef COMPILER2 46 #include "opto/runtime.hpp" 47 #endif 48 49 #ifdef BUILTIN_SIM 50 #include "../../../../../../simulator/simulator.hpp" 51 #endif 52 53 // Declaration and definition of StubGenerator (no .hpp file). 54 // For a more detailed description of the stub routine structure 55 // see the comment in stubRoutines.hpp 56 57 #undef __ 58 #define __ _masm-> 59 #define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8)) 60 61 #ifdef PRODUCT 62 #define BLOCK_COMMENT(str) /* nothing */ 63 #else 64 #define BLOCK_COMMENT(str) __ block_comment(str) 65 #endif 66 67 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 68 69 // Stub Code definitions 70 71 class StubGenerator: public StubCodeGenerator { 72 private: 73 74 #ifdef PRODUCT 75 #define inc_counter_np(counter) ((void)0) 76 #else 77 void inc_counter_np_(int& counter) { 78 __ lea(rscratch2, ExternalAddress((address)&counter)); 79 __ ldrw(rscratch1, Address(rscratch2)); 80 __ addw(rscratch1, rscratch1, 1); 81 __ strw(rscratch1, Address(rscratch2)); 82 } 83 #define inc_counter_np(counter) \ 84 BLOCK_COMMENT("inc_counter " #counter); \ 85 inc_counter_np_(counter); 86 #endif 87 88 // Call stubs are used to call Java from C 89 // 90 // Arguments: 91 // c_rarg0: call wrapper address address 92 // c_rarg1: result address 93 // c_rarg2: result type BasicType 94 // c_rarg3: method Method* 95 // c_rarg4: (interpreter) entry point address 96 // c_rarg5: parameters intptr_t* 97 // c_rarg6: parameter size (in words) int 98 // c_rarg7: thread Thread* 99 // 100 // There is no return from the stub itself as any Java result 101 // is written to result 102 // 103 // we save r30 (lr) as the return PC at the base of the frame and 104 // link r29 (fp) below it as the frame pointer installing sp (r31) 105 // into fp. 106 // 107 // we save r0-r7, which accounts for all the c arguments. 108 // 109 // TODO: strictly do we need to save them all? they are treated as 110 // volatile by C so could we omit saving the ones we are going to 111 // place in global registers (thread? method?) or those we only use 112 // during setup of the Java call? 113 // 114 // we don't need to save r8 which C uses as an indirect result location 115 // return register. 116 // 117 // we don't need to save r9-r15 which both C and Java treat as 118 // volatile 119 // 120 // we don't need to save r16-18 because Java does not use them 121 // 122 // we save r19-r28 which Java uses as scratch registers and C 123 // expects to be callee-save 124 // 125 // we save the bottom 64 bits of each value stored in v8-v15; it is 126 // the responsibility of the caller to preserve larger values. 127 // 128 // so the stub frame looks like this when we enter Java code 129 // 130 // [ return_from_Java ] <--- sp 131 // [ argument word n ] 132 // ... 133 // -27 [ argument word 1 ] 134 // -26 [ saved v15 ] <--- sp_after_call 135 // -25 [ saved v14 ] 136 // -24 [ saved v13 ] 137 // -23 [ saved v12 ] 138 // -22 [ saved v11 ] 139 // -21 [ saved v10 ] 140 // -20 [ saved v9 ] 141 // -19 [ saved v8 ] 142 // -18 [ saved r28 ] 143 // -17 [ saved r27 ] 144 // -16 [ saved r26 ] 145 // -15 [ saved r25 ] 146 // -14 [ saved r24 ] 147 // -13 [ saved r23 ] 148 // -12 [ saved r22 ] 149 // -11 [ saved r21 ] 150 // -10 [ saved r20 ] 151 // -9 [ saved r19 ] 152 // -8 [ call wrapper (r0) ] 153 // -7 [ result (r1) ] 154 // -6 [ result type (r2) ] 155 // -5 [ method (r3) ] 156 // -4 [ entry point (r4) ] 157 // -3 [ parameters (r5) ] 158 // -2 [ parameter size (r6) ] 159 // -1 [ thread (r7) ] 160 // 0 [ saved fp (r29) ] <--- fp == saved sp (r31) 161 // 1 [ saved lr (r30) ] 162 163 // Call stub stack layout word offsets from fp 164 enum call_stub_layout { 165 sp_after_call_off = -26, 166 167 d15_off = -26, 168 d13_off = -24, 169 d11_off = -22, 170 d9_off = -20, 171 172 r28_off = -18, 173 r26_off = -16, 174 r24_off = -14, 175 r22_off = -12, 176 r20_off = -10, 177 call_wrapper_off = -8, 178 result_off = -7, 179 result_type_off = -6, 180 method_off = -5, 181 entry_point_off = -4, 182 parameter_size_off = -2, 183 thread_off = -1, 184 fp_f = 0, 185 retaddr_off = 1, 186 }; 187 188 address generate_call_stub(address& return_address) { 189 assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 && 190 (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off, 191 "adjust this code"); 192 193 StubCodeMark mark(this, "StubRoutines", "call_stub"); 194 address start = __ pc(); 195 196 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 197 198 const Address call_wrapper (rfp, call_wrapper_off * wordSize); 199 const Address result (rfp, result_off * wordSize); 200 const Address result_type (rfp, result_type_off * wordSize); 201 const Address method (rfp, method_off * wordSize); 202 const Address entry_point (rfp, entry_point_off * wordSize); 203 const Address parameter_size(rfp, parameter_size_off * wordSize); 204 205 const Address thread (rfp, thread_off * wordSize); 206 207 const Address d15_save (rfp, d15_off * wordSize); 208 const Address d13_save (rfp, d13_off * wordSize); 209 const Address d11_save (rfp, d11_off * wordSize); 210 const Address d9_save (rfp, d9_off * wordSize); 211 212 const Address r28_save (rfp, r28_off * wordSize); 213 const Address r26_save (rfp, r26_off * wordSize); 214 const Address r24_save (rfp, r24_off * wordSize); 215 const Address r22_save (rfp, r22_off * wordSize); 216 const Address r20_save (rfp, r20_off * wordSize); 217 218 // stub code 219 220 // we need a C prolog to bootstrap the x86 caller into the sim 221 __ c_stub_prolog(8, 0, MacroAssembler::ret_type_void); 222 223 address aarch64_entry = __ pc(); 224 225 #ifdef BUILTIN_SIM 226 // Save sender's SP for stack traces. 227 __ mov(rscratch1, sp); 228 __ str(rscratch1, Address(__ pre(sp, -2 * wordSize))); 229 #endif 230 // set up frame and move sp to end of save area 231 __ enter(); 232 __ sub(sp, rfp, -sp_after_call_off * wordSize); 233 234 // save register parameters and Java scratch/global registers 235 // n.b. we save thread even though it gets installed in 236 // rthread because we want to sanity check rthread later 237 __ str(c_rarg7, thread); 238 __ strw(c_rarg6, parameter_size); 239 __ stp(c_rarg4, c_rarg5, entry_point); 240 __ stp(c_rarg2, c_rarg3, result_type); 241 __ stp(c_rarg0, c_rarg1, call_wrapper); 242 243 __ stp(r20, r19, r20_save); 244 __ stp(r22, r21, r22_save); 245 __ stp(r24, r23, r24_save); 246 __ stp(r26, r25, r26_save); 247 __ stp(r28, r27, r28_save); 248 249 __ stpd(v9, v8, d9_save); 250 __ stpd(v11, v10, d11_save); 251 __ stpd(v13, v12, d13_save); 252 __ stpd(v15, v14, d15_save); 253 254 // install Java thread in global register now we have saved 255 // whatever value it held 256 __ mov(rthread, c_rarg7); 257 // And method 258 __ mov(rmethod, c_rarg3); 259 260 // set up the heapbase register 261 __ reinit_heapbase(); 262 263 #ifdef ASSERT 264 // make sure we have no pending exceptions 265 { 266 Label L; 267 __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset()))); 268 __ cmp(rscratch1, (u1)NULL_WORD); 269 __ br(Assembler::EQ, L); 270 __ stop("StubRoutines::call_stub: entered with pending exception"); 271 __ BIND(L); 272 } 273 #endif 274 // pass parameters if any 275 __ mov(esp, sp); 276 __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way 277 __ andr(sp, rscratch1, -2 * wordSize); 278 279 BLOCK_COMMENT("pass parameters if any"); 280 Label parameters_done; 281 // parameter count is still in c_rarg6 282 // and parameter pointer identifying param 1 is in c_rarg5 283 __ cbzw(c_rarg6, parameters_done); 284 285 address loop = __ pc(); 286 __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize))); 287 __ subsw(c_rarg6, c_rarg6, 1); 288 __ push(rscratch1); 289 __ br(Assembler::GT, loop); 290 291 __ BIND(parameters_done); 292 293 // call Java entry -- passing methdoOop, and current sp 294 // rmethod: Method* 295 // r13: sender sp 296 BLOCK_COMMENT("call Java function"); 297 __ mov(r13, sp); 298 __ blr(c_rarg4); 299 300 // tell the simulator we have returned to the stub 301 302 // we do this here because the notify will already have been done 303 // if we get to the next instruction via an exception 304 // 305 // n.b. adding this instruction here affects the calculation of 306 // whether or not a routine returns to the call stub (used when 307 // doing stack walks) since the normal test is to check the return 308 // pc against the address saved below. so we may need to allow for 309 // this extra instruction in the check. 310 311 if (NotifySimulator) { 312 __ notify(Assembler::method_reentry); 313 } 314 // save current address for use by exception handling code 315 316 return_address = __ pc(); 317 318 // store result depending on type (everything that is not 319 // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT) 320 // n.b. this assumes Java returns an integral result in r0 321 // and a floating result in j_farg0 322 __ ldr(j_rarg2, result); 323 Label is_long, is_float, is_double, exit; 324 __ ldr(j_rarg1, result_type); 325 __ cmp(j_rarg1, (u1)T_OBJECT); 326 __ br(Assembler::EQ, is_long); 327 __ cmp(j_rarg1, (u1)T_LONG); 328 __ br(Assembler::EQ, is_long); 329 __ cmp(j_rarg1, (u1)T_FLOAT); 330 __ br(Assembler::EQ, is_float); 331 __ cmp(j_rarg1, (u1)T_DOUBLE); 332 __ br(Assembler::EQ, is_double); 333 334 // handle T_INT case 335 __ strw(r0, Address(j_rarg2)); 336 337 __ BIND(exit); 338 339 // pop parameters 340 __ sub(esp, rfp, -sp_after_call_off * wordSize); 341 342 #ifdef ASSERT 343 // verify that threads correspond 344 { 345 Label L, S; 346 __ ldr(rscratch1, thread); 347 __ cmp(rthread, rscratch1); 348 __ br(Assembler::NE, S); 349 __ get_thread(rscratch1); 350 __ cmp(rthread, rscratch1); 351 __ br(Assembler::EQ, L); 352 __ BIND(S); 353 __ stop("StubRoutines::call_stub: threads must correspond"); 354 __ BIND(L); 355 } 356 #endif 357 358 // restore callee-save registers 359 __ ldpd(v15, v14, d15_save); 360 __ ldpd(v13, v12, d13_save); 361 __ ldpd(v11, v10, d11_save); 362 __ ldpd(v9, v8, d9_save); 363 364 __ ldp(r28, r27, r28_save); 365 __ ldp(r26, r25, r26_save); 366 __ ldp(r24, r23, r24_save); 367 __ ldp(r22, r21, r22_save); 368 __ ldp(r20, r19, r20_save); 369 370 __ ldp(c_rarg0, c_rarg1, call_wrapper); 371 __ ldrw(c_rarg2, result_type); 372 __ ldr(c_rarg3, method); 373 __ ldp(c_rarg4, c_rarg5, entry_point); 374 __ ldp(c_rarg6, c_rarg7, parameter_size); 375 376 #ifndef PRODUCT 377 // tell the simulator we are about to end Java execution 378 if (NotifySimulator) { 379 __ notify(Assembler::method_exit); 380 } 381 #endif 382 // leave frame and return to caller 383 __ leave(); 384 __ ret(lr); 385 386 // handle return types different from T_INT 387 388 __ BIND(is_long); 389 __ str(r0, Address(j_rarg2, 0)); 390 __ br(Assembler::AL, exit); 391 392 __ BIND(is_float); 393 __ strs(j_farg0, Address(j_rarg2, 0)); 394 __ br(Assembler::AL, exit); 395 396 __ BIND(is_double); 397 __ strd(j_farg0, Address(j_rarg2, 0)); 398 __ br(Assembler::AL, exit); 399 400 return start; 401 } 402 403 // Return point for a Java call if there's an exception thrown in 404 // Java code. The exception is caught and transformed into a 405 // pending exception stored in JavaThread that can be tested from 406 // within the VM. 407 // 408 // Note: Usually the parameters are removed by the callee. In case 409 // of an exception crossing an activation frame boundary, that is 410 // not the case if the callee is compiled code => need to setup the 411 // rsp. 412 // 413 // r0: exception oop 414 415 // NOTE: this is used as a target from the signal handler so it 416 // needs an x86 prolog which returns into the current simulator 417 // executing the generated catch_exception code. so the prolog 418 // needs to install rax in a sim register and adjust the sim's 419 // restart pc to enter the generated code at the start position 420 // then return from native to simulated execution. 421 422 address generate_catch_exception() { 423 StubCodeMark mark(this, "StubRoutines", "catch_exception"); 424 address start = __ pc(); 425 426 // same as in generate_call_stub(): 427 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 428 const Address thread (rfp, thread_off * wordSize); 429 430 #ifdef ASSERT 431 // verify that threads correspond 432 { 433 Label L, S; 434 __ ldr(rscratch1, thread); 435 __ cmp(rthread, rscratch1); 436 __ br(Assembler::NE, S); 437 __ get_thread(rscratch1); 438 __ cmp(rthread, rscratch1); 439 __ br(Assembler::EQ, L); 440 __ bind(S); 441 __ stop("StubRoutines::catch_exception: threads must correspond"); 442 __ bind(L); 443 } 444 #endif 445 446 // set pending exception 447 __ verify_oop(r0); 448 449 __ str(r0, Address(rthread, Thread::pending_exception_offset())); 450 __ mov(rscratch1, (address)__FILE__); 451 __ str(rscratch1, Address(rthread, Thread::exception_file_offset())); 452 __ movw(rscratch1, (int)__LINE__); 453 __ strw(rscratch1, Address(rthread, Thread::exception_line_offset())); 454 455 // complete return to VM 456 assert(StubRoutines::_call_stub_return_address != NULL, 457 "_call_stub_return_address must have been generated before"); 458 __ b(StubRoutines::_call_stub_return_address); 459 460 return start; 461 } 462 463 // Continuation point for runtime calls returning with a pending 464 // exception. The pending exception check happened in the runtime 465 // or native call stub. The pending exception in Thread is 466 // converted into a Java-level exception. 467 // 468 // Contract with Java-level exception handlers: 469 // r0: exception 470 // r3: throwing pc 471 // 472 // NOTE: At entry of this stub, exception-pc must be in LR !! 473 474 // NOTE: this is always used as a jump target within generated code 475 // so it just needs to be generated code wiht no x86 prolog 476 477 address generate_forward_exception() { 478 StubCodeMark mark(this, "StubRoutines", "forward exception"); 479 address start = __ pc(); 480 481 // Upon entry, LR points to the return address returning into 482 // Java (interpreted or compiled) code; i.e., the return address 483 // becomes the throwing pc. 484 // 485 // Arguments pushed before the runtime call are still on the stack 486 // but the exception handler will reset the stack pointer -> 487 // ignore them. A potential result in registers can be ignored as 488 // well. 489 490 #ifdef ASSERT 491 // make sure this code is only executed if there is a pending exception 492 { 493 Label L; 494 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 495 __ cbnz(rscratch1, L); 496 __ stop("StubRoutines::forward exception: no pending exception (1)"); 497 __ bind(L); 498 } 499 #endif 500 501 // compute exception handler into r19 502 503 // call the VM to find the handler address associated with the 504 // caller address. pass thread in r0 and caller pc (ret address) 505 // in r1. n.b. the caller pc is in lr, unlike x86 where it is on 506 // the stack. 507 __ mov(c_rarg1, lr); 508 // lr will be trashed by the VM call so we move it to R19 509 // (callee-saved) because we also need to pass it to the handler 510 // returned by this call. 511 __ mov(r19, lr); 512 BLOCK_COMMENT("call exception_handler_for_return_address"); 513 __ call_VM_leaf(CAST_FROM_FN_PTR(address, 514 SharedRuntime::exception_handler_for_return_address), 515 rthread, c_rarg1); 516 // we should not really care that lr is no longer the callee 517 // address. we saved the value the handler needs in r19 so we can 518 // just copy it to r3. however, the C2 handler will push its own 519 // frame and then calls into the VM and the VM code asserts that 520 // the PC for the frame above the handler belongs to a compiled 521 // Java method. So, we restore lr here to satisfy that assert. 522 __ mov(lr, r19); 523 // setup r0 & r3 & clear pending exception 524 __ mov(r3, r19); 525 __ mov(r19, r0); 526 __ ldr(r0, Address(rthread, Thread::pending_exception_offset())); 527 __ str(zr, Address(rthread, Thread::pending_exception_offset())); 528 529 #ifdef ASSERT 530 // make sure exception is set 531 { 532 Label L; 533 __ cbnz(r0, L); 534 __ stop("StubRoutines::forward exception: no pending exception (2)"); 535 __ bind(L); 536 } 537 #endif 538 539 // continue at exception handler 540 // r0: exception 541 // r3: throwing pc 542 // r19: exception handler 543 __ verify_oop(r0); 544 __ br(r19); 545 546 return start; 547 } 548 549 // Non-destructive plausibility checks for oops 550 // 551 // Arguments: 552 // r0: oop to verify 553 // rscratch1: error message 554 // 555 // Stack after saving c_rarg3: 556 // [tos + 0]: saved c_rarg3 557 // [tos + 1]: saved c_rarg2 558 // [tos + 2]: saved lr 559 // [tos + 3]: saved rscratch2 560 // [tos + 4]: saved r0 561 // [tos + 5]: saved rscratch1 562 address generate_verify_oop() { 563 564 StubCodeMark mark(this, "StubRoutines", "verify_oop"); 565 address start = __ pc(); 566 567 Label exit, error; 568 569 // save c_rarg2 and c_rarg3 570 __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16))); 571 572 // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 573 __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 574 __ ldr(c_rarg3, Address(c_rarg2)); 575 __ add(c_rarg3, c_rarg3, 1); 576 __ str(c_rarg3, Address(c_rarg2)); 577 578 // object is in r0 579 // make sure object is 'reasonable' 580 __ cbz(r0, exit); // if obj is NULL it is OK 581 582 // Check if the oop is in the right area of memory 583 __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask()); 584 __ andr(c_rarg2, r0, c_rarg3); 585 __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits()); 586 587 // Compare c_rarg2 and c_rarg3. We don't use a compare 588 // instruction here because the flags register is live. 589 __ eor(c_rarg2, c_rarg2, c_rarg3); 590 __ cbnz(c_rarg2, error); 591 592 // make sure klass is 'reasonable', which is not zero. 593 __ load_klass(r0, r0); // get klass 594 __ cbz(r0, error); // if klass is NULL it is broken 595 596 // return if everything seems ok 597 __ bind(exit); 598 599 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 600 __ ret(lr); 601 602 // handle errors 603 __ bind(error); 604 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 605 606 __ push(RegSet::range(r0, r29), sp); 607 // debug(char* msg, int64_t pc, int64_t regs[]) 608 __ mov(c_rarg0, rscratch1); // pass address of error message 609 __ mov(c_rarg1, lr); // pass return address 610 __ mov(c_rarg2, sp); // pass address of regs on stack 611 #ifndef PRODUCT 612 assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area"); 613 #endif 614 BLOCK_COMMENT("call MacroAssembler::debug"); 615 __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64)); 616 __ blrt(rscratch1, 3, 0, 1); 617 618 return start; 619 } 620 621 void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); } 622 623 // The inner part of zero_words(). This is the bulk operation, 624 // zeroing words in blocks, possibly using DC ZVA to do it. The 625 // caller is responsible for zeroing the last few words. 626 // 627 // Inputs: 628 // r10: the HeapWord-aligned base address of an array to zero. 629 // r11: the count in HeapWords, r11 > 0. 630 // 631 // Returns r10 and r11, adjusted for the caller to clear. 632 // r10: the base address of the tail of words left to clear. 633 // r11: the number of words in the tail. 634 // r11 < MacroAssembler::zero_words_block_size. 635 636 address generate_zero_blocks() { 637 Label done; 638 Label base_aligned; 639 640 Register base = r10, cnt = r11; 641 642 __ align(CodeEntryAlignment); 643 StubCodeMark mark(this, "StubRoutines", "zero_blocks"); 644 address start = __ pc(); 645 646 if (UseBlockZeroing) { 647 int zva_length = VM_Version::zva_length(); 648 649 // Ensure ZVA length can be divided by 16. This is required by 650 // the subsequent operations. 651 assert (zva_length % 16 == 0, "Unexpected ZVA Length"); 652 653 __ tbz(base, 3, base_aligned); 654 __ str(zr, Address(__ post(base, 8))); 655 __ sub(cnt, cnt, 1); 656 __ bind(base_aligned); 657 658 // Ensure count >= zva_length * 2 so that it still deserves a zva after 659 // alignment. 660 Label small; 661 int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit); 662 __ subs(rscratch1, cnt, low_limit >> 3); 663 __ br(Assembler::LT, small); 664 __ zero_dcache_blocks(base, cnt); 665 __ bind(small); 666 } 667 668 { 669 // Number of stp instructions we'll unroll 670 const int unroll = 671 MacroAssembler::zero_words_block_size / 2; 672 // Clear the remaining blocks. 673 Label loop; 674 __ subs(cnt, cnt, unroll * 2); 675 __ br(Assembler::LT, done); 676 __ bind(loop); 677 for (int i = 0; i < unroll; i++) 678 __ stp(zr, zr, __ post(base, 16)); 679 __ subs(cnt, cnt, unroll * 2); 680 __ br(Assembler::GE, loop); 681 __ bind(done); 682 __ add(cnt, cnt, unroll * 2); 683 } 684 685 __ ret(lr); 686 687 return start; 688 } 689 690 691 typedef enum { 692 copy_forwards = 1, 693 copy_backwards = -1 694 } copy_direction; 695 696 // Bulk copy of blocks of 8 words. 697 // 698 // count is a count of words. 699 // 700 // Precondition: count >= 8 701 // 702 // Postconditions: 703 // 704 // The least significant bit of count contains the remaining count 705 // of words to copy. The rest of count is trash. 706 // 707 // s and d are adjusted to point to the remaining words to copy 708 // 709 void generate_copy_longs(Label &start, Register s, Register d, Register count, 710 copy_direction direction) { 711 int unit = wordSize * direction; 712 int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize; 713 714 int offset; 715 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6, 716 t4 = r7, t5 = r10, t6 = r11, t7 = r12; 717 const Register stride = r13; 718 719 assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7); 720 assert_different_registers(s, d, count, rscratch1); 721 722 Label again, drain; 723 const char *stub_name; 724 if (direction == copy_forwards) 725 stub_name = "forward_copy_longs"; 726 else 727 stub_name = "backward_copy_longs"; 728 729 __ align(CodeEntryAlignment); 730 731 StubCodeMark mark(this, "StubRoutines", stub_name); 732 733 __ bind(start); 734 735 Label unaligned_copy_long; 736 if (AvoidUnalignedAccesses) { 737 __ tbnz(d, 3, unaligned_copy_long); 738 } 739 740 if (direction == copy_forwards) { 741 __ sub(s, s, bias); 742 __ sub(d, d, bias); 743 } 744 745 #ifdef ASSERT 746 // Make sure we are never given < 8 words 747 { 748 Label L; 749 __ cmp(count, (u1)8); 750 __ br(Assembler::GE, L); 751 __ stop("genrate_copy_longs called with < 8 words"); 752 __ bind(L); 753 } 754 #endif 755 756 // Fill 8 registers 757 if (UseSIMDForMemoryOps) { 758 __ ldpq(v0, v1, Address(s, 4 * unit)); 759 __ ldpq(v2, v3, Address(__ pre(s, 8 * unit))); 760 } else { 761 __ ldp(t0, t1, Address(s, 2 * unit)); 762 __ ldp(t2, t3, Address(s, 4 * unit)); 763 __ ldp(t4, t5, Address(s, 6 * unit)); 764 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 765 } 766 767 __ subs(count, count, 16); 768 __ br(Assembler::LO, drain); 769 770 int prefetch = PrefetchCopyIntervalInBytes; 771 bool use_stride = false; 772 if (direction == copy_backwards) { 773 use_stride = prefetch > 256; 774 prefetch = -prefetch; 775 if (use_stride) __ mov(stride, prefetch); 776 } 777 778 __ bind(again); 779 780 if (PrefetchCopyIntervalInBytes > 0) 781 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 782 783 if (UseSIMDForMemoryOps) { 784 __ stpq(v0, v1, Address(d, 4 * unit)); 785 __ ldpq(v0, v1, Address(s, 4 * unit)); 786 __ stpq(v2, v3, Address(__ pre(d, 8 * unit))); 787 __ ldpq(v2, v3, Address(__ pre(s, 8 * unit))); 788 } else { 789 __ stp(t0, t1, Address(d, 2 * unit)); 790 __ ldp(t0, t1, Address(s, 2 * unit)); 791 __ stp(t2, t3, Address(d, 4 * unit)); 792 __ ldp(t2, t3, Address(s, 4 * unit)); 793 __ stp(t4, t5, Address(d, 6 * unit)); 794 __ ldp(t4, t5, Address(s, 6 * unit)); 795 __ stp(t6, t7, Address(__ pre(d, 8 * unit))); 796 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 797 } 798 799 __ subs(count, count, 8); 800 __ br(Assembler::HS, again); 801 802 // Drain 803 __ bind(drain); 804 if (UseSIMDForMemoryOps) { 805 __ stpq(v0, v1, Address(d, 4 * unit)); 806 __ stpq(v2, v3, Address(__ pre(d, 8 * unit))); 807 } else { 808 __ stp(t0, t1, Address(d, 2 * unit)); 809 __ stp(t2, t3, Address(d, 4 * unit)); 810 __ stp(t4, t5, Address(d, 6 * unit)); 811 __ stp(t6, t7, Address(__ pre(d, 8 * unit))); 812 } 813 814 { 815 Label L1, L2; 816 __ tbz(count, exact_log2(4), L1); 817 if (UseSIMDForMemoryOps) { 818 __ ldpq(v0, v1, Address(__ pre(s, 4 * unit))); 819 __ stpq(v0, v1, Address(__ pre(d, 4 * unit))); 820 } else { 821 __ ldp(t0, t1, Address(s, 2 * unit)); 822 __ ldp(t2, t3, Address(__ pre(s, 4 * unit))); 823 __ stp(t0, t1, Address(d, 2 * unit)); 824 __ stp(t2, t3, Address(__ pre(d, 4 * unit))); 825 } 826 __ bind(L1); 827 828 if (direction == copy_forwards) { 829 __ add(s, s, bias); 830 __ add(d, d, bias); 831 } 832 833 __ tbz(count, 1, L2); 834 __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards))); 835 __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards))); 836 __ bind(L2); 837 } 838 839 __ ret(lr); 840 841 if (AvoidUnalignedAccesses) { 842 Label drain, again; 843 // Register order for storing. Order is different for backward copy. 844 845 __ bind(unaligned_copy_long); 846 847 // source address is even aligned, target odd aligned 848 // 849 // when forward copying word pairs we read long pairs at offsets 850 // {0, 2, 4, 6} (in long words). when backwards copying we read 851 // long pairs at offsets {-2, -4, -6, -8}. We adjust the source 852 // address by -2 in the forwards case so we can compute the 853 // source offsets for both as {2, 4, 6, 8} * unit where unit = 1 854 // or -1. 855 // 856 // when forward copying we need to store 1 word, 3 pairs and 857 // then 1 word at offsets {0, 1, 3, 5, 7}. Rather thna use a 858 // zero offset We adjust the destination by -1 which means we 859 // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores. 860 // 861 // When backwards copyng we need to store 1 word, 3 pairs and 862 // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use 863 // offsets {1, 3, 5, 7, 8} * unit. 864 865 if (direction == copy_forwards) { 866 __ sub(s, s, 16); 867 __ sub(d, d, 8); 868 } 869 870 // Fill 8 registers 871 // 872 // for forwards copy s was offset by -16 from the original input 873 // value of s so the register contents are at these offsets 874 // relative to the 64 bit block addressed by that original input 875 // and so on for each successive 64 byte block when s is updated 876 // 877 // t0 at offset 0, t1 at offset 8 878 // t2 at offset 16, t3 at offset 24 879 // t4 at offset 32, t5 at offset 40 880 // t6 at offset 48, t7 at offset 56 881 882 // for backwards copy s was not offset so the register contents 883 // are at these offsets into the preceding 64 byte block 884 // relative to that original input and so on for each successive 885 // preceding 64 byte block when s is updated. this explains the 886 // slightly counter-intuitive looking pattern of register usage 887 // in the stp instructions for backwards copy. 888 // 889 // t0 at offset -16, t1 at offset -8 890 // t2 at offset -32, t3 at offset -24 891 // t4 at offset -48, t5 at offset -40 892 // t6 at offset -64, t7 at offset -56 893 894 __ ldp(t0, t1, Address(s, 2 * unit)); 895 __ ldp(t2, t3, Address(s, 4 * unit)); 896 __ ldp(t4, t5, Address(s, 6 * unit)); 897 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 898 899 __ subs(count, count, 16); 900 __ br(Assembler::LO, drain); 901 902 int prefetch = PrefetchCopyIntervalInBytes; 903 bool use_stride = false; 904 if (direction == copy_backwards) { 905 use_stride = prefetch > 256; 906 prefetch = -prefetch; 907 if (use_stride) __ mov(stride, prefetch); 908 } 909 910 __ bind(again); 911 912 if (PrefetchCopyIntervalInBytes > 0) 913 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 914 915 if (direction == copy_forwards) { 916 // allowing for the offset of -8 the store instructions place 917 // registers into the target 64 bit block at the following 918 // offsets 919 // 920 // t0 at offset 0 921 // t1 at offset 8, t2 at offset 16 922 // t3 at offset 24, t4 at offset 32 923 // t5 at offset 40, t6 at offset 48 924 // t7 at offset 56 925 926 __ str(t0, Address(d, 1 * unit)); 927 __ stp(t1, t2, Address(d, 2 * unit)); 928 __ ldp(t0, t1, Address(s, 2 * unit)); 929 __ stp(t3, t4, Address(d, 4 * unit)); 930 __ ldp(t2, t3, Address(s, 4 * unit)); 931 __ stp(t5, t6, Address(d, 6 * unit)); 932 __ ldp(t4, t5, Address(s, 6 * unit)); 933 __ str(t7, Address(__ pre(d, 8 * unit))); 934 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 935 } else { 936 // d was not offset when we started so the registers are 937 // written into the 64 bit block preceding d with the following 938 // offsets 939 // 940 // t1 at offset -8 941 // t3 at offset -24, t0 at offset -16 942 // t5 at offset -48, t2 at offset -32 943 // t7 at offset -56, t4 at offset -48 944 // t6 at offset -64 945 // 946 // note that this matches the offsets previously noted for the 947 // loads 948 949 __ str(t1, Address(d, 1 * unit)); 950 __ stp(t3, t0, Address(d, 3 * unit)); 951 __ ldp(t0, t1, Address(s, 2 * unit)); 952 __ stp(t5, t2, Address(d, 5 * unit)); 953 __ ldp(t2, t3, Address(s, 4 * unit)); 954 __ stp(t7, t4, Address(d, 7 * unit)); 955 __ ldp(t4, t5, Address(s, 6 * unit)); 956 __ str(t6, Address(__ pre(d, 8 * unit))); 957 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 958 } 959 960 __ subs(count, count, 8); 961 __ br(Assembler::HS, again); 962 963 // Drain 964 // 965 // this uses the same pattern of offsets and register arguments 966 // as above 967 __ bind(drain); 968 if (direction == copy_forwards) { 969 __ str(t0, Address(d, 1 * unit)); 970 __ stp(t1, t2, Address(d, 2 * unit)); 971 __ stp(t3, t4, Address(d, 4 * unit)); 972 __ stp(t5, t6, Address(d, 6 * unit)); 973 __ str(t7, Address(__ pre(d, 8 * unit))); 974 } else { 975 __ str(t1, Address(d, 1 * unit)); 976 __ stp(t3, t0, Address(d, 3 * unit)); 977 __ stp(t5, t2, Address(d, 5 * unit)); 978 __ stp(t7, t4, Address(d, 7 * unit)); 979 __ str(t6, Address(__ pre(d, 8 * unit))); 980 } 981 // now we need to copy any remaining part block which may 982 // include a 4 word block subblock and/or a 2 word subblock. 983 // bits 2 and 1 in the count are the tell-tale for whetehr we 984 // have each such subblock 985 { 986 Label L1, L2; 987 __ tbz(count, exact_log2(4), L1); 988 // this is the same as above but copying only 4 longs hence 989 // with ony one intervening stp between the str instructions 990 // but note that the offsets and registers still follow the 991 // same pattern 992 __ ldp(t0, t1, Address(s, 2 * unit)); 993 __ ldp(t2, t3, Address(__ pre(s, 4 * unit))); 994 if (direction == copy_forwards) { 995 __ str(t0, Address(d, 1 * unit)); 996 __ stp(t1, t2, Address(d, 2 * unit)); 997 __ str(t3, Address(__ pre(d, 4 * unit))); 998 } else { 999 __ str(t1, Address(d, 1 * unit)); 1000 __ stp(t3, t0, Address(d, 3 * unit)); 1001 __ str(t2, Address(__ pre(d, 4 * unit))); 1002 } 1003 __ bind(L1); 1004 1005 __ tbz(count, 1, L2); 1006 // this is the same as above but copying only 2 longs hence 1007 // there is no intervening stp between the str instructions 1008 // but note that the offset and register patterns are still 1009 // the same 1010 __ ldp(t0, t1, Address(__ pre(s, 2 * unit))); 1011 if (direction == copy_forwards) { 1012 __ str(t0, Address(d, 1 * unit)); 1013 __ str(t1, Address(__ pre(d, 2 * unit))); 1014 } else { 1015 __ str(t1, Address(d, 1 * unit)); 1016 __ str(t0, Address(__ pre(d, 2 * unit))); 1017 } 1018 __ bind(L2); 1019 1020 // for forwards copy we need to re-adjust the offsets we 1021 // applied so that s and d are follow the last words written 1022 1023 if (direction == copy_forwards) { 1024 __ add(s, s, 16); 1025 __ add(d, d, 8); 1026 } 1027 1028 } 1029 1030 __ ret(lr); 1031 } 1032 } 1033 1034 // Small copy: less than 16 bytes. 1035 // 1036 // NB: Ignores all of the bits of count which represent more than 15 1037 // bytes, so a caller doesn't have to mask them. 1038 1039 void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) { 1040 bool is_backwards = step < 0; 1041 size_t granularity = uabs(step); 1042 int direction = is_backwards ? -1 : 1; 1043 int unit = wordSize * direction; 1044 1045 Label Lword, Lint, Lshort, Lbyte; 1046 1047 assert(granularity 1048 && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small"); 1049 1050 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6; 1051 1052 // ??? I don't know if this bit-test-and-branch is the right thing 1053 // to do. It does a lot of jumping, resulting in several 1054 // mispredicted branches. It might make more sense to do this 1055 // with something like Duff's device with a single computed branch. 1056 1057 __ tbz(count, 3 - exact_log2(granularity), Lword); 1058 __ ldr(tmp, Address(__ adjust(s, unit, is_backwards))); 1059 __ str(tmp, Address(__ adjust(d, unit, is_backwards))); 1060 __ bind(Lword); 1061 1062 if (granularity <= sizeof (jint)) { 1063 __ tbz(count, 2 - exact_log2(granularity), Lint); 1064 __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards))); 1065 __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards))); 1066 __ bind(Lint); 1067 } 1068 1069 if (granularity <= sizeof (jshort)) { 1070 __ tbz(count, 1 - exact_log2(granularity), Lshort); 1071 __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards))); 1072 __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards))); 1073 __ bind(Lshort); 1074 } 1075 1076 if (granularity <= sizeof (jbyte)) { 1077 __ tbz(count, 0, Lbyte); 1078 __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards))); 1079 __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards))); 1080 __ bind(Lbyte); 1081 } 1082 } 1083 1084 Label copy_f, copy_b; 1085 1086 // All-singing all-dancing memory copy. 1087 // 1088 // Copy count units of memory from s to d. The size of a unit is 1089 // step, which can be positive or negative depending on the direction 1090 // of copy. If is_aligned is false, we align the source address. 1091 // 1092 1093 void copy_memory(bool is_aligned, Register s, Register d, 1094 Register count, Register tmp, int step) { 1095 copy_direction direction = step < 0 ? copy_backwards : copy_forwards; 1096 bool is_backwards = step < 0; 1097 int granularity = uabs(step); 1098 const Register t0 = r3, t1 = r4; 1099 1100 // <= 96 bytes do inline. Direction doesn't matter because we always 1101 // load all the data before writing anything 1102 Label copy4, copy8, copy16, copy32, copy80, copy_big, finish; 1103 const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8; 1104 const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12; 1105 const Register send = r17, dend = r18; 1106 1107 if (PrefetchCopyIntervalInBytes > 0) 1108 __ prfm(Address(s, 0), PLDL1KEEP); 1109 __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity)); 1110 __ br(Assembler::HI, copy_big); 1111 1112 __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity)))); 1113 __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity)))); 1114 1115 __ cmp(count, u1(16/granularity)); 1116 __ br(Assembler::LS, copy16); 1117 1118 __ cmp(count, u1(64/granularity)); 1119 __ br(Assembler::HI, copy80); 1120 1121 __ cmp(count, u1(32/granularity)); 1122 __ br(Assembler::LS, copy32); 1123 1124 // 33..64 bytes 1125 if (UseSIMDForMemoryOps) { 1126 __ ldpq(v0, v1, Address(s, 0)); 1127 __ ldpq(v2, v3, Address(send, -32)); 1128 __ stpq(v0, v1, Address(d, 0)); 1129 __ stpq(v2, v3, Address(dend, -32)); 1130 } else { 1131 __ ldp(t0, t1, Address(s, 0)); 1132 __ ldp(t2, t3, Address(s, 16)); 1133 __ ldp(t4, t5, Address(send, -32)); 1134 __ ldp(t6, t7, Address(send, -16)); 1135 1136 __ stp(t0, t1, Address(d, 0)); 1137 __ stp(t2, t3, Address(d, 16)); 1138 __ stp(t4, t5, Address(dend, -32)); 1139 __ stp(t6, t7, Address(dend, -16)); 1140 } 1141 __ b(finish); 1142 1143 // 17..32 bytes 1144 __ bind(copy32); 1145 __ ldp(t0, t1, Address(s, 0)); 1146 __ ldp(t2, t3, Address(send, -16)); 1147 __ stp(t0, t1, Address(d, 0)); 1148 __ stp(t2, t3, Address(dend, -16)); 1149 __ b(finish); 1150 1151 // 65..80/96 bytes 1152 // (96 bytes if SIMD because we do 32 byes per instruction) 1153 __ bind(copy80); 1154 if (UseSIMDForMemoryOps) { 1155 __ ld4(v0, v1, v2, v3, __ T16B, Address(s, 0)); 1156 __ ldpq(v4, v5, Address(send, -32)); 1157 __ st4(v0, v1, v2, v3, __ T16B, Address(d, 0)); 1158 __ stpq(v4, v5, Address(dend, -32)); 1159 } else { 1160 __ ldp(t0, t1, Address(s, 0)); 1161 __ ldp(t2, t3, Address(s, 16)); 1162 __ ldp(t4, t5, Address(s, 32)); 1163 __ ldp(t6, t7, Address(s, 48)); 1164 __ ldp(t8, t9, Address(send, -16)); 1165 1166 __ stp(t0, t1, Address(d, 0)); 1167 __ stp(t2, t3, Address(d, 16)); 1168 __ stp(t4, t5, Address(d, 32)); 1169 __ stp(t6, t7, Address(d, 48)); 1170 __ stp(t8, t9, Address(dend, -16)); 1171 } 1172 __ b(finish); 1173 1174 // 0..16 bytes 1175 __ bind(copy16); 1176 __ cmp(count, u1(8/granularity)); 1177 __ br(Assembler::LO, copy8); 1178 1179 // 8..16 bytes 1180 __ ldr(t0, Address(s, 0)); 1181 __ ldr(t1, Address(send, -8)); 1182 __ str(t0, Address(d, 0)); 1183 __ str(t1, Address(dend, -8)); 1184 __ b(finish); 1185 1186 if (granularity < 8) { 1187 // 4..7 bytes 1188 __ bind(copy8); 1189 __ tbz(count, 2 - exact_log2(granularity), copy4); 1190 __ ldrw(t0, Address(s, 0)); 1191 __ ldrw(t1, Address(send, -4)); 1192 __ strw(t0, Address(d, 0)); 1193 __ strw(t1, Address(dend, -4)); 1194 __ b(finish); 1195 if (granularity < 4) { 1196 // 0..3 bytes 1197 __ bind(copy4); 1198 __ cbz(count, finish); // get rid of 0 case 1199 if (granularity == 2) { 1200 __ ldrh(t0, Address(s, 0)); 1201 __ strh(t0, Address(d, 0)); 1202 } else { // granularity == 1 1203 // Now 1..3 bytes. Handle the 1 and 2 byte case by copying 1204 // the first and last byte. 1205 // Handle the 3 byte case by loading and storing base + count/2 1206 // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1)) 1207 // This does means in the 1 byte case we load/store the same 1208 // byte 3 times. 1209 __ lsr(count, count, 1); 1210 __ ldrb(t0, Address(s, 0)); 1211 __ ldrb(t1, Address(send, -1)); 1212 __ ldrb(t2, Address(s, count)); 1213 __ strb(t0, Address(d, 0)); 1214 __ strb(t1, Address(dend, -1)); 1215 __ strb(t2, Address(d, count)); 1216 } 1217 __ b(finish); 1218 } 1219 } 1220 1221 __ bind(copy_big); 1222 if (is_backwards) { 1223 __ lea(s, Address(s, count, Address::lsl(exact_log2(-step)))); 1224 __ lea(d, Address(d, count, Address::lsl(exact_log2(-step)))); 1225 } 1226 1227 // Now we've got the small case out of the way we can align the 1228 // source address on a 2-word boundary. 1229 1230 Label aligned; 1231 1232 if (is_aligned) { 1233 // We may have to adjust by 1 word to get s 2-word-aligned. 1234 __ tbz(s, exact_log2(wordSize), aligned); 1235 __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards))); 1236 __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards))); 1237 __ sub(count, count, wordSize/granularity); 1238 } else { 1239 if (is_backwards) { 1240 __ andr(rscratch2, s, 2 * wordSize - 1); 1241 } else { 1242 __ neg(rscratch2, s); 1243 __ andr(rscratch2, rscratch2, 2 * wordSize - 1); 1244 } 1245 // rscratch2 is the byte adjustment needed to align s. 1246 __ cbz(rscratch2, aligned); 1247 int shift = exact_log2(granularity); 1248 if (shift) __ lsr(rscratch2, rscratch2, shift); 1249 __ sub(count, count, rscratch2); 1250 1251 #if 0 1252 // ?? This code is only correct for a disjoint copy. It may or 1253 // may not make sense to use it in that case. 1254 1255 // Copy the first pair; s and d may not be aligned. 1256 __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0)); 1257 __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0)); 1258 1259 // Align s and d, adjust count 1260 if (is_backwards) { 1261 __ sub(s, s, rscratch2); 1262 __ sub(d, d, rscratch2); 1263 } else { 1264 __ add(s, s, rscratch2); 1265 __ add(d, d, rscratch2); 1266 } 1267 #else 1268 copy_memory_small(s, d, rscratch2, rscratch1, step); 1269 #endif 1270 } 1271 1272 __ bind(aligned); 1273 1274 // s is now 2-word-aligned. 1275 1276 // We have a count of units and some trailing bytes. Adjust the 1277 // count and do a bulk copy of words. 1278 __ lsr(rscratch2, count, exact_log2(wordSize/granularity)); 1279 if (direction == copy_forwards) 1280 __ bl(copy_f); 1281 else 1282 __ bl(copy_b); 1283 1284 // And the tail. 1285 copy_memory_small(s, d, count, tmp, step); 1286 1287 if (granularity >= 8) __ bind(copy8); 1288 if (granularity >= 4) __ bind(copy4); 1289 __ bind(finish); 1290 } 1291 1292 1293 void clobber_registers() { 1294 #ifdef ASSERT 1295 __ mov(rscratch1, (uint64_t)0xdeadbeef); 1296 __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32); 1297 for (Register r = r3; r <= r18; r++) 1298 if (r != rscratch1) __ mov(r, rscratch1); 1299 #endif 1300 } 1301 1302 // Scan over array at a for count oops, verifying each one. 1303 // Preserves a and count, clobbers rscratch1 and rscratch2. 1304 void verify_oop_array (size_t size, Register a, Register count, Register temp) { 1305 Label loop, end; 1306 __ mov(rscratch1, a); 1307 __ mov(rscratch2, zr); 1308 __ bind(loop); 1309 __ cmp(rscratch2, count); 1310 __ br(Assembler::HS, end); 1311 if (size == (size_t)wordSize) { 1312 __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1313 __ verify_oop(temp); 1314 } else { 1315 __ ldrw(r16, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1316 __ decode_heap_oop(temp); // calls verify_oop 1317 } 1318 __ add(rscratch2, rscratch2, size); 1319 __ b(loop); 1320 __ bind(end); 1321 } 1322 1323 // Arguments: 1324 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1325 // ignored 1326 // is_oop - true => oop array, so generate store check code 1327 // name - stub name string 1328 // 1329 // Inputs: 1330 // c_rarg0 - source array address 1331 // c_rarg1 - destination array address 1332 // c_rarg2 - element count, treated as ssize_t, can be zero 1333 // 1334 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1335 // the hardware handle it. The two dwords within qwords that span 1336 // cache line boundaries will still be loaded and stored atomicly. 1337 // 1338 // Side Effects: 1339 // disjoint_int_copy_entry is set to the no-overlap entry point 1340 // used by generate_conjoint_int_oop_copy(). 1341 // 1342 address generate_disjoint_copy(size_t size, bool aligned, bool is_oop, address *entry, 1343 const char *name, bool dest_uninitialized = false) { 1344 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1345 RegSet saved_reg = RegSet::of(s, d, count); 1346 __ align(CodeEntryAlignment); 1347 StubCodeMark mark(this, "StubRoutines", name); 1348 address start = __ pc(); 1349 __ enter(); 1350 1351 if (entry != NULL) { 1352 *entry = __ pc(); 1353 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1354 BLOCK_COMMENT("Entry:"); 1355 } 1356 1357 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT; 1358 if (dest_uninitialized) { 1359 decorators |= IS_DEST_UNINITIALIZED; 1360 } 1361 if (aligned) { 1362 decorators |= ARRAYCOPY_ALIGNED; 1363 } 1364 1365 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1366 bs->arraycopy_prologue(_masm, decorators, is_oop, d, count, saved_reg); 1367 1368 if (is_oop) { 1369 // save regs before copy_memory 1370 __ push(RegSet::of(d, count), sp); 1371 } 1372 copy_memory(aligned, s, d, count, rscratch1, size); 1373 1374 if (is_oop) { 1375 __ pop(RegSet::of(d, count), sp); 1376 if (VerifyOops) 1377 verify_oop_array(size, d, count, r16); 1378 __ sub(count, count, 1); // make an inclusive end pointer 1379 __ lea(count, Address(d, count, Address::lsl(exact_log2(size)))); 1380 } 1381 1382 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); 1383 1384 __ leave(); 1385 __ mov(r0, zr); // return 0 1386 __ ret(lr); 1387 #ifdef BUILTIN_SIM 1388 { 1389 AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck); 1390 sim->notifyCompile(const_cast<char*>(name), start); 1391 } 1392 #endif 1393 return start; 1394 } 1395 1396 // Arguments: 1397 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1398 // ignored 1399 // is_oop - true => oop array, so generate store check code 1400 // name - stub name string 1401 // 1402 // Inputs: 1403 // c_rarg0 - source array address 1404 // c_rarg1 - destination array address 1405 // c_rarg2 - element count, treated as ssize_t, can be zero 1406 // 1407 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1408 // the hardware handle it. The two dwords within qwords that span 1409 // cache line boundaries will still be loaded and stored atomicly. 1410 // 1411 address generate_conjoint_copy(size_t size, bool aligned, bool is_oop, address nooverlap_target, 1412 address *entry, const char *name, 1413 bool dest_uninitialized = false) { 1414 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1415 RegSet saved_regs = RegSet::of(s, d, count); 1416 StubCodeMark mark(this, "StubRoutines", name); 1417 address start = __ pc(); 1418 __ enter(); 1419 1420 if (entry != NULL) { 1421 *entry = __ pc(); 1422 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1423 BLOCK_COMMENT("Entry:"); 1424 } 1425 1426 // use fwd copy when (d-s) above_equal (count*size) 1427 __ sub(rscratch1, d, s); 1428 __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size)); 1429 __ br(Assembler::HS, nooverlap_target); 1430 1431 DecoratorSet decorators = IN_HEAP | IS_ARRAY; 1432 if (dest_uninitialized) { 1433 decorators |= IS_DEST_UNINITIALIZED; 1434 } 1435 if (aligned) { 1436 decorators |= ARRAYCOPY_ALIGNED; 1437 } 1438 1439 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1440 bs->arraycopy_prologue(_masm, decorators, is_oop, d, count, saved_regs); 1441 1442 if (is_oop) { 1443 // save regs before copy_memory 1444 __ push(RegSet::of(d, count), sp); 1445 } 1446 copy_memory(aligned, s, d, count, rscratch1, -size); 1447 if (is_oop) { 1448 __ pop(RegSet::of(d, count), sp); 1449 if (VerifyOops) 1450 verify_oop_array(size, d, count, r16); 1451 __ sub(count, count, 1); // make an inclusive end pointer 1452 __ lea(count, Address(d, count, Address::lsl(exact_log2(size)))); 1453 } 1454 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); 1455 __ leave(); 1456 __ mov(r0, zr); // return 0 1457 __ ret(lr); 1458 #ifdef BUILTIN_SIM 1459 { 1460 AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck); 1461 sim->notifyCompile(const_cast<char*>(name), start); 1462 } 1463 #endif 1464 return start; 1465 } 1466 1467 // Arguments: 1468 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1469 // ignored 1470 // name - stub name string 1471 // 1472 // Inputs: 1473 // c_rarg0 - source array address 1474 // c_rarg1 - destination array address 1475 // c_rarg2 - element count, treated as ssize_t, can be zero 1476 // 1477 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1478 // we let the hardware handle it. The one to eight bytes within words, 1479 // dwords or qwords that span cache line boundaries will still be loaded 1480 // and stored atomically. 1481 // 1482 // Side Effects: 1483 // disjoint_byte_copy_entry is set to the no-overlap entry point // 1484 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1485 // we let the hardware handle it. The one to eight bytes within words, 1486 // dwords or qwords that span cache line boundaries will still be loaded 1487 // and stored atomically. 1488 // 1489 // Side Effects: 1490 // disjoint_byte_copy_entry is set to the no-overlap entry point 1491 // used by generate_conjoint_byte_copy(). 1492 // 1493 address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) { 1494 const bool not_oop = false; 1495 return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name); 1496 } 1497 1498 // Arguments: 1499 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1500 // ignored 1501 // name - stub name string 1502 // 1503 // Inputs: 1504 // c_rarg0 - source array address 1505 // c_rarg1 - destination array address 1506 // c_rarg2 - element count, treated as ssize_t, can be zero 1507 // 1508 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1509 // we let the hardware handle it. The one to eight bytes within words, 1510 // dwords or qwords that span cache line boundaries will still be loaded 1511 // and stored atomically. 1512 // 1513 address generate_conjoint_byte_copy(bool aligned, address nooverlap_target, 1514 address* entry, const char *name) { 1515 const bool not_oop = false; 1516 return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name); 1517 } 1518 1519 // Arguments: 1520 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1521 // ignored 1522 // name - stub name string 1523 // 1524 // Inputs: 1525 // c_rarg0 - source array address 1526 // c_rarg1 - destination array address 1527 // c_rarg2 - element count, treated as ssize_t, can be zero 1528 // 1529 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1530 // let the hardware handle it. The two or four words within dwords 1531 // or qwords that span cache line boundaries will still be loaded 1532 // and stored atomically. 1533 // 1534 // Side Effects: 1535 // disjoint_short_copy_entry is set to the no-overlap entry point 1536 // used by generate_conjoint_short_copy(). 1537 // 1538 address generate_disjoint_short_copy(bool aligned, 1539 address* entry, const char *name) { 1540 const bool not_oop = false; 1541 return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name); 1542 } 1543 1544 // Arguments: 1545 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1546 // ignored 1547 // name - stub name string 1548 // 1549 // Inputs: 1550 // c_rarg0 - source array address 1551 // c_rarg1 - destination array address 1552 // c_rarg2 - element count, treated as ssize_t, can be zero 1553 // 1554 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1555 // let the hardware handle it. The two or four words within dwords 1556 // or qwords that span cache line boundaries will still be loaded 1557 // and stored atomically. 1558 // 1559 address generate_conjoint_short_copy(bool aligned, address nooverlap_target, 1560 address *entry, const char *name) { 1561 const bool not_oop = false; 1562 return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name); 1563 1564 } 1565 // Arguments: 1566 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1567 // ignored 1568 // name - stub name string 1569 // 1570 // Inputs: 1571 // c_rarg0 - source array address 1572 // c_rarg1 - destination array address 1573 // c_rarg2 - element count, treated as ssize_t, can be zero 1574 // 1575 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1576 // the hardware handle it. The two dwords within qwords that span 1577 // cache line boundaries will still be loaded and stored atomicly. 1578 // 1579 // Side Effects: 1580 // disjoint_int_copy_entry is set to the no-overlap entry point 1581 // used by generate_conjoint_int_oop_copy(). 1582 // 1583 address generate_disjoint_int_copy(bool aligned, address *entry, 1584 const char *name, bool dest_uninitialized = false) { 1585 const bool not_oop = false; 1586 return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name); 1587 } 1588 1589 // Arguments: 1590 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1591 // ignored 1592 // name - stub name string 1593 // 1594 // Inputs: 1595 // c_rarg0 - source array address 1596 // c_rarg1 - destination array address 1597 // c_rarg2 - element count, treated as ssize_t, can be zero 1598 // 1599 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1600 // the hardware handle it. The two dwords within qwords that span 1601 // cache line boundaries will still be loaded and stored atomicly. 1602 // 1603 address generate_conjoint_int_copy(bool aligned, address nooverlap_target, 1604 address *entry, const char *name, 1605 bool dest_uninitialized = false) { 1606 const bool not_oop = false; 1607 return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name); 1608 } 1609 1610 1611 // Arguments: 1612 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1613 // ignored 1614 // name - stub name string 1615 // 1616 // Inputs: 1617 // c_rarg0 - source array address 1618 // c_rarg1 - destination array address 1619 // c_rarg2 - element count, treated as size_t, can be zero 1620 // 1621 // Side Effects: 1622 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1623 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1624 // 1625 address generate_disjoint_long_copy(bool aligned, address *entry, 1626 const char *name, bool dest_uninitialized = false) { 1627 const bool not_oop = false; 1628 return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name); 1629 } 1630 1631 // Arguments: 1632 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1633 // ignored 1634 // name - stub name string 1635 // 1636 // Inputs: 1637 // c_rarg0 - source array address 1638 // c_rarg1 - destination array address 1639 // c_rarg2 - element count, treated as size_t, can be zero 1640 // 1641 address generate_conjoint_long_copy(bool aligned, 1642 address nooverlap_target, address *entry, 1643 const char *name, bool dest_uninitialized = false) { 1644 const bool not_oop = false; 1645 return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name); 1646 } 1647 1648 // Arguments: 1649 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1650 // ignored 1651 // name - stub name string 1652 // 1653 // Inputs: 1654 // c_rarg0 - source array address 1655 // c_rarg1 - destination array address 1656 // c_rarg2 - element count, treated as size_t, can be zero 1657 // 1658 // Side Effects: 1659 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1660 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1661 // 1662 address generate_disjoint_oop_copy(bool aligned, address *entry, 1663 const char *name, bool dest_uninitialized) { 1664 const bool is_oop = true; 1665 const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1666 return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized); 1667 } 1668 1669 // Arguments: 1670 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1671 // ignored 1672 // name - stub name string 1673 // 1674 // Inputs: 1675 // c_rarg0 - source array address 1676 // c_rarg1 - destination array address 1677 // c_rarg2 - element count, treated as size_t, can be zero 1678 // 1679 address generate_conjoint_oop_copy(bool aligned, 1680 address nooverlap_target, address *entry, 1681 const char *name, bool dest_uninitialized) { 1682 const bool is_oop = true; 1683 const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1684 return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry, 1685 name, dest_uninitialized); 1686 } 1687 1688 1689 // Helper for generating a dynamic type check. 1690 // Smashes rscratch1, rscratch2. 1691 void generate_type_check(Register sub_klass, 1692 Register super_check_offset, 1693 Register super_klass, 1694 Label& L_success) { 1695 assert_different_registers(sub_klass, super_check_offset, super_klass); 1696 1697 BLOCK_COMMENT("type_check:"); 1698 1699 Label L_miss; 1700 1701 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, NULL, 1702 super_check_offset); 1703 __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL); 1704 1705 // Fall through on failure! 1706 __ BIND(L_miss); 1707 } 1708 1709 // 1710 // Generate checkcasting array copy stub 1711 // 1712 // Input: 1713 // c_rarg0 - source array address 1714 // c_rarg1 - destination array address 1715 // c_rarg2 - element count, treated as ssize_t, can be zero 1716 // c_rarg3 - size_t ckoff (super_check_offset) 1717 // c_rarg4 - oop ckval (super_klass) 1718 // 1719 // Output: 1720 // r0 == 0 - success 1721 // r0 == -1^K - failure, where K is partial transfer count 1722 // 1723 address generate_checkcast_copy(const char *name, address *entry, 1724 bool dest_uninitialized = false) { 1725 1726 Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop; 1727 1728 // Input registers (after setup_arg_regs) 1729 const Register from = c_rarg0; // source array address 1730 const Register to = c_rarg1; // destination array address 1731 const Register count = c_rarg2; // elementscount 1732 const Register ckoff = c_rarg3; // super_check_offset 1733 const Register ckval = c_rarg4; // super_klass 1734 1735 RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4); 1736 RegSet wb_post_saved_regs = RegSet::of(count); 1737 1738 // Registers used as temps (r18, r19, r20 are save-on-entry) 1739 const Register count_save = r21; // orig elementscount 1740 const Register start_to = r20; // destination array start address 1741 const Register copied_oop = r18; // actual oop copied 1742 const Register r19_klass = r19; // oop._klass 1743 1744 //--------------------------------------------------------------- 1745 // Assembler stub will be used for this call to arraycopy 1746 // if the two arrays are subtypes of Object[] but the 1747 // destination array type is not equal to or a supertype 1748 // of the source type. Each element must be separately 1749 // checked. 1750 1751 assert_different_registers(from, to, count, ckoff, ckval, start_to, 1752 copied_oop, r19_klass, count_save); 1753 1754 __ align(CodeEntryAlignment); 1755 StubCodeMark mark(this, "StubRoutines", name); 1756 address start = __ pc(); 1757 1758 __ enter(); // required for proper stackwalking of RuntimeStub frame 1759 1760 #ifdef ASSERT 1761 // caller guarantees that the arrays really are different 1762 // otherwise, we would have to make conjoint checks 1763 { Label L; 1764 array_overlap_test(L, TIMES_OOP); 1765 __ stop("checkcast_copy within a single array"); 1766 __ bind(L); 1767 } 1768 #endif //ASSERT 1769 1770 // Caller of this entry point must set up the argument registers. 1771 if (entry != NULL) { 1772 *entry = __ pc(); 1773 BLOCK_COMMENT("Entry:"); 1774 } 1775 1776 // Empty array: Nothing to do. 1777 __ cbz(count, L_done); 1778 1779 __ push(RegSet::of(r18, r19, r20, r21), sp); 1780 1781 #ifdef ASSERT 1782 BLOCK_COMMENT("assert consistent ckoff/ckval"); 1783 // The ckoff and ckval must be mutually consistent, 1784 // even though caller generates both. 1785 { Label L; 1786 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1787 __ ldrw(start_to, Address(ckval, sco_offset)); 1788 __ cmpw(ckoff, start_to); 1789 __ br(Assembler::EQ, L); 1790 __ stop("super_check_offset inconsistent"); 1791 __ bind(L); 1792 } 1793 #endif //ASSERT 1794 1795 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST; 1796 bool is_oop = true; 1797 if (dest_uninitialized) { 1798 decorators |= IS_DEST_UNINITIALIZED; 1799 } 1800 1801 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1802 bs->arraycopy_prologue(_masm, decorators, is_oop, to, count, wb_pre_saved_regs); 1803 1804 // save the original count 1805 __ mov(count_save, count); 1806 1807 // Copy from low to high addresses 1808 __ mov(start_to, to); // Save destination array start address 1809 __ b(L_load_element); 1810 1811 // ======== begin loop ======== 1812 // (Loop is rotated; its entry is L_load_element.) 1813 // Loop control: 1814 // for (; count != 0; count--) { 1815 // copied_oop = load_heap_oop(from++); 1816 // ... generate_type_check ...; 1817 // store_heap_oop(to++, copied_oop); 1818 // } 1819 __ align(OptoLoopAlignment); 1820 1821 __ BIND(L_store_element); 1822 __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop, noreg, noreg, AS_RAW); // store the oop 1823 __ sub(count, count, 1); 1824 __ cbz(count, L_do_card_marks); 1825 1826 // ======== loop entry is here ======== 1827 __ BIND(L_load_element); 1828 __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8), noreg, noreg, AS_RAW); // load the oop 1829 __ cbz(copied_oop, L_store_element); 1830 1831 __ load_klass(r19_klass, copied_oop);// query the object klass 1832 generate_type_check(r19_klass, ckoff, ckval, L_store_element); 1833 // ======== end loop ======== 1834 1835 // It was a real error; we must depend on the caller to finish the job. 1836 // Register count = remaining oops, count_orig = total oops. 1837 // Emit GC store barriers for the oops we have copied and report 1838 // their number to the caller. 1839 1840 __ subs(count, count_save, count); // K = partially copied oop count 1841 __ eon(count, count, zr); // report (-1^K) to caller 1842 __ br(Assembler::EQ, L_done_pop); 1843 1844 __ BIND(L_do_card_marks); 1845 __ add(to, to, -heapOopSize); // make an inclusive end pointer 1846 bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, to, rscratch1, wb_post_saved_regs); 1847 1848 __ bind(L_done_pop); 1849 __ pop(RegSet::of(r18, r19, r20, r21), sp); 1850 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr); 1851 1852 __ bind(L_done); 1853 __ mov(r0, count); 1854 __ leave(); 1855 __ ret(lr); 1856 1857 return start; 1858 } 1859 1860 // Perform range checks on the proposed arraycopy. 1861 // Kills temp, but nothing else. 1862 // Also, clean the sign bits of src_pos and dst_pos. 1863 void arraycopy_range_checks(Register src, // source array oop (c_rarg0) 1864 Register src_pos, // source position (c_rarg1) 1865 Register dst, // destination array oo (c_rarg2) 1866 Register dst_pos, // destination position (c_rarg3) 1867 Register length, 1868 Register temp, 1869 Label& L_failed) { 1870 BLOCK_COMMENT("arraycopy_range_checks:"); 1871 1872 assert_different_registers(rscratch1, temp); 1873 1874 // if (src_pos + length > arrayOop(src)->length()) FAIL; 1875 __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes())); 1876 __ addw(temp, length, src_pos); 1877 __ cmpw(temp, rscratch1); 1878 __ br(Assembler::HI, L_failed); 1879 1880 // if (dst_pos + length > arrayOop(dst)->length()) FAIL; 1881 __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes())); 1882 __ addw(temp, length, dst_pos); 1883 __ cmpw(temp, rscratch1); 1884 __ br(Assembler::HI, L_failed); 1885 1886 // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'. 1887 __ movw(src_pos, src_pos); 1888 __ movw(dst_pos, dst_pos); 1889 1890 BLOCK_COMMENT("arraycopy_range_checks done"); 1891 } 1892 1893 // These stubs get called from some dumb test routine. 1894 // I'll write them properly when they're called from 1895 // something that's actually doing something. 1896 static void fake_arraycopy_stub(address src, address dst, int count) { 1897 assert(count == 0, "huh?"); 1898 } 1899 1900 1901 // 1902 // Generate 'unsafe' array copy stub 1903 // Though just as safe as the other stubs, it takes an unscaled 1904 // size_t argument instead of an element count. 1905 // 1906 // Input: 1907 // c_rarg0 - source array address 1908 // c_rarg1 - destination array address 1909 // c_rarg2 - byte count, treated as ssize_t, can be zero 1910 // 1911 // Examines the alignment of the operands and dispatches 1912 // to a long, int, short, or byte copy loop. 1913 // 1914 address generate_unsafe_copy(const char *name, 1915 address byte_copy_entry, 1916 address short_copy_entry, 1917 address int_copy_entry, 1918 address long_copy_entry) { 1919 Label L_long_aligned, L_int_aligned, L_short_aligned; 1920 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1921 1922 __ align(CodeEntryAlignment); 1923 StubCodeMark mark(this, "StubRoutines", name); 1924 address start = __ pc(); 1925 __ enter(); // required for proper stackwalking of RuntimeStub frame 1926 1927 // bump this on entry, not on exit: 1928 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr); 1929 1930 __ orr(rscratch1, s, d); 1931 __ orr(rscratch1, rscratch1, count); 1932 1933 __ andr(rscratch1, rscratch1, BytesPerLong-1); 1934 __ cbz(rscratch1, L_long_aligned); 1935 __ andr(rscratch1, rscratch1, BytesPerInt-1); 1936 __ cbz(rscratch1, L_int_aligned); 1937 __ tbz(rscratch1, 0, L_short_aligned); 1938 __ b(RuntimeAddress(byte_copy_entry)); 1939 1940 __ BIND(L_short_aligned); 1941 __ lsr(count, count, LogBytesPerShort); // size => short_count 1942 __ b(RuntimeAddress(short_copy_entry)); 1943 __ BIND(L_int_aligned); 1944 __ lsr(count, count, LogBytesPerInt); // size => int_count 1945 __ b(RuntimeAddress(int_copy_entry)); 1946 __ BIND(L_long_aligned); 1947 __ lsr(count, count, LogBytesPerLong); // size => long_count 1948 __ b(RuntimeAddress(long_copy_entry)); 1949 1950 return start; 1951 } 1952 1953 // 1954 // Generate generic array copy stubs 1955 // 1956 // Input: 1957 // c_rarg0 - src oop 1958 // c_rarg1 - src_pos (32-bits) 1959 // c_rarg2 - dst oop 1960 // c_rarg3 - dst_pos (32-bits) 1961 // c_rarg4 - element count (32-bits) 1962 // 1963 // Output: 1964 // r0 == 0 - success 1965 // r0 == -1^K - failure, where K is partial transfer count 1966 // 1967 address generate_generic_copy(const char *name, 1968 address byte_copy_entry, address short_copy_entry, 1969 address int_copy_entry, address oop_copy_entry, 1970 address long_copy_entry, address checkcast_copy_entry) { 1971 1972 Label L_failed, L_objArray; 1973 Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs; 1974 1975 // Input registers 1976 const Register src = c_rarg0; // source array oop 1977 const Register src_pos = c_rarg1; // source position 1978 const Register dst = c_rarg2; // destination array oop 1979 const Register dst_pos = c_rarg3; // destination position 1980 const Register length = c_rarg4; 1981 1982 1983 // Registers used as temps 1984 const Register dst_klass = c_rarg5; 1985 1986 __ align(CodeEntryAlignment); 1987 1988 StubCodeMark mark(this, "StubRoutines", name); 1989 1990 address start = __ pc(); 1991 1992 __ enter(); // required for proper stackwalking of RuntimeStub frame 1993 1994 // bump this on entry, not on exit: 1995 inc_counter_np(SharedRuntime::_generic_array_copy_ctr); 1996 1997 //----------------------------------------------------------------------- 1998 // Assembler stub will be used for this call to arraycopy 1999 // if the following conditions are met: 2000 // 2001 // (1) src and dst must not be null. 2002 // (2) src_pos must not be negative. 2003 // (3) dst_pos must not be negative. 2004 // (4) length must not be negative. 2005 // (5) src klass and dst klass should be the same and not NULL. 2006 // (6) src and dst should be arrays. 2007 // (7) src_pos + length must not exceed length of src. 2008 // (8) dst_pos + length must not exceed length of dst. 2009 // 2010 2011 // if (src == NULL) return -1; 2012 __ cbz(src, L_failed); 2013 2014 // if (src_pos < 0) return -1; 2015 __ tbnz(src_pos, 31, L_failed); // i.e. sign bit set 2016 2017 // if (dst == NULL) return -1; 2018 __ cbz(dst, L_failed); 2019 2020 // if (dst_pos < 0) return -1; 2021 __ tbnz(dst_pos, 31, L_failed); // i.e. sign bit set 2022 2023 // registers used as temp 2024 const Register scratch_length = r16; // elements count to copy 2025 const Register scratch_src_klass = r17; // array klass 2026 const Register lh = r18; // layout helper 2027 2028 // if (length < 0) return -1; 2029 __ movw(scratch_length, length); // length (elements count, 32-bits value) 2030 __ tbnz(scratch_length, 31, L_failed); // i.e. sign bit set 2031 2032 __ load_klass(scratch_src_klass, src); 2033 #ifdef ASSERT 2034 // assert(src->klass() != NULL); 2035 { 2036 BLOCK_COMMENT("assert klasses not null {"); 2037 Label L1, L2; 2038 __ cbnz(scratch_src_klass, L2); // it is broken if klass is NULL 2039 __ bind(L1); 2040 __ stop("broken null klass"); 2041 __ bind(L2); 2042 __ load_klass(rscratch1, dst); 2043 __ cbz(rscratch1, L1); // this would be broken also 2044 BLOCK_COMMENT("} assert klasses not null done"); 2045 } 2046 #endif 2047 2048 // Load layout helper (32-bits) 2049 // 2050 // |array_tag| | header_size | element_type | |log2_element_size| 2051 // 32 30 24 16 8 2 0 2052 // 2053 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0 2054 // 2055 2056 const int lh_offset = in_bytes(Klass::layout_helper_offset()); 2057 2058 // Handle objArrays completely differently... 2059 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT); 2060 __ ldrw(lh, Address(scratch_src_klass, lh_offset)); 2061 __ movw(rscratch1, objArray_lh); 2062 __ eorw(rscratch2, lh, rscratch1); 2063 __ cbzw(rscratch2, L_objArray); 2064 2065 // if (src->klass() != dst->klass()) return -1; 2066 __ load_klass(rscratch2, dst); 2067 __ eor(rscratch2, rscratch2, scratch_src_klass); 2068 __ cbnz(rscratch2, L_failed); 2069 2070 // if (!src->is_Array()) return -1; 2071 __ tbz(lh, 31, L_failed); // i.e. (lh >= 0) 2072 2073 // At this point, it is known to be a typeArray (array_tag 0x3). 2074 #ifdef ASSERT 2075 { 2076 BLOCK_COMMENT("assert primitive array {"); 2077 Label L; 2078 __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift); 2079 __ cmpw(lh, rscratch2); 2080 __ br(Assembler::GE, L); 2081 __ stop("must be a primitive array"); 2082 __ bind(L); 2083 BLOCK_COMMENT("} assert primitive array done"); 2084 } 2085 #endif 2086 2087 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2088 rscratch2, L_failed); 2089 2090 // TypeArrayKlass 2091 // 2092 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize); 2093 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize); 2094 // 2095 2096 const Register rscratch1_offset = rscratch1; // array offset 2097 const Register r18_elsize = lh; // element size 2098 2099 __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift, 2100 exact_log2(Klass::_lh_header_size_mask+1)); // array_offset 2101 __ add(src, src, rscratch1_offset); // src array offset 2102 __ add(dst, dst, rscratch1_offset); // dst array offset 2103 BLOCK_COMMENT("choose copy loop based on element size"); 2104 2105 // next registers should be set before the jump to corresponding stub 2106 const Register from = c_rarg0; // source array address 2107 const Register to = c_rarg1; // destination array address 2108 const Register count = c_rarg2; // elements count 2109 2110 // 'from', 'to', 'count' registers should be set in such order 2111 // since they are the same as 'src', 'src_pos', 'dst'. 2112 2113 assert(Klass::_lh_log2_element_size_shift == 0, "fix this code"); 2114 2115 // The possible values of elsize are 0-3, i.e. exact_log2(element 2116 // size in bytes). We do a simple bitwise binary search. 2117 __ BIND(L_copy_bytes); 2118 __ tbnz(r18_elsize, 1, L_copy_ints); 2119 __ tbnz(r18_elsize, 0, L_copy_shorts); 2120 __ lea(from, Address(src, src_pos));// src_addr 2121 __ lea(to, Address(dst, dst_pos));// dst_addr 2122 __ movw(count, scratch_length); // length 2123 __ b(RuntimeAddress(byte_copy_entry)); 2124 2125 __ BIND(L_copy_shorts); 2126 __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr 2127 __ lea(to, Address(dst, dst_pos, Address::lsl(1)));// dst_addr 2128 __ movw(count, scratch_length); // length 2129 __ b(RuntimeAddress(short_copy_entry)); 2130 2131 __ BIND(L_copy_ints); 2132 __ tbnz(r18_elsize, 0, L_copy_longs); 2133 __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr 2134 __ lea(to, Address(dst, dst_pos, Address::lsl(2)));// dst_addr 2135 __ movw(count, scratch_length); // length 2136 __ b(RuntimeAddress(int_copy_entry)); 2137 2138 __ BIND(L_copy_longs); 2139 #ifdef ASSERT 2140 { 2141 BLOCK_COMMENT("assert long copy {"); 2142 Label L; 2143 __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r18_elsize 2144 __ cmpw(r18_elsize, LogBytesPerLong); 2145 __ br(Assembler::EQ, L); 2146 __ stop("must be long copy, but elsize is wrong"); 2147 __ bind(L); 2148 BLOCK_COMMENT("} assert long copy done"); 2149 } 2150 #endif 2151 __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr 2152 __ lea(to, Address(dst, dst_pos, Address::lsl(3)));// dst_addr 2153 __ movw(count, scratch_length); // length 2154 __ b(RuntimeAddress(long_copy_entry)); 2155 2156 // ObjArrayKlass 2157 __ BIND(L_objArray); 2158 // live at this point: scratch_src_klass, scratch_length, src[_pos], dst[_pos] 2159 2160 Label L_plain_copy, L_checkcast_copy; 2161 // test array classes for subtyping 2162 __ load_klass(r18, dst); 2163 __ cmp(scratch_src_klass, r18); // usual case is exact equality 2164 __ br(Assembler::NE, L_checkcast_copy); 2165 2166 // Identically typed arrays can be copied without element-wise checks. 2167 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2168 rscratch2, L_failed); 2169 2170 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2171 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2172 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2173 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2174 __ movw(count, scratch_length); // length 2175 __ BIND(L_plain_copy); 2176 __ b(RuntimeAddress(oop_copy_entry)); 2177 2178 __ BIND(L_checkcast_copy); 2179 // live at this point: scratch_src_klass, scratch_length, r18 (dst_klass) 2180 { 2181 // Before looking at dst.length, make sure dst is also an objArray. 2182 __ ldrw(rscratch1, Address(r18, lh_offset)); 2183 __ movw(rscratch2, objArray_lh); 2184 __ eorw(rscratch1, rscratch1, rscratch2); 2185 __ cbnzw(rscratch1, L_failed); 2186 2187 // It is safe to examine both src.length and dst.length. 2188 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2189 r18, L_failed); 2190 2191 __ load_klass(dst_klass, dst); // reload 2192 2193 // Marshal the base address arguments now, freeing registers. 2194 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2195 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2196 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2197 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2198 __ movw(count, length); // length (reloaded) 2199 Register sco_temp = c_rarg3; // this register is free now 2200 assert_different_registers(from, to, count, sco_temp, 2201 dst_klass, scratch_src_klass); 2202 // assert_clean_int(count, sco_temp); 2203 2204 // Generate the type check. 2205 const int sco_offset = in_bytes(Klass::super_check_offset_offset()); 2206 __ ldrw(sco_temp, Address(dst_klass, sco_offset)); 2207 2208 // Smashes rscratch1, rscratch2 2209 generate_type_check(scratch_src_klass, sco_temp, dst_klass, L_plain_copy); 2210 2211 // Fetch destination element klass from the ObjArrayKlass header. 2212 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset()); 2213 __ ldr(dst_klass, Address(dst_klass, ek_offset)); 2214 __ ldrw(sco_temp, Address(dst_klass, sco_offset)); 2215 2216 // the checkcast_copy loop needs two extra arguments: 2217 assert(c_rarg3 == sco_temp, "#3 already in place"); 2218 // Set up arguments for checkcast_copy_entry. 2219 __ mov(c_rarg4, dst_klass); // dst.klass.element_klass 2220 __ b(RuntimeAddress(checkcast_copy_entry)); 2221 } 2222 2223 __ BIND(L_failed); 2224 __ mov(r0, -1); 2225 __ leave(); // required for proper stackwalking of RuntimeStub frame 2226 __ ret(lr); 2227 2228 return start; 2229 } 2230 2231 // 2232 // Generate stub for array fill. If "aligned" is true, the 2233 // "to" address is assumed to be heapword aligned. 2234 // 2235 // Arguments for generated stub: 2236 // to: c_rarg0 2237 // value: c_rarg1 2238 // count: c_rarg2 treated as signed 2239 // 2240 address generate_fill(BasicType t, bool aligned, const char *name) { 2241 __ align(CodeEntryAlignment); 2242 StubCodeMark mark(this, "StubRoutines", name); 2243 address start = __ pc(); 2244 2245 BLOCK_COMMENT("Entry:"); 2246 2247 const Register to = c_rarg0; // source array address 2248 const Register value = c_rarg1; // value 2249 const Register count = c_rarg2; // elements count 2250 2251 const Register bz_base = r10; // base for block_zero routine 2252 const Register cnt_words = r11; // temp register 2253 2254 __ enter(); 2255 2256 Label L_fill_elements, L_exit1; 2257 2258 int shift = -1; 2259 switch (t) { 2260 case T_BYTE: 2261 shift = 0; 2262 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2263 __ bfi(value, value, 8, 8); // 8 bit -> 16 bit 2264 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2265 __ br(Assembler::LO, L_fill_elements); 2266 break; 2267 case T_SHORT: 2268 shift = 1; 2269 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2270 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2271 __ br(Assembler::LO, L_fill_elements); 2272 break; 2273 case T_INT: 2274 shift = 2; 2275 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2276 __ br(Assembler::LO, L_fill_elements); 2277 break; 2278 default: ShouldNotReachHere(); 2279 } 2280 2281 // Align source address at 8 bytes address boundary. 2282 Label L_skip_align1, L_skip_align2, L_skip_align4; 2283 if (!aligned) { 2284 switch (t) { 2285 case T_BYTE: 2286 // One byte misalignment happens only for byte arrays. 2287 __ tbz(to, 0, L_skip_align1); 2288 __ strb(value, Address(__ post(to, 1))); 2289 __ subw(count, count, 1); 2290 __ bind(L_skip_align1); 2291 // Fallthrough 2292 case T_SHORT: 2293 // Two bytes misalignment happens only for byte and short (char) arrays. 2294 __ tbz(to, 1, L_skip_align2); 2295 __ strh(value, Address(__ post(to, 2))); 2296 __ subw(count, count, 2 >> shift); 2297 __ bind(L_skip_align2); 2298 // Fallthrough 2299 case T_INT: 2300 // Align to 8 bytes, we know we are 4 byte aligned to start. 2301 __ tbz(to, 2, L_skip_align4); 2302 __ strw(value, Address(__ post(to, 4))); 2303 __ subw(count, count, 4 >> shift); 2304 __ bind(L_skip_align4); 2305 break; 2306 default: ShouldNotReachHere(); 2307 } 2308 } 2309 2310 // 2311 // Fill large chunks 2312 // 2313 __ lsrw(cnt_words, count, 3 - shift); // number of words 2314 __ bfi(value, value, 32, 32); // 32 bit -> 64 bit 2315 __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift); 2316 if (UseBlockZeroing) { 2317 Label non_block_zeroing, rest; 2318 // If the fill value is zero we can use the fast zero_words(). 2319 __ cbnz(value, non_block_zeroing); 2320 __ mov(bz_base, to); 2321 __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord); 2322 __ zero_words(bz_base, cnt_words); 2323 __ b(rest); 2324 __ bind(non_block_zeroing); 2325 __ fill_words(to, cnt_words, value); 2326 __ bind(rest); 2327 } else { 2328 __ fill_words(to, cnt_words, value); 2329 } 2330 2331 // Remaining count is less than 8 bytes. Fill it by a single store. 2332 // Note that the total length is no less than 8 bytes. 2333 if (t == T_BYTE || t == T_SHORT) { 2334 Label L_exit1; 2335 __ cbzw(count, L_exit1); 2336 __ add(to, to, count, Assembler::LSL, shift); // points to the end 2337 __ str(value, Address(to, -8)); // overwrite some elements 2338 __ bind(L_exit1); 2339 __ leave(); 2340 __ ret(lr); 2341 } 2342 2343 // Handle copies less than 8 bytes. 2344 Label L_fill_2, L_fill_4, L_exit2; 2345 __ bind(L_fill_elements); 2346 switch (t) { 2347 case T_BYTE: 2348 __ tbz(count, 0, L_fill_2); 2349 __ strb(value, Address(__ post(to, 1))); 2350 __ bind(L_fill_2); 2351 __ tbz(count, 1, L_fill_4); 2352 __ strh(value, Address(__ post(to, 2))); 2353 __ bind(L_fill_4); 2354 __ tbz(count, 2, L_exit2); 2355 __ strw(value, Address(to)); 2356 break; 2357 case T_SHORT: 2358 __ tbz(count, 0, L_fill_4); 2359 __ strh(value, Address(__ post(to, 2))); 2360 __ bind(L_fill_4); 2361 __ tbz(count, 1, L_exit2); 2362 __ strw(value, Address(to)); 2363 break; 2364 case T_INT: 2365 __ cbzw(count, L_exit2); 2366 __ strw(value, Address(to)); 2367 break; 2368 default: ShouldNotReachHere(); 2369 } 2370 __ bind(L_exit2); 2371 __ leave(); 2372 __ ret(lr); 2373 return start; 2374 } 2375 2376 void generate_arraycopy_stubs() { 2377 address entry; 2378 address entry_jbyte_arraycopy; 2379 address entry_jshort_arraycopy; 2380 address entry_jint_arraycopy; 2381 address entry_oop_arraycopy; 2382 address entry_jlong_arraycopy; 2383 address entry_checkcast_arraycopy; 2384 2385 generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards); 2386 generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards); 2387 2388 StubRoutines::aarch64::_zero_blocks = generate_zero_blocks(); 2389 2390 //*** jbyte 2391 // Always need aligned and unaligned versions 2392 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(false, &entry, 2393 "jbyte_disjoint_arraycopy"); 2394 StubRoutines::_jbyte_arraycopy = generate_conjoint_byte_copy(false, entry, 2395 &entry_jbyte_arraycopy, 2396 "jbyte_arraycopy"); 2397 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry, 2398 "arrayof_jbyte_disjoint_arraycopy"); 2399 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_byte_copy(true, entry, NULL, 2400 "arrayof_jbyte_arraycopy"); 2401 2402 //*** jshort 2403 // Always need aligned and unaligned versions 2404 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, &entry, 2405 "jshort_disjoint_arraycopy"); 2406 StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(false, entry, 2407 &entry_jshort_arraycopy, 2408 "jshort_arraycopy"); 2409 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry, 2410 "arrayof_jshort_disjoint_arraycopy"); 2411 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_short_copy(true, entry, NULL, 2412 "arrayof_jshort_arraycopy"); 2413 2414 //*** jint 2415 // Aligned versions 2416 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry, 2417 "arrayof_jint_disjoint_arraycopy"); 2418 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy, 2419 "arrayof_jint_arraycopy"); 2420 // In 64 bit we need both aligned and unaligned versions of jint arraycopy. 2421 // entry_jint_arraycopy always points to the unaligned version 2422 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_int_copy(false, &entry, 2423 "jint_disjoint_arraycopy"); 2424 StubRoutines::_jint_arraycopy = generate_conjoint_int_copy(false, entry, 2425 &entry_jint_arraycopy, 2426 "jint_arraycopy"); 2427 2428 //*** jlong 2429 // It is always aligned 2430 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry, 2431 "arrayof_jlong_disjoint_arraycopy"); 2432 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy, 2433 "arrayof_jlong_arraycopy"); 2434 StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy; 2435 StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy; 2436 2437 //*** oops 2438 { 2439 // With compressed oops we need unaligned versions; notice that 2440 // we overwrite entry_oop_arraycopy. 2441 bool aligned = !UseCompressedOops; 2442 2443 StubRoutines::_arrayof_oop_disjoint_arraycopy 2444 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy", 2445 /*dest_uninitialized*/false); 2446 StubRoutines::_arrayof_oop_arraycopy 2447 = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy", 2448 /*dest_uninitialized*/false); 2449 // Aligned versions without pre-barriers 2450 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit 2451 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit", 2452 /*dest_uninitialized*/true); 2453 StubRoutines::_arrayof_oop_arraycopy_uninit 2454 = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit", 2455 /*dest_uninitialized*/true); 2456 } 2457 2458 StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy; 2459 StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy; 2460 StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit; 2461 StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit; 2462 2463 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy); 2464 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL, 2465 /*dest_uninitialized*/true); 2466 2467 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy("unsafe_arraycopy", 2468 entry_jbyte_arraycopy, 2469 entry_jshort_arraycopy, 2470 entry_jint_arraycopy, 2471 entry_jlong_arraycopy); 2472 2473 StubRoutines::_generic_arraycopy = generate_generic_copy("generic_arraycopy", 2474 entry_jbyte_arraycopy, 2475 entry_jshort_arraycopy, 2476 entry_jint_arraycopy, 2477 entry_oop_arraycopy, 2478 entry_jlong_arraycopy, 2479 entry_checkcast_arraycopy); 2480 2481 StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill"); 2482 StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill"); 2483 StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill"); 2484 StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill"); 2485 StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill"); 2486 StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill"); 2487 } 2488 2489 void generate_math_stubs() { Unimplemented(); } 2490 2491 // Arguments: 2492 // 2493 // Inputs: 2494 // c_rarg0 - source byte array address 2495 // c_rarg1 - destination byte array address 2496 // c_rarg2 - K (key) in little endian int array 2497 // 2498 address generate_aescrypt_encryptBlock() { 2499 __ align(CodeEntryAlignment); 2500 StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock"); 2501 2502 Label L_doLast; 2503 2504 const Register from = c_rarg0; // source array address 2505 const Register to = c_rarg1; // destination array address 2506 const Register key = c_rarg2; // key array address 2507 const Register keylen = rscratch1; 2508 2509 address start = __ pc(); 2510 __ enter(); 2511 2512 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2513 2514 __ ld1(v0, __ T16B, from); // get 16 bytes of input 2515 2516 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2517 __ rev32(v1, __ T16B, v1); 2518 __ rev32(v2, __ T16B, v2); 2519 __ rev32(v3, __ T16B, v3); 2520 __ rev32(v4, __ T16B, v4); 2521 __ aese(v0, v1); 2522 __ aesmc(v0, v0); 2523 __ aese(v0, v2); 2524 __ aesmc(v0, v0); 2525 __ aese(v0, v3); 2526 __ aesmc(v0, v0); 2527 __ aese(v0, v4); 2528 __ aesmc(v0, v0); 2529 2530 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2531 __ rev32(v1, __ T16B, v1); 2532 __ rev32(v2, __ T16B, v2); 2533 __ rev32(v3, __ T16B, v3); 2534 __ rev32(v4, __ T16B, v4); 2535 __ aese(v0, v1); 2536 __ aesmc(v0, v0); 2537 __ aese(v0, v2); 2538 __ aesmc(v0, v0); 2539 __ aese(v0, v3); 2540 __ aesmc(v0, v0); 2541 __ aese(v0, v4); 2542 __ aesmc(v0, v0); 2543 2544 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2545 __ rev32(v1, __ T16B, v1); 2546 __ rev32(v2, __ T16B, v2); 2547 2548 __ cmpw(keylen, 44); 2549 __ br(Assembler::EQ, L_doLast); 2550 2551 __ aese(v0, v1); 2552 __ aesmc(v0, v0); 2553 __ aese(v0, v2); 2554 __ aesmc(v0, v0); 2555 2556 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2557 __ rev32(v1, __ T16B, v1); 2558 __ rev32(v2, __ T16B, v2); 2559 2560 __ cmpw(keylen, 52); 2561 __ br(Assembler::EQ, L_doLast); 2562 2563 __ aese(v0, v1); 2564 __ aesmc(v0, v0); 2565 __ aese(v0, v2); 2566 __ aesmc(v0, v0); 2567 2568 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2569 __ rev32(v1, __ T16B, v1); 2570 __ rev32(v2, __ T16B, v2); 2571 2572 __ BIND(L_doLast); 2573 2574 __ aese(v0, v1); 2575 __ aesmc(v0, v0); 2576 __ aese(v0, v2); 2577 2578 __ ld1(v1, __ T16B, key); 2579 __ rev32(v1, __ T16B, v1); 2580 __ eor(v0, __ T16B, v0, v1); 2581 2582 __ st1(v0, __ T16B, to); 2583 2584 __ mov(r0, 0); 2585 2586 __ leave(); 2587 __ ret(lr); 2588 2589 return start; 2590 } 2591 2592 // Arguments: 2593 // 2594 // Inputs: 2595 // c_rarg0 - source byte array address 2596 // c_rarg1 - destination byte array address 2597 // c_rarg2 - K (key) in little endian int array 2598 // 2599 address generate_aescrypt_decryptBlock() { 2600 assert(UseAES, "need AES instructions and misaligned SSE support"); 2601 __ align(CodeEntryAlignment); 2602 StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock"); 2603 Label L_doLast; 2604 2605 const Register from = c_rarg0; // source array address 2606 const Register to = c_rarg1; // destination array address 2607 const Register key = c_rarg2; // key array address 2608 const Register keylen = rscratch1; 2609 2610 address start = __ pc(); 2611 __ enter(); // required for proper stackwalking of RuntimeStub frame 2612 2613 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2614 2615 __ ld1(v0, __ T16B, from); // get 16 bytes of input 2616 2617 __ ld1(v5, __ T16B, __ post(key, 16)); 2618 __ rev32(v5, __ T16B, v5); 2619 2620 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2621 __ rev32(v1, __ T16B, v1); 2622 __ rev32(v2, __ T16B, v2); 2623 __ rev32(v3, __ T16B, v3); 2624 __ rev32(v4, __ T16B, v4); 2625 __ aesd(v0, v1); 2626 __ aesimc(v0, v0); 2627 __ aesd(v0, v2); 2628 __ aesimc(v0, v0); 2629 __ aesd(v0, v3); 2630 __ aesimc(v0, v0); 2631 __ aesd(v0, v4); 2632 __ aesimc(v0, v0); 2633 2634 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2635 __ rev32(v1, __ T16B, v1); 2636 __ rev32(v2, __ T16B, v2); 2637 __ rev32(v3, __ T16B, v3); 2638 __ rev32(v4, __ T16B, v4); 2639 __ aesd(v0, v1); 2640 __ aesimc(v0, v0); 2641 __ aesd(v0, v2); 2642 __ aesimc(v0, v0); 2643 __ aesd(v0, v3); 2644 __ aesimc(v0, v0); 2645 __ aesd(v0, v4); 2646 __ aesimc(v0, v0); 2647 2648 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2649 __ rev32(v1, __ T16B, v1); 2650 __ rev32(v2, __ T16B, v2); 2651 2652 __ cmpw(keylen, 44); 2653 __ br(Assembler::EQ, L_doLast); 2654 2655 __ aesd(v0, v1); 2656 __ aesimc(v0, v0); 2657 __ aesd(v0, v2); 2658 __ aesimc(v0, v0); 2659 2660 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2661 __ rev32(v1, __ T16B, v1); 2662 __ rev32(v2, __ T16B, v2); 2663 2664 __ cmpw(keylen, 52); 2665 __ br(Assembler::EQ, L_doLast); 2666 2667 __ aesd(v0, v1); 2668 __ aesimc(v0, v0); 2669 __ aesd(v0, v2); 2670 __ aesimc(v0, v0); 2671 2672 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2673 __ rev32(v1, __ T16B, v1); 2674 __ rev32(v2, __ T16B, v2); 2675 2676 __ BIND(L_doLast); 2677 2678 __ aesd(v0, v1); 2679 __ aesimc(v0, v0); 2680 __ aesd(v0, v2); 2681 2682 __ eor(v0, __ T16B, v0, v5); 2683 2684 __ st1(v0, __ T16B, to); 2685 2686 __ mov(r0, 0); 2687 2688 __ leave(); 2689 __ ret(lr); 2690 2691 return start; 2692 } 2693 2694 // Arguments: 2695 // 2696 // Inputs: 2697 // c_rarg0 - source byte array address 2698 // c_rarg1 - destination byte array address 2699 // c_rarg2 - K (key) in little endian int array 2700 // c_rarg3 - r vector byte array address 2701 // c_rarg4 - input length 2702 // 2703 // Output: 2704 // x0 - input length 2705 // 2706 address generate_cipherBlockChaining_encryptAESCrypt() { 2707 assert(UseAES, "need AES instructions and misaligned SSE support"); 2708 __ align(CodeEntryAlignment); 2709 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt"); 2710 2711 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2712 2713 const Register from = c_rarg0; // source array address 2714 const Register to = c_rarg1; // destination array address 2715 const Register key = c_rarg2; // key array address 2716 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2717 // and left with the results of the last encryption block 2718 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2719 const Register keylen = rscratch1; 2720 2721 address start = __ pc(); 2722 2723 __ enter(); 2724 2725 __ movw(rscratch2, len_reg); 2726 2727 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2728 2729 __ ld1(v0, __ T16B, rvec); 2730 2731 __ cmpw(keylen, 52); 2732 __ br(Assembler::CC, L_loadkeys_44); 2733 __ br(Assembler::EQ, L_loadkeys_52); 2734 2735 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2736 __ rev32(v17, __ T16B, v17); 2737 __ rev32(v18, __ T16B, v18); 2738 __ BIND(L_loadkeys_52); 2739 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2740 __ rev32(v19, __ T16B, v19); 2741 __ rev32(v20, __ T16B, v20); 2742 __ BIND(L_loadkeys_44); 2743 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2744 __ rev32(v21, __ T16B, v21); 2745 __ rev32(v22, __ T16B, v22); 2746 __ rev32(v23, __ T16B, v23); 2747 __ rev32(v24, __ T16B, v24); 2748 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2749 __ rev32(v25, __ T16B, v25); 2750 __ rev32(v26, __ T16B, v26); 2751 __ rev32(v27, __ T16B, v27); 2752 __ rev32(v28, __ T16B, v28); 2753 __ ld1(v29, v30, v31, __ T16B, key); 2754 __ rev32(v29, __ T16B, v29); 2755 __ rev32(v30, __ T16B, v30); 2756 __ rev32(v31, __ T16B, v31); 2757 2758 __ BIND(L_aes_loop); 2759 __ ld1(v1, __ T16B, __ post(from, 16)); 2760 __ eor(v0, __ T16B, v0, v1); 2761 2762 __ br(Assembler::CC, L_rounds_44); 2763 __ br(Assembler::EQ, L_rounds_52); 2764 2765 __ aese(v0, v17); __ aesmc(v0, v0); 2766 __ aese(v0, v18); __ aesmc(v0, v0); 2767 __ BIND(L_rounds_52); 2768 __ aese(v0, v19); __ aesmc(v0, v0); 2769 __ aese(v0, v20); __ aesmc(v0, v0); 2770 __ BIND(L_rounds_44); 2771 __ aese(v0, v21); __ aesmc(v0, v0); 2772 __ aese(v0, v22); __ aesmc(v0, v0); 2773 __ aese(v0, v23); __ aesmc(v0, v0); 2774 __ aese(v0, v24); __ aesmc(v0, v0); 2775 __ aese(v0, v25); __ aesmc(v0, v0); 2776 __ aese(v0, v26); __ aesmc(v0, v0); 2777 __ aese(v0, v27); __ aesmc(v0, v0); 2778 __ aese(v0, v28); __ aesmc(v0, v0); 2779 __ aese(v0, v29); __ aesmc(v0, v0); 2780 __ aese(v0, v30); 2781 __ eor(v0, __ T16B, v0, v31); 2782 2783 __ st1(v0, __ T16B, __ post(to, 16)); 2784 2785 __ subw(len_reg, len_reg, 16); 2786 __ cbnzw(len_reg, L_aes_loop); 2787 2788 __ st1(v0, __ T16B, rvec); 2789 2790 __ mov(r0, rscratch2); 2791 2792 __ leave(); 2793 __ ret(lr); 2794 2795 return start; 2796 } 2797 2798 // Arguments: 2799 // 2800 // Inputs: 2801 // c_rarg0 - source byte array address 2802 // c_rarg1 - destination byte array address 2803 // c_rarg2 - K (key) in little endian int array 2804 // c_rarg3 - r vector byte array address 2805 // c_rarg4 - input length 2806 // 2807 // Output: 2808 // r0 - input length 2809 // 2810 address generate_cipherBlockChaining_decryptAESCrypt() { 2811 assert(UseAES, "need AES instructions and misaligned SSE support"); 2812 __ align(CodeEntryAlignment); 2813 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt"); 2814 2815 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2816 2817 const Register from = c_rarg0; // source array address 2818 const Register to = c_rarg1; // destination array address 2819 const Register key = c_rarg2; // key array address 2820 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2821 // and left with the results of the last encryption block 2822 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2823 const Register keylen = rscratch1; 2824 2825 address start = __ pc(); 2826 2827 __ enter(); 2828 2829 __ movw(rscratch2, len_reg); 2830 2831 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2832 2833 __ ld1(v2, __ T16B, rvec); 2834 2835 __ ld1(v31, __ T16B, __ post(key, 16)); 2836 __ rev32(v31, __ T16B, v31); 2837 2838 __ cmpw(keylen, 52); 2839 __ br(Assembler::CC, L_loadkeys_44); 2840 __ br(Assembler::EQ, L_loadkeys_52); 2841 2842 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2843 __ rev32(v17, __ T16B, v17); 2844 __ rev32(v18, __ T16B, v18); 2845 __ BIND(L_loadkeys_52); 2846 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2847 __ rev32(v19, __ T16B, v19); 2848 __ rev32(v20, __ T16B, v20); 2849 __ BIND(L_loadkeys_44); 2850 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2851 __ rev32(v21, __ T16B, v21); 2852 __ rev32(v22, __ T16B, v22); 2853 __ rev32(v23, __ T16B, v23); 2854 __ rev32(v24, __ T16B, v24); 2855 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2856 __ rev32(v25, __ T16B, v25); 2857 __ rev32(v26, __ T16B, v26); 2858 __ rev32(v27, __ T16B, v27); 2859 __ rev32(v28, __ T16B, v28); 2860 __ ld1(v29, v30, __ T16B, key); 2861 __ rev32(v29, __ T16B, v29); 2862 __ rev32(v30, __ T16B, v30); 2863 2864 __ BIND(L_aes_loop); 2865 __ ld1(v0, __ T16B, __ post(from, 16)); 2866 __ orr(v1, __ T16B, v0, v0); 2867 2868 __ br(Assembler::CC, L_rounds_44); 2869 __ br(Assembler::EQ, L_rounds_52); 2870 2871 __ aesd(v0, v17); __ aesimc(v0, v0); 2872 __ aesd(v0, v18); __ aesimc(v0, v0); 2873 __ BIND(L_rounds_52); 2874 __ aesd(v0, v19); __ aesimc(v0, v0); 2875 __ aesd(v0, v20); __ aesimc(v0, v0); 2876 __ BIND(L_rounds_44); 2877 __ aesd(v0, v21); __ aesimc(v0, v0); 2878 __ aesd(v0, v22); __ aesimc(v0, v0); 2879 __ aesd(v0, v23); __ aesimc(v0, v0); 2880 __ aesd(v0, v24); __ aesimc(v0, v0); 2881 __ aesd(v0, v25); __ aesimc(v0, v0); 2882 __ aesd(v0, v26); __ aesimc(v0, v0); 2883 __ aesd(v0, v27); __ aesimc(v0, v0); 2884 __ aesd(v0, v28); __ aesimc(v0, v0); 2885 __ aesd(v0, v29); __ aesimc(v0, v0); 2886 __ aesd(v0, v30); 2887 __ eor(v0, __ T16B, v0, v31); 2888 __ eor(v0, __ T16B, v0, v2); 2889 2890 __ st1(v0, __ T16B, __ post(to, 16)); 2891 __ orr(v2, __ T16B, v1, v1); 2892 2893 __ subw(len_reg, len_reg, 16); 2894 __ cbnzw(len_reg, L_aes_loop); 2895 2896 __ st1(v2, __ T16B, rvec); 2897 2898 __ mov(r0, rscratch2); 2899 2900 __ leave(); 2901 __ ret(lr); 2902 2903 return start; 2904 } 2905 2906 // Arguments: 2907 // 2908 // Inputs: 2909 // c_rarg0 - byte[] source+offset 2910 // c_rarg1 - int[] SHA.state 2911 // c_rarg2 - int offset 2912 // c_rarg3 - int limit 2913 // 2914 address generate_sha1_implCompress(bool multi_block, const char *name) { 2915 __ align(CodeEntryAlignment); 2916 StubCodeMark mark(this, "StubRoutines", name); 2917 address start = __ pc(); 2918 2919 Register buf = c_rarg0; 2920 Register state = c_rarg1; 2921 Register ofs = c_rarg2; 2922 Register limit = c_rarg3; 2923 2924 Label keys; 2925 Label sha1_loop; 2926 2927 // load the keys into v0..v3 2928 __ adr(rscratch1, keys); 2929 __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1)); 2930 // load 5 words state into v6, v7 2931 __ ldrq(v6, Address(state, 0)); 2932 __ ldrs(v7, Address(state, 16)); 2933 2934 2935 __ BIND(sha1_loop); 2936 // load 64 bytes of data into v16..v19 2937 __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf); 2938 __ rev32(v16, __ T16B, v16); 2939 __ rev32(v17, __ T16B, v17); 2940 __ rev32(v18, __ T16B, v18); 2941 __ rev32(v19, __ T16B, v19); 2942 2943 // do the sha1 2944 __ addv(v4, __ T4S, v16, v0); 2945 __ orr(v20, __ T16B, v6, v6); 2946 2947 FloatRegister d0 = v16; 2948 FloatRegister d1 = v17; 2949 FloatRegister d2 = v18; 2950 FloatRegister d3 = v19; 2951 2952 for (int round = 0; round < 20; round++) { 2953 FloatRegister tmp1 = (round & 1) ? v4 : v5; 2954 FloatRegister tmp2 = (round & 1) ? v21 : v22; 2955 FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7; 2956 FloatRegister tmp4 = (round & 1) ? v5 : v4; 2957 FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3)); 2958 2959 if (round < 16) __ sha1su0(d0, __ T4S, d1, d2); 2960 if (round < 19) __ addv(tmp1, __ T4S, d1, key); 2961 __ sha1h(tmp2, __ T4S, v20); 2962 if (round < 5) 2963 __ sha1c(v20, __ T4S, tmp3, tmp4); 2964 else if (round < 10 || round >= 15) 2965 __ sha1p(v20, __ T4S, tmp3, tmp4); 2966 else 2967 __ sha1m(v20, __ T4S, tmp3, tmp4); 2968 if (round < 16) __ sha1su1(d0, __ T4S, d3); 2969 2970 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 2971 } 2972 2973 __ addv(v7, __ T2S, v7, v21); 2974 __ addv(v6, __ T4S, v6, v20); 2975 2976 if (multi_block) { 2977 __ add(ofs, ofs, 64); 2978 __ cmp(ofs, limit); 2979 __ br(Assembler::LE, sha1_loop); 2980 __ mov(c_rarg0, ofs); // return ofs 2981 } 2982 2983 __ strq(v6, Address(state, 0)); 2984 __ strs(v7, Address(state, 16)); 2985 2986 __ ret(lr); 2987 2988 __ bind(keys); 2989 __ emit_int32(0x5a827999); 2990 __ emit_int32(0x6ed9eba1); 2991 __ emit_int32(0x8f1bbcdc); 2992 __ emit_int32(0xca62c1d6); 2993 2994 return start; 2995 } 2996 2997 2998 // Arguments: 2999 // 3000 // Inputs: 3001 // c_rarg0 - byte[] source+offset 3002 // c_rarg1 - int[] SHA.state 3003 // c_rarg2 - int offset 3004 // c_rarg3 - int limit 3005 // 3006 address generate_sha256_implCompress(bool multi_block, const char *name) { 3007 static const uint32_t round_consts[64] = { 3008 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 3009 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 3010 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 3011 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 3012 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 3013 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 3014 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 3015 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, 3016 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 3017 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 3018 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 3019 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, 3020 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 3021 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, 3022 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 3023 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, 3024 }; 3025 __ align(CodeEntryAlignment); 3026 StubCodeMark mark(this, "StubRoutines", name); 3027 address start = __ pc(); 3028 3029 Register buf = c_rarg0; 3030 Register state = c_rarg1; 3031 Register ofs = c_rarg2; 3032 Register limit = c_rarg3; 3033 3034 Label sha1_loop; 3035 3036 __ stpd(v8, v9, __ pre(sp, -32)); 3037 __ stpd(v10, v11, Address(sp, 16)); 3038 3039 // dga == v0 3040 // dgb == v1 3041 // dg0 == v2 3042 // dg1 == v3 3043 // dg2 == v4 3044 // t0 == v6 3045 // t1 == v7 3046 3047 // load 16 keys to v16..v31 3048 __ lea(rscratch1, ExternalAddress((address)round_consts)); 3049 __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64)); 3050 __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64)); 3051 __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64)); 3052 __ ld1(v28, v29, v30, v31, __ T4S, rscratch1); 3053 3054 // load 8 words (256 bits) state 3055 __ ldpq(v0, v1, state); 3056 3057 __ BIND(sha1_loop); 3058 // load 64 bytes of data into v8..v11 3059 __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf); 3060 __ rev32(v8, __ T16B, v8); 3061 __ rev32(v9, __ T16B, v9); 3062 __ rev32(v10, __ T16B, v10); 3063 __ rev32(v11, __ T16B, v11); 3064 3065 __ addv(v6, __ T4S, v8, v16); 3066 __ orr(v2, __ T16B, v0, v0); 3067 __ orr(v3, __ T16B, v1, v1); 3068 3069 FloatRegister d0 = v8; 3070 FloatRegister d1 = v9; 3071 FloatRegister d2 = v10; 3072 FloatRegister d3 = v11; 3073 3074 3075 for (int round = 0; round < 16; round++) { 3076 FloatRegister tmp1 = (round & 1) ? v6 : v7; 3077 FloatRegister tmp2 = (round & 1) ? v7 : v6; 3078 FloatRegister tmp3 = (round & 1) ? v2 : v4; 3079 FloatRegister tmp4 = (round & 1) ? v4 : v2; 3080 3081 if (round < 12) __ sha256su0(d0, __ T4S, d1); 3082 __ orr(v4, __ T16B, v2, v2); 3083 if (round < 15) 3084 __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17)); 3085 __ sha256h(v2, __ T4S, v3, tmp2); 3086 __ sha256h2(v3, __ T4S, v4, tmp2); 3087 if (round < 12) __ sha256su1(d0, __ T4S, d2, d3); 3088 3089 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3090 } 3091 3092 __ addv(v0, __ T4S, v0, v2); 3093 __ addv(v1, __ T4S, v1, v3); 3094 3095 if (multi_block) { 3096 __ add(ofs, ofs, 64); 3097 __ cmp(ofs, limit); 3098 __ br(Assembler::LE, sha1_loop); 3099 __ mov(c_rarg0, ofs); // return ofs 3100 } 3101 3102 __ ldpd(v10, v11, Address(sp, 16)); 3103 __ ldpd(v8, v9, __ post(sp, 32)); 3104 3105 __ stpq(v0, v1, state); 3106 3107 __ ret(lr); 3108 3109 return start; 3110 } 3111 3112 #ifndef BUILTIN_SIM 3113 // Safefetch stubs. 3114 void generate_safefetch(const char* name, int size, address* entry, 3115 address* fault_pc, address* continuation_pc) { 3116 // safefetch signatures: 3117 // int SafeFetch32(int* adr, int errValue); 3118 // intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue); 3119 // 3120 // arguments: 3121 // c_rarg0 = adr 3122 // c_rarg1 = errValue 3123 // 3124 // result: 3125 // PPC_RET = *adr or errValue 3126 3127 StubCodeMark mark(this, "StubRoutines", name); 3128 3129 // Entry point, pc or function descriptor. 3130 *entry = __ pc(); 3131 3132 // Load *adr into c_rarg1, may fault. 3133 *fault_pc = __ pc(); 3134 switch (size) { 3135 case 4: 3136 // int32_t 3137 __ ldrw(c_rarg1, Address(c_rarg0, 0)); 3138 break; 3139 case 8: 3140 // int64_t 3141 __ ldr(c_rarg1, Address(c_rarg0, 0)); 3142 break; 3143 default: 3144 ShouldNotReachHere(); 3145 } 3146 3147 // return errValue or *adr 3148 *continuation_pc = __ pc(); 3149 __ mov(r0, c_rarg1); 3150 __ ret(lr); 3151 } 3152 #endif 3153 3154 /** 3155 * Arguments: 3156 * 3157 * Inputs: 3158 * c_rarg0 - int crc 3159 * c_rarg1 - byte* buf 3160 * c_rarg2 - int length 3161 * 3162 * Ouput: 3163 * rax - int crc result 3164 */ 3165 address generate_updateBytesCRC32() { 3166 assert(UseCRC32Intrinsics, "what are we doing here?"); 3167 3168 __ align(CodeEntryAlignment); 3169 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32"); 3170 3171 address start = __ pc(); 3172 3173 const Register crc = c_rarg0; // crc 3174 const Register buf = c_rarg1; // source java byte array address 3175 const Register len = c_rarg2; // length 3176 const Register table0 = c_rarg3; // crc_table address 3177 const Register table1 = c_rarg4; 3178 const Register table2 = c_rarg5; 3179 const Register table3 = c_rarg6; 3180 const Register tmp3 = c_rarg7; 3181 3182 BLOCK_COMMENT("Entry:"); 3183 __ enter(); // required for proper stackwalking of RuntimeStub frame 3184 3185 __ kernel_crc32(crc, buf, len, 3186 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 3187 3188 __ leave(); // required for proper stackwalking of RuntimeStub frame 3189 __ ret(lr); 3190 3191 return start; 3192 } 3193 3194 /** 3195 * Arguments: 3196 * 3197 * Inputs: 3198 * c_rarg0 - int crc 3199 * c_rarg1 - byte* buf 3200 * c_rarg2 - int length 3201 * c_rarg3 - int* table 3202 * 3203 * Ouput: 3204 * r0 - int crc result 3205 */ 3206 address generate_updateBytesCRC32C() { 3207 assert(UseCRC32CIntrinsics, "what are we doing here?"); 3208 3209 __ align(CodeEntryAlignment); 3210 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C"); 3211 3212 address start = __ pc(); 3213 3214 const Register crc = c_rarg0; // crc 3215 const Register buf = c_rarg1; // source java byte array address 3216 const Register len = c_rarg2; // length 3217 const Register table0 = c_rarg3; // crc_table address 3218 const Register table1 = c_rarg4; 3219 const Register table2 = c_rarg5; 3220 const Register table3 = c_rarg6; 3221 const Register tmp3 = c_rarg7; 3222 3223 BLOCK_COMMENT("Entry:"); 3224 __ enter(); // required for proper stackwalking of RuntimeStub frame 3225 3226 __ kernel_crc32c(crc, buf, len, 3227 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 3228 3229 __ leave(); // required for proper stackwalking of RuntimeStub frame 3230 __ ret(lr); 3231 3232 return start; 3233 } 3234 3235 /*** 3236 * Arguments: 3237 * 3238 * Inputs: 3239 * c_rarg0 - int adler 3240 * c_rarg1 - byte* buff 3241 * c_rarg2 - int len 3242 * 3243 * Output: 3244 * c_rarg0 - int adler result 3245 */ 3246 address generate_updateBytesAdler32() { 3247 __ align(CodeEntryAlignment); 3248 StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32"); 3249 address start = __ pc(); 3250 3251 Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1; 3252 3253 // Aliases 3254 Register adler = c_rarg0; 3255 Register s1 = c_rarg0; 3256 Register s2 = c_rarg3; 3257 Register buff = c_rarg1; 3258 Register len = c_rarg2; 3259 Register nmax = r4; 3260 Register base = r5; 3261 Register count = r6; 3262 Register temp0 = rscratch1; 3263 Register temp1 = rscratch2; 3264 FloatRegister vbytes = v0; 3265 FloatRegister vs1acc = v1; 3266 FloatRegister vs2acc = v2; 3267 FloatRegister vtable = v3; 3268 3269 // Max number of bytes we can process before having to take the mod 3270 // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 3271 unsigned long BASE = 0xfff1; 3272 unsigned long NMAX = 0x15B0; 3273 3274 __ mov(base, BASE); 3275 __ mov(nmax, NMAX); 3276 3277 // Load accumulation coefficients for the upper 16 bits 3278 __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table)); 3279 __ ld1(vtable, __ T16B, Address(temp0)); 3280 3281 // s1 is initialized to the lower 16 bits of adler 3282 // s2 is initialized to the upper 16 bits of adler 3283 __ ubfx(s2, adler, 16, 16); // s2 = ((adler >> 16) & 0xffff) 3284 __ uxth(s1, adler); // s1 = (adler & 0xffff) 3285 3286 // The pipelined loop needs at least 16 elements for 1 iteration 3287 // It does check this, but it is more effective to skip to the cleanup loop 3288 __ cmp(len, (u1)16); 3289 __ br(Assembler::HS, L_nmax); 3290 __ cbz(len, L_combine); 3291 3292 __ bind(L_simple_by1_loop); 3293 __ ldrb(temp0, Address(__ post(buff, 1))); 3294 __ add(s1, s1, temp0); 3295 __ add(s2, s2, s1); 3296 __ subs(len, len, 1); 3297 __ br(Assembler::HI, L_simple_by1_loop); 3298 3299 // s1 = s1 % BASE 3300 __ subs(temp0, s1, base); 3301 __ csel(s1, temp0, s1, Assembler::HS); 3302 3303 // s2 = s2 % BASE 3304 __ lsr(temp0, s2, 16); 3305 __ lsl(temp1, temp0, 4); 3306 __ sub(temp1, temp1, temp0); 3307 __ add(s2, temp1, s2, ext::uxth); 3308 3309 __ subs(temp0, s2, base); 3310 __ csel(s2, temp0, s2, Assembler::HS); 3311 3312 __ b(L_combine); 3313 3314 __ bind(L_nmax); 3315 __ subs(len, len, nmax); 3316 __ sub(count, nmax, 16); 3317 __ br(Assembler::LO, L_by16); 3318 3319 __ bind(L_nmax_loop); 3320 3321 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1, 3322 vbytes, vs1acc, vs2acc, vtable); 3323 3324 __ subs(count, count, 16); 3325 __ br(Assembler::HS, L_nmax_loop); 3326 3327 // s1 = s1 % BASE 3328 __ lsr(temp0, s1, 16); 3329 __ lsl(temp1, temp0, 4); 3330 __ sub(temp1, temp1, temp0); 3331 __ add(temp1, temp1, s1, ext::uxth); 3332 3333 __ lsr(temp0, temp1, 16); 3334 __ lsl(s1, temp0, 4); 3335 __ sub(s1, s1, temp0); 3336 __ add(s1, s1, temp1, ext:: uxth); 3337 3338 __ subs(temp0, s1, base); 3339 __ csel(s1, temp0, s1, Assembler::HS); 3340 3341 // s2 = s2 % BASE 3342 __ lsr(temp0, s2, 16); 3343 __ lsl(temp1, temp0, 4); 3344 __ sub(temp1, temp1, temp0); 3345 __ add(temp1, temp1, s2, ext::uxth); 3346 3347 __ lsr(temp0, temp1, 16); 3348 __ lsl(s2, temp0, 4); 3349 __ sub(s2, s2, temp0); 3350 __ add(s2, s2, temp1, ext:: uxth); 3351 3352 __ subs(temp0, s2, base); 3353 __ csel(s2, temp0, s2, Assembler::HS); 3354 3355 __ subs(len, len, nmax); 3356 __ sub(count, nmax, 16); 3357 __ br(Assembler::HS, L_nmax_loop); 3358 3359 __ bind(L_by16); 3360 __ adds(len, len, count); 3361 __ br(Assembler::LO, L_by1); 3362 3363 __ bind(L_by16_loop); 3364 3365 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1, 3366 vbytes, vs1acc, vs2acc, vtable); 3367 3368 __ subs(len, len, 16); 3369 __ br(Assembler::HS, L_by16_loop); 3370 3371 __ bind(L_by1); 3372 __ adds(len, len, 15); 3373 __ br(Assembler::LO, L_do_mod); 3374 3375 __ bind(L_by1_loop); 3376 __ ldrb(temp0, Address(__ post(buff, 1))); 3377 __ add(s1, temp0, s1); 3378 __ add(s2, s2, s1); 3379 __ subs(len, len, 1); 3380 __ br(Assembler::HS, L_by1_loop); 3381 3382 __ bind(L_do_mod); 3383 // s1 = s1 % BASE 3384 __ lsr(temp0, s1, 16); 3385 __ lsl(temp1, temp0, 4); 3386 __ sub(temp1, temp1, temp0); 3387 __ add(temp1, temp1, s1, ext::uxth); 3388 3389 __ lsr(temp0, temp1, 16); 3390 __ lsl(s1, temp0, 4); 3391 __ sub(s1, s1, temp0); 3392 __ add(s1, s1, temp1, ext:: uxth); 3393 3394 __ subs(temp0, s1, base); 3395 __ csel(s1, temp0, s1, Assembler::HS); 3396 3397 // s2 = s2 % BASE 3398 __ lsr(temp0, s2, 16); 3399 __ lsl(temp1, temp0, 4); 3400 __ sub(temp1, temp1, temp0); 3401 __ add(temp1, temp1, s2, ext::uxth); 3402 3403 __ lsr(temp0, temp1, 16); 3404 __ lsl(s2, temp0, 4); 3405 __ sub(s2, s2, temp0); 3406 __ add(s2, s2, temp1, ext:: uxth); 3407 3408 __ subs(temp0, s2, base); 3409 __ csel(s2, temp0, s2, Assembler::HS); 3410 3411 // Combine lower bits and higher bits 3412 __ bind(L_combine); 3413 __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16) 3414 3415 __ ret(lr); 3416 3417 return start; 3418 } 3419 3420 void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff, 3421 Register temp0, Register temp1, FloatRegister vbytes, 3422 FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) { 3423 // Below is a vectorized implementation of updating s1 and s2 for 16 bytes. 3424 // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration. 3425 // In non-vectorized code, we update s1 and s2 as: 3426 // s1 <- s1 + b1 3427 // s2 <- s2 + s1 3428 // s1 <- s1 + b2 3429 // s2 <- s2 + b1 3430 // ... 3431 // s1 <- s1 + b16 3432 // s2 <- s2 + s1 3433 // Putting above assignments together, we have: 3434 // s1_new = s1 + b1 + b2 + ... + b16 3435 // s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16) 3436 // = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1) 3437 // = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1) 3438 __ ld1(vbytes, __ T16B, Address(__ post(buff, 16))); 3439 3440 // s2 = s2 + s1 * 16 3441 __ add(s2, s2, s1, Assembler::LSL, 4); 3442 3443 // vs1acc = b1 + b2 + b3 + ... + b16 3444 // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1) 3445 __ umullv(vs2acc, __ T8B, vtable, vbytes); 3446 __ umlalv(vs2acc, __ T16B, vtable, vbytes); 3447 __ uaddlv(vs1acc, __ T16B, vbytes); 3448 __ uaddlv(vs2acc, __ T8H, vs2acc); 3449 3450 // s1 = s1 + vs1acc, s2 = s2 + vs2acc 3451 __ fmovd(temp0, vs1acc); 3452 __ fmovd(temp1, vs2acc); 3453 __ add(s1, s1, temp0); 3454 __ add(s2, s2, temp1); 3455 } 3456 3457 /** 3458 * Arguments: 3459 * 3460 * Input: 3461 * c_rarg0 - x address 3462 * c_rarg1 - x length 3463 * c_rarg2 - y address 3464 * c_rarg3 - y lenth 3465 * c_rarg4 - z address 3466 * c_rarg5 - z length 3467 */ 3468 address generate_multiplyToLen() { 3469 __ align(CodeEntryAlignment); 3470 StubCodeMark mark(this, "StubRoutines", "multiplyToLen"); 3471 3472 address start = __ pc(); 3473 const Register x = r0; 3474 const Register xlen = r1; 3475 const Register y = r2; 3476 const Register ylen = r3; 3477 const Register z = r4; 3478 const Register zlen = r5; 3479 3480 const Register tmp1 = r10; 3481 const Register tmp2 = r11; 3482 const Register tmp3 = r12; 3483 const Register tmp4 = r13; 3484 const Register tmp5 = r14; 3485 const Register tmp6 = r15; 3486 const Register tmp7 = r16; 3487 3488 BLOCK_COMMENT("Entry:"); 3489 __ enter(); // required for proper stackwalking of RuntimeStub frame 3490 __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 3491 __ leave(); // required for proper stackwalking of RuntimeStub frame 3492 __ ret(lr); 3493 3494 return start; 3495 } 3496 3497 address generate_squareToLen() { 3498 // squareToLen algorithm for sizes 1..127 described in java code works 3499 // faster than multiply_to_len on some CPUs and slower on others, but 3500 // multiply_to_len shows a bit better overall results 3501 __ align(CodeEntryAlignment); 3502 StubCodeMark mark(this, "StubRoutines", "squareToLen"); 3503 address start = __ pc(); 3504 3505 const Register x = r0; 3506 const Register xlen = r1; 3507 const Register z = r2; 3508 const Register zlen = r3; 3509 const Register y = r4; // == x 3510 const Register ylen = r5; // == xlen 3511 3512 const Register tmp1 = r10; 3513 const Register tmp2 = r11; 3514 const Register tmp3 = r12; 3515 const Register tmp4 = r13; 3516 const Register tmp5 = r14; 3517 const Register tmp6 = r15; 3518 const Register tmp7 = r16; 3519 3520 RegSet spilled_regs = RegSet::of(y, ylen); 3521 BLOCK_COMMENT("Entry:"); 3522 __ enter(); 3523 __ push(spilled_regs, sp); 3524 __ mov(y, x); 3525 __ mov(ylen, xlen); 3526 __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 3527 __ pop(spilled_regs, sp); 3528 __ leave(); 3529 __ ret(lr); 3530 return start; 3531 } 3532 3533 address generate_mulAdd() { 3534 __ align(CodeEntryAlignment); 3535 StubCodeMark mark(this, "StubRoutines", "mulAdd"); 3536 3537 address start = __ pc(); 3538 3539 const Register out = r0; 3540 const Register in = r1; 3541 const Register offset = r2; 3542 const Register len = r3; 3543 const Register k = r4; 3544 3545 BLOCK_COMMENT("Entry:"); 3546 __ enter(); 3547 __ mul_add(out, in, offset, len, k); 3548 __ leave(); 3549 __ ret(lr); 3550 3551 return start; 3552 } 3553 3554 void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi, 3555 FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0, 3556 FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4) { 3557 // Karatsuba multiplication performs a 128*128 -> 256-bit 3558 // multiplication in three 128-bit multiplications and a few 3559 // additions. 3560 // 3561 // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1) 3562 // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0 3563 // 3564 // Inputs: 3565 // 3566 // A0 in a.d[0] (subkey) 3567 // A1 in a.d[1] 3568 // (A1+A0) in a1_xor_a0.d[0] 3569 // 3570 // B0 in b.d[0] (state) 3571 // B1 in b.d[1] 3572 3573 __ ext(tmp1, __ T16B, b, b, 0x08); 3574 __ pmull2(result_hi, __ T1Q, b, a, __ T2D); // A1*B1 3575 __ eor(tmp1, __ T16B, tmp1, b); // (B1+B0) 3576 __ pmull(result_lo, __ T1Q, b, a, __ T1D); // A0*B0 3577 __ pmull(tmp2, __ T1Q, tmp1, a1_xor_a0, __ T1D); // (A1+A0)(B1+B0) 3578 3579 __ ext(tmp4, __ T16B, result_lo, result_hi, 0x08); 3580 __ eor(tmp3, __ T16B, result_hi, result_lo); // A1*B1+A0*B0 3581 __ eor(tmp2, __ T16B, tmp2, tmp4); 3582 __ eor(tmp2, __ T16B, tmp2, tmp3); 3583 3584 // Register pair <result_hi:result_lo> holds the result of carry-less multiplication 3585 __ ins(result_hi, __ D, tmp2, 0, 1); 3586 __ ins(result_lo, __ D, tmp2, 1, 0); 3587 } 3588 3589 void ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi, 3590 FloatRegister p, FloatRegister z, FloatRegister t1) { 3591 const FloatRegister t0 = result; 3592 3593 // The GCM field polynomial f is z^128 + p(z), where p = 3594 // z^7+z^2+z+1. 3595 // 3596 // z^128 === -p(z) (mod (z^128 + p(z))) 3597 // 3598 // so, given that the product we're reducing is 3599 // a == lo + hi * z^128 3600 // substituting, 3601 // === lo - hi * p(z) (mod (z^128 + p(z))) 3602 // 3603 // we reduce by multiplying hi by p(z) and subtracting the result 3604 // from (i.e. XORing it with) lo. Because p has no nonzero high 3605 // bits we can do this with two 64-bit multiplications, lo*p and 3606 // hi*p. 3607 3608 __ pmull2(t0, __ T1Q, hi, p, __ T2D); 3609 __ ext(t1, __ T16B, t0, z, 8); 3610 __ eor(hi, __ T16B, hi, t1); 3611 __ ext(t1, __ T16B, z, t0, 8); 3612 __ eor(lo, __ T16B, lo, t1); 3613 __ pmull(t0, __ T1Q, hi, p, __ T1D); 3614 __ eor(result, __ T16B, lo, t0); 3615 } 3616 3617 address generate_has_negatives(address &has_negatives_long) { 3618 const u1 large_loop_size = 64; 3619 const uint64_t UPPER_BIT_MASK=0x8080808080808080; 3620 int dcache_line = VM_Version::dcache_line_size(); 3621 3622 Register ary1 = r1, len = r2, result = r0; 3623 3624 __ align(CodeEntryAlignment); 3625 3626 StubCodeMark mark(this, "StubRoutines", "has_negatives"); 3627 3628 address entry = __ pc(); 3629 3630 __ enter(); 3631 3632 Label RET_TRUE, RET_TRUE_NO_POP, RET_FALSE, ALIGNED, LOOP16, CHECK_16, DONE, 3633 LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL; 3634 3635 __ cmp(len, (u1)15); 3636 __ br(Assembler::GT, LEN_OVER_15); 3637 // The only case when execution falls into this code is when pointer is near 3638 // the end of memory page and we have to avoid reading next page 3639 __ add(ary1, ary1, len); 3640 __ subs(len, len, 8); 3641 __ br(Assembler::GT, LEN_OVER_8); 3642 __ ldr(rscratch2, Address(ary1, -8)); 3643 __ sub(rscratch1, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes. 3644 __ lsrv(rscratch2, rscratch2, rscratch1); 3645 __ tst(rscratch2, UPPER_BIT_MASK); 3646 __ cset(result, Assembler::NE); 3647 __ leave(); 3648 __ ret(lr); 3649 __ bind(LEN_OVER_8); 3650 __ ldp(rscratch1, rscratch2, Address(ary1, -16)); 3651 __ sub(len, len, 8); // no data dep., then sub can be executed while loading 3652 __ tst(rscratch2, UPPER_BIT_MASK); 3653 __ br(Assembler::NE, RET_TRUE_NO_POP); 3654 __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes 3655 __ lsrv(rscratch1, rscratch1, rscratch2); 3656 __ tst(rscratch1, UPPER_BIT_MASK); 3657 __ cset(result, Assembler::NE); 3658 __ leave(); 3659 __ ret(lr); 3660 3661 Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10; 3662 const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6; 3663 3664 has_negatives_long = __ pc(); // 2nd entry point 3665 3666 __ enter(); 3667 3668 __ bind(LEN_OVER_15); 3669 __ push(spilled_regs, sp); 3670 __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment 3671 __ cbz(rscratch2, ALIGNED); 3672 __ ldp(tmp6, tmp1, Address(ary1)); 3673 __ mov(tmp5, 16); 3674 __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address 3675 __ add(ary1, ary1, rscratch1); 3676 __ sub(len, len, rscratch1); 3677 __ orr(tmp6, tmp6, tmp1); 3678 __ tst(tmp6, UPPER_BIT_MASK); 3679 __ br(Assembler::NE, RET_TRUE); 3680 3681 __ bind(ALIGNED); 3682 __ cmp(len, large_loop_size); 3683 __ br(Assembler::LT, CHECK_16); 3684 // Perform 16-byte load as early return in pre-loop to handle situation 3685 // when initially aligned large array has negative values at starting bytes, 3686 // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is 3687 // slower. Cases with negative bytes further ahead won't be affected that 3688 // much. In fact, it'll be faster due to early loads, less instructions and 3689 // less branches in LARGE_LOOP. 3690 __ ldp(tmp6, tmp1, Address(__ post(ary1, 16))); 3691 __ sub(len, len, 16); 3692 __ orr(tmp6, tmp6, tmp1); 3693 __ tst(tmp6, UPPER_BIT_MASK); 3694 __ br(Assembler::NE, RET_TRUE); 3695 __ cmp(len, large_loop_size); 3696 __ br(Assembler::LT, CHECK_16); 3697 3698 if (SoftwarePrefetchHintDistance >= 0 3699 && SoftwarePrefetchHintDistance >= dcache_line) { 3700 // initial prefetch 3701 __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line)); 3702 } 3703 __ bind(LARGE_LOOP); 3704 if (SoftwarePrefetchHintDistance >= 0) { 3705 __ prfm(Address(ary1, SoftwarePrefetchHintDistance)); 3706 } 3707 // Issue load instructions first, since it can save few CPU/MEM cycles, also 3708 // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp) 3709 // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3 3710 // instructions per cycle and have less branches, but this approach disables 3711 // early return, thus, all 64 bytes are loaded and checked every time. 3712 __ ldp(tmp2, tmp3, Address(ary1)); 3713 __ ldp(tmp4, tmp5, Address(ary1, 16)); 3714 __ ldp(rscratch1, rscratch2, Address(ary1, 32)); 3715 __ ldp(tmp6, tmp1, Address(ary1, 48)); 3716 __ add(ary1, ary1, large_loop_size); 3717 __ sub(len, len, large_loop_size); 3718 __ orr(tmp2, tmp2, tmp3); 3719 __ orr(tmp4, tmp4, tmp5); 3720 __ orr(rscratch1, rscratch1, rscratch2); 3721 __ orr(tmp6, tmp6, tmp1); 3722 __ orr(tmp2, tmp2, tmp4); 3723 __ orr(rscratch1, rscratch1, tmp6); 3724 __ orr(tmp2, tmp2, rscratch1); 3725 __ tst(tmp2, UPPER_BIT_MASK); 3726 __ br(Assembler::NE, RET_TRUE); 3727 __ cmp(len, large_loop_size); 3728 __ br(Assembler::GE, LARGE_LOOP); 3729 3730 __ bind(CHECK_16); // small 16-byte load pre-loop 3731 __ cmp(len, (u1)16); 3732 __ br(Assembler::LT, POST_LOOP16); 3733 3734 __ bind(LOOP16); // small 16-byte load loop 3735 __ ldp(tmp2, tmp3, Address(__ post(ary1, 16))); 3736 __ sub(len, len, 16); 3737 __ orr(tmp2, tmp2, tmp3); 3738 __ tst(tmp2, UPPER_BIT_MASK); 3739 __ br(Assembler::NE, RET_TRUE); 3740 __ cmp(len, (u1)16); 3741 __ br(Assembler::GE, LOOP16); // 16-byte load loop end 3742 3743 __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally 3744 __ cmp(len, (u1)8); 3745 __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL); 3746 __ ldr(tmp3, Address(__ post(ary1, 8))); 3747 __ sub(len, len, 8); 3748 __ tst(tmp3, UPPER_BIT_MASK); 3749 __ br(Assembler::NE, RET_TRUE); 3750 3751 __ bind(POST_LOOP16_LOAD_TAIL); 3752 __ cbz(len, RET_FALSE); // Can't shift left by 64 when len==0 3753 __ ldr(tmp1, Address(ary1)); 3754 __ mov(tmp2, 64); 3755 __ sub(tmp4, tmp2, len, __ LSL, 3); 3756 __ lslv(tmp1, tmp1, tmp4); 3757 __ tst(tmp1, UPPER_BIT_MASK); 3758 __ br(Assembler::NE, RET_TRUE); 3759 // Fallthrough 3760 3761 __ bind(RET_FALSE); 3762 __ pop(spilled_regs, sp); 3763 __ leave(); 3764 __ mov(result, zr); 3765 __ ret(lr); 3766 3767 __ bind(RET_TRUE); 3768 __ pop(spilled_regs, sp); 3769 __ bind(RET_TRUE_NO_POP); 3770 __ leave(); 3771 __ mov(result, 1); 3772 __ ret(lr); 3773 3774 __ bind(DONE); 3775 __ pop(spilled_regs, sp); 3776 __ leave(); 3777 __ ret(lr); 3778 return entry; 3779 } 3780 3781 void generate_large_array_equals_loop_nonsimd(int loopThreshold, 3782 bool usePrefetch, Label &NOT_EQUAL) { 3783 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 3784 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, 3785 tmp7 = r12, tmp8 = r13; 3786 Label LOOP; 3787 3788 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 3789 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 3790 __ bind(LOOP); 3791 if (usePrefetch) { 3792 __ prfm(Address(a1, SoftwarePrefetchHintDistance)); 3793 __ prfm(Address(a2, SoftwarePrefetchHintDistance)); 3794 } 3795 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); 3796 __ eor(tmp1, tmp1, tmp2); 3797 __ eor(tmp3, tmp3, tmp4); 3798 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); 3799 __ orr(tmp1, tmp1, tmp3); 3800 __ cbnz(tmp1, NOT_EQUAL); 3801 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 3802 __ eor(tmp5, tmp5, tmp6); 3803 __ eor(tmp7, tmp7, tmp8); 3804 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 3805 __ orr(tmp5, tmp5, tmp7); 3806 __ cbnz(tmp5, NOT_EQUAL); 3807 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); 3808 __ eor(tmp1, tmp1, tmp2); 3809 __ eor(tmp3, tmp3, tmp4); 3810 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); 3811 __ orr(tmp1, tmp1, tmp3); 3812 __ cbnz(tmp1, NOT_EQUAL); 3813 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 3814 __ eor(tmp5, tmp5, tmp6); 3815 __ sub(cnt1, cnt1, 8 * wordSize); 3816 __ eor(tmp7, tmp7, tmp8); 3817 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 3818 // tmp6 is not used. MacroAssembler::subs is used here (rather than 3819 // cmp) because subs allows an unlimited range of immediate operand. 3820 __ subs(tmp6, cnt1, loopThreshold); 3821 __ orr(tmp5, tmp5, tmp7); 3822 __ cbnz(tmp5, NOT_EQUAL); 3823 __ br(__ GE, LOOP); 3824 // post-loop 3825 __ eor(tmp1, tmp1, tmp2); 3826 __ eor(tmp3, tmp3, tmp4); 3827 __ orr(tmp1, tmp1, tmp3); 3828 __ sub(cnt1, cnt1, 2 * wordSize); 3829 __ cbnz(tmp1, NOT_EQUAL); 3830 } 3831 3832 void generate_large_array_equals_loop_simd(int loopThreshold, 3833 bool usePrefetch, Label &NOT_EQUAL) { 3834 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 3835 tmp2 = rscratch2; 3836 Label LOOP; 3837 3838 __ bind(LOOP); 3839 if (usePrefetch) { 3840 __ prfm(Address(a1, SoftwarePrefetchHintDistance)); 3841 __ prfm(Address(a2, SoftwarePrefetchHintDistance)); 3842 } 3843 __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize))); 3844 __ sub(cnt1, cnt1, 8 * wordSize); 3845 __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize))); 3846 __ subs(tmp1, cnt1, loopThreshold); 3847 __ eor(v0, __ T16B, v0, v4); 3848 __ eor(v1, __ T16B, v1, v5); 3849 __ eor(v2, __ T16B, v2, v6); 3850 __ eor(v3, __ T16B, v3, v7); 3851 __ orr(v0, __ T16B, v0, v1); 3852 __ orr(v1, __ T16B, v2, v3); 3853 __ orr(v0, __ T16B, v0, v1); 3854 __ umov(tmp1, v0, __ D, 0); 3855 __ umov(tmp2, v0, __ D, 1); 3856 __ orr(tmp1, tmp1, tmp2); 3857 __ cbnz(tmp1, NOT_EQUAL); 3858 __ br(__ GE, LOOP); 3859 } 3860 3861 // a1 = r1 - array1 address 3862 // a2 = r2 - array2 address 3863 // result = r0 - return value. Already contains "false" 3864 // cnt1 = r10 - amount of elements left to check, reduced by wordSize 3865 // r3-r5 are reserved temporary registers 3866 address generate_large_array_equals() { 3867 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 3868 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, 3869 tmp7 = r12, tmp8 = r13; 3870 Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP, 3871 SMALL_LOOP, POST_LOOP; 3872 const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16; 3873 // calculate if at least 32 prefetched bytes are used 3874 int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32; 3875 int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE); 3876 RegSet spilled_regs = RegSet::range(tmp6, tmp8); 3877 assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4, 3878 tmp5, tmp6, tmp7, tmp8); 3879 3880 __ align(CodeEntryAlignment); 3881 3882 StubCodeMark mark(this, "StubRoutines", "large_array_equals"); 3883 3884 address entry = __ pc(); 3885 __ enter(); 3886 __ sub(cnt1, cnt1, wordSize); // first 8 bytes were loaded outside of stub 3887 // also advance pointers to use post-increment instead of pre-increment 3888 __ add(a1, a1, wordSize); 3889 __ add(a2, a2, wordSize); 3890 if (AvoidUnalignedAccesses) { 3891 // both implementations (SIMD/nonSIMD) are using relatively large load 3892 // instructions (ld1/ldp), which has huge penalty (up to x2 exec time) 3893 // on some CPUs in case of address is not at least 16-byte aligned. 3894 // Arrays are 8-byte aligned currently, so, we can make additional 8-byte 3895 // load if needed at least for 1st address and make if 16-byte aligned. 3896 Label ALIGNED16; 3897 __ tbz(a1, 3, ALIGNED16); 3898 __ ldr(tmp1, Address(__ post(a1, wordSize))); 3899 __ ldr(tmp2, Address(__ post(a2, wordSize))); 3900 __ sub(cnt1, cnt1, wordSize); 3901 __ eor(tmp1, tmp1, tmp2); 3902 __ cbnz(tmp1, NOT_EQUAL_NO_POP); 3903 __ bind(ALIGNED16); 3904 } 3905 if (UseSIMDForArrayEquals) { 3906 if (SoftwarePrefetchHintDistance >= 0) { 3907 __ subs(tmp1, cnt1, prefetchLoopThreshold); 3908 __ br(__ LE, NO_PREFETCH_LARGE_LOOP); 3909 generate_large_array_equals_loop_simd(prefetchLoopThreshold, 3910 /* prfm = */ true, NOT_EQUAL); 3911 __ subs(zr, cnt1, nonPrefetchLoopThreshold); 3912 __ br(__ LT, TAIL); 3913 } 3914 __ bind(NO_PREFETCH_LARGE_LOOP); 3915 generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold, 3916 /* prfm = */ false, NOT_EQUAL); 3917 } else { 3918 __ push(spilled_regs, sp); 3919 if (SoftwarePrefetchHintDistance >= 0) { 3920 __ subs(tmp1, cnt1, prefetchLoopThreshold); 3921 __ br(__ LE, NO_PREFETCH_LARGE_LOOP); 3922 generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold, 3923 /* prfm = */ true, NOT_EQUAL); 3924 __ subs(zr, cnt1, nonPrefetchLoopThreshold); 3925 __ br(__ LT, TAIL); 3926 } 3927 __ bind(NO_PREFETCH_LARGE_LOOP); 3928 generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold, 3929 /* prfm = */ false, NOT_EQUAL); 3930 } 3931 __ bind(TAIL); 3932 __ cbz(cnt1, EQUAL); 3933 __ subs(cnt1, cnt1, wordSize); 3934 __ br(__ LE, POST_LOOP); 3935 __ bind(SMALL_LOOP); 3936 __ ldr(tmp1, Address(__ post(a1, wordSize))); 3937 __ ldr(tmp2, Address(__ post(a2, wordSize))); 3938 __ subs(cnt1, cnt1, wordSize); 3939 __ eor(tmp1, tmp1, tmp2); 3940 __ cbnz(tmp1, NOT_EQUAL); 3941 __ br(__ GT, SMALL_LOOP); 3942 __ bind(POST_LOOP); 3943 __ ldr(tmp1, Address(a1, cnt1)); 3944 __ ldr(tmp2, Address(a2, cnt1)); 3945 __ eor(tmp1, tmp1, tmp2); 3946 __ cbnz(tmp1, NOT_EQUAL); 3947 __ bind(EQUAL); 3948 __ mov(result, true); 3949 __ bind(NOT_EQUAL); 3950 if (!UseSIMDForArrayEquals) { 3951 __ pop(spilled_regs, sp); 3952 } 3953 __ bind(NOT_EQUAL_NO_POP); 3954 __ leave(); 3955 __ ret(lr); 3956 return entry; 3957 } 3958 3959 address generate_dsin_dcos(bool isCos) { 3960 __ align(CodeEntryAlignment); 3961 StubCodeMark mark(this, "StubRoutines", isCos ? "libmDcos" : "libmDsin"); 3962 address start = __ pc(); 3963 __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw, 3964 (address)StubRoutines::aarch64::_two_over_pi, 3965 (address)StubRoutines::aarch64::_pio2, 3966 (address)StubRoutines::aarch64::_dsin_coef, 3967 (address)StubRoutines::aarch64::_dcos_coef); 3968 return start; 3969 } 3970 3971 address generate_dlog() { 3972 __ align(CodeEntryAlignment); 3973 StubCodeMark mark(this, "StubRoutines", "dlog"); 3974 address entry = __ pc(); 3975 FloatRegister vtmp0 = v0, vtmp1 = v1, vtmp2 = v2, vtmp3 = v3, vtmp4 = v4, 3976 vtmp5 = v5, tmpC1 = v16, tmpC2 = v17, tmpC3 = v18, tmpC4 = v19; 3977 Register tmp1 = r0, tmp2 = r1, tmp3 = r2, tmp4 = r3, tmp5 = r4; 3978 __ fast_log(vtmp0, vtmp1, vtmp2, vtmp3, vtmp4, vtmp5, tmpC1, tmpC2, tmpC3, 3979 tmpC4, tmp1, tmp2, tmp3, tmp4, tmp5); 3980 return entry; 3981 } 3982 3983 // code for comparing 16 bytes of strings with same encoding 3984 void compare_string_16_bytes_same(Label &DIFF1, Label &DIFF2) { 3985 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, tmp1 = r10, tmp2 = r11; 3986 __ ldr(rscratch1, Address(__ post(str1, 8))); 3987 __ eor(rscratch2, tmp1, tmp2); 3988 __ ldr(cnt1, Address(__ post(str2, 8))); 3989 __ cbnz(rscratch2, DIFF1); 3990 __ ldr(tmp1, Address(__ post(str1, 8))); 3991 __ eor(rscratch2, rscratch1, cnt1); 3992 __ ldr(tmp2, Address(__ post(str2, 8))); 3993 __ cbnz(rscratch2, DIFF2); 3994 } 3995 3996 // code for comparing 16 characters of strings with Latin1 and Utf16 encoding 3997 void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1, 3998 Label &DIFF2) { 3999 Register cnt1 = r2, tmp1 = r10, tmp2 = r11, tmp3 = r12; 4000 FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2; 4001 4002 __ ldrq(vtmp, Address(__ post(tmp2, 16))); 4003 __ ldr(tmpU, Address(__ post(cnt1, 8))); 4004 __ zip1(vtmp3, __ T16B, vtmp, vtmpZ); 4005 // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3 4006 4007 __ fmovd(tmpL, vtmp3); 4008 __ eor(rscratch2, tmp3, tmpL); 4009 __ cbnz(rscratch2, DIFF2); 4010 4011 __ ldr(tmp3, Address(__ post(cnt1, 8))); 4012 __ umov(tmpL, vtmp3, __ D, 1); 4013 __ eor(rscratch2, tmpU, tmpL); 4014 __ cbnz(rscratch2, DIFF1); 4015 4016 __ zip2(vtmp, __ T16B, vtmp, vtmpZ); 4017 __ ldr(tmpU, Address(__ post(cnt1, 8))); 4018 __ fmovd(tmpL, vtmp); 4019 __ eor(rscratch2, tmp3, tmpL); 4020 __ cbnz(rscratch2, DIFF2); 4021 4022 __ ldr(tmp3, Address(__ post(cnt1, 8))); 4023 __ umov(tmpL, vtmp, __ D, 1); 4024 __ eor(rscratch2, tmpU, tmpL); 4025 __ cbnz(rscratch2, DIFF1); 4026 } 4027 4028 // r0 = result 4029 // r1 = str1 4030 // r2 = cnt1 4031 // r3 = str2 4032 // r4 = cnt2 4033 // r10 = tmp1 4034 // r11 = tmp2 4035 address generate_compare_long_string_different_encoding(bool isLU) { 4036 __ align(CodeEntryAlignment); 4037 StubCodeMark mark(this, "StubRoutines", isLU 4038 ? "compare_long_string_different_encoding LU" 4039 : "compare_long_string_different_encoding UL"); 4040 address entry = __ pc(); 4041 Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2, 4042 DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, SMALL_LOOP_ENTER, 4043 LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2; 4044 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 4045 tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14; 4046 FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2; 4047 RegSet spilled_regs = RegSet::of(tmp3, tmp4); 4048 4049 int prefetchLoopExitCondition = MAX(32, SoftwarePrefetchHintDistance/2); 4050 4051 __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ); 4052 // cnt2 == amount of characters left to compare 4053 // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL)) 4054 __ zip1(vtmp, __ T8B, vtmp, vtmpZ); 4055 __ add(str1, str1, isLU ? wordSize/2 : wordSize); 4056 __ add(str2, str2, isLU ? wordSize : wordSize/2); 4057 __ fmovd(isLU ? tmp1 : tmp2, vtmp); 4058 __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case. 4059 __ add(str1, str1, cnt2, __ LSL, isLU ? 0 : 1); 4060 __ eor(rscratch2, tmp1, tmp2); 4061 __ add(str2, str2, cnt2, __ LSL, isLU ? 1 : 0); 4062 __ mov(rscratch1, tmp2); 4063 __ cbnz(rscratch2, CALCULATE_DIFFERENCE); 4064 Register strU = isLU ? str2 : str1, 4065 strL = isLU ? str1 : str2, 4066 tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison 4067 tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison 4068 __ push(spilled_regs, sp); 4069 __ sub(tmp2, strL, cnt2); // strL pointer to load from 4070 __ sub(cnt1, strU, cnt2, __ LSL, 1); // strU pointer to load from 4071 4072 __ ldr(tmp3, Address(__ post(cnt1, 8))); 4073 4074 if (SoftwarePrefetchHintDistance >= 0) { 4075 __ subs(rscratch2, cnt2, prefetchLoopExitCondition); 4076 __ br(__ LT, SMALL_LOOP); 4077 __ bind(LARGE_LOOP_PREFETCH); 4078 __ prfm(Address(tmp2, SoftwarePrefetchHintDistance)); 4079 __ mov(tmp4, 2); 4080 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance)); 4081 __ bind(LARGE_LOOP_PREFETCH_REPEAT1); 4082 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 4083 __ subs(tmp4, tmp4, 1); 4084 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1); 4085 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance)); 4086 __ mov(tmp4, 2); 4087 __ bind(LARGE_LOOP_PREFETCH_REPEAT2); 4088 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 4089 __ subs(tmp4, tmp4, 1); 4090 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2); 4091 __ sub(cnt2, cnt2, 64); 4092 __ subs(rscratch2, cnt2, prefetchLoopExitCondition); 4093 __ br(__ GE, LARGE_LOOP_PREFETCH); 4094 } 4095 __ cbz(cnt2, LOAD_LAST); // no characters left except last load 4096 __ subs(cnt2, cnt2, 16); 4097 __ br(__ LT, TAIL); 4098 __ b(SMALL_LOOP_ENTER); 4099 __ bind(SMALL_LOOP); // smaller loop 4100 __ subs(cnt2, cnt2, 16); 4101 __ bind(SMALL_LOOP_ENTER); 4102 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 4103 __ br(__ GE, SMALL_LOOP); 4104 __ cbz(cnt2, LOAD_LAST); 4105 __ bind(TAIL); // 1..15 characters left 4106 __ subs(zr, cnt2, -8); 4107 __ br(__ GT, TAIL_LOAD_16); 4108 __ ldrd(vtmp, Address(tmp2)); 4109 __ zip1(vtmp3, __ T8B, vtmp, vtmpZ); 4110 4111 __ ldr(tmpU, Address(__ post(cnt1, 8))); 4112 __ fmovd(tmpL, vtmp3); 4113 __ eor(rscratch2, tmp3, tmpL); 4114 __ cbnz(rscratch2, DIFF2); 4115 __ umov(tmpL, vtmp3, __ D, 1); 4116 __ eor(rscratch2, tmpU, tmpL); 4117 __ cbnz(rscratch2, DIFF1); 4118 __ b(LOAD_LAST); 4119 __ bind(TAIL_LOAD_16); 4120 __ ldrq(vtmp, Address(tmp2)); 4121 __ ldr(tmpU, Address(__ post(cnt1, 8))); 4122 __ zip1(vtmp3, __ T16B, vtmp, vtmpZ); 4123 __ zip2(vtmp, __ T16B, vtmp, vtmpZ); 4124 __ fmovd(tmpL, vtmp3); 4125 __ eor(rscratch2, tmp3, tmpL); 4126 __ cbnz(rscratch2, DIFF2); 4127 4128 __ ldr(tmp3, Address(__ post(cnt1, 8))); 4129 __ umov(tmpL, vtmp3, __ D, 1); 4130 __ eor(rscratch2, tmpU, tmpL); 4131 __ cbnz(rscratch2, DIFF1); 4132 4133 __ ldr(tmpU, Address(__ post(cnt1, 8))); 4134 __ fmovd(tmpL, vtmp); 4135 __ eor(rscratch2, tmp3, tmpL); 4136 __ cbnz(rscratch2, DIFF2); 4137 4138 __ umov(tmpL, vtmp, __ D, 1); 4139 __ eor(rscratch2, tmpU, tmpL); 4140 __ cbnz(rscratch2, DIFF1); 4141 __ b(LOAD_LAST); 4142 __ bind(DIFF2); 4143 __ mov(tmpU, tmp3); 4144 __ bind(DIFF1); 4145 __ pop(spilled_regs, sp); 4146 __ b(CALCULATE_DIFFERENCE); 4147 __ bind(LOAD_LAST); 4148 __ pop(spilled_regs, sp); 4149 4150 __ ldrs(vtmp, Address(strL)); 4151 __ ldr(tmpU, Address(strU)); 4152 __ zip1(vtmp, __ T8B, vtmp, vtmpZ); 4153 __ fmovd(tmpL, vtmp); 4154 4155 __ eor(rscratch2, tmpU, tmpL); 4156 __ cbz(rscratch2, DONE); 4157 4158 // Find the first different characters in the longwords and 4159 // compute their difference. 4160 __ bind(CALCULATE_DIFFERENCE); 4161 __ rev(rscratch2, rscratch2); 4162 __ clz(rscratch2, rscratch2); 4163 __ andr(rscratch2, rscratch2, -16); 4164 __ lsrv(tmp1, tmp1, rscratch2); 4165 __ uxthw(tmp1, tmp1); 4166 __ lsrv(rscratch1, rscratch1, rscratch2); 4167 __ uxthw(rscratch1, rscratch1); 4168 __ subw(result, tmp1, rscratch1); 4169 __ bind(DONE); 4170 __ ret(lr); 4171 return entry; 4172 } 4173 4174 // r0 = result 4175 // r1 = str1 4176 // r2 = cnt1 4177 // r3 = str2 4178 // r4 = cnt2 4179 // r10 = tmp1 4180 // r11 = tmp2 4181 address generate_compare_long_string_same_encoding(bool isLL) { 4182 __ align(CodeEntryAlignment); 4183 StubCodeMark mark(this, "StubRoutines", isLL 4184 ? "compare_long_string_same_encoding LL" 4185 : "compare_long_string_same_encoding UU"); 4186 address entry = __ pc(); 4187 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 4188 tmp1 = r10, tmp2 = r11; 4189 Label SMALL_LOOP, LARGE_LOOP_PREFETCH, CHECK_LAST, DIFF2, TAIL, 4190 LENGTH_DIFF, DIFF, LAST_CHECK_AND_LENGTH_DIFF, 4191 DIFF_LAST_POSITION, DIFF_LAST_POSITION2; 4192 // exit from large loop when less than 64 bytes left to read or we're about 4193 // to prefetch memory behind array border 4194 int largeLoopExitCondition = MAX(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2); 4195 // cnt1/cnt2 contains amount of characters to compare. cnt1 can be re-used 4196 // update cnt2 counter with already loaded 8 bytes 4197 __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2)); 4198 // update pointers, because of previous read 4199 __ add(str1, str1, wordSize); 4200 __ add(str2, str2, wordSize); 4201 if (SoftwarePrefetchHintDistance >= 0) { 4202 __ bind(LARGE_LOOP_PREFETCH); 4203 __ prfm(Address(str1, SoftwarePrefetchHintDistance)); 4204 __ prfm(Address(str2, SoftwarePrefetchHintDistance)); 4205 compare_string_16_bytes_same(DIFF, DIFF2); 4206 compare_string_16_bytes_same(DIFF, DIFF2); 4207 __ sub(cnt2, cnt2, isLL ? 64 : 32); 4208 compare_string_16_bytes_same(DIFF, DIFF2); 4209 __ subs(rscratch2, cnt2, largeLoopExitCondition); 4210 compare_string_16_bytes_same(DIFF, DIFF2); 4211 __ br(__ GT, LARGE_LOOP_PREFETCH); 4212 __ cbz(cnt2, LAST_CHECK_AND_LENGTH_DIFF); // no more chars left? 4213 // less than 16 bytes left? 4214 __ subs(cnt2, cnt2, isLL ? 16 : 8); 4215 __ br(__ LT, TAIL); 4216 } 4217 __ bind(SMALL_LOOP); 4218 compare_string_16_bytes_same(DIFF, DIFF2); 4219 __ subs(cnt2, cnt2, isLL ? 16 : 8); 4220 __ br(__ GE, SMALL_LOOP); 4221 __ bind(TAIL); 4222 __ adds(cnt2, cnt2, isLL ? 16 : 8); 4223 __ br(__ EQ, LAST_CHECK_AND_LENGTH_DIFF); 4224 __ subs(cnt2, cnt2, isLL ? 8 : 4); 4225 __ br(__ LE, CHECK_LAST); 4226 __ eor(rscratch2, tmp1, tmp2); 4227 __ cbnz(rscratch2, DIFF); 4228 __ ldr(tmp1, Address(__ post(str1, 8))); 4229 __ ldr(tmp2, Address(__ post(str2, 8))); 4230 __ sub(cnt2, cnt2, isLL ? 8 : 4); 4231 __ bind(CHECK_LAST); 4232 if (!isLL) { 4233 __ add(cnt2, cnt2, cnt2); // now in bytes 4234 } 4235 __ eor(rscratch2, tmp1, tmp2); 4236 __ cbnz(rscratch2, DIFF); 4237 __ ldr(rscratch1, Address(str1, cnt2)); 4238 __ ldr(cnt1, Address(str2, cnt2)); 4239 __ eor(rscratch2, rscratch1, cnt1); 4240 __ cbz(rscratch2, LENGTH_DIFF); 4241 // Find the first different characters in the longwords and 4242 // compute their difference. 4243 __ bind(DIFF2); 4244 __ rev(rscratch2, rscratch2); 4245 __ clz(rscratch2, rscratch2); 4246 __ andr(rscratch2, rscratch2, isLL ? -8 : -16); 4247 __ lsrv(rscratch1, rscratch1, rscratch2); 4248 if (isLL) { 4249 __ lsrv(cnt1, cnt1, rscratch2); 4250 __ uxtbw(rscratch1, rscratch1); 4251 __ uxtbw(cnt1, cnt1); 4252 } else { 4253 __ lsrv(cnt1, cnt1, rscratch2); 4254 __ uxthw(rscratch1, rscratch1); 4255 __ uxthw(cnt1, cnt1); 4256 } 4257 __ subw(result, rscratch1, cnt1); 4258 __ b(LENGTH_DIFF); 4259 __ bind(DIFF); 4260 __ rev(rscratch2, rscratch2); 4261 __ clz(rscratch2, rscratch2); 4262 __ andr(rscratch2, rscratch2, isLL ? -8 : -16); 4263 __ lsrv(tmp1, tmp1, rscratch2); 4264 if (isLL) { 4265 __ lsrv(tmp2, tmp2, rscratch2); 4266 __ uxtbw(tmp1, tmp1); 4267 __ uxtbw(tmp2, tmp2); 4268 } else { 4269 __ lsrv(tmp2, tmp2, rscratch2); 4270 __ uxthw(tmp1, tmp1); 4271 __ uxthw(tmp2, tmp2); 4272 } 4273 __ subw(result, tmp1, tmp2); 4274 __ b(LENGTH_DIFF); 4275 __ bind(LAST_CHECK_AND_LENGTH_DIFF); 4276 __ eor(rscratch2, tmp1, tmp2); 4277 __ cbnz(rscratch2, DIFF); 4278 __ bind(LENGTH_DIFF); 4279 __ ret(lr); 4280 return entry; 4281 } 4282 4283 void generate_compare_long_strings() { 4284 StubRoutines::aarch64::_compare_long_string_LL 4285 = generate_compare_long_string_same_encoding(true); 4286 StubRoutines::aarch64::_compare_long_string_UU 4287 = generate_compare_long_string_same_encoding(false); 4288 StubRoutines::aarch64::_compare_long_string_LU 4289 = generate_compare_long_string_different_encoding(true); 4290 StubRoutines::aarch64::_compare_long_string_UL 4291 = generate_compare_long_string_different_encoding(false); 4292 } 4293 4294 // R0 = result 4295 // R1 = str2 4296 // R2 = cnt1 4297 // R3 = str1 4298 // R4 = cnt2 4299 // This generic linear code use few additional ideas, which makes it faster: 4300 // 1) we can safely keep at least 1st register of pattern(since length >= 8) 4301 // in order to skip initial loading(help in systems with 1 ld pipeline) 4302 // 2) we can use "fast" algorithm of finding single character to search for 4303 // first symbol with less branches(1 branch per each loaded register instead 4304 // of branch for each symbol), so, this is where constants like 4305 // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from 4306 // 3) after loading and analyzing 1st register of source string, it can be 4307 // used to search for every 1st character entry, saving few loads in 4308 // comparison with "simplier-but-slower" implementation 4309 // 4) in order to avoid lots of push/pop operations, code below is heavily 4310 // re-using/re-initializing/compressing register values, which makes code 4311 // larger and a bit less readable, however, most of extra operations are 4312 // issued during loads or branches, so, penalty is minimal 4313 address generate_string_indexof_linear(bool str1_isL, bool str2_isL) { 4314 const char* stubName = str1_isL 4315 ? (str2_isL ? "indexof_linear_ll" : "indexof_linear_ul") 4316 : "indexof_linear_uu"; 4317 __ align(CodeEntryAlignment); 4318 StubCodeMark mark(this, "StubRoutines", stubName); 4319 address entry = __ pc(); 4320 4321 int str1_chr_size = str1_isL ? 1 : 2; 4322 int str2_chr_size = str2_isL ? 1 : 2; 4323 int str1_chr_shift = str1_isL ? 0 : 1; 4324 int str2_chr_shift = str2_isL ? 0 : 1; 4325 bool isL = str1_isL && str2_isL; 4326 // parameters 4327 Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4; 4328 // temporary registers 4329 Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23; 4330 RegSet spilled_regs = RegSet::range(tmp1, tmp4); 4331 // redefinitions 4332 Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3; 4333 4334 __ push(spilled_regs, sp); 4335 Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO, 4336 L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED, 4337 L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP, 4338 L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH, 4339 L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2, 4340 L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH; 4341 // Read whole register from str1. It is safe, because length >=8 here 4342 __ ldr(ch1, Address(str1)); 4343 // Read whole register from str2. It is safe, because length >=8 here 4344 __ ldr(ch2, Address(str2)); 4345 __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF); 4346 if (str1_isL != str2_isL) { 4347 __ eor(v0, __ T16B, v0, v0); 4348 } 4349 __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001); 4350 __ mul(first, first, tmp1); 4351 // check if we have less than 1 register to check 4352 __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1); 4353 if (str1_isL != str2_isL) { 4354 __ fmovd(v1, ch1); 4355 } 4356 __ br(__ LE, L_SMALL); 4357 __ eor(ch2, first, ch2); 4358 if (str1_isL != str2_isL) { 4359 __ zip1(v1, __ T16B, v1, v0); 4360 } 4361 __ sub(tmp2, ch2, tmp1); 4362 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 4363 __ bics(tmp2, tmp2, ch2); 4364 if (str1_isL != str2_isL) { 4365 __ fmovd(ch1, v1); 4366 } 4367 __ br(__ NE, L_HAS_ZERO); 4368 __ subs(cnt2, cnt2, wordSize/str2_chr_size); 4369 __ add(result, result, wordSize/str2_chr_size); 4370 __ add(str2, str2, wordSize); 4371 __ br(__ LT, L_POST_LOOP); 4372 __ BIND(L_LOOP); 4373 __ ldr(ch2, Address(str2)); 4374 __ eor(ch2, first, ch2); 4375 __ sub(tmp2, ch2, tmp1); 4376 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 4377 __ bics(tmp2, tmp2, ch2); 4378 __ br(__ NE, L_HAS_ZERO); 4379 __ BIND(L_LOOP_PROCEED); 4380 __ subs(cnt2, cnt2, wordSize/str2_chr_size); 4381 __ add(str2, str2, wordSize); 4382 __ add(result, result, wordSize/str2_chr_size); 4383 __ br(__ GE, L_LOOP); 4384 __ BIND(L_POST_LOOP); 4385 __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check 4386 __ br(__ LE, NOMATCH); 4387 __ ldr(ch2, Address(str2)); 4388 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift); 4389 __ eor(ch2, first, ch2); 4390 __ sub(tmp2, ch2, tmp1); 4391 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 4392 __ mov(tmp4, -1); // all bits set 4393 __ b(L_SMALL_PROCEED); 4394 __ align(OptoLoopAlignment); 4395 __ BIND(L_SMALL); 4396 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift); 4397 __ eor(ch2, first, ch2); 4398 if (str1_isL != str2_isL) { 4399 __ zip1(v1, __ T16B, v1, v0); 4400 } 4401 __ sub(tmp2, ch2, tmp1); 4402 __ mov(tmp4, -1); // all bits set 4403 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 4404 if (str1_isL != str2_isL) { 4405 __ fmovd(ch1, v1); // move converted 4 symbols 4406 } 4407 __ BIND(L_SMALL_PROCEED); 4408 __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits. 4409 __ bic(tmp2, tmp2, ch2); 4410 __ ands(tmp2, tmp2, tmp4); // clear useless bits and check 4411 __ rbit(tmp2, tmp2); 4412 __ br(__ EQ, NOMATCH); 4413 __ BIND(L_SMALL_HAS_ZERO_LOOP); 4414 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's 4415 __ cmp(cnt1, u1(wordSize/str2_chr_size)); 4416 __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2); 4417 if (str2_isL) { // LL 4418 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index" 4419 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe. 4420 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info 4421 __ add(result, result, tmp4, __ LSR, LogBitsPerByte); 4422 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 4423 } else { 4424 __ mov(ch2, 0xE); // all bits in byte set except last one 4425 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 4426 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 4427 __ lslv(tmp2, tmp2, tmp4); 4428 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4429 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4430 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 4431 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4432 } 4433 __ cmp(ch1, ch2); 4434 __ mov(tmp4, wordSize/str2_chr_size); 4435 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 4436 __ BIND(L_SMALL_CMP_LOOP); 4437 str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift))) 4438 : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift))); 4439 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))) 4440 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))); 4441 __ add(tmp4, tmp4, 1); 4442 __ cmp(tmp4, cnt1); 4443 __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP); 4444 __ cmp(first, ch2); 4445 __ br(__ EQ, L_SMALL_CMP_LOOP); 4446 __ BIND(L_SMALL_CMP_LOOP_NOMATCH); 4447 __ cbz(tmp2, NOMATCH); // no more matches. exit 4448 __ clz(tmp4, tmp2); 4449 __ add(result, result, 1); // advance index 4450 __ add(str2, str2, str2_chr_size); // advance pointer 4451 __ b(L_SMALL_HAS_ZERO_LOOP); 4452 __ align(OptoLoopAlignment); 4453 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP); 4454 __ cmp(first, ch2); 4455 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 4456 __ b(DONE); 4457 __ align(OptoLoopAlignment); 4458 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2); 4459 if (str2_isL) { // LL 4460 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index" 4461 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe. 4462 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info 4463 __ add(result, result, tmp4, __ LSR, LogBitsPerByte); 4464 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 4465 } else { 4466 __ mov(ch2, 0xE); // all bits in byte set except last one 4467 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 4468 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 4469 __ lslv(tmp2, tmp2, tmp4); 4470 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4471 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4472 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 4473 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4474 } 4475 __ cmp(ch1, ch2); 4476 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 4477 __ b(DONE); 4478 __ align(OptoLoopAlignment); 4479 __ BIND(L_HAS_ZERO); 4480 __ rbit(tmp2, tmp2); 4481 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's 4482 // Now, perform compression of counters(cnt2 and cnt1) into one register. 4483 // It's fine because both counters are 32bit and are not changed in this 4484 // loop. Just restore it on exit. So, cnt1 can be re-used in this loop. 4485 __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2); 4486 __ sub(result, result, 1); 4487 __ BIND(L_HAS_ZERO_LOOP); 4488 __ mov(cnt1, wordSize/str2_chr_size); 4489 __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2); 4490 __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare 4491 if (str2_isL) { 4492 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index 4493 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 4494 __ lslv(tmp2, tmp2, tmp4); 4495 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4496 __ add(tmp4, tmp4, 1); 4497 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4498 __ lsl(tmp2, tmp2, 1); 4499 __ mov(tmp4, wordSize/str2_chr_size); 4500 } else { 4501 __ mov(ch2, 0xE); 4502 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 4503 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 4504 __ lslv(tmp2, tmp2, tmp4); 4505 __ add(tmp4, tmp4, 1); 4506 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4507 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); 4508 __ lsl(tmp2, tmp2, 1); 4509 __ mov(tmp4, wordSize/str2_chr_size); 4510 __ sub(str2, str2, str2_chr_size); 4511 } 4512 __ cmp(ch1, ch2); 4513 __ mov(tmp4, wordSize/str2_chr_size); 4514 __ br(__ NE, L_CMP_LOOP_NOMATCH); 4515 __ BIND(L_CMP_LOOP); 4516 str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift))) 4517 : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift))); 4518 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))) 4519 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))); 4520 __ add(tmp4, tmp4, 1); 4521 __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2); 4522 __ br(__ GE, L_CMP_LOOP_LAST_CMP); 4523 __ cmp(cnt1, ch2); 4524 __ br(__ EQ, L_CMP_LOOP); 4525 __ BIND(L_CMP_LOOP_NOMATCH); 4526 // here we're not matched 4527 __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop 4528 __ clz(tmp4, tmp2); 4529 __ add(str2, str2, str2_chr_size); // advance pointer 4530 __ b(L_HAS_ZERO_LOOP); 4531 __ align(OptoLoopAlignment); 4532 __ BIND(L_CMP_LOOP_LAST_CMP); 4533 __ cmp(cnt1, ch2); 4534 __ br(__ NE, L_CMP_LOOP_NOMATCH); 4535 __ b(DONE); 4536 __ align(OptoLoopAlignment); 4537 __ BIND(L_CMP_LOOP_LAST_CMP2); 4538 if (str2_isL) { 4539 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index 4540 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 4541 __ lslv(tmp2, tmp2, tmp4); 4542 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4543 __ add(tmp4, tmp4, 1); 4544 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4545 __ lsl(tmp2, tmp2, 1); 4546 } else { 4547 __ mov(ch2, 0xE); 4548 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 4549 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 4550 __ lslv(tmp2, tmp2, tmp4); 4551 __ add(tmp4, tmp4, 1); 4552 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4553 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); 4554 __ lsl(tmp2, tmp2, 1); 4555 __ sub(str2, str2, str2_chr_size); 4556 } 4557 __ cmp(ch1, ch2); 4558 __ br(__ NE, L_CMP_LOOP_NOMATCH); 4559 __ b(DONE); 4560 __ align(OptoLoopAlignment); 4561 __ BIND(L_HAS_ZERO_LOOP_NOMATCH); 4562 // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until 4563 // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP, 4564 // so, result was increased at max by wordSize/str2_chr_size - 1, so, 4565 // respective high bit wasn't changed. L_LOOP_PROCEED will increase 4566 // result by analyzed characters value, so, we can just reset lower bits 4567 // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL 4568 // 2) restore cnt1 and cnt2 values from "compressed" cnt2 4569 // 3) advance str2 value to represent next str2 octet. result & 7/3 is 4570 // index of last analyzed substring inside current octet. So, str2 in at 4571 // respective start address. We need to advance it to next octet 4572 __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed 4573 __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2); 4574 __ bfm(result, zr, 0, 2 - str2_chr_shift); 4575 __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2 4576 __ movw(cnt2, cnt2); 4577 __ b(L_LOOP_PROCEED); 4578 __ align(OptoLoopAlignment); 4579 __ BIND(NOMATCH); 4580 __ mov(result, -1); 4581 __ BIND(DONE); 4582 __ pop(spilled_regs, sp); 4583 __ ret(lr); 4584 return entry; 4585 } 4586 4587 void generate_string_indexof_stubs() { 4588 StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true); 4589 StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false); 4590 StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false); 4591 } 4592 4593 void inflate_and_store_2_fp_registers(bool generatePrfm, 4594 FloatRegister src1, FloatRegister src2) { 4595 Register dst = r1; 4596 __ zip1(v1, __ T16B, src1, v0); 4597 __ zip2(v2, __ T16B, src1, v0); 4598 if (generatePrfm) { 4599 __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM); 4600 } 4601 __ zip1(v3, __ T16B, src2, v0); 4602 __ zip2(v4, __ T16B, src2, v0); 4603 __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64))); 4604 } 4605 4606 // R0 = src 4607 // R1 = dst 4608 // R2 = len 4609 // R3 = len >> 3 4610 // V0 = 0 4611 // v1 = loaded 8 bytes 4612 address generate_large_byte_array_inflate() { 4613 __ align(CodeEntryAlignment); 4614 StubCodeMark mark(this, "StubRoutines", "large_byte_array_inflate"); 4615 address entry = __ pc(); 4616 Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE; 4617 Register src = r0, dst = r1, len = r2, octetCounter = r3; 4618 const int large_loop_threshold = MAX(64, SoftwarePrefetchHintDistance)/8 + 4; 4619 4620 // do one more 8-byte read to have address 16-byte aligned in most cases 4621 // also use single store instruction 4622 __ ldrd(v2, __ post(src, 8)); 4623 __ sub(octetCounter, octetCounter, 2); 4624 __ zip1(v1, __ T16B, v1, v0); 4625 __ zip1(v2, __ T16B, v2, v0); 4626 __ st1(v1, v2, __ T16B, __ post(dst, 32)); 4627 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 4628 __ subs(rscratch1, octetCounter, large_loop_threshold); 4629 __ br(__ LE, LOOP_START); 4630 __ b(LOOP_PRFM_START); 4631 __ bind(LOOP_PRFM); 4632 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 4633 __ bind(LOOP_PRFM_START); 4634 __ prfm(Address(src, SoftwarePrefetchHintDistance)); 4635 __ sub(octetCounter, octetCounter, 8); 4636 __ subs(rscratch1, octetCounter, large_loop_threshold); 4637 inflate_and_store_2_fp_registers(true, v3, v4); 4638 inflate_and_store_2_fp_registers(true, v5, v6); 4639 __ br(__ GT, LOOP_PRFM); 4640 __ cmp(octetCounter, (u1)8); 4641 __ br(__ LT, DONE); 4642 __ bind(LOOP); 4643 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 4644 __ bind(LOOP_START); 4645 __ sub(octetCounter, octetCounter, 8); 4646 __ cmp(octetCounter, (u1)8); 4647 inflate_and_store_2_fp_registers(false, v3, v4); 4648 inflate_and_store_2_fp_registers(false, v5, v6); 4649 __ br(__ GE, LOOP); 4650 __ bind(DONE); 4651 __ ret(lr); 4652 return entry; 4653 } 4654 4655 /** 4656 * Arguments: 4657 * 4658 * Input: 4659 * c_rarg0 - current state address 4660 * c_rarg1 - H key address 4661 * c_rarg2 - data address 4662 * c_rarg3 - number of blocks 4663 * 4664 * Output: 4665 * Updated state at c_rarg0 4666 */ 4667 address generate_ghash_processBlocks() { 4668 // Bafflingly, GCM uses little-endian for the byte order, but 4669 // big-endian for the bit order. For example, the polynomial 1 is 4670 // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00. 4671 // 4672 // So, we must either reverse the bytes in each word and do 4673 // everything big-endian or reverse the bits in each byte and do 4674 // it little-endian. On AArch64 it's more idiomatic to reverse 4675 // the bits in each byte (we have an instruction, RBIT, to do 4676 // that) and keep the data in little-endian bit order throught the 4677 // calculation, bit-reversing the inputs and outputs. 4678 4679 StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks"); 4680 __ align(wordSize * 2); 4681 address p = __ pc(); 4682 __ emit_int64(0x87); // The low-order bits of the field 4683 // polynomial (i.e. p = z^7+z^2+z+1) 4684 // repeated in the low and high parts of a 4685 // 128-bit vector 4686 __ emit_int64(0x87); 4687 4688 __ align(CodeEntryAlignment); 4689 address start = __ pc(); 4690 4691 Register state = c_rarg0; 4692 Register subkeyH = c_rarg1; 4693 Register data = c_rarg2; 4694 Register blocks = c_rarg3; 4695 4696 FloatRegister vzr = v30; 4697 __ eor(vzr, __ T16B, vzr, vzr); // zero register 4698 4699 __ ldrq(v0, Address(state)); 4700 __ ldrq(v1, Address(subkeyH)); 4701 4702 __ rev64(v0, __ T16B, v0); // Bit-reverse words in state and subkeyH 4703 __ rbit(v0, __ T16B, v0); 4704 __ rev64(v1, __ T16B, v1); 4705 __ rbit(v1, __ T16B, v1); 4706 4707 __ ldrq(v26, p); 4708 4709 __ ext(v16, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1 4710 __ eor(v16, __ T16B, v16, v1); // xor subkeyH into subkeyL (Karatsuba: (A1+A0)) 4711 4712 { 4713 Label L_ghash_loop; 4714 __ bind(L_ghash_loop); 4715 4716 __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit 4717 // reversing each byte 4718 __ rbit(v2, __ T16B, v2); 4719 __ eor(v2, __ T16B, v0, v2); // bit-swapped data ^ bit-swapped state 4720 4721 // Multiply state in v2 by subkey in v1 4722 ghash_multiply(/*result_lo*/v5, /*result_hi*/v7, 4723 /*a*/v1, /*b*/v2, /*a1_xor_a0*/v16, 4724 /*temps*/v6, v20, v18, v21); 4725 // Reduce v7:v5 by the field polynomial 4726 ghash_reduce(v0, v5, v7, v26, vzr, v20); 4727 4728 __ sub(blocks, blocks, 1); 4729 __ cbnz(blocks, L_ghash_loop); 4730 } 4731 4732 // The bit-reversed result is at this point in v0 4733 __ rev64(v1, __ T16B, v0); 4734 __ rbit(v1, __ T16B, v1); 4735 4736 __ st1(v1, __ T16B, state); 4737 __ ret(lr); 4738 4739 return start; 4740 } 4741 4742 // Continuation point for throwing of implicit exceptions that are 4743 // not handled in the current activation. Fabricates an exception 4744 // oop and initiates normal exception dispatching in this 4745 // frame. Since we need to preserve callee-saved values (currently 4746 // only for C2, but done for C1 as well) we need a callee-saved oop 4747 // map and therefore have to make these stubs into RuntimeStubs 4748 // rather than BufferBlobs. If the compiler needs all registers to 4749 // be preserved between the fault point and the exception handler 4750 // then it must assume responsibility for that in 4751 // AbstractCompiler::continuation_for_implicit_null_exception or 4752 // continuation_for_implicit_division_by_zero_exception. All other 4753 // implicit exceptions (e.g., NullPointerException or 4754 // AbstractMethodError on entry) are either at call sites or 4755 // otherwise assume that stack unwinding will be initiated, so 4756 // caller saved registers were assumed volatile in the compiler. 4757 4758 #undef __ 4759 #define __ masm-> 4760 4761 address generate_throw_exception(const char* name, 4762 address runtime_entry, 4763 Register arg1 = noreg, 4764 Register arg2 = noreg) { 4765 // Information about frame layout at time of blocking runtime call. 4766 // Note that we only have to preserve callee-saved registers since 4767 // the compilers are responsible for supplying a continuation point 4768 // if they expect all registers to be preserved. 4769 // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0 4770 enum layout { 4771 rfp_off = 0, 4772 rfp_off2, 4773 return_off, 4774 return_off2, 4775 framesize // inclusive of return address 4776 }; 4777 4778 int insts_size = 512; 4779 int locs_size = 64; 4780 4781 CodeBuffer code(name, insts_size, locs_size); 4782 OopMapSet* oop_maps = new OopMapSet(); 4783 MacroAssembler* masm = new MacroAssembler(&code); 4784 4785 address start = __ pc(); 4786 4787 // This is an inlined and slightly modified version of call_VM 4788 // which has the ability to fetch the return PC out of 4789 // thread-local storage and also sets up last_Java_sp slightly 4790 // differently than the real call_VM 4791 4792 __ enter(); // Save FP and LR before call 4793 4794 assert(is_even(framesize/2), "sp not 16-byte aligned"); 4795 4796 // lr and fp are already in place 4797 __ sub(sp, rfp, ((unsigned)framesize-4) << LogBytesPerInt); // prolog 4798 4799 int frame_complete = __ pc() - start; 4800 4801 // Set up last_Java_sp and last_Java_fp 4802 address the_pc = __ pc(); 4803 __ set_last_Java_frame(sp, rfp, the_pc, rscratch1); 4804 4805 // Call runtime 4806 if (arg1 != noreg) { 4807 assert(arg2 != c_rarg1, "clobbered"); 4808 __ mov(c_rarg1, arg1); 4809 } 4810 if (arg2 != noreg) { 4811 __ mov(c_rarg2, arg2); 4812 } 4813 __ mov(c_rarg0, rthread); 4814 BLOCK_COMMENT("call runtime_entry"); 4815 __ mov(rscratch1, runtime_entry); 4816 __ blrt(rscratch1, 3 /* number_of_arguments */, 0, 1); 4817 4818 // Generate oop map 4819 OopMap* map = new OopMap(framesize, 0); 4820 4821 oop_maps->add_gc_map(the_pc - start, map); 4822 4823 __ reset_last_Java_frame(true); 4824 __ maybe_isb(); 4825 4826 __ leave(); 4827 4828 // check for pending exceptions 4829 #ifdef ASSERT 4830 Label L; 4831 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 4832 __ cbnz(rscratch1, L); 4833 __ should_not_reach_here(); 4834 __ bind(L); 4835 #endif // ASSERT 4836 __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 4837 4838 4839 // codeBlob framesize is in words (not VMRegImpl::slot_size) 4840 RuntimeStub* stub = 4841 RuntimeStub::new_runtime_stub(name, 4842 &code, 4843 frame_complete, 4844 (framesize >> (LogBytesPerWord - LogBytesPerInt)), 4845 oop_maps, false); 4846 return stub->entry_point(); 4847 } 4848 4849 class MontgomeryMultiplyGenerator : public MacroAssembler { 4850 4851 Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn, 4852 Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj; 4853 4854 RegSet _toSave; 4855 bool _squaring; 4856 4857 public: 4858 MontgomeryMultiplyGenerator (Assembler *as, bool squaring) 4859 : MacroAssembler(as->code()), _squaring(squaring) { 4860 4861 // Register allocation 4862 4863 Register reg = c_rarg0; 4864 Pa_base = reg; // Argument registers 4865 if (squaring) 4866 Pb_base = Pa_base; 4867 else 4868 Pb_base = ++reg; 4869 Pn_base = ++reg; 4870 Rlen= ++reg; 4871 inv = ++reg; 4872 Pm_base = ++reg; 4873 4874 // Working registers: 4875 Ra = ++reg; // The current digit of a, b, n, and m. 4876 Rb = ++reg; 4877 Rm = ++reg; 4878 Rn = ++reg; 4879 4880 Pa = ++reg; // Pointers to the current/next digit of a, b, n, and m. 4881 Pb = ++reg; 4882 Pm = ++reg; 4883 Pn = ++reg; 4884 4885 t0 = ++reg; // Three registers which form a 4886 t1 = ++reg; // triple-precision accumuator. 4887 t2 = ++reg; 4888 4889 Ri = ++reg; // Inner and outer loop indexes. 4890 Rj = ++reg; 4891 4892 Rhi_ab = ++reg; // Product registers: low and high parts 4893 Rlo_ab = ++reg; // of a*b and m*n. 4894 Rhi_mn = ++reg; 4895 Rlo_mn = ++reg; 4896 4897 // r19 and up are callee-saved. 4898 _toSave = RegSet::range(r19, reg) + Pm_base; 4899 } 4900 4901 private: 4902 void save_regs() { 4903 push(_toSave, sp); 4904 } 4905 4906 void restore_regs() { 4907 pop(_toSave, sp); 4908 } 4909 4910 template <typename T> 4911 void unroll_2(Register count, T block) { 4912 Label loop, end, odd; 4913 tbnz(count, 0, odd); 4914 cbz(count, end); 4915 align(16); 4916 bind(loop); 4917 (this->*block)(); 4918 bind(odd); 4919 (this->*block)(); 4920 subs(count, count, 2); 4921 br(Assembler::GT, loop); 4922 bind(end); 4923 } 4924 4925 template <typename T> 4926 void unroll_2(Register count, T block, Register d, Register s, Register tmp) { 4927 Label loop, end, odd; 4928 tbnz(count, 0, odd); 4929 cbz(count, end); 4930 align(16); 4931 bind(loop); 4932 (this->*block)(d, s, tmp); 4933 bind(odd); 4934 (this->*block)(d, s, tmp); 4935 subs(count, count, 2); 4936 br(Assembler::GT, loop); 4937 bind(end); 4938 } 4939 4940 void pre1(RegisterOrConstant i) { 4941 block_comment("pre1"); 4942 // Pa = Pa_base; 4943 // Pb = Pb_base + i; 4944 // Pm = Pm_base; 4945 // Pn = Pn_base + i; 4946 // Ra = *Pa; 4947 // Rb = *Pb; 4948 // Rm = *Pm; 4949 // Rn = *Pn; 4950 ldr(Ra, Address(Pa_base)); 4951 ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 4952 ldr(Rm, Address(Pm_base)); 4953 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 4954 lea(Pa, Address(Pa_base)); 4955 lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 4956 lea(Pm, Address(Pm_base)); 4957 lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 4958 4959 // Zero the m*n result. 4960 mov(Rhi_mn, zr); 4961 mov(Rlo_mn, zr); 4962 } 4963 4964 // The core multiply-accumulate step of a Montgomery 4965 // multiplication. The idea is to schedule operations as a 4966 // pipeline so that instructions with long latencies (loads and 4967 // multiplies) have time to complete before their results are 4968 // used. This most benefits in-order implementations of the 4969 // architecture but out-of-order ones also benefit. 4970 void step() { 4971 block_comment("step"); 4972 // MACC(Ra, Rb, t0, t1, t2); 4973 // Ra = *++Pa; 4974 // Rb = *--Pb; 4975 umulh(Rhi_ab, Ra, Rb); 4976 mul(Rlo_ab, Ra, Rb); 4977 ldr(Ra, pre(Pa, wordSize)); 4978 ldr(Rb, pre(Pb, -wordSize)); 4979 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the 4980 // previous iteration. 4981 // MACC(Rm, Rn, t0, t1, t2); 4982 // Rm = *++Pm; 4983 // Rn = *--Pn; 4984 umulh(Rhi_mn, Rm, Rn); 4985 mul(Rlo_mn, Rm, Rn); 4986 ldr(Rm, pre(Pm, wordSize)); 4987 ldr(Rn, pre(Pn, -wordSize)); 4988 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 4989 } 4990 4991 void post1() { 4992 block_comment("post1"); 4993 4994 // MACC(Ra, Rb, t0, t1, t2); 4995 // Ra = *++Pa; 4996 // Rb = *--Pb; 4997 umulh(Rhi_ab, Ra, Rb); 4998 mul(Rlo_ab, Ra, Rb); 4999 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 5000 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 5001 5002 // *Pm = Rm = t0 * inv; 5003 mul(Rm, t0, inv); 5004 str(Rm, Address(Pm)); 5005 5006 // MACC(Rm, Rn, t0, t1, t2); 5007 // t0 = t1; t1 = t2; t2 = 0; 5008 umulh(Rhi_mn, Rm, Rn); 5009 5010 #ifndef PRODUCT 5011 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 5012 { 5013 mul(Rlo_mn, Rm, Rn); 5014 add(Rlo_mn, t0, Rlo_mn); 5015 Label ok; 5016 cbz(Rlo_mn, ok); { 5017 stop("broken Montgomery multiply"); 5018 } bind(ok); 5019 } 5020 #endif 5021 // We have very carefully set things up so that 5022 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 5023 // the lower half of Rm * Rn because we know the result already: 5024 // it must be -t0. t0 + (-t0) must generate a carry iff 5025 // t0 != 0. So, rather than do a mul and an adds we just set 5026 // the carry flag iff t0 is nonzero. 5027 // 5028 // mul(Rlo_mn, Rm, Rn); 5029 // adds(zr, t0, Rlo_mn); 5030 subs(zr, t0, 1); // Set carry iff t0 is nonzero 5031 adcs(t0, t1, Rhi_mn); 5032 adc(t1, t2, zr); 5033 mov(t2, zr); 5034 } 5035 5036 void pre2(RegisterOrConstant i, RegisterOrConstant len) { 5037 block_comment("pre2"); 5038 // Pa = Pa_base + i-len; 5039 // Pb = Pb_base + len; 5040 // Pm = Pm_base + i-len; 5041 // Pn = Pn_base + len; 5042 5043 if (i.is_register()) { 5044 sub(Rj, i.as_register(), len); 5045 } else { 5046 mov(Rj, i.as_constant()); 5047 sub(Rj, Rj, len); 5048 } 5049 // Rj == i-len 5050 5051 lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord))); 5052 lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord))); 5053 lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 5054 lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord))); 5055 5056 // Ra = *++Pa; 5057 // Rb = *--Pb; 5058 // Rm = *++Pm; 5059 // Rn = *--Pn; 5060 ldr(Ra, pre(Pa, wordSize)); 5061 ldr(Rb, pre(Pb, -wordSize)); 5062 ldr(Rm, pre(Pm, wordSize)); 5063 ldr(Rn, pre(Pn, -wordSize)); 5064 5065 mov(Rhi_mn, zr); 5066 mov(Rlo_mn, zr); 5067 } 5068 5069 void post2(RegisterOrConstant i, RegisterOrConstant len) { 5070 block_comment("post2"); 5071 if (i.is_constant()) { 5072 mov(Rj, i.as_constant()-len.as_constant()); 5073 } else { 5074 sub(Rj, i.as_register(), len); 5075 } 5076 5077 adds(t0, t0, Rlo_mn); // The pending m*n, low part 5078 5079 // As soon as we know the least significant digit of our result, 5080 // store it. 5081 // Pm_base[i-len] = t0; 5082 str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 5083 5084 // t0 = t1; t1 = t2; t2 = 0; 5085 adcs(t0, t1, Rhi_mn); // The pending m*n, high part 5086 adc(t1, t2, zr); 5087 mov(t2, zr); 5088 } 5089 5090 // A carry in t0 after Montgomery multiplication means that we 5091 // should subtract multiples of n from our result in m. We'll 5092 // keep doing that until there is no carry. 5093 void normalize(RegisterOrConstant len) { 5094 block_comment("normalize"); 5095 // while (t0) 5096 // t0 = sub(Pm_base, Pn_base, t0, len); 5097 Label loop, post, again; 5098 Register cnt = t1, i = t2; // Re-use registers; we're done with them now 5099 cbz(t0, post); { 5100 bind(again); { 5101 mov(i, zr); 5102 mov(cnt, len); 5103 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 5104 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 5105 subs(zr, zr, zr); // set carry flag, i.e. no borrow 5106 align(16); 5107 bind(loop); { 5108 sbcs(Rm, Rm, Rn); 5109 str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 5110 add(i, i, 1); 5111 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 5112 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 5113 sub(cnt, cnt, 1); 5114 } cbnz(cnt, loop); 5115 sbc(t0, t0, zr); 5116 } cbnz(t0, again); 5117 } bind(post); 5118 } 5119 5120 // Move memory at s to d, reversing words. 5121 // Increments d to end of copied memory 5122 // Destroys tmp1, tmp2 5123 // Preserves len 5124 // Leaves s pointing to the address which was in d at start 5125 void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) { 5126 assert(tmp1 < r19 && tmp2 < r19, "register corruption"); 5127 5128 lea(s, Address(s, len, Address::uxtw(LogBytesPerWord))); 5129 mov(tmp1, len); 5130 unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2); 5131 sub(s, d, len, ext::uxtw, LogBytesPerWord); 5132 } 5133 // where 5134 void reverse1(Register d, Register s, Register tmp) { 5135 ldr(tmp, pre(s, -wordSize)); 5136 ror(tmp, tmp, 32); 5137 str(tmp, post(d, wordSize)); 5138 } 5139 5140 void step_squaring() { 5141 // An extra ACC 5142 step(); 5143 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 5144 } 5145 5146 void last_squaring(RegisterOrConstant i) { 5147 Label dont; 5148 // if ((i & 1) == 0) { 5149 tbnz(i.as_register(), 0, dont); { 5150 // MACC(Ra, Rb, t0, t1, t2); 5151 // Ra = *++Pa; 5152 // Rb = *--Pb; 5153 umulh(Rhi_ab, Ra, Rb); 5154 mul(Rlo_ab, Ra, Rb); 5155 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 5156 } bind(dont); 5157 } 5158 5159 void extra_step_squaring() { 5160 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 5161 5162 // MACC(Rm, Rn, t0, t1, t2); 5163 // Rm = *++Pm; 5164 // Rn = *--Pn; 5165 umulh(Rhi_mn, Rm, Rn); 5166 mul(Rlo_mn, Rm, Rn); 5167 ldr(Rm, pre(Pm, wordSize)); 5168 ldr(Rn, pre(Pn, -wordSize)); 5169 } 5170 5171 void post1_squaring() { 5172 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 5173 5174 // *Pm = Rm = t0 * inv; 5175 mul(Rm, t0, inv); 5176 str(Rm, Address(Pm)); 5177 5178 // MACC(Rm, Rn, t0, t1, t2); 5179 // t0 = t1; t1 = t2; t2 = 0; 5180 umulh(Rhi_mn, Rm, Rn); 5181 5182 #ifndef PRODUCT 5183 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 5184 { 5185 mul(Rlo_mn, Rm, Rn); 5186 add(Rlo_mn, t0, Rlo_mn); 5187 Label ok; 5188 cbz(Rlo_mn, ok); { 5189 stop("broken Montgomery multiply"); 5190 } bind(ok); 5191 } 5192 #endif 5193 // We have very carefully set things up so that 5194 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 5195 // the lower half of Rm * Rn because we know the result already: 5196 // it must be -t0. t0 + (-t0) must generate a carry iff 5197 // t0 != 0. So, rather than do a mul and an adds we just set 5198 // the carry flag iff t0 is nonzero. 5199 // 5200 // mul(Rlo_mn, Rm, Rn); 5201 // adds(zr, t0, Rlo_mn); 5202 subs(zr, t0, 1); // Set carry iff t0 is nonzero 5203 adcs(t0, t1, Rhi_mn); 5204 adc(t1, t2, zr); 5205 mov(t2, zr); 5206 } 5207 5208 void acc(Register Rhi, Register Rlo, 5209 Register t0, Register t1, Register t2) { 5210 adds(t0, t0, Rlo); 5211 adcs(t1, t1, Rhi); 5212 adc(t2, t2, zr); 5213 } 5214 5215 public: 5216 /** 5217 * Fast Montgomery multiplication. The derivation of the 5218 * algorithm is in A Cryptographic Library for the Motorola 5219 * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. 5220 * 5221 * Arguments: 5222 * 5223 * Inputs for multiplication: 5224 * c_rarg0 - int array elements a 5225 * c_rarg1 - int array elements b 5226 * c_rarg2 - int array elements n (the modulus) 5227 * c_rarg3 - int length 5228 * c_rarg4 - int inv 5229 * c_rarg5 - int array elements m (the result) 5230 * 5231 * Inputs for squaring: 5232 * c_rarg0 - int array elements a 5233 * c_rarg1 - int array elements n (the modulus) 5234 * c_rarg2 - int length 5235 * c_rarg3 - int inv 5236 * c_rarg4 - int array elements m (the result) 5237 * 5238 */ 5239 address generate_multiply() { 5240 Label argh, nothing; 5241 bind(argh); 5242 stop("MontgomeryMultiply total_allocation must be <= 8192"); 5243 5244 align(CodeEntryAlignment); 5245 address entry = pc(); 5246 5247 cbzw(Rlen, nothing); 5248 5249 enter(); 5250 5251 // Make room. 5252 cmpw(Rlen, 512); 5253 br(Assembler::HI, argh); 5254 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 5255 andr(sp, Ra, -2 * wordSize); 5256 5257 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 5258 5259 { 5260 // Copy input args, reversing as we go. We use Ra as a 5261 // temporary variable. 5262 reverse(Ra, Pa_base, Rlen, t0, t1); 5263 if (!_squaring) 5264 reverse(Ra, Pb_base, Rlen, t0, t1); 5265 reverse(Ra, Pn_base, Rlen, t0, t1); 5266 } 5267 5268 // Push all call-saved registers and also Pm_base which we'll need 5269 // at the end. 5270 save_regs(); 5271 5272 #ifndef PRODUCT 5273 // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply"); 5274 { 5275 ldr(Rn, Address(Pn_base, 0)); 5276 mul(Rlo_mn, Rn, inv); 5277 subs(zr, Rlo_mn, -1); 5278 Label ok; 5279 br(EQ, ok); { 5280 stop("broken inverse in Montgomery multiply"); 5281 } bind(ok); 5282 } 5283 #endif 5284 5285 mov(Pm_base, Ra); 5286 5287 mov(t0, zr); 5288 mov(t1, zr); 5289 mov(t2, zr); 5290 5291 block_comment("for (int i = 0; i < len; i++) {"); 5292 mov(Ri, zr); { 5293 Label loop, end; 5294 cmpw(Ri, Rlen); 5295 br(Assembler::GE, end); 5296 5297 bind(loop); 5298 pre1(Ri); 5299 5300 block_comment(" for (j = i; j; j--) {"); { 5301 movw(Rj, Ri); 5302 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 5303 } block_comment(" } // j"); 5304 5305 post1(); 5306 addw(Ri, Ri, 1); 5307 cmpw(Ri, Rlen); 5308 br(Assembler::LT, loop); 5309 bind(end); 5310 block_comment("} // i"); 5311 } 5312 5313 block_comment("for (int i = len; i < 2*len; i++) {"); 5314 mov(Ri, Rlen); { 5315 Label loop, end; 5316 cmpw(Ri, Rlen, Assembler::LSL, 1); 5317 br(Assembler::GE, end); 5318 5319 bind(loop); 5320 pre2(Ri, Rlen); 5321 5322 block_comment(" for (j = len*2-i-1; j; j--) {"); { 5323 lslw(Rj, Rlen, 1); 5324 subw(Rj, Rj, Ri); 5325 subw(Rj, Rj, 1); 5326 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 5327 } block_comment(" } // j"); 5328 5329 post2(Ri, Rlen); 5330 addw(Ri, Ri, 1); 5331 cmpw(Ri, Rlen, Assembler::LSL, 1); 5332 br(Assembler::LT, loop); 5333 bind(end); 5334 } 5335 block_comment("} // i"); 5336 5337 normalize(Rlen); 5338 5339 mov(Ra, Pm_base); // Save Pm_base in Ra 5340 restore_regs(); // Restore caller's Pm_base 5341 5342 // Copy our result into caller's Pm_base 5343 reverse(Pm_base, Ra, Rlen, t0, t1); 5344 5345 leave(); 5346 bind(nothing); 5347 ret(lr); 5348 5349 return entry; 5350 } 5351 // In C, approximately: 5352 5353 // void 5354 // montgomery_multiply(unsigned long Pa_base[], unsigned long Pb_base[], 5355 // unsigned long Pn_base[], unsigned long Pm_base[], 5356 // unsigned long inv, int len) { 5357 // unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 5358 // unsigned long *Pa, *Pb, *Pn, *Pm; 5359 // unsigned long Ra, Rb, Rn, Rm; 5360 5361 // int i; 5362 5363 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 5364 5365 // for (i = 0; i < len; i++) { 5366 // int j; 5367 5368 // Pa = Pa_base; 5369 // Pb = Pb_base + i; 5370 // Pm = Pm_base; 5371 // Pn = Pn_base + i; 5372 5373 // Ra = *Pa; 5374 // Rb = *Pb; 5375 // Rm = *Pm; 5376 // Rn = *Pn; 5377 5378 // int iters = i; 5379 // for (j = 0; iters--; j++) { 5380 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 5381 // MACC(Ra, Rb, t0, t1, t2); 5382 // Ra = *++Pa; 5383 // Rb = *--Pb; 5384 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5385 // MACC(Rm, Rn, t0, t1, t2); 5386 // Rm = *++Pm; 5387 // Rn = *--Pn; 5388 // } 5389 5390 // assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be"); 5391 // MACC(Ra, Rb, t0, t1, t2); 5392 // *Pm = Rm = t0 * inv; 5393 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 5394 // MACC(Rm, Rn, t0, t1, t2); 5395 5396 // assert(t0 == 0, "broken Montgomery multiply"); 5397 5398 // t0 = t1; t1 = t2; t2 = 0; 5399 // } 5400 5401 // for (i = len; i < 2*len; i++) { 5402 // int j; 5403 5404 // Pa = Pa_base + i-len; 5405 // Pb = Pb_base + len; 5406 // Pm = Pm_base + i-len; 5407 // Pn = Pn_base + len; 5408 5409 // Ra = *++Pa; 5410 // Rb = *--Pb; 5411 // Rm = *++Pm; 5412 // Rn = *--Pn; 5413 5414 // int iters = len*2-i-1; 5415 // for (j = i-len+1; iters--; j++) { 5416 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 5417 // MACC(Ra, Rb, t0, t1, t2); 5418 // Ra = *++Pa; 5419 // Rb = *--Pb; 5420 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5421 // MACC(Rm, Rn, t0, t1, t2); 5422 // Rm = *++Pm; 5423 // Rn = *--Pn; 5424 // } 5425 5426 // Pm_base[i-len] = t0; 5427 // t0 = t1; t1 = t2; t2 = 0; 5428 // } 5429 5430 // while (t0) 5431 // t0 = sub(Pm_base, Pn_base, t0, len); 5432 // } 5433 5434 /** 5435 * Fast Montgomery squaring. This uses asymptotically 25% fewer 5436 * multiplies than Montgomery multiplication so it should be up to 5437 * 25% faster. However, its loop control is more complex and it 5438 * may actually run slower on some machines. 5439 * 5440 * Arguments: 5441 * 5442 * Inputs: 5443 * c_rarg0 - int array elements a 5444 * c_rarg1 - int array elements n (the modulus) 5445 * c_rarg2 - int length 5446 * c_rarg3 - int inv 5447 * c_rarg4 - int array elements m (the result) 5448 * 5449 */ 5450 address generate_square() { 5451 Label argh; 5452 bind(argh); 5453 stop("MontgomeryMultiply total_allocation must be <= 8192"); 5454 5455 align(CodeEntryAlignment); 5456 address entry = pc(); 5457 5458 enter(); 5459 5460 // Make room. 5461 cmpw(Rlen, 512); 5462 br(Assembler::HI, argh); 5463 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 5464 andr(sp, Ra, -2 * wordSize); 5465 5466 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 5467 5468 { 5469 // Copy input args, reversing as we go. We use Ra as a 5470 // temporary variable. 5471 reverse(Ra, Pa_base, Rlen, t0, t1); 5472 reverse(Ra, Pn_base, Rlen, t0, t1); 5473 } 5474 5475 // Push all call-saved registers and also Pm_base which we'll need 5476 // at the end. 5477 save_regs(); 5478 5479 mov(Pm_base, Ra); 5480 5481 mov(t0, zr); 5482 mov(t1, zr); 5483 mov(t2, zr); 5484 5485 block_comment("for (int i = 0; i < len; i++) {"); 5486 mov(Ri, zr); { 5487 Label loop, end; 5488 bind(loop); 5489 cmp(Ri, Rlen); 5490 br(Assembler::GE, end); 5491 5492 pre1(Ri); 5493 5494 block_comment("for (j = (i+1)/2; j; j--) {"); { 5495 add(Rj, Ri, 1); 5496 lsr(Rj, Rj, 1); 5497 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 5498 } block_comment(" } // j"); 5499 5500 last_squaring(Ri); 5501 5502 block_comment(" for (j = i/2; j; j--) {"); { 5503 lsr(Rj, Ri, 1); 5504 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 5505 } block_comment(" } // j"); 5506 5507 post1_squaring(); 5508 add(Ri, Ri, 1); 5509 cmp(Ri, Rlen); 5510 br(Assembler::LT, loop); 5511 5512 bind(end); 5513 block_comment("} // i"); 5514 } 5515 5516 block_comment("for (int i = len; i < 2*len; i++) {"); 5517 mov(Ri, Rlen); { 5518 Label loop, end; 5519 bind(loop); 5520 cmp(Ri, Rlen, Assembler::LSL, 1); 5521 br(Assembler::GE, end); 5522 5523 pre2(Ri, Rlen); 5524 5525 block_comment(" for (j = (2*len-i-1)/2; j; j--) {"); { 5526 lsl(Rj, Rlen, 1); 5527 sub(Rj, Rj, Ri); 5528 sub(Rj, Rj, 1); 5529 lsr(Rj, Rj, 1); 5530 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 5531 } block_comment(" } // j"); 5532 5533 last_squaring(Ri); 5534 5535 block_comment(" for (j = (2*len-i)/2; j; j--) {"); { 5536 lsl(Rj, Rlen, 1); 5537 sub(Rj, Rj, Ri); 5538 lsr(Rj, Rj, 1); 5539 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 5540 } block_comment(" } // j"); 5541 5542 post2(Ri, Rlen); 5543 add(Ri, Ri, 1); 5544 cmp(Ri, Rlen, Assembler::LSL, 1); 5545 5546 br(Assembler::LT, loop); 5547 bind(end); 5548 block_comment("} // i"); 5549 } 5550 5551 normalize(Rlen); 5552 5553 mov(Ra, Pm_base); // Save Pm_base in Ra 5554 restore_regs(); // Restore caller's Pm_base 5555 5556 // Copy our result into caller's Pm_base 5557 reverse(Pm_base, Ra, Rlen, t0, t1); 5558 5559 leave(); 5560 ret(lr); 5561 5562 return entry; 5563 } 5564 // In C, approximately: 5565 5566 // void 5567 // montgomery_square(unsigned long Pa_base[], unsigned long Pn_base[], 5568 // unsigned long Pm_base[], unsigned long inv, int len) { 5569 // unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 5570 // unsigned long *Pa, *Pb, *Pn, *Pm; 5571 // unsigned long Ra, Rb, Rn, Rm; 5572 5573 // int i; 5574 5575 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 5576 5577 // for (i = 0; i < len; i++) { 5578 // int j; 5579 5580 // Pa = Pa_base; 5581 // Pb = Pa_base + i; 5582 // Pm = Pm_base; 5583 // Pn = Pn_base + i; 5584 5585 // Ra = *Pa; 5586 // Rb = *Pb; 5587 // Rm = *Pm; 5588 // Rn = *Pn; 5589 5590 // int iters = (i+1)/2; 5591 // for (j = 0; iters--; j++) { 5592 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 5593 // MACC2(Ra, Rb, t0, t1, t2); 5594 // Ra = *++Pa; 5595 // Rb = *--Pb; 5596 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5597 // MACC(Rm, Rn, t0, t1, t2); 5598 // Rm = *++Pm; 5599 // Rn = *--Pn; 5600 // } 5601 // if ((i & 1) == 0) { 5602 // assert(Ra == Pa_base[j], "must be"); 5603 // MACC(Ra, Ra, t0, t1, t2); 5604 // } 5605 // iters = i/2; 5606 // assert(iters == i-j, "must be"); 5607 // for (; iters--; j++) { 5608 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5609 // MACC(Rm, Rn, t0, t1, t2); 5610 // Rm = *++Pm; 5611 // Rn = *--Pn; 5612 // } 5613 5614 // *Pm = Rm = t0 * inv; 5615 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 5616 // MACC(Rm, Rn, t0, t1, t2); 5617 5618 // assert(t0 == 0, "broken Montgomery multiply"); 5619 5620 // t0 = t1; t1 = t2; t2 = 0; 5621 // } 5622 5623 // for (i = len; i < 2*len; i++) { 5624 // int start = i-len+1; 5625 // int end = start + (len - start)/2; 5626 // int j; 5627 5628 // Pa = Pa_base + i-len; 5629 // Pb = Pa_base + len; 5630 // Pm = Pm_base + i-len; 5631 // Pn = Pn_base + len; 5632 5633 // Ra = *++Pa; 5634 // Rb = *--Pb; 5635 // Rm = *++Pm; 5636 // Rn = *--Pn; 5637 5638 // int iters = (2*len-i-1)/2; 5639 // assert(iters == end-start, "must be"); 5640 // for (j = start; iters--; j++) { 5641 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 5642 // MACC2(Ra, Rb, t0, t1, t2); 5643 // Ra = *++Pa; 5644 // Rb = *--Pb; 5645 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5646 // MACC(Rm, Rn, t0, t1, t2); 5647 // Rm = *++Pm; 5648 // Rn = *--Pn; 5649 // } 5650 // if ((i & 1) == 0) { 5651 // assert(Ra == Pa_base[j], "must be"); 5652 // MACC(Ra, Ra, t0, t1, t2); 5653 // } 5654 // iters = (2*len-i)/2; 5655 // assert(iters == len-j, "must be"); 5656 // for (; iters--; j++) { 5657 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5658 // MACC(Rm, Rn, t0, t1, t2); 5659 // Rm = *++Pm; 5660 // Rn = *--Pn; 5661 // } 5662 // Pm_base[i-len] = t0; 5663 // t0 = t1; t1 = t2; t2 = 0; 5664 // } 5665 5666 // while (t0) 5667 // t0 = sub(Pm_base, Pn_base, t0, len); 5668 // } 5669 }; 5670 5671 5672 // Initialization 5673 void generate_initial() { 5674 // Generate initial stubs and initializes the entry points 5675 5676 // entry points that exist in all platforms Note: This is code 5677 // that could be shared among different platforms - however the 5678 // benefit seems to be smaller than the disadvantage of having a 5679 // much more complicated generator structure. See also comment in 5680 // stubRoutines.hpp. 5681 5682 StubRoutines::_forward_exception_entry = generate_forward_exception(); 5683 5684 StubRoutines::_call_stub_entry = 5685 generate_call_stub(StubRoutines::_call_stub_return_address); 5686 5687 // is referenced by megamorphic call 5688 StubRoutines::_catch_exception_entry = generate_catch_exception(); 5689 5690 // Build this early so it's available for the interpreter. 5691 StubRoutines::_throw_StackOverflowError_entry = 5692 generate_throw_exception("StackOverflowError throw_exception", 5693 CAST_FROM_FN_PTR(address, 5694 SharedRuntime::throw_StackOverflowError)); 5695 StubRoutines::_throw_delayed_StackOverflowError_entry = 5696 generate_throw_exception("delayed StackOverflowError throw_exception", 5697 CAST_FROM_FN_PTR(address, 5698 SharedRuntime::throw_delayed_StackOverflowError)); 5699 if (UseCRC32Intrinsics) { 5700 // set table address before stub generation which use it 5701 StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table; 5702 StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32(); 5703 } 5704 5705 if (UseCRC32CIntrinsics) { 5706 StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(); 5707 } 5708 5709 // Disabled until JDK-8210858 is fixed 5710 // if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog)) { 5711 // StubRoutines::_dlog = generate_dlog(); 5712 // } 5713 5714 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) { 5715 StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false); 5716 } 5717 5718 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) { 5719 StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true); 5720 } 5721 } 5722 5723 void generate_all() { 5724 // support for verify_oop (must happen after universe_init) 5725 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop(); 5726 StubRoutines::_throw_AbstractMethodError_entry = 5727 generate_throw_exception("AbstractMethodError throw_exception", 5728 CAST_FROM_FN_PTR(address, 5729 SharedRuntime:: 5730 throw_AbstractMethodError)); 5731 5732 StubRoutines::_throw_IncompatibleClassChangeError_entry = 5733 generate_throw_exception("IncompatibleClassChangeError throw_exception", 5734 CAST_FROM_FN_PTR(address, 5735 SharedRuntime:: 5736 throw_IncompatibleClassChangeError)); 5737 5738 StubRoutines::_throw_NullPointerException_at_call_entry = 5739 generate_throw_exception("NullPointerException at call throw_exception", 5740 CAST_FROM_FN_PTR(address, 5741 SharedRuntime:: 5742 throw_NullPointerException_at_call)); 5743 5744 // arraycopy stubs used by compilers 5745 generate_arraycopy_stubs(); 5746 5747 // has negatives stub for large arrays. 5748 StubRoutines::aarch64::_has_negatives = generate_has_negatives(StubRoutines::aarch64::_has_negatives_long); 5749 5750 // array equals stub for large arrays. 5751 if (!UseSimpleArrayEquals) { 5752 StubRoutines::aarch64::_large_array_equals = generate_large_array_equals(); 5753 } 5754 5755 generate_compare_long_strings(); 5756 5757 generate_string_indexof_stubs(); 5758 5759 // byte_array_inflate stub for large arrays. 5760 StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate(); 5761 5762 #ifdef COMPILER2 5763 if (UseMultiplyToLenIntrinsic) { 5764 StubRoutines::_multiplyToLen = generate_multiplyToLen(); 5765 } 5766 5767 if (UseSquareToLenIntrinsic) { 5768 StubRoutines::_squareToLen = generate_squareToLen(); 5769 } 5770 5771 if (UseMulAddIntrinsic) { 5772 StubRoutines::_mulAdd = generate_mulAdd(); 5773 } 5774 5775 if (UseMontgomeryMultiplyIntrinsic) { 5776 StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply"); 5777 MontgomeryMultiplyGenerator g(_masm, /*squaring*/false); 5778 StubRoutines::_montgomeryMultiply = g.generate_multiply(); 5779 } 5780 5781 if (UseMontgomerySquareIntrinsic) { 5782 StubCodeMark mark(this, "StubRoutines", "montgomerySquare"); 5783 MontgomeryMultiplyGenerator g(_masm, /*squaring*/true); 5784 // We use generate_multiply() rather than generate_square() 5785 // because it's faster for the sizes of modulus we care about. 5786 StubRoutines::_montgomerySquare = g.generate_multiply(); 5787 } 5788 #endif // COMPILER2 5789 5790 #ifndef BUILTIN_SIM 5791 // generate GHASH intrinsics code 5792 if (UseGHASHIntrinsics) { 5793 StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks(); 5794 } 5795 5796 if (UseAESIntrinsics) { 5797 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); 5798 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); 5799 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(); 5800 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt(); 5801 } 5802 5803 if (UseSHA1Intrinsics) { 5804 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress"); 5805 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB"); 5806 } 5807 if (UseSHA256Intrinsics) { 5808 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress"); 5809 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB"); 5810 } 5811 5812 // generate Adler32 intrinsics code 5813 if (UseAdler32Intrinsics) { 5814 StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32(); 5815 } 5816 5817 // Safefetch stubs. 5818 generate_safefetch("SafeFetch32", sizeof(int), &StubRoutines::_safefetch32_entry, 5819 &StubRoutines::_safefetch32_fault_pc, 5820 &StubRoutines::_safefetch32_continuation_pc); 5821 generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry, 5822 &StubRoutines::_safefetchN_fault_pc, 5823 &StubRoutines::_safefetchN_continuation_pc); 5824 #endif 5825 StubRoutines::aarch64::set_completed(); 5826 } 5827 5828 public: 5829 StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) { 5830 if (all) { 5831 generate_all(); 5832 } else { 5833 generate_initial(); 5834 } 5835 } 5836 }; // end class declaration 5837 5838 void StubGenerator_generate(CodeBuffer* code, bool all) { 5839 StubGenerator g(code, all); 5840 }