1 /* 2 * Copyright (c) 1997, 2019, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2014, 2019, Red Hat Inc. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include <sys/types.h> 27 28 #include "precompiled.hpp" 29 #include "jvm.h" 30 #include "asm/assembler.hpp" 31 #include "asm/assembler.inline.hpp" 32 #include "gc/shared/barrierSet.hpp" 33 #include "gc/shared/cardTable.hpp" 34 #include "gc/shared/barrierSetAssembler.hpp" 35 #include "gc/shared/cardTableBarrierSet.hpp" 36 #include "interpreter/interpreter.hpp" 37 #include "compiler/disassembler.hpp" 38 #include "memory/resourceArea.hpp" 39 #include "memory/universe.hpp" 40 #include "nativeInst_aarch64.hpp" 41 #include "oops/accessDecorators.hpp" 42 #include "oops/compressedOops.inline.hpp" 43 #include "oops/klass.inline.hpp" 44 #include "runtime/biasedLocking.hpp" 45 #include "runtime/icache.hpp" 46 #include "runtime/interfaceSupport.inline.hpp" 47 #include "runtime/jniHandles.inline.hpp" 48 #include "runtime/sharedRuntime.hpp" 49 #include "runtime/thread.hpp" 50 #ifdef COMPILER1 51 #include "c1/c1_LIRAssembler.hpp" 52 #endif 53 #ifdef COMPILER2 54 #include "oops/oop.hpp" 55 #include "opto/compile.hpp" 56 #include "opto/intrinsicnode.hpp" 57 #include "opto/node.hpp" 58 #endif 59 60 #ifdef PRODUCT 61 #define BLOCK_COMMENT(str) /* nothing */ 62 #define STOP(error) stop(error) 63 #else 64 #define BLOCK_COMMENT(str) block_comment(str) 65 #define STOP(error) block_comment(error); stop(error) 66 #endif 67 68 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 69 70 // Patch any kind of instruction; there may be several instructions. 71 // Return the total length (in bytes) of the instructions. 72 int MacroAssembler::pd_patch_instruction_size(address branch, address target) { 73 int instructions = 1; 74 assert((uint64_t)target < (1ul << 48), "48-bit overflow in address constant"); 75 long offset = (target - branch) >> 2; 76 unsigned insn = *(unsigned*)branch; 77 if ((Instruction_aarch64::extract(insn, 29, 24) & 0b111011) == 0b011000) { 78 // Load register (literal) 79 Instruction_aarch64::spatch(branch, 23, 5, offset); 80 } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) { 81 // Unconditional branch (immediate) 82 Instruction_aarch64::spatch(branch, 25, 0, offset); 83 } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) { 84 // Conditional branch (immediate) 85 Instruction_aarch64::spatch(branch, 23, 5, offset); 86 } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) { 87 // Compare & branch (immediate) 88 Instruction_aarch64::spatch(branch, 23, 5, offset); 89 } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) { 90 // Test & branch (immediate) 91 Instruction_aarch64::spatch(branch, 18, 5, offset); 92 } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) { 93 // PC-rel. addressing 94 offset = target-branch; 95 int shift = Instruction_aarch64::extract(insn, 31, 31); 96 if (shift) { 97 u_int64_t dest = (u_int64_t)target; 98 uint64_t pc_page = (uint64_t)branch >> 12; 99 uint64_t adr_page = (uint64_t)target >> 12; 100 unsigned offset_lo = dest & 0xfff; 101 offset = adr_page - pc_page; 102 103 // We handle 4 types of PC relative addressing 104 // 1 - adrp Rx, target_page 105 // ldr/str Ry, [Rx, #offset_in_page] 106 // 2 - adrp Rx, target_page 107 // add Ry, Rx, #offset_in_page 108 // 3 - adrp Rx, target_page (page aligned reloc, offset == 0) 109 // movk Rx, #imm16<<32 110 // 4 - adrp Rx, target_page (page aligned reloc, offset == 0) 111 // In the first 3 cases we must check that Rx is the same in the adrp and the 112 // subsequent ldr/str, add or movk instruction. Otherwise we could accidentally end 113 // up treating a type 4 relocation as a type 1, 2 or 3 just because it happened 114 // to be followed by a random unrelated ldr/str, add or movk instruction. 115 // 116 unsigned insn2 = ((unsigned*)branch)[1]; 117 if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 && 118 Instruction_aarch64::extract(insn, 4, 0) == 119 Instruction_aarch64::extract(insn2, 9, 5)) { 120 // Load/store register (unsigned immediate) 121 unsigned size = Instruction_aarch64::extract(insn2, 31, 30); 122 Instruction_aarch64::patch(branch + sizeof (unsigned), 123 21, 10, offset_lo >> size); 124 guarantee(((dest >> size) << size) == dest, "misaligned target"); 125 instructions = 2; 126 } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 && 127 Instruction_aarch64::extract(insn, 4, 0) == 128 Instruction_aarch64::extract(insn2, 4, 0)) { 129 // add (immediate) 130 Instruction_aarch64::patch(branch + sizeof (unsigned), 131 21, 10, offset_lo); 132 instructions = 2; 133 } else if (Instruction_aarch64::extract(insn2, 31, 21) == 0b11110010110 && 134 Instruction_aarch64::extract(insn, 4, 0) == 135 Instruction_aarch64::extract(insn2, 4, 0)) { 136 // movk #imm16<<32 137 Instruction_aarch64::patch(branch + 4, 20, 5, (uint64_t)target >> 32); 138 long dest = ((long)target & 0xffffffffL) | ((long)branch & 0xffff00000000L); 139 long pc_page = (long)branch >> 12; 140 long adr_page = (long)dest >> 12; 141 offset = adr_page - pc_page; 142 instructions = 2; 143 } 144 } 145 int offset_lo = offset & 3; 146 offset >>= 2; 147 Instruction_aarch64::spatch(branch, 23, 5, offset); 148 Instruction_aarch64::patch(branch, 30, 29, offset_lo); 149 } else if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010100) { 150 u_int64_t dest = (u_int64_t)target; 151 // Move wide constant 152 assert(nativeInstruction_at(branch+4)->is_movk(), "wrong insns in patch"); 153 assert(nativeInstruction_at(branch+8)->is_movk(), "wrong insns in patch"); 154 Instruction_aarch64::patch(branch, 20, 5, dest & 0xffff); 155 Instruction_aarch64::patch(branch+4, 20, 5, (dest >>= 16) & 0xffff); 156 Instruction_aarch64::patch(branch+8, 20, 5, (dest >>= 16) & 0xffff); 157 assert(target_addr_for_insn(branch) == target, "should be"); 158 instructions = 3; 159 } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 && 160 Instruction_aarch64::extract(insn, 4, 0) == 0b11111) { 161 // nothing to do 162 assert(target == 0, "did not expect to relocate target for polling page load"); 163 } else { 164 ShouldNotReachHere(); 165 } 166 return instructions * NativeInstruction::instruction_size; 167 } 168 169 int MacroAssembler::patch_oop(address insn_addr, address o) { 170 int instructions; 171 unsigned insn = *(unsigned*)insn_addr; 172 assert(nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch"); 173 174 // OOPs are either narrow (32 bits) or wide (48 bits). We encode 175 // narrow OOPs by setting the upper 16 bits in the first 176 // instruction. 177 if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010101) { 178 // Move narrow OOP 179 narrowOop n = CompressedOops::encode((oop)o); 180 Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16); 181 Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff); 182 instructions = 2; 183 } else { 184 // Move wide OOP 185 assert(nativeInstruction_at(insn_addr+8)->is_movk(), "wrong insns in patch"); 186 uintptr_t dest = (uintptr_t)o; 187 Instruction_aarch64::patch(insn_addr, 20, 5, dest & 0xffff); 188 Instruction_aarch64::patch(insn_addr+4, 20, 5, (dest >>= 16) & 0xffff); 189 Instruction_aarch64::patch(insn_addr+8, 20, 5, (dest >>= 16) & 0xffff); 190 instructions = 3; 191 } 192 return instructions * NativeInstruction::instruction_size; 193 } 194 195 int MacroAssembler::patch_narrow_klass(address insn_addr, narrowKlass n) { 196 // Metatdata pointers are either narrow (32 bits) or wide (48 bits). 197 // We encode narrow ones by setting the upper 16 bits in the first 198 // instruction. 199 NativeInstruction *insn = nativeInstruction_at(insn_addr); 200 assert(Instruction_aarch64::extract(insn->encoding(), 31, 21) == 0b11010010101 && 201 nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch"); 202 203 Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16); 204 Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff); 205 return 2 * NativeInstruction::instruction_size; 206 } 207 208 address MacroAssembler::target_addr_for_insn(address insn_addr, unsigned insn) { 209 long offset = 0; 210 if ((Instruction_aarch64::extract(insn, 29, 24) & 0b011011) == 0b00011000) { 211 // Load register (literal) 212 offset = Instruction_aarch64::sextract(insn, 23, 5); 213 return address(((uint64_t)insn_addr + (offset << 2))); 214 } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) { 215 // Unconditional branch (immediate) 216 offset = Instruction_aarch64::sextract(insn, 25, 0); 217 } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) { 218 // Conditional branch (immediate) 219 offset = Instruction_aarch64::sextract(insn, 23, 5); 220 } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) { 221 // Compare & branch (immediate) 222 offset = Instruction_aarch64::sextract(insn, 23, 5); 223 } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) { 224 // Test & branch (immediate) 225 offset = Instruction_aarch64::sextract(insn, 18, 5); 226 } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) { 227 // PC-rel. addressing 228 offset = Instruction_aarch64::extract(insn, 30, 29); 229 offset |= Instruction_aarch64::sextract(insn, 23, 5) << 2; 230 int shift = Instruction_aarch64::extract(insn, 31, 31) ? 12 : 0; 231 if (shift) { 232 offset <<= shift; 233 uint64_t target_page = ((uint64_t)insn_addr) + offset; 234 target_page &= ((uint64_t)-1) << shift; 235 // Return the target address for the following sequences 236 // 1 - adrp Rx, target_page 237 // ldr/str Ry, [Rx, #offset_in_page] 238 // 2 - adrp Rx, target_page 239 // add Ry, Rx, #offset_in_page 240 // 3 - adrp Rx, target_page (page aligned reloc, offset == 0) 241 // movk Rx, #imm12<<32 242 // 4 - adrp Rx, target_page (page aligned reloc, offset == 0) 243 // 244 // In the first two cases we check that the register is the same and 245 // return the target_page + the offset within the page. 246 // Otherwise we assume it is a page aligned relocation and return 247 // the target page only. 248 // 249 unsigned insn2 = ((unsigned*)insn_addr)[1]; 250 if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 && 251 Instruction_aarch64::extract(insn, 4, 0) == 252 Instruction_aarch64::extract(insn2, 9, 5)) { 253 // Load/store register (unsigned immediate) 254 unsigned int byte_offset = Instruction_aarch64::extract(insn2, 21, 10); 255 unsigned int size = Instruction_aarch64::extract(insn2, 31, 30); 256 return address(target_page + (byte_offset << size)); 257 } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 && 258 Instruction_aarch64::extract(insn, 4, 0) == 259 Instruction_aarch64::extract(insn2, 4, 0)) { 260 // add (immediate) 261 unsigned int byte_offset = Instruction_aarch64::extract(insn2, 21, 10); 262 return address(target_page + byte_offset); 263 } else { 264 if (Instruction_aarch64::extract(insn2, 31, 21) == 0b11110010110 && 265 Instruction_aarch64::extract(insn, 4, 0) == 266 Instruction_aarch64::extract(insn2, 4, 0)) { 267 target_page = (target_page & 0xffffffff) | 268 ((uint64_t)Instruction_aarch64::extract(insn2, 20, 5) << 32); 269 } 270 return (address)target_page; 271 } 272 } else { 273 ShouldNotReachHere(); 274 } 275 } else if (Instruction_aarch64::extract(insn, 31, 23) == 0b110100101) { 276 u_int32_t *insns = (u_int32_t *)insn_addr; 277 // Move wide constant: movz, movk, movk. See movptr(). 278 assert(nativeInstruction_at(insns+1)->is_movk(), "wrong insns in patch"); 279 assert(nativeInstruction_at(insns+2)->is_movk(), "wrong insns in patch"); 280 return address(u_int64_t(Instruction_aarch64::extract(insns[0], 20, 5)) 281 + (u_int64_t(Instruction_aarch64::extract(insns[1], 20, 5)) << 16) 282 + (u_int64_t(Instruction_aarch64::extract(insns[2], 20, 5)) << 32)); 283 } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 && 284 Instruction_aarch64::extract(insn, 4, 0) == 0b11111) { 285 return 0; 286 } else { 287 ShouldNotReachHere(); 288 } 289 return address(((uint64_t)insn_addr + (offset << 2))); 290 } 291 292 void MacroAssembler::safepoint_poll(Label& slow_path) { 293 if (SafepointMechanism::uses_thread_local_poll()) { 294 ldr(rscratch1, Address(rthread, Thread::polling_page_offset())); 295 tbnz(rscratch1, exact_log2(SafepointMechanism::poll_bit()), slow_path); 296 } else { 297 unsigned long offset; 298 adrp(rscratch1, ExternalAddress(SafepointSynchronize::address_of_state()), offset); 299 ldrw(rscratch1, Address(rscratch1, offset)); 300 assert(SafepointSynchronize::_not_synchronized == 0, "rewrite this code"); 301 cbnz(rscratch1, slow_path); 302 } 303 } 304 305 // Just like safepoint_poll, but use an acquiring load for thread- 306 // local polling. 307 // 308 // We need an acquire here to ensure that any subsequent load of the 309 // global SafepointSynchronize::_state flag is ordered after this load 310 // of the local Thread::_polling page. We don't want this poll to 311 // return false (i.e. not safepointing) and a later poll of the global 312 // SafepointSynchronize::_state spuriously to return true. 313 // 314 // This is to avoid a race when we're in a native->Java transition 315 // racing the code which wakes up from a safepoint. 316 // 317 void MacroAssembler::safepoint_poll_acquire(Label& slow_path) { 318 if (SafepointMechanism::uses_thread_local_poll()) { 319 lea(rscratch1, Address(rthread, Thread::polling_page_offset())); 320 ldar(rscratch1, rscratch1); 321 tbnz(rscratch1, exact_log2(SafepointMechanism::poll_bit()), slow_path); 322 } else { 323 safepoint_poll(slow_path); 324 } 325 } 326 327 void MacroAssembler::reset_last_Java_frame(bool clear_fp) { 328 // we must set sp to zero to clear frame 329 str(zr, Address(rthread, JavaThread::last_Java_sp_offset())); 330 331 // must clear fp, so that compiled frames are not confused; it is 332 // possible that we need it only for debugging 333 if (clear_fp) { 334 str(zr, Address(rthread, JavaThread::last_Java_fp_offset())); 335 } 336 337 // Always clear the pc because it could have been set by make_walkable() 338 str(zr, Address(rthread, JavaThread::last_Java_pc_offset())); 339 } 340 341 // Calls to C land 342 // 343 // When entering C land, the rfp, & resp of the last Java frame have to be recorded 344 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp 345 // has to be reset to 0. This is required to allow proper stack traversal. 346 void MacroAssembler::set_last_Java_frame(Register last_java_sp, 347 Register last_java_fp, 348 Register last_java_pc, 349 Register scratch) { 350 351 if (last_java_pc->is_valid()) { 352 str(last_java_pc, Address(rthread, 353 JavaThread::frame_anchor_offset() 354 + JavaFrameAnchor::last_Java_pc_offset())); 355 } 356 357 // determine last_java_sp register 358 if (last_java_sp == sp) { 359 mov(scratch, sp); 360 last_java_sp = scratch; 361 } else if (!last_java_sp->is_valid()) { 362 last_java_sp = esp; 363 } 364 365 str(last_java_sp, Address(rthread, JavaThread::last_Java_sp_offset())); 366 367 // last_java_fp is optional 368 if (last_java_fp->is_valid()) { 369 str(last_java_fp, Address(rthread, JavaThread::last_Java_fp_offset())); 370 } 371 } 372 373 void MacroAssembler::set_last_Java_frame(Register last_java_sp, 374 Register last_java_fp, 375 address last_java_pc, 376 Register scratch) { 377 assert(last_java_pc != NULL, "must provide a valid PC"); 378 379 adr(scratch, last_java_pc); 380 str(scratch, Address(rthread, 381 JavaThread::frame_anchor_offset() 382 + JavaFrameAnchor::last_Java_pc_offset())); 383 384 set_last_Java_frame(last_java_sp, last_java_fp, noreg, scratch); 385 } 386 387 void MacroAssembler::set_last_Java_frame(Register last_java_sp, 388 Register last_java_fp, 389 Label &L, 390 Register scratch) { 391 if (L.is_bound()) { 392 set_last_Java_frame(last_java_sp, last_java_fp, target(L), scratch); 393 } else { 394 InstructionMark im(this); 395 L.add_patch_at(code(), locator()); 396 set_last_Java_frame(last_java_sp, last_java_fp, pc() /* Patched later */, scratch); 397 } 398 } 399 400 void MacroAssembler::far_call(Address entry, CodeBuffer *cbuf, Register tmp) { 401 assert(ReservedCodeCacheSize < 4*G, "branch out of range"); 402 assert(CodeCache::find_blob(entry.target()) != NULL, 403 "destination of far call not found in code cache"); 404 if (far_branches()) { 405 unsigned long offset; 406 // We can use ADRP here because we know that the total size of 407 // the code cache cannot exceed 2Gb. 408 adrp(tmp, entry, offset); 409 add(tmp, tmp, offset); 410 if (cbuf) cbuf->set_insts_mark(); 411 blr(tmp); 412 } else { 413 if (cbuf) cbuf->set_insts_mark(); 414 bl(entry); 415 } 416 } 417 418 void MacroAssembler::far_jump(Address entry, CodeBuffer *cbuf, Register tmp) { 419 assert(ReservedCodeCacheSize < 4*G, "branch out of range"); 420 assert(CodeCache::find_blob(entry.target()) != NULL, 421 "destination of far call not found in code cache"); 422 if (far_branches()) { 423 unsigned long offset; 424 // We can use ADRP here because we know that the total size of 425 // the code cache cannot exceed 2Gb. 426 adrp(tmp, entry, offset); 427 add(tmp, tmp, offset); 428 if (cbuf) cbuf->set_insts_mark(); 429 br(tmp); 430 } else { 431 if (cbuf) cbuf->set_insts_mark(); 432 b(entry); 433 } 434 } 435 436 void MacroAssembler::reserved_stack_check() { 437 // testing if reserved zone needs to be enabled 438 Label no_reserved_zone_enabling; 439 440 ldr(rscratch1, Address(rthread, JavaThread::reserved_stack_activation_offset())); 441 cmp(sp, rscratch1); 442 br(Assembler::LO, no_reserved_zone_enabling); 443 444 enter(); // LR and FP are live. 445 lea(rscratch1, CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone)); 446 mov(c_rarg0, rthread); 447 blr(rscratch1); 448 leave(); 449 450 // We have already removed our own frame. 451 // throw_delayed_StackOverflowError will think that it's been 452 // called by our caller. 453 lea(rscratch1, RuntimeAddress(StubRoutines::throw_delayed_StackOverflowError_entry())); 454 br(rscratch1); 455 should_not_reach_here(); 456 457 bind(no_reserved_zone_enabling); 458 } 459 460 int MacroAssembler::biased_locking_enter(Register lock_reg, 461 Register obj_reg, 462 Register swap_reg, 463 Register tmp_reg, 464 bool swap_reg_contains_mark, 465 Label& done, 466 Label* slow_case, 467 BiasedLockingCounters* counters) { 468 assert(UseBiasedLocking, "why call this otherwise?"); 469 assert_different_registers(lock_reg, obj_reg, swap_reg); 470 471 if (PrintBiasedLockingStatistics && counters == NULL) 472 counters = BiasedLocking::counters(); 473 474 assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg, rscratch1, rscratch2, noreg); 475 assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout"); 476 Address mark_addr (obj_reg, oopDesc::mark_offset_in_bytes()); 477 Address klass_addr (obj_reg, oopDesc::klass_offset_in_bytes()); 478 Address saved_mark_addr(lock_reg, 0); 479 480 // Biased locking 481 // See whether the lock is currently biased toward our thread and 482 // whether the epoch is still valid 483 // Note that the runtime guarantees sufficient alignment of JavaThread 484 // pointers to allow age to be placed into low bits 485 // First check to see whether biasing is even enabled for this object 486 Label cas_label; 487 int null_check_offset = -1; 488 if (!swap_reg_contains_mark) { 489 null_check_offset = offset(); 490 ldr(swap_reg, mark_addr); 491 } 492 andr(tmp_reg, swap_reg, markOopDesc::biased_lock_mask_in_place); 493 cmp(tmp_reg, (u1)markOopDesc::biased_lock_pattern); 494 br(Assembler::NE, cas_label); 495 // The bias pattern is present in the object's header. Need to check 496 // whether the bias owner and the epoch are both still current. 497 load_prototype_header(tmp_reg, obj_reg); 498 orr(tmp_reg, tmp_reg, rthread); 499 eor(tmp_reg, swap_reg, tmp_reg); 500 andr(tmp_reg, tmp_reg, ~((int) markOopDesc::age_mask_in_place)); 501 if (counters != NULL) { 502 Label around; 503 cbnz(tmp_reg, around); 504 atomic_incw(Address((address)counters->biased_lock_entry_count_addr()), tmp_reg, rscratch1, rscratch2); 505 b(done); 506 bind(around); 507 } else { 508 cbz(tmp_reg, done); 509 } 510 511 Label try_revoke_bias; 512 Label try_rebias; 513 514 // At this point we know that the header has the bias pattern and 515 // that we are not the bias owner in the current epoch. We need to 516 // figure out more details about the state of the header in order to 517 // know what operations can be legally performed on the object's 518 // header. 519 520 // If the low three bits in the xor result aren't clear, that means 521 // the prototype header is no longer biased and we have to revoke 522 // the bias on this object. 523 andr(rscratch1, tmp_reg, markOopDesc::biased_lock_mask_in_place); 524 cbnz(rscratch1, try_revoke_bias); 525 526 // Biasing is still enabled for this data type. See whether the 527 // epoch of the current bias is still valid, meaning that the epoch 528 // bits of the mark word are equal to the epoch bits of the 529 // prototype header. (Note that the prototype header's epoch bits 530 // only change at a safepoint.) If not, attempt to rebias the object 531 // toward the current thread. Note that we must be absolutely sure 532 // that the current epoch is invalid in order to do this because 533 // otherwise the manipulations it performs on the mark word are 534 // illegal. 535 andr(rscratch1, tmp_reg, markOopDesc::epoch_mask_in_place); 536 cbnz(rscratch1, try_rebias); 537 538 // The epoch of the current bias is still valid but we know nothing 539 // about the owner; it might be set or it might be clear. Try to 540 // acquire the bias of the object using an atomic operation. If this 541 // fails we will go in to the runtime to revoke the object's bias. 542 // Note that we first construct the presumed unbiased header so we 543 // don't accidentally blow away another thread's valid bias. 544 { 545 Label here; 546 mov(rscratch1, markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place); 547 andr(swap_reg, swap_reg, rscratch1); 548 orr(tmp_reg, swap_reg, rthread); 549 cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case); 550 // If the biasing toward our thread failed, this means that 551 // another thread succeeded in biasing it toward itself and we 552 // need to revoke that bias. The revocation will occur in the 553 // interpreter runtime in the slow case. 554 bind(here); 555 if (counters != NULL) { 556 atomic_incw(Address((address)counters->anonymously_biased_lock_entry_count_addr()), 557 tmp_reg, rscratch1, rscratch2); 558 } 559 } 560 b(done); 561 562 bind(try_rebias); 563 // At this point we know the epoch has expired, meaning that the 564 // current "bias owner", if any, is actually invalid. Under these 565 // circumstances _only_, we are allowed to use the current header's 566 // value as the comparison value when doing the cas to acquire the 567 // bias in the current epoch. In other words, we allow transfer of 568 // the bias from one thread to another directly in this situation. 569 // 570 // FIXME: due to a lack of registers we currently blow away the age 571 // bits in this situation. Should attempt to preserve them. 572 { 573 Label here; 574 load_prototype_header(tmp_reg, obj_reg); 575 orr(tmp_reg, rthread, tmp_reg); 576 cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case); 577 // If the biasing toward our thread failed, then another thread 578 // succeeded in biasing it toward itself and we need to revoke that 579 // bias. The revocation will occur in the runtime in the slow case. 580 bind(here); 581 if (counters != NULL) { 582 atomic_incw(Address((address)counters->rebiased_lock_entry_count_addr()), 583 tmp_reg, rscratch1, rscratch2); 584 } 585 } 586 b(done); 587 588 bind(try_revoke_bias); 589 // The prototype mark in the klass doesn't have the bias bit set any 590 // more, indicating that objects of this data type are not supposed 591 // to be biased any more. We are going to try to reset the mark of 592 // this object to the prototype value and fall through to the 593 // CAS-based locking scheme. Note that if our CAS fails, it means 594 // that another thread raced us for the privilege of revoking the 595 // bias of this particular object, so it's okay to continue in the 596 // normal locking code. 597 // 598 // FIXME: due to a lack of registers we currently blow away the age 599 // bits in this situation. Should attempt to preserve them. 600 { 601 Label here, nope; 602 load_prototype_header(tmp_reg, obj_reg); 603 cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, &nope); 604 bind(here); 605 606 // Fall through to the normal CAS-based lock, because no matter what 607 // the result of the above CAS, some thread must have succeeded in 608 // removing the bias bit from the object's header. 609 if (counters != NULL) { 610 atomic_incw(Address((address)counters->revoked_lock_entry_count_addr()), tmp_reg, 611 rscratch1, rscratch2); 612 } 613 bind(nope); 614 } 615 616 bind(cas_label); 617 618 return null_check_offset; 619 } 620 621 void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) { 622 assert(UseBiasedLocking, "why call this otherwise?"); 623 624 // Check for biased locking unlock case, which is a no-op 625 // Note: we do not have to check the thread ID for two reasons. 626 // First, the interpreter checks for IllegalMonitorStateException at 627 // a higher level. Second, if the bias was revoked while we held the 628 // lock, the object could not be rebiased toward another thread, so 629 // the bias bit would be clear. 630 ldr(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes())); 631 andr(temp_reg, temp_reg, markOopDesc::biased_lock_mask_in_place); 632 cmp(temp_reg, (u1)markOopDesc::biased_lock_pattern); 633 br(Assembler::EQ, done); 634 } 635 636 static void pass_arg0(MacroAssembler* masm, Register arg) { 637 if (c_rarg0 != arg ) { 638 masm->mov(c_rarg0, arg); 639 } 640 } 641 642 static void pass_arg1(MacroAssembler* masm, Register arg) { 643 if (c_rarg1 != arg ) { 644 masm->mov(c_rarg1, arg); 645 } 646 } 647 648 static void pass_arg2(MacroAssembler* masm, Register arg) { 649 if (c_rarg2 != arg ) { 650 masm->mov(c_rarg2, arg); 651 } 652 } 653 654 static void pass_arg3(MacroAssembler* masm, Register arg) { 655 if (c_rarg3 != arg ) { 656 masm->mov(c_rarg3, arg); 657 } 658 } 659 660 void MacroAssembler::call_VM_base(Register oop_result, 661 Register java_thread, 662 Register last_java_sp, 663 address entry_point, 664 int number_of_arguments, 665 bool check_exceptions) { 666 // determine java_thread register 667 if (!java_thread->is_valid()) { 668 java_thread = rthread; 669 } 670 671 // determine last_java_sp register 672 if (!last_java_sp->is_valid()) { 673 last_java_sp = esp; 674 } 675 676 // debugging support 677 assert(number_of_arguments >= 0 , "cannot have negative number of arguments"); 678 assert(java_thread == rthread, "unexpected register"); 679 #ifdef ASSERT 680 // TraceBytecodes does not use r12 but saves it over the call, so don't verify 681 // if ((UseCompressedOops || UseCompressedClassPointers) && !TraceBytecodes) verify_heapbase("call_VM_base: heap base corrupted?"); 682 #endif // ASSERT 683 684 assert(java_thread != oop_result , "cannot use the same register for java_thread & oop_result"); 685 assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp"); 686 687 // push java thread (becomes first argument of C function) 688 689 mov(c_rarg0, java_thread); 690 691 // set last Java frame before call 692 assert(last_java_sp != rfp, "can't use rfp"); 693 694 Label l; 695 set_last_Java_frame(last_java_sp, rfp, l, rscratch1); 696 697 // do the call, remove parameters 698 MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments, &l); 699 700 // reset last Java frame 701 // Only interpreter should have to clear fp 702 reset_last_Java_frame(true); 703 704 // C++ interp handles this in the interpreter 705 check_and_handle_popframe(java_thread); 706 check_and_handle_earlyret(java_thread); 707 708 if (check_exceptions) { 709 // check for pending exceptions (java_thread is set upon return) 710 ldr(rscratch1, Address(java_thread, in_bytes(Thread::pending_exception_offset()))); 711 Label ok; 712 cbz(rscratch1, ok); 713 lea(rscratch1, RuntimeAddress(StubRoutines::forward_exception_entry())); 714 br(rscratch1); 715 bind(ok); 716 } 717 718 // get oop result if there is one and reset the value in the thread 719 if (oop_result->is_valid()) { 720 get_vm_result(oop_result, java_thread); 721 } 722 } 723 724 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) { 725 call_VM_base(oop_result, noreg, noreg, entry_point, number_of_arguments, check_exceptions); 726 } 727 728 // Maybe emit a call via a trampoline. If the code cache is small 729 // trampolines won't be emitted. 730 731 address MacroAssembler::trampoline_call(Address entry, CodeBuffer *cbuf) { 732 assert(JavaThread::current()->is_Compiler_thread(), "just checking"); 733 assert(entry.rspec().type() == relocInfo::runtime_call_type 734 || entry.rspec().type() == relocInfo::opt_virtual_call_type 735 || entry.rspec().type() == relocInfo::static_call_type 736 || entry.rspec().type() == relocInfo::virtual_call_type, "wrong reloc type"); 737 738 // We need a trampoline if branches are far. 739 if (far_branches()) { 740 bool in_scratch_emit_size = false; 741 #ifdef COMPILER2 742 // We don't want to emit a trampoline if C2 is generating dummy 743 // code during its branch shortening phase. 744 CompileTask* task = ciEnv::current()->task(); 745 in_scratch_emit_size = 746 (task != NULL && is_c2_compile(task->comp_level()) && 747 Compile::current()->in_scratch_emit_size()); 748 #endif 749 if (!in_scratch_emit_size) { 750 address stub = emit_trampoline_stub(offset(), entry.target()); 751 if (stub == NULL) { 752 return NULL; // CodeCache is full 753 } 754 } 755 } 756 757 if (cbuf) cbuf->set_insts_mark(); 758 relocate(entry.rspec()); 759 if (!far_branches()) { 760 bl(entry.target()); 761 } else { 762 bl(pc()); 763 } 764 // just need to return a non-null address 765 return pc(); 766 } 767 768 769 // Emit a trampoline stub for a call to a target which is too far away. 770 // 771 // code sequences: 772 // 773 // call-site: 774 // branch-and-link to <destination> or <trampoline stub> 775 // 776 // Related trampoline stub for this call site in the stub section: 777 // load the call target from the constant pool 778 // branch (LR still points to the call site above) 779 780 address MacroAssembler::emit_trampoline_stub(int insts_call_instruction_offset, 781 address dest) { 782 // Max stub size: alignment nop, TrampolineStub. 783 address stub = start_a_stub(NativeInstruction::instruction_size 784 + NativeCallTrampolineStub::instruction_size); 785 if (stub == NULL) { 786 return NULL; // CodeBuffer::expand failed 787 } 788 789 // Create a trampoline stub relocation which relates this trampoline stub 790 // with the call instruction at insts_call_instruction_offset in the 791 // instructions code-section. 792 align(wordSize); 793 relocate(trampoline_stub_Relocation::spec(code()->insts()->start() 794 + insts_call_instruction_offset)); 795 const int stub_start_offset = offset(); 796 797 // Now, create the trampoline stub's code: 798 // - load the call 799 // - call 800 Label target; 801 ldr(rscratch1, target); 802 br(rscratch1); 803 bind(target); 804 assert(offset() - stub_start_offset == NativeCallTrampolineStub::data_offset, 805 "should be"); 806 emit_int64((int64_t)dest); 807 808 const address stub_start_addr = addr_at(stub_start_offset); 809 810 assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline"); 811 812 end_a_stub(); 813 return stub_start_addr; 814 } 815 816 void MacroAssembler::emit_static_call_stub() { 817 // CompiledDirectStaticCall::set_to_interpreted knows the 818 // exact layout of this stub. 819 820 isb(); 821 mov_metadata(rmethod, (Metadata*)NULL); 822 823 // Jump to the entry point of the i2c stub. 824 movptr(rscratch1, 0); 825 br(rscratch1); 826 } 827 828 void MacroAssembler::c2bool(Register x) { 829 // implements x == 0 ? 0 : 1 830 // note: must only look at least-significant byte of x 831 // since C-style booleans are stored in one byte 832 // only! (was bug) 833 tst(x, 0xff); 834 cset(x, Assembler::NE); 835 } 836 837 address MacroAssembler::ic_call(address entry, jint method_index) { 838 RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index); 839 // address const_ptr = long_constant((jlong)Universe::non_oop_word()); 840 // unsigned long offset; 841 // ldr_constant(rscratch2, const_ptr); 842 movptr(rscratch2, (uintptr_t)Universe::non_oop_word()); 843 return trampoline_call(Address(entry, rh)); 844 } 845 846 // Implementation of call_VM versions 847 848 void MacroAssembler::call_VM(Register oop_result, 849 address entry_point, 850 bool check_exceptions) { 851 call_VM_helper(oop_result, entry_point, 0, check_exceptions); 852 } 853 854 void MacroAssembler::call_VM(Register oop_result, 855 address entry_point, 856 Register arg_1, 857 bool check_exceptions) { 858 pass_arg1(this, arg_1); 859 call_VM_helper(oop_result, entry_point, 1, check_exceptions); 860 } 861 862 void MacroAssembler::call_VM(Register oop_result, 863 address entry_point, 864 Register arg_1, 865 Register arg_2, 866 bool check_exceptions) { 867 assert(arg_1 != c_rarg2, "smashed arg"); 868 pass_arg2(this, arg_2); 869 pass_arg1(this, arg_1); 870 call_VM_helper(oop_result, entry_point, 2, check_exceptions); 871 } 872 873 void MacroAssembler::call_VM(Register oop_result, 874 address entry_point, 875 Register arg_1, 876 Register arg_2, 877 Register arg_3, 878 bool check_exceptions) { 879 assert(arg_1 != c_rarg3, "smashed arg"); 880 assert(arg_2 != c_rarg3, "smashed arg"); 881 pass_arg3(this, arg_3); 882 883 assert(arg_1 != c_rarg2, "smashed arg"); 884 pass_arg2(this, arg_2); 885 886 pass_arg1(this, arg_1); 887 call_VM_helper(oop_result, entry_point, 3, check_exceptions); 888 } 889 890 void MacroAssembler::call_VM(Register oop_result, 891 Register last_java_sp, 892 address entry_point, 893 int number_of_arguments, 894 bool check_exceptions) { 895 call_VM_base(oop_result, rthread, last_java_sp, entry_point, number_of_arguments, check_exceptions); 896 } 897 898 void MacroAssembler::call_VM(Register oop_result, 899 Register last_java_sp, 900 address entry_point, 901 Register arg_1, 902 bool check_exceptions) { 903 pass_arg1(this, arg_1); 904 call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions); 905 } 906 907 void MacroAssembler::call_VM(Register oop_result, 908 Register last_java_sp, 909 address entry_point, 910 Register arg_1, 911 Register arg_2, 912 bool check_exceptions) { 913 914 assert(arg_1 != c_rarg2, "smashed arg"); 915 pass_arg2(this, arg_2); 916 pass_arg1(this, arg_1); 917 call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions); 918 } 919 920 void MacroAssembler::call_VM(Register oop_result, 921 Register last_java_sp, 922 address entry_point, 923 Register arg_1, 924 Register arg_2, 925 Register arg_3, 926 bool check_exceptions) { 927 assert(arg_1 != c_rarg3, "smashed arg"); 928 assert(arg_2 != c_rarg3, "smashed arg"); 929 pass_arg3(this, arg_3); 930 assert(arg_1 != c_rarg2, "smashed arg"); 931 pass_arg2(this, arg_2); 932 pass_arg1(this, arg_1); 933 call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions); 934 } 935 936 937 void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) { 938 ldr(oop_result, Address(java_thread, JavaThread::vm_result_offset())); 939 str(zr, Address(java_thread, JavaThread::vm_result_offset())); 940 verify_oop(oop_result, "broken oop in call_VM_base"); 941 } 942 943 void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) { 944 ldr(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset())); 945 str(zr, Address(java_thread, JavaThread::vm_result_2_offset())); 946 } 947 948 void MacroAssembler::align(int modulus) { 949 while (offset() % modulus != 0) nop(); 950 } 951 952 // these are no-ops overridden by InterpreterMacroAssembler 953 954 void MacroAssembler::check_and_handle_earlyret(Register java_thread) { } 955 956 void MacroAssembler::check_and_handle_popframe(Register java_thread) { } 957 958 959 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr, 960 Register tmp, 961 int offset) { 962 intptr_t value = *delayed_value_addr; 963 if (value != 0) 964 return RegisterOrConstant(value + offset); 965 966 // load indirectly to solve generation ordering problem 967 ldr(tmp, ExternalAddress((address) delayed_value_addr)); 968 969 if (offset != 0) 970 add(tmp, tmp, offset); 971 972 return RegisterOrConstant(tmp); 973 } 974 975 976 void MacroAssembler:: notify(int type) { 977 if (type == bytecode_start) { 978 // set_last_Java_frame(esp, rfp, (address)NULL); 979 Assembler:: notify(type); 980 // reset_last_Java_frame(true); 981 } 982 else 983 Assembler:: notify(type); 984 } 985 986 // Look up the method for a megamorphic invokeinterface call. 987 // The target method is determined by <intf_klass, itable_index>. 988 // The receiver klass is in recv_klass. 989 // On success, the result will be in method_result, and execution falls through. 990 // On failure, execution transfers to the given label. 991 void MacroAssembler::lookup_interface_method(Register recv_klass, 992 Register intf_klass, 993 RegisterOrConstant itable_index, 994 Register method_result, 995 Register scan_temp, 996 Label& L_no_such_interface, 997 bool return_method) { 998 assert_different_registers(recv_klass, intf_klass, scan_temp); 999 assert_different_registers(method_result, intf_klass, scan_temp); 1000 assert(recv_klass != method_result || !return_method, 1001 "recv_klass can be destroyed when method isn't needed"); 1002 assert(itable_index.is_constant() || itable_index.as_register() == method_result, 1003 "caller must use same register for non-constant itable index as for method"); 1004 1005 // Compute start of first itableOffsetEntry (which is at the end of the vtable) 1006 int vtable_base = in_bytes(Klass::vtable_start_offset()); 1007 int itentry_off = itableMethodEntry::method_offset_in_bytes(); 1008 int scan_step = itableOffsetEntry::size() * wordSize; 1009 int vte_size = vtableEntry::size_in_bytes(); 1010 assert(vte_size == wordSize, "else adjust times_vte_scale"); 1011 1012 ldrw(scan_temp, Address(recv_klass, Klass::vtable_length_offset())); 1013 1014 // %%% Could store the aligned, prescaled offset in the klassoop. 1015 // lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base)); 1016 lea(scan_temp, Address(recv_klass, scan_temp, Address::lsl(3))); 1017 add(scan_temp, scan_temp, vtable_base); 1018 1019 if (return_method) { 1020 // Adjust recv_klass by scaled itable_index, so we can free itable_index. 1021 assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below"); 1022 // lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off)); 1023 lea(recv_klass, Address(recv_klass, itable_index, Address::lsl(3))); 1024 if (itentry_off) 1025 add(recv_klass, recv_klass, itentry_off); 1026 } 1027 1028 // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) { 1029 // if (scan->interface() == intf) { 1030 // result = (klass + scan->offset() + itable_index); 1031 // } 1032 // } 1033 Label search, found_method; 1034 1035 for (int peel = 1; peel >= 0; peel--) { 1036 ldr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes())); 1037 cmp(intf_klass, method_result); 1038 1039 if (peel) { 1040 br(Assembler::EQ, found_method); 1041 } else { 1042 br(Assembler::NE, search); 1043 // (invert the test to fall through to found_method...) 1044 } 1045 1046 if (!peel) break; 1047 1048 bind(search); 1049 1050 // Check that the previous entry is non-null. A null entry means that 1051 // the receiver class doesn't implement the interface, and wasn't the 1052 // same as when the caller was compiled. 1053 cbz(method_result, L_no_such_interface); 1054 add(scan_temp, scan_temp, scan_step); 1055 } 1056 1057 bind(found_method); 1058 1059 // Got a hit. 1060 if (return_method) { 1061 ldrw(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset_in_bytes())); 1062 ldr(method_result, Address(recv_klass, scan_temp, Address::uxtw(0))); 1063 } 1064 } 1065 1066 // virtual method calling 1067 void MacroAssembler::lookup_virtual_method(Register recv_klass, 1068 RegisterOrConstant vtable_index, 1069 Register method_result) { 1070 const int base = in_bytes(Klass::vtable_start_offset()); 1071 assert(vtableEntry::size() * wordSize == 8, 1072 "adjust the scaling in the code below"); 1073 int vtable_offset_in_bytes = base + vtableEntry::method_offset_in_bytes(); 1074 1075 if (vtable_index.is_register()) { 1076 lea(method_result, Address(recv_klass, 1077 vtable_index.as_register(), 1078 Address::lsl(LogBytesPerWord))); 1079 ldr(method_result, Address(method_result, vtable_offset_in_bytes)); 1080 } else { 1081 vtable_offset_in_bytes += vtable_index.as_constant() * wordSize; 1082 ldr(method_result, 1083 form_address(rscratch1, recv_klass, vtable_offset_in_bytes, 0)); 1084 } 1085 } 1086 1087 void MacroAssembler::check_klass_subtype(Register sub_klass, 1088 Register super_klass, 1089 Register temp_reg, 1090 Label& L_success) { 1091 Label L_failure; 1092 check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg, &L_success, &L_failure, NULL); 1093 check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, NULL); 1094 bind(L_failure); 1095 } 1096 1097 1098 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass, 1099 Register super_klass, 1100 Register temp_reg, 1101 Label* L_success, 1102 Label* L_failure, 1103 Label* L_slow_path, 1104 RegisterOrConstant super_check_offset) { 1105 assert_different_registers(sub_klass, super_klass, temp_reg); 1106 bool must_load_sco = (super_check_offset.constant_or_zero() == -1); 1107 if (super_check_offset.is_register()) { 1108 assert_different_registers(sub_klass, super_klass, 1109 super_check_offset.as_register()); 1110 } else if (must_load_sco) { 1111 assert(temp_reg != noreg, "supply either a temp or a register offset"); 1112 } 1113 1114 Label L_fallthrough; 1115 int label_nulls = 0; 1116 if (L_success == NULL) { L_success = &L_fallthrough; label_nulls++; } 1117 if (L_failure == NULL) { L_failure = &L_fallthrough; label_nulls++; } 1118 if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; } 1119 assert(label_nulls <= 1, "at most one NULL in the batch"); 1120 1121 int sc_offset = in_bytes(Klass::secondary_super_cache_offset()); 1122 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1123 Address super_check_offset_addr(super_klass, sco_offset); 1124 1125 // Hacked jmp, which may only be used just before L_fallthrough. 1126 #define final_jmp(label) \ 1127 if (&(label) == &L_fallthrough) { /*do nothing*/ } \ 1128 else b(label) /*omit semi*/ 1129 1130 // If the pointers are equal, we are done (e.g., String[] elements). 1131 // This self-check enables sharing of secondary supertype arrays among 1132 // non-primary types such as array-of-interface. Otherwise, each such 1133 // type would need its own customized SSA. 1134 // We move this check to the front of the fast path because many 1135 // type checks are in fact trivially successful in this manner, 1136 // so we get a nicely predicted branch right at the start of the check. 1137 cmp(sub_klass, super_klass); 1138 br(Assembler::EQ, *L_success); 1139 1140 // Check the supertype display: 1141 if (must_load_sco) { 1142 ldrw(temp_reg, super_check_offset_addr); 1143 super_check_offset = RegisterOrConstant(temp_reg); 1144 } 1145 Address super_check_addr(sub_klass, super_check_offset); 1146 ldr(rscratch1, super_check_addr); 1147 cmp(super_klass, rscratch1); // load displayed supertype 1148 1149 // This check has worked decisively for primary supers. 1150 // Secondary supers are sought in the super_cache ('super_cache_addr'). 1151 // (Secondary supers are interfaces and very deeply nested subtypes.) 1152 // This works in the same check above because of a tricky aliasing 1153 // between the super_cache and the primary super display elements. 1154 // (The 'super_check_addr' can address either, as the case requires.) 1155 // Note that the cache is updated below if it does not help us find 1156 // what we need immediately. 1157 // So if it was a primary super, we can just fail immediately. 1158 // Otherwise, it's the slow path for us (no success at this point). 1159 1160 if (super_check_offset.is_register()) { 1161 br(Assembler::EQ, *L_success); 1162 subs(zr, super_check_offset.as_register(), sc_offset); 1163 if (L_failure == &L_fallthrough) { 1164 br(Assembler::EQ, *L_slow_path); 1165 } else { 1166 br(Assembler::NE, *L_failure); 1167 final_jmp(*L_slow_path); 1168 } 1169 } else if (super_check_offset.as_constant() == sc_offset) { 1170 // Need a slow path; fast failure is impossible. 1171 if (L_slow_path == &L_fallthrough) { 1172 br(Assembler::EQ, *L_success); 1173 } else { 1174 br(Assembler::NE, *L_slow_path); 1175 final_jmp(*L_success); 1176 } 1177 } else { 1178 // No slow path; it's a fast decision. 1179 if (L_failure == &L_fallthrough) { 1180 br(Assembler::EQ, *L_success); 1181 } else { 1182 br(Assembler::NE, *L_failure); 1183 final_jmp(*L_success); 1184 } 1185 } 1186 1187 bind(L_fallthrough); 1188 1189 #undef final_jmp 1190 } 1191 1192 // These two are taken from x86, but they look generally useful 1193 1194 // scans count pointer sized words at [addr] for occurence of value, 1195 // generic 1196 void MacroAssembler::repne_scan(Register addr, Register value, Register count, 1197 Register scratch) { 1198 Label Lloop, Lexit; 1199 cbz(count, Lexit); 1200 bind(Lloop); 1201 ldr(scratch, post(addr, wordSize)); 1202 cmp(value, scratch); 1203 br(EQ, Lexit); 1204 sub(count, count, 1); 1205 cbnz(count, Lloop); 1206 bind(Lexit); 1207 } 1208 1209 // scans count 4 byte words at [addr] for occurence of value, 1210 // generic 1211 void MacroAssembler::repne_scanw(Register addr, Register value, Register count, 1212 Register scratch) { 1213 Label Lloop, Lexit; 1214 cbz(count, Lexit); 1215 bind(Lloop); 1216 ldrw(scratch, post(addr, wordSize)); 1217 cmpw(value, scratch); 1218 br(EQ, Lexit); 1219 sub(count, count, 1); 1220 cbnz(count, Lloop); 1221 bind(Lexit); 1222 } 1223 1224 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass, 1225 Register super_klass, 1226 Register temp_reg, 1227 Register temp2_reg, 1228 Label* L_success, 1229 Label* L_failure, 1230 bool set_cond_codes) { 1231 assert_different_registers(sub_klass, super_klass, temp_reg); 1232 if (temp2_reg != noreg) 1233 assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg, rscratch1); 1234 #define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg) 1235 1236 Label L_fallthrough; 1237 int label_nulls = 0; 1238 if (L_success == NULL) { L_success = &L_fallthrough; label_nulls++; } 1239 if (L_failure == NULL) { L_failure = &L_fallthrough; label_nulls++; } 1240 assert(label_nulls <= 1, "at most one NULL in the batch"); 1241 1242 // a couple of useful fields in sub_klass: 1243 int ss_offset = in_bytes(Klass::secondary_supers_offset()); 1244 int sc_offset = in_bytes(Klass::secondary_super_cache_offset()); 1245 Address secondary_supers_addr(sub_klass, ss_offset); 1246 Address super_cache_addr( sub_klass, sc_offset); 1247 1248 BLOCK_COMMENT("check_klass_subtype_slow_path"); 1249 1250 // Do a linear scan of the secondary super-klass chain. 1251 // This code is rarely used, so simplicity is a virtue here. 1252 // The repne_scan instruction uses fixed registers, which we must spill. 1253 // Don't worry too much about pre-existing connections with the input regs. 1254 1255 assert(sub_klass != r0, "killed reg"); // killed by mov(r0, super) 1256 assert(sub_klass != r2, "killed reg"); // killed by lea(r2, &pst_counter) 1257 1258 RegSet pushed_registers; 1259 if (!IS_A_TEMP(r2)) pushed_registers += r2; 1260 if (!IS_A_TEMP(r5)) pushed_registers += r5; 1261 1262 if (super_klass != r0 || UseCompressedOops) { 1263 if (!IS_A_TEMP(r0)) pushed_registers += r0; 1264 } 1265 1266 push(pushed_registers, sp); 1267 1268 // Get super_klass value into r0 (even if it was in r5 or r2). 1269 if (super_klass != r0) { 1270 mov(r0, super_klass); 1271 } 1272 1273 #ifndef PRODUCT 1274 mov(rscratch2, (address)&SharedRuntime::_partial_subtype_ctr); 1275 Address pst_counter_addr(rscratch2); 1276 ldr(rscratch1, pst_counter_addr); 1277 add(rscratch1, rscratch1, 1); 1278 str(rscratch1, pst_counter_addr); 1279 #endif //PRODUCT 1280 1281 // We will consult the secondary-super array. 1282 ldr(r5, secondary_supers_addr); 1283 // Load the array length. 1284 ldrw(r2, Address(r5, Array<Klass*>::length_offset_in_bytes())); 1285 // Skip to start of data. 1286 add(r5, r5, Array<Klass*>::base_offset_in_bytes()); 1287 1288 cmp(sp, zr); // Clear Z flag; SP is never zero 1289 // Scan R2 words at [R5] for an occurrence of R0. 1290 // Set NZ/Z based on last compare. 1291 repne_scan(r5, r0, r2, rscratch1); 1292 1293 // Unspill the temp. registers: 1294 pop(pushed_registers, sp); 1295 1296 br(Assembler::NE, *L_failure); 1297 1298 // Success. Cache the super we found and proceed in triumph. 1299 str(super_klass, super_cache_addr); 1300 1301 if (L_success != &L_fallthrough) { 1302 b(*L_success); 1303 } 1304 1305 #undef IS_A_TEMP 1306 1307 bind(L_fallthrough); 1308 } 1309 1310 1311 void MacroAssembler::verify_oop(Register reg, const char* s) { 1312 if (!VerifyOops || VerifyAdapterSharing) { 1313 // Below address of the code string confuses VerifyAdapterSharing 1314 // because it may differ between otherwise equivalent adapters. 1315 return; 1316 } 1317 1318 // Pass register number to verify_oop_subroutine 1319 const char* b = NULL; 1320 { 1321 ResourceMark rm; 1322 stringStream ss; 1323 ss.print("verify_oop: %s: %s", reg->name(), s); 1324 b = code_string(ss.as_string()); 1325 } 1326 BLOCK_COMMENT("verify_oop {"); 1327 1328 stp(r0, rscratch1, Address(pre(sp, -2 * wordSize))); 1329 stp(rscratch2, lr, Address(pre(sp, -2 * wordSize))); 1330 1331 mov(r0, reg); 1332 mov(rscratch1, (address)b); 1333 1334 // call indirectly to solve generation ordering problem 1335 lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address())); 1336 ldr(rscratch2, Address(rscratch2)); 1337 blr(rscratch2); 1338 1339 ldp(rscratch2, lr, Address(post(sp, 2 * wordSize))); 1340 ldp(r0, rscratch1, Address(post(sp, 2 * wordSize))); 1341 1342 BLOCK_COMMENT("} verify_oop"); 1343 } 1344 1345 void MacroAssembler::verify_oop_addr(Address addr, const char* s) { 1346 if (!VerifyOops || VerifyAdapterSharing) { 1347 // Below address of the code string confuses VerifyAdapterSharing 1348 // because it may differ between otherwise equivalent adapters. 1349 return; 1350 } 1351 1352 const char* b = NULL; 1353 { 1354 ResourceMark rm; 1355 stringStream ss; 1356 ss.print("verify_oop_addr: %s", s); 1357 b = code_string(ss.as_string()); 1358 } 1359 BLOCK_COMMENT("verify_oop_addr {"); 1360 1361 stp(r0, rscratch1, Address(pre(sp, -2 * wordSize))); 1362 stp(rscratch2, lr, Address(pre(sp, -2 * wordSize))); 1363 1364 // addr may contain sp so we will have to adjust it based on the 1365 // pushes that we just did. 1366 if (addr.uses(sp)) { 1367 lea(r0, addr); 1368 ldr(r0, Address(r0, 4 * wordSize)); 1369 } else { 1370 ldr(r0, addr); 1371 } 1372 mov(rscratch1, (address)b); 1373 1374 // call indirectly to solve generation ordering problem 1375 lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address())); 1376 ldr(rscratch2, Address(rscratch2)); 1377 blr(rscratch2); 1378 1379 ldp(rscratch2, lr, Address(post(sp, 2 * wordSize))); 1380 ldp(r0, rscratch1, Address(post(sp, 2 * wordSize))); 1381 1382 BLOCK_COMMENT("} verify_oop_addr"); 1383 } 1384 1385 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot, 1386 int extra_slot_offset) { 1387 // cf. TemplateTable::prepare_invoke(), if (load_receiver). 1388 int stackElementSize = Interpreter::stackElementSize; 1389 int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0); 1390 #ifdef ASSERT 1391 int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1); 1392 assert(offset1 - offset == stackElementSize, "correct arithmetic"); 1393 #endif 1394 if (arg_slot.is_constant()) { 1395 return Address(esp, arg_slot.as_constant() * stackElementSize 1396 + offset); 1397 } else { 1398 add(rscratch1, esp, arg_slot.as_register(), 1399 ext::uxtx, exact_log2(stackElementSize)); 1400 return Address(rscratch1, offset); 1401 } 1402 } 1403 1404 void MacroAssembler::call_VM_leaf_base(address entry_point, 1405 int number_of_arguments, 1406 Label *retaddr) { 1407 call_VM_leaf_base1(entry_point, number_of_arguments, 0, ret_type_integral, retaddr); 1408 } 1409 1410 void MacroAssembler::call_VM_leaf_base1(address entry_point, 1411 int number_of_gp_arguments, 1412 int number_of_fp_arguments, 1413 ret_type type, 1414 Label *retaddr) { 1415 Label E, L; 1416 1417 stp(rscratch1, rmethod, Address(pre(sp, -2 * wordSize))); 1418 1419 // We add 1 to number_of_arguments because the thread in arg0 is 1420 // not counted 1421 mov(rscratch1, entry_point); 1422 blrt(rscratch1, number_of_gp_arguments + 1, number_of_fp_arguments, type); 1423 if (retaddr) 1424 bind(*retaddr); 1425 1426 ldp(rscratch1, rmethod, Address(post(sp, 2 * wordSize))); 1427 maybe_isb(); 1428 } 1429 1430 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) { 1431 call_VM_leaf_base(entry_point, number_of_arguments); 1432 } 1433 1434 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) { 1435 pass_arg0(this, arg_0); 1436 call_VM_leaf_base(entry_point, 1); 1437 } 1438 1439 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) { 1440 pass_arg0(this, arg_0); 1441 pass_arg1(this, arg_1); 1442 call_VM_leaf_base(entry_point, 2); 1443 } 1444 1445 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, 1446 Register arg_1, Register arg_2) { 1447 pass_arg0(this, arg_0); 1448 pass_arg1(this, arg_1); 1449 pass_arg2(this, arg_2); 1450 call_VM_leaf_base(entry_point, 3); 1451 } 1452 1453 void MacroAssembler::super_call_VM_leaf(address entry_point) { 1454 MacroAssembler::call_VM_leaf_base(entry_point, 1); 1455 } 1456 1457 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) { 1458 pass_arg0(this, arg_0); 1459 MacroAssembler::call_VM_leaf_base(entry_point, 1); 1460 } 1461 1462 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) { 1463 1464 assert(arg_0 != c_rarg1, "smashed arg"); 1465 pass_arg1(this, arg_1); 1466 pass_arg0(this, arg_0); 1467 MacroAssembler::call_VM_leaf_base(entry_point, 2); 1468 } 1469 1470 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) { 1471 assert(arg_0 != c_rarg2, "smashed arg"); 1472 assert(arg_1 != c_rarg2, "smashed arg"); 1473 pass_arg2(this, arg_2); 1474 assert(arg_0 != c_rarg1, "smashed arg"); 1475 pass_arg1(this, arg_1); 1476 pass_arg0(this, arg_0); 1477 MacroAssembler::call_VM_leaf_base(entry_point, 3); 1478 } 1479 1480 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) { 1481 assert(arg_0 != c_rarg3, "smashed arg"); 1482 assert(arg_1 != c_rarg3, "smashed arg"); 1483 assert(arg_2 != c_rarg3, "smashed arg"); 1484 pass_arg3(this, arg_3); 1485 assert(arg_0 != c_rarg2, "smashed arg"); 1486 assert(arg_1 != c_rarg2, "smashed arg"); 1487 pass_arg2(this, arg_2); 1488 assert(arg_0 != c_rarg1, "smashed arg"); 1489 pass_arg1(this, arg_1); 1490 pass_arg0(this, arg_0); 1491 MacroAssembler::call_VM_leaf_base(entry_point, 4); 1492 } 1493 1494 void MacroAssembler::null_check(Register reg, int offset) { 1495 if (needs_explicit_null_check(offset)) { 1496 // provoke OS NULL exception if reg = NULL by 1497 // accessing M[reg] w/o changing any registers 1498 // NOTE: this is plenty to provoke a segv 1499 ldr(zr, Address(reg)); 1500 } else { 1501 // nothing to do, (later) access of M[reg + offset] 1502 // will provoke OS NULL exception if reg = NULL 1503 } 1504 } 1505 1506 void MacroAssembler::test_klass_is_value(Register klass, Register temp_reg, Label& is_value) { 1507 ldrw(temp_reg, Address(klass, Klass::access_flags_offset())); 1508 andr(temp_reg, temp_reg, JVM_ACC_VALUE); 1509 cbnz(temp_reg, is_value); 1510 } 1511 1512 void MacroAssembler::test_field_is_flattenable(Register flags, Register temp_reg, Label& is_flattenable) { 1513 (void) temp_reg; // keep signature uniform with x86 1514 tbnz(flags, ConstantPoolCacheEntry::is_flattenable_field_shift, is_flattenable); 1515 } 1516 1517 void MacroAssembler::test_field_is_not_flattenable(Register flags, Register temp_reg, Label& not_flattenable) { 1518 (void) temp_reg; // keep signature uniform with x86 1519 tbz(flags, ConstantPoolCacheEntry::is_flattenable_field_shift, not_flattenable); 1520 } 1521 1522 void MacroAssembler::test_field_is_flattened(Register flags, Register temp_reg, Label& is_flattened) { 1523 (void) temp_reg; // keep signature uniform with x86 1524 tbnz(flags, ConstantPoolCacheEntry::is_flattened_field_shift, is_flattened); 1525 } 1526 1527 void MacroAssembler::test_flattened_array_oop(Register oop, Register temp_reg, Label& is_flattened_array) { 1528 load_storage_props(temp_reg, oop); 1529 andr(temp_reg, temp_reg, ArrayStorageProperties::flattened_value); 1530 cbnz(temp_reg, is_flattened_array); 1531 } 1532 1533 void MacroAssembler::test_null_free_array_oop(Register oop, Register temp_reg, Label& is_null_free_array) { 1534 load_storage_props(temp_reg, oop); 1535 andr(temp_reg, temp_reg, ArrayStorageProperties::null_free_value); 1536 cbnz(temp_reg, is_null_free_array); 1537 } 1538 1539 // MacroAssembler protected routines needed to implement 1540 // public methods 1541 1542 void MacroAssembler::mov(Register r, Address dest) { 1543 code_section()->relocate(pc(), dest.rspec()); 1544 u_int64_t imm64 = (u_int64_t)dest.target(); 1545 movptr(r, imm64); 1546 } 1547 1548 // Move a constant pointer into r. In AArch64 mode the virtual 1549 // address space is 48 bits in size, so we only need three 1550 // instructions to create a patchable instruction sequence that can 1551 // reach anywhere. 1552 void MacroAssembler::movptr(Register r, uintptr_t imm64) { 1553 #ifndef PRODUCT 1554 { 1555 char buffer[64]; 1556 snprintf(buffer, sizeof(buffer), "0x%" PRIX64, imm64); 1557 block_comment(buffer); 1558 } 1559 #endif 1560 assert(imm64 < (1ul << 48), "48-bit overflow in address constant"); 1561 movz(r, imm64 & 0xffff); 1562 imm64 >>= 16; 1563 movk(r, imm64 & 0xffff, 16); 1564 imm64 >>= 16; 1565 movk(r, imm64 & 0xffff, 32); 1566 } 1567 1568 // Macro to mov replicated immediate to vector register. 1569 // Vd will get the following values for different arrangements in T 1570 // imm32 == hex 000000gh T8B: Vd = ghghghghghghghgh 1571 // imm32 == hex 000000gh T16B: Vd = ghghghghghghghghghghghghghghghgh 1572 // imm32 == hex 0000efgh T4H: Vd = efghefghefghefgh 1573 // imm32 == hex 0000efgh T8H: Vd = efghefghefghefghefghefghefghefgh 1574 // imm32 == hex abcdefgh T2S: Vd = abcdefghabcdefgh 1575 // imm32 == hex abcdefgh T4S: Vd = abcdefghabcdefghabcdefghabcdefgh 1576 // T1D/T2D: invalid 1577 void MacroAssembler::mov(FloatRegister Vd, SIMD_Arrangement T, u_int32_t imm32) { 1578 assert(T != T1D && T != T2D, "invalid arrangement"); 1579 if (T == T8B || T == T16B) { 1580 assert((imm32 & ~0xff) == 0, "extraneous bits in unsigned imm32 (T8B/T16B)"); 1581 movi(Vd, T, imm32 & 0xff, 0); 1582 return; 1583 } 1584 u_int32_t nimm32 = ~imm32; 1585 if (T == T4H || T == T8H) { 1586 assert((imm32 & ~0xffff) == 0, "extraneous bits in unsigned imm32 (T4H/T8H)"); 1587 imm32 &= 0xffff; 1588 nimm32 &= 0xffff; 1589 } 1590 u_int32_t x = imm32; 1591 int movi_cnt = 0; 1592 int movn_cnt = 0; 1593 while (x) { if (x & 0xff) movi_cnt++; x >>= 8; } 1594 x = nimm32; 1595 while (x) { if (x & 0xff) movn_cnt++; x >>= 8; } 1596 if (movn_cnt < movi_cnt) imm32 = nimm32; 1597 unsigned lsl = 0; 1598 while (imm32 && (imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; } 1599 if (movn_cnt < movi_cnt) 1600 mvni(Vd, T, imm32 & 0xff, lsl); 1601 else 1602 movi(Vd, T, imm32 & 0xff, lsl); 1603 imm32 >>= 8; lsl += 8; 1604 while (imm32) { 1605 while ((imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; } 1606 if (movn_cnt < movi_cnt) 1607 bici(Vd, T, imm32 & 0xff, lsl); 1608 else 1609 orri(Vd, T, imm32 & 0xff, lsl); 1610 lsl += 8; imm32 >>= 8; 1611 } 1612 } 1613 1614 void MacroAssembler::mov_immediate64(Register dst, u_int64_t imm64) 1615 { 1616 #ifndef PRODUCT 1617 { 1618 char buffer[64]; 1619 snprintf(buffer, sizeof(buffer), "0x%" PRIX64, imm64); 1620 block_comment(buffer); 1621 } 1622 #endif 1623 if (operand_valid_for_logical_immediate(false, imm64)) { 1624 orr(dst, zr, imm64); 1625 } else { 1626 // we can use a combination of MOVZ or MOVN with 1627 // MOVK to build up the constant 1628 u_int64_t imm_h[4]; 1629 int zero_count = 0; 1630 int neg_count = 0; 1631 int i; 1632 for (i = 0; i < 4; i++) { 1633 imm_h[i] = ((imm64 >> (i * 16)) & 0xffffL); 1634 if (imm_h[i] == 0) { 1635 zero_count++; 1636 } else if (imm_h[i] == 0xffffL) { 1637 neg_count++; 1638 } 1639 } 1640 if (zero_count == 4) { 1641 // one MOVZ will do 1642 movz(dst, 0); 1643 } else if (neg_count == 4) { 1644 // one MOVN will do 1645 movn(dst, 0); 1646 } else if (zero_count == 3) { 1647 for (i = 0; i < 4; i++) { 1648 if (imm_h[i] != 0L) { 1649 movz(dst, (u_int32_t)imm_h[i], (i << 4)); 1650 break; 1651 } 1652 } 1653 } else if (neg_count == 3) { 1654 // one MOVN will do 1655 for (int i = 0; i < 4; i++) { 1656 if (imm_h[i] != 0xffffL) { 1657 movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4)); 1658 break; 1659 } 1660 } 1661 } else if (zero_count == 2) { 1662 // one MOVZ and one MOVK will do 1663 for (i = 0; i < 3; i++) { 1664 if (imm_h[i] != 0L) { 1665 movz(dst, (u_int32_t)imm_h[i], (i << 4)); 1666 i++; 1667 break; 1668 } 1669 } 1670 for (;i < 4; i++) { 1671 if (imm_h[i] != 0L) { 1672 movk(dst, (u_int32_t)imm_h[i], (i << 4)); 1673 } 1674 } 1675 } else if (neg_count == 2) { 1676 // one MOVN and one MOVK will do 1677 for (i = 0; i < 4; i++) { 1678 if (imm_h[i] != 0xffffL) { 1679 movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4)); 1680 i++; 1681 break; 1682 } 1683 } 1684 for (;i < 4; i++) { 1685 if (imm_h[i] != 0xffffL) { 1686 movk(dst, (u_int32_t)imm_h[i], (i << 4)); 1687 } 1688 } 1689 } else if (zero_count == 1) { 1690 // one MOVZ and two MOVKs will do 1691 for (i = 0; i < 4; i++) { 1692 if (imm_h[i] != 0L) { 1693 movz(dst, (u_int32_t)imm_h[i], (i << 4)); 1694 i++; 1695 break; 1696 } 1697 } 1698 for (;i < 4; i++) { 1699 if (imm_h[i] != 0x0L) { 1700 movk(dst, (u_int32_t)imm_h[i], (i << 4)); 1701 } 1702 } 1703 } else if (neg_count == 1) { 1704 // one MOVN and two MOVKs will do 1705 for (i = 0; i < 4; i++) { 1706 if (imm_h[i] != 0xffffL) { 1707 movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4)); 1708 i++; 1709 break; 1710 } 1711 } 1712 for (;i < 4; i++) { 1713 if (imm_h[i] != 0xffffL) { 1714 movk(dst, (u_int32_t)imm_h[i], (i << 4)); 1715 } 1716 } 1717 } else { 1718 // use a MOVZ and 3 MOVKs (makes it easier to debug) 1719 movz(dst, (u_int32_t)imm_h[0], 0); 1720 for (i = 1; i < 4; i++) { 1721 movk(dst, (u_int32_t)imm_h[i], (i << 4)); 1722 } 1723 } 1724 } 1725 } 1726 1727 void MacroAssembler::mov_immediate32(Register dst, u_int32_t imm32) 1728 { 1729 #ifndef PRODUCT 1730 { 1731 char buffer[64]; 1732 snprintf(buffer, sizeof(buffer), "0x%" PRIX32, imm32); 1733 block_comment(buffer); 1734 } 1735 #endif 1736 if (operand_valid_for_logical_immediate(true, imm32)) { 1737 orrw(dst, zr, imm32); 1738 } else { 1739 // we can use MOVZ, MOVN or two calls to MOVK to build up the 1740 // constant 1741 u_int32_t imm_h[2]; 1742 imm_h[0] = imm32 & 0xffff; 1743 imm_h[1] = ((imm32 >> 16) & 0xffff); 1744 if (imm_h[0] == 0) { 1745 movzw(dst, imm_h[1], 16); 1746 } else if (imm_h[0] == 0xffff) { 1747 movnw(dst, imm_h[1] ^ 0xffff, 16); 1748 } else if (imm_h[1] == 0) { 1749 movzw(dst, imm_h[0], 0); 1750 } else if (imm_h[1] == 0xffff) { 1751 movnw(dst, imm_h[0] ^ 0xffff, 0); 1752 } else { 1753 // use a MOVZ and MOVK (makes it easier to debug) 1754 movzw(dst, imm_h[0], 0); 1755 movkw(dst, imm_h[1], 16); 1756 } 1757 } 1758 } 1759 1760 // Form an address from base + offset in Rd. Rd may or may 1761 // not actually be used: you must use the Address that is returned. 1762 // It is up to you to ensure that the shift provided matches the size 1763 // of your data. 1764 Address MacroAssembler::form_address(Register Rd, Register base, long byte_offset, int shift) { 1765 if (Address::offset_ok_for_immed(byte_offset, shift)) 1766 // It fits; no need for any heroics 1767 return Address(base, byte_offset); 1768 1769 // Don't do anything clever with negative or misaligned offsets 1770 unsigned mask = (1 << shift) - 1; 1771 if (byte_offset < 0 || byte_offset & mask) { 1772 mov(Rd, byte_offset); 1773 add(Rd, base, Rd); 1774 return Address(Rd); 1775 } 1776 1777 // See if we can do this with two 12-bit offsets 1778 { 1779 unsigned long word_offset = byte_offset >> shift; 1780 unsigned long masked_offset = word_offset & 0xfff000; 1781 if (Address::offset_ok_for_immed(word_offset - masked_offset) 1782 && Assembler::operand_valid_for_add_sub_immediate(masked_offset << shift)) { 1783 add(Rd, base, masked_offset << shift); 1784 word_offset -= masked_offset; 1785 return Address(Rd, word_offset << shift); 1786 } 1787 } 1788 1789 // Do it the hard way 1790 mov(Rd, byte_offset); 1791 add(Rd, base, Rd); 1792 return Address(Rd); 1793 } 1794 1795 void MacroAssembler::atomic_incw(Register counter_addr, Register tmp, Register tmp2) { 1796 if (UseLSE) { 1797 mov(tmp, 1); 1798 ldadd(Assembler::word, tmp, zr, counter_addr); 1799 return; 1800 } 1801 Label retry_load; 1802 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) 1803 prfm(Address(counter_addr), PSTL1STRM); 1804 bind(retry_load); 1805 // flush and load exclusive from the memory location 1806 ldxrw(tmp, counter_addr); 1807 addw(tmp, tmp, 1); 1808 // if we store+flush with no intervening write tmp wil be zero 1809 stxrw(tmp2, tmp, counter_addr); 1810 cbnzw(tmp2, retry_load); 1811 } 1812 1813 1814 int MacroAssembler::corrected_idivl(Register result, Register ra, Register rb, 1815 bool want_remainder, Register scratch) 1816 { 1817 // Full implementation of Java idiv and irem. The function 1818 // returns the (pc) offset of the div instruction - may be needed 1819 // for implicit exceptions. 1820 // 1821 // constraint : ra/rb =/= scratch 1822 // normal case 1823 // 1824 // input : ra: dividend 1825 // rb: divisor 1826 // 1827 // result: either 1828 // quotient (= ra idiv rb) 1829 // remainder (= ra irem rb) 1830 1831 assert(ra != scratch && rb != scratch, "reg cannot be scratch"); 1832 1833 int idivl_offset = offset(); 1834 if (! want_remainder) { 1835 sdivw(result, ra, rb); 1836 } else { 1837 sdivw(scratch, ra, rb); 1838 Assembler::msubw(result, scratch, rb, ra); 1839 } 1840 1841 return idivl_offset; 1842 } 1843 1844 int MacroAssembler::corrected_idivq(Register result, Register ra, Register rb, 1845 bool want_remainder, Register scratch) 1846 { 1847 // Full implementation of Java ldiv and lrem. The function 1848 // returns the (pc) offset of the div instruction - may be needed 1849 // for implicit exceptions. 1850 // 1851 // constraint : ra/rb =/= scratch 1852 // normal case 1853 // 1854 // input : ra: dividend 1855 // rb: divisor 1856 // 1857 // result: either 1858 // quotient (= ra idiv rb) 1859 // remainder (= ra irem rb) 1860 1861 assert(ra != scratch && rb != scratch, "reg cannot be scratch"); 1862 1863 int idivq_offset = offset(); 1864 if (! want_remainder) { 1865 sdiv(result, ra, rb); 1866 } else { 1867 sdiv(scratch, ra, rb); 1868 Assembler::msub(result, scratch, rb, ra); 1869 } 1870 1871 return idivq_offset; 1872 } 1873 1874 void MacroAssembler::membar(Membar_mask_bits order_constraint) { 1875 address prev = pc() - NativeMembar::instruction_size; 1876 address last = code()->last_insn(); 1877 if (last != NULL && nativeInstruction_at(last)->is_Membar() && prev == last) { 1878 NativeMembar *bar = NativeMembar_at(prev); 1879 // We are merging two memory barrier instructions. On AArch64 we 1880 // can do this simply by ORing them together. 1881 bar->set_kind(bar->get_kind() | order_constraint); 1882 BLOCK_COMMENT("merged membar"); 1883 } else { 1884 code()->set_last_insn(pc()); 1885 dmb(Assembler::barrier(order_constraint)); 1886 } 1887 } 1888 1889 bool MacroAssembler::try_merge_ldst(Register rt, const Address &adr, size_t size_in_bytes, bool is_store) { 1890 if (ldst_can_merge(rt, adr, size_in_bytes, is_store)) { 1891 merge_ldst(rt, adr, size_in_bytes, is_store); 1892 code()->clear_last_insn(); 1893 return true; 1894 } else { 1895 assert(size_in_bytes == 8 || size_in_bytes == 4, "only 8 bytes or 4 bytes load/store is supported."); 1896 const unsigned mask = size_in_bytes - 1; 1897 if (adr.getMode() == Address::base_plus_offset && 1898 (adr.offset() & mask) == 0) { // only supports base_plus_offset. 1899 code()->set_last_insn(pc()); 1900 } 1901 return false; 1902 } 1903 } 1904 1905 void MacroAssembler::ldr(Register Rx, const Address &adr) { 1906 // We always try to merge two adjacent loads into one ldp. 1907 if (!try_merge_ldst(Rx, adr, 8, false)) { 1908 Assembler::ldr(Rx, adr); 1909 } 1910 } 1911 1912 void MacroAssembler::ldrw(Register Rw, const Address &adr) { 1913 // We always try to merge two adjacent loads into one ldp. 1914 if (!try_merge_ldst(Rw, adr, 4, false)) { 1915 Assembler::ldrw(Rw, adr); 1916 } 1917 } 1918 1919 void MacroAssembler::str(Register Rx, const Address &adr) { 1920 // We always try to merge two adjacent stores into one stp. 1921 if (!try_merge_ldst(Rx, adr, 8, true)) { 1922 Assembler::str(Rx, adr); 1923 } 1924 } 1925 1926 void MacroAssembler::strw(Register Rw, const Address &adr) { 1927 // We always try to merge two adjacent stores into one stp. 1928 if (!try_merge_ldst(Rw, adr, 4, true)) { 1929 Assembler::strw(Rw, adr); 1930 } 1931 } 1932 1933 // MacroAssembler routines found actually to be needed 1934 1935 void MacroAssembler::push(Register src) 1936 { 1937 str(src, Address(pre(esp, -1 * wordSize))); 1938 } 1939 1940 void MacroAssembler::pop(Register dst) 1941 { 1942 ldr(dst, Address(post(esp, 1 * wordSize))); 1943 } 1944 1945 // Note: load_unsigned_short used to be called load_unsigned_word. 1946 int MacroAssembler::load_unsigned_short(Register dst, Address src) { 1947 int off = offset(); 1948 ldrh(dst, src); 1949 return off; 1950 } 1951 1952 int MacroAssembler::load_unsigned_byte(Register dst, Address src) { 1953 int off = offset(); 1954 ldrb(dst, src); 1955 return off; 1956 } 1957 1958 int MacroAssembler::load_signed_short(Register dst, Address src) { 1959 int off = offset(); 1960 ldrsh(dst, src); 1961 return off; 1962 } 1963 1964 int MacroAssembler::load_signed_byte(Register dst, Address src) { 1965 int off = offset(); 1966 ldrsb(dst, src); 1967 return off; 1968 } 1969 1970 int MacroAssembler::load_signed_short32(Register dst, Address src) { 1971 int off = offset(); 1972 ldrshw(dst, src); 1973 return off; 1974 } 1975 1976 int MacroAssembler::load_signed_byte32(Register dst, Address src) { 1977 int off = offset(); 1978 ldrsbw(dst, src); 1979 return off; 1980 } 1981 1982 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) { 1983 switch (size_in_bytes) { 1984 case 8: ldr(dst, src); break; 1985 case 4: ldrw(dst, src); break; 1986 case 2: is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break; 1987 case 1: is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break; 1988 default: ShouldNotReachHere(); 1989 } 1990 } 1991 1992 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) { 1993 switch (size_in_bytes) { 1994 case 8: str(src, dst); break; 1995 case 4: strw(src, dst); break; 1996 case 2: strh(src, dst); break; 1997 case 1: strb(src, dst); break; 1998 default: ShouldNotReachHere(); 1999 } 2000 } 2001 2002 void MacroAssembler::decrementw(Register reg, int value) 2003 { 2004 if (value < 0) { incrementw(reg, -value); return; } 2005 if (value == 0) { return; } 2006 if (value < (1 << 12)) { subw(reg, reg, value); return; } 2007 /* else */ { 2008 guarantee(reg != rscratch2, "invalid dst for register decrement"); 2009 movw(rscratch2, (unsigned)value); 2010 subw(reg, reg, rscratch2); 2011 } 2012 } 2013 2014 void MacroAssembler::decrement(Register reg, int value) 2015 { 2016 if (value < 0) { increment(reg, -value); return; } 2017 if (value == 0) { return; } 2018 if (value < (1 << 12)) { sub(reg, reg, value); return; } 2019 /* else */ { 2020 assert(reg != rscratch2, "invalid dst for register decrement"); 2021 mov(rscratch2, (unsigned long)value); 2022 sub(reg, reg, rscratch2); 2023 } 2024 } 2025 2026 void MacroAssembler::decrementw(Address dst, int value) 2027 { 2028 assert(!dst.uses(rscratch1), "invalid dst for address decrement"); 2029 if (dst.getMode() == Address::literal) { 2030 assert(abs(value) < (1 << 12), "invalid value and address mode combination"); 2031 lea(rscratch2, dst); 2032 dst = Address(rscratch2); 2033 } 2034 ldrw(rscratch1, dst); 2035 decrementw(rscratch1, value); 2036 strw(rscratch1, dst); 2037 } 2038 2039 void MacroAssembler::decrement(Address dst, int value) 2040 { 2041 assert(!dst.uses(rscratch1), "invalid address for decrement"); 2042 if (dst.getMode() == Address::literal) { 2043 assert(abs(value) < (1 << 12), "invalid value and address mode combination"); 2044 lea(rscratch2, dst); 2045 dst = Address(rscratch2); 2046 } 2047 ldr(rscratch1, dst); 2048 decrement(rscratch1, value); 2049 str(rscratch1, dst); 2050 } 2051 2052 void MacroAssembler::incrementw(Register reg, int value) 2053 { 2054 if (value < 0) { decrementw(reg, -value); return; } 2055 if (value == 0) { return; } 2056 if (value < (1 << 12)) { addw(reg, reg, value); return; } 2057 /* else */ { 2058 assert(reg != rscratch2, "invalid dst for register increment"); 2059 movw(rscratch2, (unsigned)value); 2060 addw(reg, reg, rscratch2); 2061 } 2062 } 2063 2064 void MacroAssembler::increment(Register reg, int value) 2065 { 2066 if (value < 0) { decrement(reg, -value); return; } 2067 if (value == 0) { return; } 2068 if (value < (1 << 12)) { add(reg, reg, value); return; } 2069 /* else */ { 2070 assert(reg != rscratch2, "invalid dst for register increment"); 2071 movw(rscratch2, (unsigned)value); 2072 add(reg, reg, rscratch2); 2073 } 2074 } 2075 2076 void MacroAssembler::incrementw(Address dst, int value) 2077 { 2078 assert(!dst.uses(rscratch1), "invalid dst for address increment"); 2079 if (dst.getMode() == Address::literal) { 2080 assert(abs(value) < (1 << 12), "invalid value and address mode combination"); 2081 lea(rscratch2, dst); 2082 dst = Address(rscratch2); 2083 } 2084 ldrw(rscratch1, dst); 2085 incrementw(rscratch1, value); 2086 strw(rscratch1, dst); 2087 } 2088 2089 void MacroAssembler::increment(Address dst, int value) 2090 { 2091 assert(!dst.uses(rscratch1), "invalid dst for address increment"); 2092 if (dst.getMode() == Address::literal) { 2093 assert(abs(value) < (1 << 12), "invalid value and address mode combination"); 2094 lea(rscratch2, dst); 2095 dst = Address(rscratch2); 2096 } 2097 ldr(rscratch1, dst); 2098 increment(rscratch1, value); 2099 str(rscratch1, dst); 2100 } 2101 2102 2103 void MacroAssembler::pusha() { 2104 push(0x7fffffff, sp); 2105 } 2106 2107 void MacroAssembler::popa() { 2108 pop(0x7fffffff, sp); 2109 } 2110 2111 // Push lots of registers in the bit set supplied. Don't push sp. 2112 // Return the number of words pushed 2113 int MacroAssembler::push(unsigned int bitset, Register stack) { 2114 int words_pushed = 0; 2115 2116 // Scan bitset to accumulate register pairs 2117 unsigned char regs[32]; 2118 int count = 0; 2119 for (int reg = 0; reg <= 30; reg++) { 2120 if (1 & bitset) 2121 regs[count++] = reg; 2122 bitset >>= 1; 2123 } 2124 regs[count++] = zr->encoding_nocheck(); 2125 count &= ~1; // Only push an even nuber of regs 2126 2127 if (count) { 2128 stp(as_Register(regs[0]), as_Register(regs[1]), 2129 Address(pre(stack, -count * wordSize))); 2130 words_pushed += 2; 2131 } 2132 for (int i = 2; i < count; i += 2) { 2133 stp(as_Register(regs[i]), as_Register(regs[i+1]), 2134 Address(stack, i * wordSize)); 2135 words_pushed += 2; 2136 } 2137 2138 assert(words_pushed == count, "oops, pushed != count"); 2139 2140 return count; 2141 } 2142 2143 int MacroAssembler::pop(unsigned int bitset, Register stack) { 2144 int words_pushed = 0; 2145 2146 // Scan bitset to accumulate register pairs 2147 unsigned char regs[32]; 2148 int count = 0; 2149 for (int reg = 0; reg <= 30; reg++) { 2150 if (1 & bitset) 2151 regs[count++] = reg; 2152 bitset >>= 1; 2153 } 2154 regs[count++] = zr->encoding_nocheck(); 2155 count &= ~1; 2156 2157 for (int i = 2; i < count; i += 2) { 2158 ldp(as_Register(regs[i]), as_Register(regs[i+1]), 2159 Address(stack, i * wordSize)); 2160 words_pushed += 2; 2161 } 2162 if (count) { 2163 ldp(as_Register(regs[0]), as_Register(regs[1]), 2164 Address(post(stack, count * wordSize))); 2165 words_pushed += 2; 2166 } 2167 2168 assert(words_pushed == count, "oops, pushed != count"); 2169 2170 return count; 2171 } 2172 #ifdef ASSERT 2173 void MacroAssembler::verify_heapbase(const char* msg) { 2174 #if 0 2175 assert (UseCompressedOops || UseCompressedClassPointers, "should be compressed"); 2176 assert (Universe::heap() != NULL, "java heap should be initialized"); 2177 if (CheckCompressedOops) { 2178 Label ok; 2179 push(1 << rscratch1->encoding(), sp); // cmpptr trashes rscratch1 2180 cmpptr(rheapbase, ExternalAddress((address)CompressedOops::ptrs_base_addr())); 2181 br(Assembler::EQ, ok); 2182 stop(msg); 2183 bind(ok); 2184 pop(1 << rscratch1->encoding(), sp); 2185 } 2186 #endif 2187 } 2188 #endif 2189 2190 void MacroAssembler::resolve_jobject(Register value, Register thread, Register tmp) { 2191 Label done, not_weak; 2192 cbz(value, done); // Use NULL as-is. 2193 2194 STATIC_ASSERT(JNIHandles::weak_tag_mask == 1u); 2195 tbz(r0, 0, not_weak); // Test for jweak tag. 2196 2197 // Resolve jweak. 2198 access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, value, 2199 Address(value, -JNIHandles::weak_tag_value), tmp, thread); 2200 verify_oop(value); 2201 b(done); 2202 2203 bind(not_weak); 2204 // Resolve (untagged) jobject. 2205 access_load_at(T_OBJECT, IN_NATIVE, value, Address(value, 0), tmp, thread); 2206 verify_oop(value); 2207 bind(done); 2208 } 2209 2210 void MacroAssembler::stop(const char* msg) { 2211 address ip = pc(); 2212 pusha(); 2213 mov(c_rarg0, (address)msg); 2214 mov(c_rarg1, (address)ip); 2215 mov(c_rarg2, sp); 2216 mov(c_rarg3, CAST_FROM_FN_PTR(address, MacroAssembler::debug64)); 2217 // call(c_rarg3); 2218 blrt(c_rarg3, 3, 0, 1); 2219 hlt(0); 2220 } 2221 2222 void MacroAssembler::warn(const char* msg) { 2223 pusha(); 2224 mov(c_rarg0, (address)msg); 2225 mov(lr, CAST_FROM_FN_PTR(address, warning)); 2226 blrt(lr, 1, 0, MacroAssembler::ret_type_void); 2227 popa(); 2228 } 2229 2230 void MacroAssembler::unimplemented(const char* what) { 2231 const char* buf = NULL; 2232 { 2233 ResourceMark rm; 2234 stringStream ss; 2235 ss.print("unimplemented: %s", what); 2236 buf = code_string(ss.as_string()); 2237 } 2238 stop(buf); 2239 } 2240 2241 // If a constant does not fit in an immediate field, generate some 2242 // number of MOV instructions and then perform the operation. 2243 void MacroAssembler::wrap_add_sub_imm_insn(Register Rd, Register Rn, unsigned imm, 2244 add_sub_imm_insn insn1, 2245 add_sub_reg_insn insn2) { 2246 assert(Rd != zr, "Rd = zr and not setting flags?"); 2247 if (operand_valid_for_add_sub_immediate((int)imm)) { 2248 (this->*insn1)(Rd, Rn, imm); 2249 } else { 2250 if (uabs(imm) < (1 << 24)) { 2251 (this->*insn1)(Rd, Rn, imm & -(1 << 12)); 2252 (this->*insn1)(Rd, Rd, imm & ((1 << 12)-1)); 2253 } else { 2254 assert_different_registers(Rd, Rn); 2255 mov(Rd, (uint64_t)imm); 2256 (this->*insn2)(Rd, Rn, Rd, LSL, 0); 2257 } 2258 } 2259 } 2260 2261 // Seperate vsn which sets the flags. Optimisations are more restricted 2262 // because we must set the flags correctly. 2263 void MacroAssembler::wrap_adds_subs_imm_insn(Register Rd, Register Rn, unsigned imm, 2264 add_sub_imm_insn insn1, 2265 add_sub_reg_insn insn2) { 2266 if (operand_valid_for_add_sub_immediate((int)imm)) { 2267 (this->*insn1)(Rd, Rn, imm); 2268 } else { 2269 assert_different_registers(Rd, Rn); 2270 assert(Rd != zr, "overflow in immediate operand"); 2271 mov(Rd, (uint64_t)imm); 2272 (this->*insn2)(Rd, Rn, Rd, LSL, 0); 2273 } 2274 } 2275 2276 2277 void MacroAssembler::add(Register Rd, Register Rn, RegisterOrConstant increment) { 2278 if (increment.is_register()) { 2279 add(Rd, Rn, increment.as_register()); 2280 } else { 2281 add(Rd, Rn, increment.as_constant()); 2282 } 2283 } 2284 2285 void MacroAssembler::addw(Register Rd, Register Rn, RegisterOrConstant increment) { 2286 if (increment.is_register()) { 2287 addw(Rd, Rn, increment.as_register()); 2288 } else { 2289 addw(Rd, Rn, increment.as_constant()); 2290 } 2291 } 2292 2293 void MacroAssembler::sub(Register Rd, Register Rn, RegisterOrConstant decrement) { 2294 if (decrement.is_register()) { 2295 sub(Rd, Rn, decrement.as_register()); 2296 } else { 2297 sub(Rd, Rn, decrement.as_constant()); 2298 } 2299 } 2300 2301 void MacroAssembler::subw(Register Rd, Register Rn, RegisterOrConstant decrement) { 2302 if (decrement.is_register()) { 2303 subw(Rd, Rn, decrement.as_register()); 2304 } else { 2305 subw(Rd, Rn, decrement.as_constant()); 2306 } 2307 } 2308 2309 void MacroAssembler::reinit_heapbase() 2310 { 2311 if (UseCompressedOops) { 2312 if (Universe::is_fully_initialized()) { 2313 mov(rheapbase, CompressedOops::ptrs_base()); 2314 } else { 2315 lea(rheapbase, ExternalAddress((address)CompressedOops::ptrs_base_addr())); 2316 ldr(rheapbase, Address(rheapbase)); 2317 } 2318 } 2319 } 2320 2321 // this simulates the behaviour of the x86 cmpxchg instruction using a 2322 // load linked/store conditional pair. we use the acquire/release 2323 // versions of these instructions so that we flush pending writes as 2324 // per Java semantics. 2325 2326 // n.b the x86 version assumes the old value to be compared against is 2327 // in rax and updates rax with the value located in memory if the 2328 // cmpxchg fails. we supply a register for the old value explicitly 2329 2330 // the aarch64 load linked/store conditional instructions do not 2331 // accept an offset. so, unlike x86, we must provide a plain register 2332 // to identify the memory word to be compared/exchanged rather than a 2333 // register+offset Address. 2334 2335 void MacroAssembler::cmpxchgptr(Register oldv, Register newv, Register addr, Register tmp, 2336 Label &succeed, Label *fail) { 2337 // oldv holds comparison value 2338 // newv holds value to write in exchange 2339 // addr identifies memory word to compare against/update 2340 if (UseLSE) { 2341 mov(tmp, oldv); 2342 casal(Assembler::xword, oldv, newv, addr); 2343 cmp(tmp, oldv); 2344 br(Assembler::EQ, succeed); 2345 membar(AnyAny); 2346 } else { 2347 Label retry_load, nope; 2348 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) 2349 prfm(Address(addr), PSTL1STRM); 2350 bind(retry_load); 2351 // flush and load exclusive from the memory location 2352 // and fail if it is not what we expect 2353 ldaxr(tmp, addr); 2354 cmp(tmp, oldv); 2355 br(Assembler::NE, nope); 2356 // if we store+flush with no intervening write tmp wil be zero 2357 stlxr(tmp, newv, addr); 2358 cbzw(tmp, succeed); 2359 // retry so we only ever return after a load fails to compare 2360 // ensures we don't return a stale value after a failed write. 2361 b(retry_load); 2362 // if the memory word differs we return it in oldv and signal a fail 2363 bind(nope); 2364 membar(AnyAny); 2365 mov(oldv, tmp); 2366 } 2367 if (fail) 2368 b(*fail); 2369 } 2370 2371 void MacroAssembler::cmpxchg_obj_header(Register oldv, Register newv, Register obj, Register tmp, 2372 Label &succeed, Label *fail) { 2373 assert(oopDesc::mark_offset_in_bytes() == 0, "assumption"); 2374 cmpxchgptr(oldv, newv, obj, tmp, succeed, fail); 2375 } 2376 2377 void MacroAssembler::cmpxchgw(Register oldv, Register newv, Register addr, Register tmp, 2378 Label &succeed, Label *fail) { 2379 // oldv holds comparison value 2380 // newv holds value to write in exchange 2381 // addr identifies memory word to compare against/update 2382 // tmp returns 0/1 for success/failure 2383 if (UseLSE) { 2384 mov(tmp, oldv); 2385 casal(Assembler::word, oldv, newv, addr); 2386 cmp(tmp, oldv); 2387 br(Assembler::EQ, succeed); 2388 membar(AnyAny); 2389 } else { 2390 Label retry_load, nope; 2391 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) 2392 prfm(Address(addr), PSTL1STRM); 2393 bind(retry_load); 2394 // flush and load exclusive from the memory location 2395 // and fail if it is not what we expect 2396 ldaxrw(tmp, addr); 2397 cmp(tmp, oldv); 2398 br(Assembler::NE, nope); 2399 // if we store+flush with no intervening write tmp wil be zero 2400 stlxrw(tmp, newv, addr); 2401 cbzw(tmp, succeed); 2402 // retry so we only ever return after a load fails to compare 2403 // ensures we don't return a stale value after a failed write. 2404 b(retry_load); 2405 // if the memory word differs we return it in oldv and signal a fail 2406 bind(nope); 2407 membar(AnyAny); 2408 mov(oldv, tmp); 2409 } 2410 if (fail) 2411 b(*fail); 2412 } 2413 2414 // A generic CAS; success or failure is in the EQ flag. A weak CAS 2415 // doesn't retry and may fail spuriously. If the oldval is wanted, 2416 // Pass a register for the result, otherwise pass noreg. 2417 2418 // Clobbers rscratch1 2419 void MacroAssembler::cmpxchg(Register addr, Register expected, 2420 Register new_val, 2421 enum operand_size size, 2422 bool acquire, bool release, 2423 bool weak, 2424 Register result) { 2425 if (result == noreg) result = rscratch1; 2426 BLOCK_COMMENT("cmpxchg {"); 2427 if (UseLSE) { 2428 mov(result, expected); 2429 lse_cas(result, new_val, addr, size, acquire, release, /*not_pair*/ true); 2430 compare_eq(result, expected, size); 2431 } else { 2432 Label retry_load, done; 2433 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) 2434 prfm(Address(addr), PSTL1STRM); 2435 bind(retry_load); 2436 load_exclusive(result, addr, size, acquire); 2437 compare_eq(result, expected, size); 2438 br(Assembler::NE, done); 2439 store_exclusive(rscratch1, new_val, addr, size, release); 2440 if (weak) { 2441 cmpw(rscratch1, 0u); // If the store fails, return NE to our caller. 2442 } else { 2443 cbnzw(rscratch1, retry_load); 2444 } 2445 bind(done); 2446 } 2447 BLOCK_COMMENT("} cmpxchg"); 2448 } 2449 2450 // A generic comparison. Only compares for equality, clobbers rscratch1. 2451 void MacroAssembler::compare_eq(Register rm, Register rn, enum operand_size size) { 2452 if (size == xword) { 2453 cmp(rm, rn); 2454 } else if (size == word) { 2455 cmpw(rm, rn); 2456 } else if (size == halfword) { 2457 eorw(rscratch1, rm, rn); 2458 ands(zr, rscratch1, 0xffff); 2459 } else if (size == byte) { 2460 eorw(rscratch1, rm, rn); 2461 ands(zr, rscratch1, 0xff); 2462 } else { 2463 ShouldNotReachHere(); 2464 } 2465 } 2466 2467 2468 static bool different(Register a, RegisterOrConstant b, Register c) { 2469 if (b.is_constant()) 2470 return a != c; 2471 else 2472 return a != b.as_register() && a != c && b.as_register() != c; 2473 } 2474 2475 #define ATOMIC_OP(NAME, LDXR, OP, IOP, AOP, STXR, sz) \ 2476 void MacroAssembler::atomic_##NAME(Register prev, RegisterOrConstant incr, Register addr) { \ 2477 if (UseLSE) { \ 2478 prev = prev->is_valid() ? prev : zr; \ 2479 if (incr.is_register()) { \ 2480 AOP(sz, incr.as_register(), prev, addr); \ 2481 } else { \ 2482 mov(rscratch2, incr.as_constant()); \ 2483 AOP(sz, rscratch2, prev, addr); \ 2484 } \ 2485 return; \ 2486 } \ 2487 Register result = rscratch2; \ 2488 if (prev->is_valid()) \ 2489 result = different(prev, incr, addr) ? prev : rscratch2; \ 2490 \ 2491 Label retry_load; \ 2492 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) \ 2493 prfm(Address(addr), PSTL1STRM); \ 2494 bind(retry_load); \ 2495 LDXR(result, addr); \ 2496 OP(rscratch1, result, incr); \ 2497 STXR(rscratch2, rscratch1, addr); \ 2498 cbnzw(rscratch2, retry_load); \ 2499 if (prev->is_valid() && prev != result) { \ 2500 IOP(prev, rscratch1, incr); \ 2501 } \ 2502 } 2503 2504 ATOMIC_OP(add, ldxr, add, sub, ldadd, stxr, Assembler::xword) 2505 ATOMIC_OP(addw, ldxrw, addw, subw, ldadd, stxrw, Assembler::word) 2506 ATOMIC_OP(addal, ldaxr, add, sub, ldaddal, stlxr, Assembler::xword) 2507 ATOMIC_OP(addalw, ldaxrw, addw, subw, ldaddal, stlxrw, Assembler::word) 2508 2509 #undef ATOMIC_OP 2510 2511 #define ATOMIC_XCHG(OP, AOP, LDXR, STXR, sz) \ 2512 void MacroAssembler::atomic_##OP(Register prev, Register newv, Register addr) { \ 2513 if (UseLSE) { \ 2514 prev = prev->is_valid() ? prev : zr; \ 2515 AOP(sz, newv, prev, addr); \ 2516 return; \ 2517 } \ 2518 Register result = rscratch2; \ 2519 if (prev->is_valid()) \ 2520 result = different(prev, newv, addr) ? prev : rscratch2; \ 2521 \ 2522 Label retry_load; \ 2523 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) \ 2524 prfm(Address(addr), PSTL1STRM); \ 2525 bind(retry_load); \ 2526 LDXR(result, addr); \ 2527 STXR(rscratch1, newv, addr); \ 2528 cbnzw(rscratch1, retry_load); \ 2529 if (prev->is_valid() && prev != result) \ 2530 mov(prev, result); \ 2531 } 2532 2533 ATOMIC_XCHG(xchg, swp, ldxr, stxr, Assembler::xword) 2534 ATOMIC_XCHG(xchgw, swp, ldxrw, stxrw, Assembler::word) 2535 ATOMIC_XCHG(xchgal, swpal, ldaxr, stlxr, Assembler::xword) 2536 ATOMIC_XCHG(xchgalw, swpal, ldaxrw, stlxrw, Assembler::word) 2537 2538 #undef ATOMIC_XCHG 2539 2540 #ifndef PRODUCT 2541 extern "C" void findpc(intptr_t x); 2542 #endif 2543 2544 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[]) 2545 { 2546 // In order to get locks to work, we need to fake a in_VM state 2547 if (ShowMessageBoxOnError ) { 2548 JavaThread* thread = JavaThread::current(); 2549 JavaThreadState saved_state = thread->thread_state(); 2550 thread->set_thread_state(_thread_in_vm); 2551 #ifndef PRODUCT 2552 if (CountBytecodes || TraceBytecodes || StopInterpreterAt) { 2553 ttyLocker ttyl; 2554 BytecodeCounter::print(); 2555 } 2556 #endif 2557 if (os::message_box(msg, "Execution stopped, print registers?")) { 2558 ttyLocker ttyl; 2559 tty->print_cr(" pc = 0x%016lx", pc); 2560 #ifndef PRODUCT 2561 tty->cr(); 2562 findpc(pc); 2563 tty->cr(); 2564 #endif 2565 tty->print_cr(" r0 = 0x%016lx", regs[0]); 2566 tty->print_cr(" r1 = 0x%016lx", regs[1]); 2567 tty->print_cr(" r2 = 0x%016lx", regs[2]); 2568 tty->print_cr(" r3 = 0x%016lx", regs[3]); 2569 tty->print_cr(" r4 = 0x%016lx", regs[4]); 2570 tty->print_cr(" r5 = 0x%016lx", regs[5]); 2571 tty->print_cr(" r6 = 0x%016lx", regs[6]); 2572 tty->print_cr(" r7 = 0x%016lx", regs[7]); 2573 tty->print_cr(" r8 = 0x%016lx", regs[8]); 2574 tty->print_cr(" r9 = 0x%016lx", regs[9]); 2575 tty->print_cr("r10 = 0x%016lx", regs[10]); 2576 tty->print_cr("r11 = 0x%016lx", regs[11]); 2577 tty->print_cr("r12 = 0x%016lx", regs[12]); 2578 tty->print_cr("r13 = 0x%016lx", regs[13]); 2579 tty->print_cr("r14 = 0x%016lx", regs[14]); 2580 tty->print_cr("r15 = 0x%016lx", regs[15]); 2581 tty->print_cr("r16 = 0x%016lx", regs[16]); 2582 tty->print_cr("r17 = 0x%016lx", regs[17]); 2583 tty->print_cr("r18 = 0x%016lx", regs[18]); 2584 tty->print_cr("r19 = 0x%016lx", regs[19]); 2585 tty->print_cr("r20 = 0x%016lx", regs[20]); 2586 tty->print_cr("r21 = 0x%016lx", regs[21]); 2587 tty->print_cr("r22 = 0x%016lx", regs[22]); 2588 tty->print_cr("r23 = 0x%016lx", regs[23]); 2589 tty->print_cr("r24 = 0x%016lx", regs[24]); 2590 tty->print_cr("r25 = 0x%016lx", regs[25]); 2591 tty->print_cr("r26 = 0x%016lx", regs[26]); 2592 tty->print_cr("r27 = 0x%016lx", regs[27]); 2593 tty->print_cr("r28 = 0x%016lx", regs[28]); 2594 tty->print_cr("r30 = 0x%016lx", regs[30]); 2595 tty->print_cr("r31 = 0x%016lx", regs[31]); 2596 BREAKPOINT; 2597 } 2598 ThreadStateTransition::transition(thread, _thread_in_vm, saved_state); 2599 } else { 2600 ttyLocker ttyl; 2601 ::tty->print_cr("=============== DEBUG MESSAGE: %s ================\n", 2602 msg); 2603 assert(false, "DEBUG MESSAGE: %s", msg); 2604 } 2605 } 2606 2607 #ifdef BUILTIN_SIM 2608 // routine to generate an x86 prolog for a stub function which 2609 // bootstraps into the generated ARM code which directly follows the 2610 // stub 2611 // 2612 // the argument encodes the number of general and fp registers 2613 // passed by the caller and the callng convention (currently just 2614 // the number of general registers and assumes C argument passing) 2615 2616 extern "C" { 2617 int aarch64_stub_prolog_size(); 2618 void aarch64_stub_prolog(); 2619 void aarch64_prolog(); 2620 } 2621 2622 void MacroAssembler::c_stub_prolog(int gp_arg_count, int fp_arg_count, int ret_type, 2623 address *prolog_ptr) 2624 { 2625 int calltype = (((ret_type & 0x3) << 8) | 2626 ((fp_arg_count & 0xf) << 4) | 2627 (gp_arg_count & 0xf)); 2628 2629 // the addresses for the x86 to ARM entry code we need to use 2630 address start = pc(); 2631 // printf("start = %lx\n", start); 2632 int byteCount = aarch64_stub_prolog_size(); 2633 // printf("byteCount = %x\n", byteCount); 2634 int instructionCount = (byteCount + 3)/ 4; 2635 // printf("instructionCount = %x\n", instructionCount); 2636 for (int i = 0; i < instructionCount; i++) { 2637 nop(); 2638 } 2639 2640 memcpy(start, (void*)aarch64_stub_prolog, byteCount); 2641 2642 // write the address of the setup routine and the call format at the 2643 // end of into the copied code 2644 u_int64_t *patch_end = (u_int64_t *)(start + byteCount); 2645 if (prolog_ptr) 2646 patch_end[-2] = (u_int64_t)prolog_ptr; 2647 patch_end[-1] = calltype; 2648 } 2649 #endif 2650 2651 void MacroAssembler::push_call_clobbered_registers() { 2652 int step = 4 * wordSize; 2653 push(RegSet::range(r0, r18) - RegSet::of(rscratch1, rscratch2), sp); 2654 sub(sp, sp, step); 2655 mov(rscratch1, -step); 2656 // Push v0-v7, v16-v31. 2657 for (int i = 31; i>= 4; i -= 4) { 2658 if (i <= v7->encoding() || i >= v16->encoding()) 2659 st1(as_FloatRegister(i-3), as_FloatRegister(i-2), as_FloatRegister(i-1), 2660 as_FloatRegister(i), T1D, Address(post(sp, rscratch1))); 2661 } 2662 st1(as_FloatRegister(0), as_FloatRegister(1), as_FloatRegister(2), 2663 as_FloatRegister(3), T1D, Address(sp)); 2664 } 2665 2666 void MacroAssembler::pop_call_clobbered_registers() { 2667 for (int i = 0; i < 32; i += 4) { 2668 if (i <= v7->encoding() || i >= v16->encoding()) 2669 ld1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2), 2670 as_FloatRegister(i+3), T1D, Address(post(sp, 4 * wordSize))); 2671 } 2672 2673 pop(RegSet::range(r0, r18) - RegSet::of(rscratch1, rscratch2), sp); 2674 } 2675 2676 void MacroAssembler::push_CPU_state(bool save_vectors) { 2677 int step = (save_vectors ? 8 : 4) * wordSize; 2678 push(0x3fffffff, sp); // integer registers except lr & sp 2679 mov(rscratch1, -step); 2680 sub(sp, sp, step); 2681 for (int i = 28; i >= 4; i -= 4) { 2682 st1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2), 2683 as_FloatRegister(i+3), save_vectors ? T2D : T1D, Address(post(sp, rscratch1))); 2684 } 2685 st1(v0, v1, v2, v3, save_vectors ? T2D : T1D, sp); 2686 } 2687 2688 void MacroAssembler::pop_CPU_state(bool restore_vectors) { 2689 int step = (restore_vectors ? 8 : 4) * wordSize; 2690 for (int i = 0; i <= 28; i += 4) 2691 ld1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2), 2692 as_FloatRegister(i+3), restore_vectors ? T2D : T1D, Address(post(sp, step))); 2693 pop(0x3fffffff, sp); // integer registers except lr & sp 2694 } 2695 2696 /** 2697 * Helpers for multiply_to_len(). 2698 */ 2699 void MacroAssembler::add2_with_carry(Register final_dest_hi, Register dest_hi, Register dest_lo, 2700 Register src1, Register src2) { 2701 adds(dest_lo, dest_lo, src1); 2702 adc(dest_hi, dest_hi, zr); 2703 adds(dest_lo, dest_lo, src2); 2704 adc(final_dest_hi, dest_hi, zr); 2705 } 2706 2707 // Generate an address from (r + r1 extend offset). "size" is the 2708 // size of the operand. The result may be in rscratch2. 2709 Address MacroAssembler::offsetted_address(Register r, Register r1, 2710 Address::extend ext, int offset, int size) { 2711 if (offset || (ext.shift() % size != 0)) { 2712 lea(rscratch2, Address(r, r1, ext)); 2713 return Address(rscratch2, offset); 2714 } else { 2715 return Address(r, r1, ext); 2716 } 2717 } 2718 2719 Address MacroAssembler::spill_address(int size, int offset, Register tmp) 2720 { 2721 assert(offset >= 0, "spill to negative address?"); 2722 // Offset reachable ? 2723 // Not aligned - 9 bits signed offset 2724 // Aligned - 12 bits unsigned offset shifted 2725 Register base = sp; 2726 if ((offset & (size-1)) && offset >= (1<<8)) { 2727 add(tmp, base, offset & ((1<<12)-1)); 2728 base = tmp; 2729 offset &= -1u<<12; 2730 } 2731 2732 if (offset >= (1<<12) * size) { 2733 add(tmp, base, offset & (((1<<12)-1)<<12)); 2734 base = tmp; 2735 offset &= ~(((1<<12)-1)<<12); 2736 } 2737 2738 return Address(base, offset); 2739 } 2740 2741 // Checks whether offset is aligned. 2742 // Returns true if it is, else false. 2743 bool MacroAssembler::merge_alignment_check(Register base, 2744 size_t size, 2745 long cur_offset, 2746 long prev_offset) const { 2747 if (AvoidUnalignedAccesses) { 2748 if (base == sp) { 2749 // Checks whether low offset if aligned to pair of registers. 2750 long pair_mask = size * 2 - 1; 2751 long offset = prev_offset > cur_offset ? cur_offset : prev_offset; 2752 return (offset & pair_mask) == 0; 2753 } else { // If base is not sp, we can't guarantee the access is aligned. 2754 return false; 2755 } 2756 } else { 2757 long mask = size - 1; 2758 // Load/store pair instruction only supports element size aligned offset. 2759 return (cur_offset & mask) == 0 && (prev_offset & mask) == 0; 2760 } 2761 } 2762 2763 // Checks whether current and previous loads/stores can be merged. 2764 // Returns true if it can be merged, else false. 2765 bool MacroAssembler::ldst_can_merge(Register rt, 2766 const Address &adr, 2767 size_t cur_size_in_bytes, 2768 bool is_store) const { 2769 address prev = pc() - NativeInstruction::instruction_size; 2770 address last = code()->last_insn(); 2771 2772 if (last == NULL || !nativeInstruction_at(last)->is_Imm_LdSt()) { 2773 return false; 2774 } 2775 2776 if (adr.getMode() != Address::base_plus_offset || prev != last) { 2777 return false; 2778 } 2779 2780 NativeLdSt* prev_ldst = NativeLdSt_at(prev); 2781 size_t prev_size_in_bytes = prev_ldst->size_in_bytes(); 2782 2783 assert(prev_size_in_bytes == 4 || prev_size_in_bytes == 8, "only supports 64/32bit merging."); 2784 assert(cur_size_in_bytes == 4 || cur_size_in_bytes == 8, "only supports 64/32bit merging."); 2785 2786 if (cur_size_in_bytes != prev_size_in_bytes || is_store != prev_ldst->is_store()) { 2787 return false; 2788 } 2789 2790 long max_offset = 63 * prev_size_in_bytes; 2791 long min_offset = -64 * prev_size_in_bytes; 2792 2793 assert(prev_ldst->is_not_pre_post_index(), "pre-index or post-index is not supported to be merged."); 2794 2795 // Only same base can be merged. 2796 if (adr.base() != prev_ldst->base()) { 2797 return false; 2798 } 2799 2800 long cur_offset = adr.offset(); 2801 long prev_offset = prev_ldst->offset(); 2802 size_t diff = abs(cur_offset - prev_offset); 2803 if (diff != prev_size_in_bytes) { 2804 return false; 2805 } 2806 2807 // Following cases can not be merged: 2808 // ldr x2, [x2, #8] 2809 // ldr x3, [x2, #16] 2810 // or: 2811 // ldr x2, [x3, #8] 2812 // ldr x2, [x3, #16] 2813 // If t1 and t2 is the same in "ldp t1, t2, [xn, #imm]", we'll get SIGILL. 2814 if (!is_store && (adr.base() == prev_ldst->target() || rt == prev_ldst->target())) { 2815 return false; 2816 } 2817 2818 long low_offset = prev_offset > cur_offset ? cur_offset : prev_offset; 2819 // Offset range must be in ldp/stp instruction's range. 2820 if (low_offset > max_offset || low_offset < min_offset) { 2821 return false; 2822 } 2823 2824 if (merge_alignment_check(adr.base(), prev_size_in_bytes, cur_offset, prev_offset)) { 2825 return true; 2826 } 2827 2828 return false; 2829 } 2830 2831 // Merge current load/store with previous load/store into ldp/stp. 2832 void MacroAssembler::merge_ldst(Register rt, 2833 const Address &adr, 2834 size_t cur_size_in_bytes, 2835 bool is_store) { 2836 2837 assert(ldst_can_merge(rt, adr, cur_size_in_bytes, is_store) == true, "cur and prev must be able to be merged."); 2838 2839 Register rt_low, rt_high; 2840 address prev = pc() - NativeInstruction::instruction_size; 2841 NativeLdSt* prev_ldst = NativeLdSt_at(prev); 2842 2843 long offset; 2844 2845 if (adr.offset() < prev_ldst->offset()) { 2846 offset = adr.offset(); 2847 rt_low = rt; 2848 rt_high = prev_ldst->target(); 2849 } else { 2850 offset = prev_ldst->offset(); 2851 rt_low = prev_ldst->target(); 2852 rt_high = rt; 2853 } 2854 2855 Address adr_p = Address(prev_ldst->base(), offset); 2856 // Overwrite previous generated binary. 2857 code_section()->set_end(prev); 2858 2859 const int sz = prev_ldst->size_in_bytes(); 2860 assert(sz == 8 || sz == 4, "only supports 64/32bit merging."); 2861 if (!is_store) { 2862 BLOCK_COMMENT("merged ldr pair"); 2863 if (sz == 8) { 2864 ldp(rt_low, rt_high, adr_p); 2865 } else { 2866 ldpw(rt_low, rt_high, adr_p); 2867 } 2868 } else { 2869 BLOCK_COMMENT("merged str pair"); 2870 if (sz == 8) { 2871 stp(rt_low, rt_high, adr_p); 2872 } else { 2873 stpw(rt_low, rt_high, adr_p); 2874 } 2875 } 2876 } 2877 2878 /** 2879 * Multiply 64 bit by 64 bit first loop. 2880 */ 2881 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart, 2882 Register y, Register y_idx, Register z, 2883 Register carry, Register product, 2884 Register idx, Register kdx) { 2885 // 2886 // jlong carry, x[], y[], z[]; 2887 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) { 2888 // huge_128 product = y[idx] * x[xstart] + carry; 2889 // z[kdx] = (jlong)product; 2890 // carry = (jlong)(product >>> 64); 2891 // } 2892 // z[xstart] = carry; 2893 // 2894 2895 Label L_first_loop, L_first_loop_exit; 2896 Label L_one_x, L_one_y, L_multiply; 2897 2898 subsw(xstart, xstart, 1); 2899 br(Assembler::MI, L_one_x); 2900 2901 lea(rscratch1, Address(x, xstart, Address::lsl(LogBytesPerInt))); 2902 ldr(x_xstart, Address(rscratch1)); 2903 ror(x_xstart, x_xstart, 32); // convert big-endian to little-endian 2904 2905 bind(L_first_loop); 2906 subsw(idx, idx, 1); 2907 br(Assembler::MI, L_first_loop_exit); 2908 subsw(idx, idx, 1); 2909 br(Assembler::MI, L_one_y); 2910 lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt))); 2911 ldr(y_idx, Address(rscratch1)); 2912 ror(y_idx, y_idx, 32); // convert big-endian to little-endian 2913 bind(L_multiply); 2914 2915 // AArch64 has a multiply-accumulate instruction that we can't use 2916 // here because it has no way to process carries, so we have to use 2917 // separate add and adc instructions. Bah. 2918 umulh(rscratch1, x_xstart, y_idx); // x_xstart * y_idx -> rscratch1:product 2919 mul(product, x_xstart, y_idx); 2920 adds(product, product, carry); 2921 adc(carry, rscratch1, zr); // x_xstart * y_idx + carry -> carry:product 2922 2923 subw(kdx, kdx, 2); 2924 ror(product, product, 32); // back to big-endian 2925 str(product, offsetted_address(z, kdx, Address::uxtw(LogBytesPerInt), 0, BytesPerLong)); 2926 2927 b(L_first_loop); 2928 2929 bind(L_one_y); 2930 ldrw(y_idx, Address(y, 0)); 2931 b(L_multiply); 2932 2933 bind(L_one_x); 2934 ldrw(x_xstart, Address(x, 0)); 2935 b(L_first_loop); 2936 2937 bind(L_first_loop_exit); 2938 } 2939 2940 /** 2941 * Multiply 128 bit by 128. Unrolled inner loop. 2942 * 2943 */ 2944 void MacroAssembler::multiply_128_x_128_loop(Register y, Register z, 2945 Register carry, Register carry2, 2946 Register idx, Register jdx, 2947 Register yz_idx1, Register yz_idx2, 2948 Register tmp, Register tmp3, Register tmp4, 2949 Register tmp6, Register product_hi) { 2950 2951 // jlong carry, x[], y[], z[]; 2952 // int kdx = ystart+1; 2953 // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop 2954 // huge_128 tmp3 = (y[idx+1] * product_hi) + z[kdx+idx+1] + carry; 2955 // jlong carry2 = (jlong)(tmp3 >>> 64); 2956 // huge_128 tmp4 = (y[idx] * product_hi) + z[kdx+idx] + carry2; 2957 // carry = (jlong)(tmp4 >>> 64); 2958 // z[kdx+idx+1] = (jlong)tmp3; 2959 // z[kdx+idx] = (jlong)tmp4; 2960 // } 2961 // idx += 2; 2962 // if (idx > 0) { 2963 // yz_idx1 = (y[idx] * product_hi) + z[kdx+idx] + carry; 2964 // z[kdx+idx] = (jlong)yz_idx1; 2965 // carry = (jlong)(yz_idx1 >>> 64); 2966 // } 2967 // 2968 2969 Label L_third_loop, L_third_loop_exit, L_post_third_loop_done; 2970 2971 lsrw(jdx, idx, 2); 2972 2973 bind(L_third_loop); 2974 2975 subsw(jdx, jdx, 1); 2976 br(Assembler::MI, L_third_loop_exit); 2977 subw(idx, idx, 4); 2978 2979 lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt))); 2980 2981 ldp(yz_idx2, yz_idx1, Address(rscratch1, 0)); 2982 2983 lea(tmp6, Address(z, idx, Address::uxtw(LogBytesPerInt))); 2984 2985 ror(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian 2986 ror(yz_idx2, yz_idx2, 32); 2987 2988 ldp(rscratch2, rscratch1, Address(tmp6, 0)); 2989 2990 mul(tmp3, product_hi, yz_idx1); // yz_idx1 * product_hi -> tmp4:tmp3 2991 umulh(tmp4, product_hi, yz_idx1); 2992 2993 ror(rscratch1, rscratch1, 32); // convert big-endian to little-endian 2994 ror(rscratch2, rscratch2, 32); 2995 2996 mul(tmp, product_hi, yz_idx2); // yz_idx2 * product_hi -> carry2:tmp 2997 umulh(carry2, product_hi, yz_idx2); 2998 2999 // propagate sum of both multiplications into carry:tmp4:tmp3 3000 adds(tmp3, tmp3, carry); 3001 adc(tmp4, tmp4, zr); 3002 adds(tmp3, tmp3, rscratch1); 3003 adcs(tmp4, tmp4, tmp); 3004 adc(carry, carry2, zr); 3005 adds(tmp4, tmp4, rscratch2); 3006 adc(carry, carry, zr); 3007 3008 ror(tmp3, tmp3, 32); // convert little-endian to big-endian 3009 ror(tmp4, tmp4, 32); 3010 stp(tmp4, tmp3, Address(tmp6, 0)); 3011 3012 b(L_third_loop); 3013 bind (L_third_loop_exit); 3014 3015 andw (idx, idx, 0x3); 3016 cbz(idx, L_post_third_loop_done); 3017 3018 Label L_check_1; 3019 subsw(idx, idx, 2); 3020 br(Assembler::MI, L_check_1); 3021 3022 lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt))); 3023 ldr(yz_idx1, Address(rscratch1, 0)); 3024 ror(yz_idx1, yz_idx1, 32); 3025 mul(tmp3, product_hi, yz_idx1); // yz_idx1 * product_hi -> tmp4:tmp3 3026 umulh(tmp4, product_hi, yz_idx1); 3027 lea(rscratch1, Address(z, idx, Address::uxtw(LogBytesPerInt))); 3028 ldr(yz_idx2, Address(rscratch1, 0)); 3029 ror(yz_idx2, yz_idx2, 32); 3030 3031 add2_with_carry(carry, tmp4, tmp3, carry, yz_idx2); 3032 3033 ror(tmp3, tmp3, 32); 3034 str(tmp3, Address(rscratch1, 0)); 3035 3036 bind (L_check_1); 3037 3038 andw (idx, idx, 0x1); 3039 subsw(idx, idx, 1); 3040 br(Assembler::MI, L_post_third_loop_done); 3041 ldrw(tmp4, Address(y, idx, Address::uxtw(LogBytesPerInt))); 3042 mul(tmp3, tmp4, product_hi); // tmp4 * product_hi -> carry2:tmp3 3043 umulh(carry2, tmp4, product_hi); 3044 ldrw(tmp4, Address(z, idx, Address::uxtw(LogBytesPerInt))); 3045 3046 add2_with_carry(carry2, tmp3, tmp4, carry); 3047 3048 strw(tmp3, Address(z, idx, Address::uxtw(LogBytesPerInt))); 3049 extr(carry, carry2, tmp3, 32); 3050 3051 bind(L_post_third_loop_done); 3052 } 3053 3054 /** 3055 * Code for BigInteger::multiplyToLen() instrinsic. 3056 * 3057 * r0: x 3058 * r1: xlen 3059 * r2: y 3060 * r3: ylen 3061 * r4: z 3062 * r5: zlen 3063 * r10: tmp1 3064 * r11: tmp2 3065 * r12: tmp3 3066 * r13: tmp4 3067 * r14: tmp5 3068 * r15: tmp6 3069 * r16: tmp7 3070 * 3071 */ 3072 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen, 3073 Register z, Register zlen, 3074 Register tmp1, Register tmp2, Register tmp3, Register tmp4, 3075 Register tmp5, Register tmp6, Register product_hi) { 3076 3077 assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6); 3078 3079 const Register idx = tmp1; 3080 const Register kdx = tmp2; 3081 const Register xstart = tmp3; 3082 3083 const Register y_idx = tmp4; 3084 const Register carry = tmp5; 3085 const Register product = xlen; 3086 const Register x_xstart = zlen; // reuse register 3087 3088 // First Loop. 3089 // 3090 // final static long LONG_MASK = 0xffffffffL; 3091 // int xstart = xlen - 1; 3092 // int ystart = ylen - 1; 3093 // long carry = 0; 3094 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) { 3095 // long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry; 3096 // z[kdx] = (int)product; 3097 // carry = product >>> 32; 3098 // } 3099 // z[xstart] = (int)carry; 3100 // 3101 3102 movw(idx, ylen); // idx = ylen; 3103 movw(kdx, zlen); // kdx = xlen+ylen; 3104 mov(carry, zr); // carry = 0; 3105 3106 Label L_done; 3107 3108 movw(xstart, xlen); 3109 subsw(xstart, xstart, 1); 3110 br(Assembler::MI, L_done); 3111 3112 multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx); 3113 3114 Label L_second_loop; 3115 cbzw(kdx, L_second_loop); 3116 3117 Label L_carry; 3118 subw(kdx, kdx, 1); 3119 cbzw(kdx, L_carry); 3120 3121 strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt))); 3122 lsr(carry, carry, 32); 3123 subw(kdx, kdx, 1); 3124 3125 bind(L_carry); 3126 strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt))); 3127 3128 // Second and third (nested) loops. 3129 // 3130 // for (int i = xstart-1; i >= 0; i--) { // Second loop 3131 // carry = 0; 3132 // for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop 3133 // long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) + 3134 // (z[k] & LONG_MASK) + carry; 3135 // z[k] = (int)product; 3136 // carry = product >>> 32; 3137 // } 3138 // z[i] = (int)carry; 3139 // } 3140 // 3141 // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = product_hi 3142 3143 const Register jdx = tmp1; 3144 3145 bind(L_second_loop); 3146 mov(carry, zr); // carry = 0; 3147 movw(jdx, ylen); // j = ystart+1 3148 3149 subsw(xstart, xstart, 1); // i = xstart-1; 3150 br(Assembler::MI, L_done); 3151 3152 str(z, Address(pre(sp, -4 * wordSize))); 3153 3154 Label L_last_x; 3155 lea(z, offsetted_address(z, xstart, Address::uxtw(LogBytesPerInt), 4, BytesPerInt)); // z = z + k - j 3156 subsw(xstart, xstart, 1); // i = xstart-1; 3157 br(Assembler::MI, L_last_x); 3158 3159 lea(rscratch1, Address(x, xstart, Address::uxtw(LogBytesPerInt))); 3160 ldr(product_hi, Address(rscratch1)); 3161 ror(product_hi, product_hi, 32); // convert big-endian to little-endian 3162 3163 Label L_third_loop_prologue; 3164 bind(L_third_loop_prologue); 3165 3166 str(ylen, Address(sp, wordSize)); 3167 stp(x, xstart, Address(sp, 2 * wordSize)); 3168 multiply_128_x_128_loop(y, z, carry, x, jdx, ylen, product, 3169 tmp2, x_xstart, tmp3, tmp4, tmp6, product_hi); 3170 ldp(z, ylen, Address(post(sp, 2 * wordSize))); 3171 ldp(x, xlen, Address(post(sp, 2 * wordSize))); // copy old xstart -> xlen 3172 3173 addw(tmp3, xlen, 1); 3174 strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt))); 3175 subsw(tmp3, tmp3, 1); 3176 br(Assembler::MI, L_done); 3177 3178 lsr(carry, carry, 32); 3179 strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt))); 3180 b(L_second_loop); 3181 3182 // Next infrequent code is moved outside loops. 3183 bind(L_last_x); 3184 ldrw(product_hi, Address(x, 0)); 3185 b(L_third_loop_prologue); 3186 3187 bind(L_done); 3188 } 3189 3190 // Code for BigInteger::mulAdd instrinsic 3191 // out = r0 3192 // in = r1 3193 // offset = r2 (already out.length-offset) 3194 // len = r3 3195 // k = r4 3196 // 3197 // pseudo code from java implementation: 3198 // carry = 0; 3199 // offset = out.length-offset - 1; 3200 // for (int j=len-1; j >= 0; j--) { 3201 // product = (in[j] & LONG_MASK) * kLong + (out[offset] & LONG_MASK) + carry; 3202 // out[offset--] = (int)product; 3203 // carry = product >>> 32; 3204 // } 3205 // return (int)carry; 3206 void MacroAssembler::mul_add(Register out, Register in, Register offset, 3207 Register len, Register k) { 3208 Label LOOP, END; 3209 // pre-loop 3210 cmp(len, zr); // cmp, not cbz/cbnz: to use condition twice => less branches 3211 csel(out, zr, out, Assembler::EQ); 3212 br(Assembler::EQ, END); 3213 add(in, in, len, LSL, 2); // in[j+1] address 3214 add(offset, out, offset, LSL, 2); // out[offset + 1] address 3215 mov(out, zr); // used to keep carry now 3216 BIND(LOOP); 3217 ldrw(rscratch1, Address(pre(in, -4))); 3218 madd(rscratch1, rscratch1, k, out); 3219 ldrw(rscratch2, Address(pre(offset, -4))); 3220 add(rscratch1, rscratch1, rscratch2); 3221 strw(rscratch1, Address(offset)); 3222 lsr(out, rscratch1, 32); 3223 subs(len, len, 1); 3224 br(Assembler::NE, LOOP); 3225 BIND(END); 3226 } 3227 3228 /** 3229 * Emits code to update CRC-32 with a byte value according to constants in table 3230 * 3231 * @param [in,out]crc Register containing the crc. 3232 * @param [in]val Register containing the byte to fold into the CRC. 3233 * @param [in]table Register containing the table of crc constants. 3234 * 3235 * uint32_t crc; 3236 * val = crc_table[(val ^ crc) & 0xFF]; 3237 * crc = val ^ (crc >> 8); 3238 * 3239 */ 3240 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) { 3241 eor(val, val, crc); 3242 andr(val, val, 0xff); 3243 ldrw(val, Address(table, val, Address::lsl(2))); 3244 eor(crc, val, crc, Assembler::LSR, 8); 3245 } 3246 3247 /** 3248 * Emits code to update CRC-32 with a 32-bit value according to tables 0 to 3 3249 * 3250 * @param [in,out]crc Register containing the crc. 3251 * @param [in]v Register containing the 32-bit to fold into the CRC. 3252 * @param [in]table0 Register containing table 0 of crc constants. 3253 * @param [in]table1 Register containing table 1 of crc constants. 3254 * @param [in]table2 Register containing table 2 of crc constants. 3255 * @param [in]table3 Register containing table 3 of crc constants. 3256 * 3257 * uint32_t crc; 3258 * v = crc ^ v 3259 * crc = table3[v&0xff]^table2[(v>>8)&0xff]^table1[(v>>16)&0xff]^table0[v>>24] 3260 * 3261 */ 3262 void MacroAssembler::update_word_crc32(Register crc, Register v, Register tmp, 3263 Register table0, Register table1, Register table2, Register table3, 3264 bool upper) { 3265 eor(v, crc, v, upper ? LSR:LSL, upper ? 32:0); 3266 uxtb(tmp, v); 3267 ldrw(crc, Address(table3, tmp, Address::lsl(2))); 3268 ubfx(tmp, v, 8, 8); 3269 ldrw(tmp, Address(table2, tmp, Address::lsl(2))); 3270 eor(crc, crc, tmp); 3271 ubfx(tmp, v, 16, 8); 3272 ldrw(tmp, Address(table1, tmp, Address::lsl(2))); 3273 eor(crc, crc, tmp); 3274 ubfx(tmp, v, 24, 8); 3275 ldrw(tmp, Address(table0, tmp, Address::lsl(2))); 3276 eor(crc, crc, tmp); 3277 } 3278 3279 void MacroAssembler::kernel_crc32_using_crc32(Register crc, Register buf, 3280 Register len, Register tmp0, Register tmp1, Register tmp2, 3281 Register tmp3) { 3282 Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop, CRC_less64, CRC_by64_pre, CRC_by32_loop, CRC_less32, L_exit; 3283 assert_different_registers(crc, buf, len, tmp0, tmp1, tmp2, tmp3); 3284 3285 mvnw(crc, crc); 3286 3287 subs(len, len, 128); 3288 br(Assembler::GE, CRC_by64_pre); 3289 BIND(CRC_less64); 3290 adds(len, len, 128-32); 3291 br(Assembler::GE, CRC_by32_loop); 3292 BIND(CRC_less32); 3293 adds(len, len, 32-4); 3294 br(Assembler::GE, CRC_by4_loop); 3295 adds(len, len, 4); 3296 br(Assembler::GT, CRC_by1_loop); 3297 b(L_exit); 3298 3299 BIND(CRC_by32_loop); 3300 ldp(tmp0, tmp1, Address(post(buf, 16))); 3301 subs(len, len, 32); 3302 crc32x(crc, crc, tmp0); 3303 ldr(tmp2, Address(post(buf, 8))); 3304 crc32x(crc, crc, tmp1); 3305 ldr(tmp3, Address(post(buf, 8))); 3306 crc32x(crc, crc, tmp2); 3307 crc32x(crc, crc, tmp3); 3308 br(Assembler::GE, CRC_by32_loop); 3309 cmn(len, 32); 3310 br(Assembler::NE, CRC_less32); 3311 b(L_exit); 3312 3313 BIND(CRC_by4_loop); 3314 ldrw(tmp0, Address(post(buf, 4))); 3315 subs(len, len, 4); 3316 crc32w(crc, crc, tmp0); 3317 br(Assembler::GE, CRC_by4_loop); 3318 adds(len, len, 4); 3319 br(Assembler::LE, L_exit); 3320 BIND(CRC_by1_loop); 3321 ldrb(tmp0, Address(post(buf, 1))); 3322 subs(len, len, 1); 3323 crc32b(crc, crc, tmp0); 3324 br(Assembler::GT, CRC_by1_loop); 3325 b(L_exit); 3326 3327 BIND(CRC_by64_pre); 3328 sub(buf, buf, 8); 3329 ldp(tmp0, tmp1, Address(buf, 8)); 3330 crc32x(crc, crc, tmp0); 3331 ldr(tmp2, Address(buf, 24)); 3332 crc32x(crc, crc, tmp1); 3333 ldr(tmp3, Address(buf, 32)); 3334 crc32x(crc, crc, tmp2); 3335 ldr(tmp0, Address(buf, 40)); 3336 crc32x(crc, crc, tmp3); 3337 ldr(tmp1, Address(buf, 48)); 3338 crc32x(crc, crc, tmp0); 3339 ldr(tmp2, Address(buf, 56)); 3340 crc32x(crc, crc, tmp1); 3341 ldr(tmp3, Address(pre(buf, 64))); 3342 3343 b(CRC_by64_loop); 3344 3345 align(CodeEntryAlignment); 3346 BIND(CRC_by64_loop); 3347 subs(len, len, 64); 3348 crc32x(crc, crc, tmp2); 3349 ldr(tmp0, Address(buf, 8)); 3350 crc32x(crc, crc, tmp3); 3351 ldr(tmp1, Address(buf, 16)); 3352 crc32x(crc, crc, tmp0); 3353 ldr(tmp2, Address(buf, 24)); 3354 crc32x(crc, crc, tmp1); 3355 ldr(tmp3, Address(buf, 32)); 3356 crc32x(crc, crc, tmp2); 3357 ldr(tmp0, Address(buf, 40)); 3358 crc32x(crc, crc, tmp3); 3359 ldr(tmp1, Address(buf, 48)); 3360 crc32x(crc, crc, tmp0); 3361 ldr(tmp2, Address(buf, 56)); 3362 crc32x(crc, crc, tmp1); 3363 ldr(tmp3, Address(pre(buf, 64))); 3364 br(Assembler::GE, CRC_by64_loop); 3365 3366 // post-loop 3367 crc32x(crc, crc, tmp2); 3368 crc32x(crc, crc, tmp3); 3369 3370 sub(len, len, 64); 3371 add(buf, buf, 8); 3372 cmn(len, 128); 3373 br(Assembler::NE, CRC_less64); 3374 BIND(L_exit); 3375 mvnw(crc, crc); 3376 } 3377 3378 /** 3379 * @param crc register containing existing CRC (32-bit) 3380 * @param buf register pointing to input byte buffer (byte*) 3381 * @param len register containing number of bytes 3382 * @param table register that will contain address of CRC table 3383 * @param tmp scratch register 3384 */ 3385 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len, 3386 Register table0, Register table1, Register table2, Register table3, 3387 Register tmp, Register tmp2, Register tmp3) { 3388 Label L_by16, L_by16_loop, L_by4, L_by4_loop, L_by1, L_by1_loop, L_exit; 3389 unsigned long offset; 3390 3391 if (UseCRC32) { 3392 kernel_crc32_using_crc32(crc, buf, len, table0, table1, table2, table3); 3393 return; 3394 } 3395 3396 mvnw(crc, crc); 3397 3398 adrp(table0, ExternalAddress(StubRoutines::crc_table_addr()), offset); 3399 if (offset) add(table0, table0, offset); 3400 add(table1, table0, 1*256*sizeof(juint)); 3401 add(table2, table0, 2*256*sizeof(juint)); 3402 add(table3, table0, 3*256*sizeof(juint)); 3403 3404 if (UseNeon) { 3405 cmp(len, (u1)64); 3406 br(Assembler::LT, L_by16); 3407 eor(v16, T16B, v16, v16); 3408 3409 Label L_fold; 3410 3411 add(tmp, table0, 4*256*sizeof(juint)); // Point at the Neon constants 3412 3413 ld1(v0, v1, T2D, post(buf, 32)); 3414 ld1r(v4, T2D, post(tmp, 8)); 3415 ld1r(v5, T2D, post(tmp, 8)); 3416 ld1r(v6, T2D, post(tmp, 8)); 3417 ld1r(v7, T2D, post(tmp, 8)); 3418 mov(v16, T4S, 0, crc); 3419 3420 eor(v0, T16B, v0, v16); 3421 sub(len, len, 64); 3422 3423 BIND(L_fold); 3424 pmull(v22, T8H, v0, v5, T8B); 3425 pmull(v20, T8H, v0, v7, T8B); 3426 pmull(v23, T8H, v0, v4, T8B); 3427 pmull(v21, T8H, v0, v6, T8B); 3428 3429 pmull2(v18, T8H, v0, v5, T16B); 3430 pmull2(v16, T8H, v0, v7, T16B); 3431 pmull2(v19, T8H, v0, v4, T16B); 3432 pmull2(v17, T8H, v0, v6, T16B); 3433 3434 uzp1(v24, T8H, v20, v22); 3435 uzp2(v25, T8H, v20, v22); 3436 eor(v20, T16B, v24, v25); 3437 3438 uzp1(v26, T8H, v16, v18); 3439 uzp2(v27, T8H, v16, v18); 3440 eor(v16, T16B, v26, v27); 3441 3442 ushll2(v22, T4S, v20, T8H, 8); 3443 ushll(v20, T4S, v20, T4H, 8); 3444 3445 ushll2(v18, T4S, v16, T8H, 8); 3446 ushll(v16, T4S, v16, T4H, 8); 3447 3448 eor(v22, T16B, v23, v22); 3449 eor(v18, T16B, v19, v18); 3450 eor(v20, T16B, v21, v20); 3451 eor(v16, T16B, v17, v16); 3452 3453 uzp1(v17, T2D, v16, v20); 3454 uzp2(v21, T2D, v16, v20); 3455 eor(v17, T16B, v17, v21); 3456 3457 ushll2(v20, T2D, v17, T4S, 16); 3458 ushll(v16, T2D, v17, T2S, 16); 3459 3460 eor(v20, T16B, v20, v22); 3461 eor(v16, T16B, v16, v18); 3462 3463 uzp1(v17, T2D, v20, v16); 3464 uzp2(v21, T2D, v20, v16); 3465 eor(v28, T16B, v17, v21); 3466 3467 pmull(v22, T8H, v1, v5, T8B); 3468 pmull(v20, T8H, v1, v7, T8B); 3469 pmull(v23, T8H, v1, v4, T8B); 3470 pmull(v21, T8H, v1, v6, T8B); 3471 3472 pmull2(v18, T8H, v1, v5, T16B); 3473 pmull2(v16, T8H, v1, v7, T16B); 3474 pmull2(v19, T8H, v1, v4, T16B); 3475 pmull2(v17, T8H, v1, v6, T16B); 3476 3477 ld1(v0, v1, T2D, post(buf, 32)); 3478 3479 uzp1(v24, T8H, v20, v22); 3480 uzp2(v25, T8H, v20, v22); 3481 eor(v20, T16B, v24, v25); 3482 3483 uzp1(v26, T8H, v16, v18); 3484 uzp2(v27, T8H, v16, v18); 3485 eor(v16, T16B, v26, v27); 3486 3487 ushll2(v22, T4S, v20, T8H, 8); 3488 ushll(v20, T4S, v20, T4H, 8); 3489 3490 ushll2(v18, T4S, v16, T8H, 8); 3491 ushll(v16, T4S, v16, T4H, 8); 3492 3493 eor(v22, T16B, v23, v22); 3494 eor(v18, T16B, v19, v18); 3495 eor(v20, T16B, v21, v20); 3496 eor(v16, T16B, v17, v16); 3497 3498 uzp1(v17, T2D, v16, v20); 3499 uzp2(v21, T2D, v16, v20); 3500 eor(v16, T16B, v17, v21); 3501 3502 ushll2(v20, T2D, v16, T4S, 16); 3503 ushll(v16, T2D, v16, T2S, 16); 3504 3505 eor(v20, T16B, v22, v20); 3506 eor(v16, T16B, v16, v18); 3507 3508 uzp1(v17, T2D, v20, v16); 3509 uzp2(v21, T2D, v20, v16); 3510 eor(v20, T16B, v17, v21); 3511 3512 shl(v16, T2D, v28, 1); 3513 shl(v17, T2D, v20, 1); 3514 3515 eor(v0, T16B, v0, v16); 3516 eor(v1, T16B, v1, v17); 3517 3518 subs(len, len, 32); 3519 br(Assembler::GE, L_fold); 3520 3521 mov(crc, 0); 3522 mov(tmp, v0, T1D, 0); 3523 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false); 3524 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true); 3525 mov(tmp, v0, T1D, 1); 3526 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false); 3527 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true); 3528 mov(tmp, v1, T1D, 0); 3529 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false); 3530 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true); 3531 mov(tmp, v1, T1D, 1); 3532 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false); 3533 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true); 3534 3535 add(len, len, 32); 3536 } 3537 3538 BIND(L_by16); 3539 subs(len, len, 16); 3540 br(Assembler::GE, L_by16_loop); 3541 adds(len, len, 16-4); 3542 br(Assembler::GE, L_by4_loop); 3543 adds(len, len, 4); 3544 br(Assembler::GT, L_by1_loop); 3545 b(L_exit); 3546 3547 BIND(L_by4_loop); 3548 ldrw(tmp, Address(post(buf, 4))); 3549 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3); 3550 subs(len, len, 4); 3551 br(Assembler::GE, L_by4_loop); 3552 adds(len, len, 4); 3553 br(Assembler::LE, L_exit); 3554 BIND(L_by1_loop); 3555 subs(len, len, 1); 3556 ldrb(tmp, Address(post(buf, 1))); 3557 update_byte_crc32(crc, tmp, table0); 3558 br(Assembler::GT, L_by1_loop); 3559 b(L_exit); 3560 3561 align(CodeEntryAlignment); 3562 BIND(L_by16_loop); 3563 subs(len, len, 16); 3564 ldp(tmp, tmp3, Address(post(buf, 16))); 3565 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false); 3566 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true); 3567 update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, false); 3568 update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, true); 3569 br(Assembler::GE, L_by16_loop); 3570 adds(len, len, 16-4); 3571 br(Assembler::GE, L_by4_loop); 3572 adds(len, len, 4); 3573 br(Assembler::GT, L_by1_loop); 3574 BIND(L_exit); 3575 mvnw(crc, crc); 3576 } 3577 3578 void MacroAssembler::kernel_crc32c_using_crc32c(Register crc, Register buf, 3579 Register len, Register tmp0, Register tmp1, Register tmp2, 3580 Register tmp3) { 3581 Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop, CRC_less64, CRC_by64_pre, CRC_by32_loop, CRC_less32, L_exit; 3582 assert_different_registers(crc, buf, len, tmp0, tmp1, tmp2, tmp3); 3583 3584 subs(len, len, 128); 3585 br(Assembler::GE, CRC_by64_pre); 3586 BIND(CRC_less64); 3587 adds(len, len, 128-32); 3588 br(Assembler::GE, CRC_by32_loop); 3589 BIND(CRC_less32); 3590 adds(len, len, 32-4); 3591 br(Assembler::GE, CRC_by4_loop); 3592 adds(len, len, 4); 3593 br(Assembler::GT, CRC_by1_loop); 3594 b(L_exit); 3595 3596 BIND(CRC_by32_loop); 3597 ldp(tmp0, tmp1, Address(post(buf, 16))); 3598 subs(len, len, 32); 3599 crc32cx(crc, crc, tmp0); 3600 ldr(tmp2, Address(post(buf, 8))); 3601 crc32cx(crc, crc, tmp1); 3602 ldr(tmp3, Address(post(buf, 8))); 3603 crc32cx(crc, crc, tmp2); 3604 crc32cx(crc, crc, tmp3); 3605 br(Assembler::GE, CRC_by32_loop); 3606 cmn(len, 32); 3607 br(Assembler::NE, CRC_less32); 3608 b(L_exit); 3609 3610 BIND(CRC_by4_loop); 3611 ldrw(tmp0, Address(post(buf, 4))); 3612 subs(len, len, 4); 3613 crc32cw(crc, crc, tmp0); 3614 br(Assembler::GE, CRC_by4_loop); 3615 adds(len, len, 4); 3616 br(Assembler::LE, L_exit); 3617 BIND(CRC_by1_loop); 3618 ldrb(tmp0, Address(post(buf, 1))); 3619 subs(len, len, 1); 3620 crc32cb(crc, crc, tmp0); 3621 br(Assembler::GT, CRC_by1_loop); 3622 b(L_exit); 3623 3624 BIND(CRC_by64_pre); 3625 sub(buf, buf, 8); 3626 ldp(tmp0, tmp1, Address(buf, 8)); 3627 crc32cx(crc, crc, tmp0); 3628 ldr(tmp2, Address(buf, 24)); 3629 crc32cx(crc, crc, tmp1); 3630 ldr(tmp3, Address(buf, 32)); 3631 crc32cx(crc, crc, tmp2); 3632 ldr(tmp0, Address(buf, 40)); 3633 crc32cx(crc, crc, tmp3); 3634 ldr(tmp1, Address(buf, 48)); 3635 crc32cx(crc, crc, tmp0); 3636 ldr(tmp2, Address(buf, 56)); 3637 crc32cx(crc, crc, tmp1); 3638 ldr(tmp3, Address(pre(buf, 64))); 3639 3640 b(CRC_by64_loop); 3641 3642 align(CodeEntryAlignment); 3643 BIND(CRC_by64_loop); 3644 subs(len, len, 64); 3645 crc32cx(crc, crc, tmp2); 3646 ldr(tmp0, Address(buf, 8)); 3647 crc32cx(crc, crc, tmp3); 3648 ldr(tmp1, Address(buf, 16)); 3649 crc32cx(crc, crc, tmp0); 3650 ldr(tmp2, Address(buf, 24)); 3651 crc32cx(crc, crc, tmp1); 3652 ldr(tmp3, Address(buf, 32)); 3653 crc32cx(crc, crc, tmp2); 3654 ldr(tmp0, Address(buf, 40)); 3655 crc32cx(crc, crc, tmp3); 3656 ldr(tmp1, Address(buf, 48)); 3657 crc32cx(crc, crc, tmp0); 3658 ldr(tmp2, Address(buf, 56)); 3659 crc32cx(crc, crc, tmp1); 3660 ldr(tmp3, Address(pre(buf, 64))); 3661 br(Assembler::GE, CRC_by64_loop); 3662 3663 // post-loop 3664 crc32cx(crc, crc, tmp2); 3665 crc32cx(crc, crc, tmp3); 3666 3667 sub(len, len, 64); 3668 add(buf, buf, 8); 3669 cmn(len, 128); 3670 br(Assembler::NE, CRC_less64); 3671 BIND(L_exit); 3672 } 3673 3674 /** 3675 * @param crc register containing existing CRC (32-bit) 3676 * @param buf register pointing to input byte buffer (byte*) 3677 * @param len register containing number of bytes 3678 * @param table register that will contain address of CRC table 3679 * @param tmp scratch register 3680 */ 3681 void MacroAssembler::kernel_crc32c(Register crc, Register buf, Register len, 3682 Register table0, Register table1, Register table2, Register table3, 3683 Register tmp, Register tmp2, Register tmp3) { 3684 kernel_crc32c_using_crc32c(crc, buf, len, table0, table1, table2, table3); 3685 } 3686 3687 3688 SkipIfEqual::SkipIfEqual( 3689 MacroAssembler* masm, const bool* flag_addr, bool value) { 3690 _masm = masm; 3691 unsigned long offset; 3692 _masm->adrp(rscratch1, ExternalAddress((address)flag_addr), offset); 3693 _masm->ldrb(rscratch1, Address(rscratch1, offset)); 3694 _masm->cbzw(rscratch1, _label); 3695 } 3696 3697 SkipIfEqual::~SkipIfEqual() { 3698 _masm->bind(_label); 3699 } 3700 3701 void MacroAssembler::addptr(const Address &dst, int32_t src) { 3702 Address adr; 3703 switch(dst.getMode()) { 3704 case Address::base_plus_offset: 3705 // This is the expected mode, although we allow all the other 3706 // forms below. 3707 adr = form_address(rscratch2, dst.base(), dst.offset(), LogBytesPerWord); 3708 break; 3709 default: 3710 lea(rscratch2, dst); 3711 adr = Address(rscratch2); 3712 break; 3713 } 3714 ldr(rscratch1, adr); 3715 add(rscratch1, rscratch1, src); 3716 str(rscratch1, adr); 3717 } 3718 3719 void MacroAssembler::cmpptr(Register src1, Address src2) { 3720 unsigned long offset; 3721 adrp(rscratch1, src2, offset); 3722 ldr(rscratch1, Address(rscratch1, offset)); 3723 cmp(src1, rscratch1); 3724 } 3725 3726 void MacroAssembler::cmpoop(Register obj1, Register obj2) { 3727 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 3728 bs->obj_equals(this, obj1, obj2); 3729 } 3730 3731 void MacroAssembler::load_metadata(Register dst, Register src) { 3732 if (UseCompressedClassPointers) { 3733 ldrw(dst, Address(src, oopDesc::klass_offset_in_bytes())); 3734 } else { 3735 ldr(dst, Address(src, oopDesc::klass_offset_in_bytes())); 3736 } 3737 } 3738 3739 void MacroAssembler::load_klass(Register dst, Register src) { 3740 load_metadata(dst, src); 3741 if (UseCompressedClassPointers) { 3742 andr(dst, dst, oopDesc::compressed_klass_mask()); 3743 decode_klass_not_null(dst); 3744 } else { 3745 ubfm(dst, dst, 0, 63 - oopDesc::storage_props_nof_bits); 3746 } 3747 } 3748 3749 // ((OopHandle)result).resolve(); 3750 void MacroAssembler::resolve_oop_handle(Register result, Register tmp) { 3751 // OopHandle::resolve is an indirection. 3752 access_load_at(T_OBJECT, IN_NATIVE, result, Address(result, 0), tmp, noreg); 3753 } 3754 3755 void MacroAssembler::load_mirror(Register dst, Register method, Register tmp) { 3756 const int mirror_offset = in_bytes(Klass::java_mirror_offset()); 3757 ldr(dst, Address(rmethod, Method::const_offset())); 3758 ldr(dst, Address(dst, ConstMethod::constants_offset())); 3759 ldr(dst, Address(dst, ConstantPool::pool_holder_offset_in_bytes())); 3760 ldr(dst, Address(dst, mirror_offset)); 3761 resolve_oop_handle(dst, tmp); 3762 } 3763 3764 void MacroAssembler::load_storage_props(Register dst, Register src) { 3765 load_metadata(dst, src); 3766 if (UseCompressedClassPointers) { 3767 asrw(dst, dst, oopDesc::narrow_storage_props_shift); 3768 } else { 3769 asr(dst, dst, oopDesc::wide_storage_props_shift); 3770 } 3771 } 3772 3773 void MacroAssembler::cmp_klass(Register oop, Register trial_klass, Register tmp) { 3774 if (UseCompressedClassPointers) { 3775 ldrw(tmp, Address(oop, oopDesc::klass_offset_in_bytes())); 3776 if (CompressedKlassPointers::base() == NULL) { 3777 cmp(trial_klass, tmp, LSL, CompressedKlassPointers::shift()); 3778 return; 3779 } else if (((uint64_t)CompressedKlassPointers::base() & 0xffffffff) == 0 3780 && CompressedKlassPointers::shift() == 0) { 3781 // Only the bottom 32 bits matter 3782 cmpw(trial_klass, tmp); 3783 return; 3784 } 3785 decode_klass_not_null(tmp); 3786 } else { 3787 ldr(tmp, Address(oop, oopDesc::klass_offset_in_bytes())); 3788 } 3789 cmp(trial_klass, tmp); 3790 } 3791 3792 void MacroAssembler::load_prototype_header(Register dst, Register src) { 3793 load_klass(dst, src); 3794 ldr(dst, Address(dst, Klass::prototype_header_offset())); 3795 } 3796 3797 void MacroAssembler::store_klass(Register dst, Register src) { 3798 // FIXME: Should this be a store release? concurrent gcs assumes 3799 // klass length is valid if klass field is not null. 3800 if (UseCompressedClassPointers) { 3801 encode_klass_not_null(src); 3802 strw(src, Address(dst, oopDesc::klass_offset_in_bytes())); 3803 } else { 3804 str(src, Address(dst, oopDesc::klass_offset_in_bytes())); 3805 } 3806 } 3807 3808 void MacroAssembler::store_klass_gap(Register dst, Register src) { 3809 if (UseCompressedClassPointers) { 3810 // Store to klass gap in destination 3811 strw(src, Address(dst, oopDesc::klass_gap_offset_in_bytes())); 3812 } 3813 } 3814 3815 // Algorithm must match CompressedOops::encode. 3816 void MacroAssembler::encode_heap_oop(Register d, Register s) { 3817 #ifdef ASSERT 3818 verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?"); 3819 #endif 3820 verify_oop(s, "broken oop in encode_heap_oop"); 3821 if (CompressedOops::base() == NULL) { 3822 if (CompressedOops::shift() != 0) { 3823 assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong"); 3824 lsr(d, s, LogMinObjAlignmentInBytes); 3825 } else { 3826 mov(d, s); 3827 } 3828 } else { 3829 subs(d, s, rheapbase); 3830 csel(d, d, zr, Assembler::HS); 3831 lsr(d, d, LogMinObjAlignmentInBytes); 3832 3833 /* Old algorithm: is this any worse? 3834 Label nonnull; 3835 cbnz(r, nonnull); 3836 sub(r, r, rheapbase); 3837 bind(nonnull); 3838 lsr(r, r, LogMinObjAlignmentInBytes); 3839 */ 3840 } 3841 } 3842 3843 void MacroAssembler::encode_heap_oop_not_null(Register r) { 3844 #ifdef ASSERT 3845 verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?"); 3846 if (CheckCompressedOops) { 3847 Label ok; 3848 cbnz(r, ok); 3849 stop("null oop passed to encode_heap_oop_not_null"); 3850 bind(ok); 3851 } 3852 #endif 3853 verify_oop(r, "broken oop in encode_heap_oop_not_null"); 3854 if (CompressedOops::base() != NULL) { 3855 sub(r, r, rheapbase); 3856 } 3857 if (CompressedOops::shift() != 0) { 3858 assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong"); 3859 lsr(r, r, LogMinObjAlignmentInBytes); 3860 } 3861 } 3862 3863 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) { 3864 #ifdef ASSERT 3865 verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?"); 3866 if (CheckCompressedOops) { 3867 Label ok; 3868 cbnz(src, ok); 3869 stop("null oop passed to encode_heap_oop_not_null2"); 3870 bind(ok); 3871 } 3872 #endif 3873 verify_oop(src, "broken oop in encode_heap_oop_not_null2"); 3874 3875 Register data = src; 3876 if (CompressedOops::base() != NULL) { 3877 sub(dst, src, rheapbase); 3878 data = dst; 3879 } 3880 if (CompressedOops::shift() != 0) { 3881 assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong"); 3882 lsr(dst, data, LogMinObjAlignmentInBytes); 3883 data = dst; 3884 } 3885 if (data == src) 3886 mov(dst, src); 3887 } 3888 3889 void MacroAssembler::decode_heap_oop(Register d, Register s) { 3890 #ifdef ASSERT 3891 verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?"); 3892 #endif 3893 if (CompressedOops::base() == NULL) { 3894 if (CompressedOops::shift() != 0 || d != s) { 3895 lsl(d, s, CompressedOops::shift()); 3896 } 3897 } else { 3898 Label done; 3899 if (d != s) 3900 mov(d, s); 3901 cbz(s, done); 3902 add(d, rheapbase, s, Assembler::LSL, LogMinObjAlignmentInBytes); 3903 bind(done); 3904 } 3905 verify_oop(d, "broken oop in decode_heap_oop"); 3906 } 3907 3908 void MacroAssembler::decode_heap_oop_not_null(Register r) { 3909 assert (UseCompressedOops, "should only be used for compressed headers"); 3910 assert (Universe::heap() != NULL, "java heap should be initialized"); 3911 // Cannot assert, unverified entry point counts instructions (see .ad file) 3912 // vtableStubs also counts instructions in pd_code_size_limit. 3913 // Also do not verify_oop as this is called by verify_oop. 3914 if (CompressedOops::shift() != 0) { 3915 assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong"); 3916 if (CompressedOops::base() != NULL) { 3917 add(r, rheapbase, r, Assembler::LSL, LogMinObjAlignmentInBytes); 3918 } else { 3919 add(r, zr, r, Assembler::LSL, LogMinObjAlignmentInBytes); 3920 } 3921 } else { 3922 assert (CompressedOops::base() == NULL, "sanity"); 3923 } 3924 } 3925 3926 void MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) { 3927 assert (UseCompressedOops, "should only be used for compressed headers"); 3928 assert (Universe::heap() != NULL, "java heap should be initialized"); 3929 // Cannot assert, unverified entry point counts instructions (see .ad file) 3930 // vtableStubs also counts instructions in pd_code_size_limit. 3931 // Also do not verify_oop as this is called by verify_oop. 3932 if (CompressedOops::shift() != 0) { 3933 assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong"); 3934 if (CompressedOops::base() != NULL) { 3935 add(dst, rheapbase, src, Assembler::LSL, LogMinObjAlignmentInBytes); 3936 } else { 3937 add(dst, zr, src, Assembler::LSL, LogMinObjAlignmentInBytes); 3938 } 3939 } else { 3940 assert (CompressedOops::base() == NULL, "sanity"); 3941 if (dst != src) { 3942 mov(dst, src); 3943 } 3944 } 3945 } 3946 3947 void MacroAssembler::encode_klass_not_null(Register dst, Register src) { 3948 if (CompressedKlassPointers::base() == NULL) { 3949 if (CompressedKlassPointers::shift() != 0) { 3950 assert (LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong"); 3951 lsr(dst, src, LogKlassAlignmentInBytes); 3952 } else { 3953 if (dst != src) mov(dst, src); 3954 } 3955 return; 3956 } 3957 3958 if (use_XOR_for_compressed_class_base) { 3959 if (CompressedKlassPointers::shift() != 0) { 3960 eor(dst, src, (uint64_t)CompressedKlassPointers::base()); 3961 lsr(dst, dst, LogKlassAlignmentInBytes); 3962 } else { 3963 eor(dst, src, (uint64_t)CompressedKlassPointers::base()); 3964 } 3965 return; 3966 } 3967 3968 if (((uint64_t)CompressedKlassPointers::base() & 0xffffffff) == 0 3969 && CompressedKlassPointers::shift() == 0) { 3970 movw(dst, src); 3971 return; 3972 } 3973 3974 #ifdef ASSERT 3975 verify_heapbase("MacroAssembler::encode_klass_not_null2: heap base corrupted?"); 3976 #endif 3977 3978 Register rbase = dst; 3979 if (dst == src) rbase = rheapbase; 3980 mov(rbase, (uint64_t)CompressedKlassPointers::base()); 3981 sub(dst, src, rbase); 3982 if (CompressedKlassPointers::shift() != 0) { 3983 assert (LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong"); 3984 lsr(dst, dst, LogKlassAlignmentInBytes); 3985 } 3986 if (dst == src) reinit_heapbase(); 3987 } 3988 3989 void MacroAssembler::encode_klass_not_null(Register r) { 3990 encode_klass_not_null(r, r); 3991 } 3992 3993 void MacroAssembler::decode_klass_not_null(Register dst, Register src) { 3994 Register rbase = dst; 3995 assert (UseCompressedClassPointers, "should only be used for compressed headers"); 3996 3997 if (CompressedKlassPointers::base() == NULL) { 3998 if (CompressedKlassPointers::shift() != 0) { 3999 assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong"); 4000 lsl(dst, src, LogKlassAlignmentInBytes); 4001 } else { 4002 if (dst != src) mov(dst, src); 4003 } 4004 return; 4005 } 4006 4007 if (use_XOR_for_compressed_class_base) { 4008 if (CompressedKlassPointers::shift() != 0) { 4009 lsl(dst, src, LogKlassAlignmentInBytes); 4010 eor(dst, dst, (uint64_t)CompressedKlassPointers::base()); 4011 } else { 4012 eor(dst, src, (uint64_t)CompressedKlassPointers::base()); 4013 } 4014 return; 4015 } 4016 4017 if (((uint64_t)CompressedKlassPointers::base() & 0xffffffff) == 0 4018 && CompressedKlassPointers::shift() == 0) { 4019 if (dst != src) 4020 movw(dst, src); 4021 movk(dst, (uint64_t)CompressedKlassPointers::base() >> 32, 32); 4022 return; 4023 } 4024 4025 // Cannot assert, unverified entry point counts instructions (see .ad file) 4026 // vtableStubs also counts instructions in pd_code_size_limit. 4027 // Also do not verify_oop as this is called by verify_oop. 4028 if (dst == src) rbase = rheapbase; 4029 mov(rbase, (uint64_t)CompressedKlassPointers::base()); 4030 if (CompressedKlassPointers::shift() != 0) { 4031 assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong"); 4032 add(dst, rbase, src, Assembler::LSL, LogKlassAlignmentInBytes); 4033 } else { 4034 add(dst, rbase, src); 4035 } 4036 if (dst == src) reinit_heapbase(); 4037 } 4038 4039 void MacroAssembler::decode_klass_not_null(Register r) { 4040 decode_klass_not_null(r, r); 4041 } 4042 4043 void MacroAssembler::set_narrow_oop(Register dst, jobject obj) { 4044 #ifdef ASSERT 4045 { 4046 ThreadInVMfromUnknown tiv; 4047 assert (UseCompressedOops, "should only be used for compressed oops"); 4048 assert (Universe::heap() != NULL, "java heap should be initialized"); 4049 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder"); 4050 assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "should be real oop"); 4051 } 4052 #endif 4053 int oop_index = oop_recorder()->find_index(obj); 4054 InstructionMark im(this); 4055 RelocationHolder rspec = oop_Relocation::spec(oop_index); 4056 code_section()->relocate(inst_mark(), rspec); 4057 movz(dst, 0xDEAD, 16); 4058 movk(dst, 0xBEEF); 4059 } 4060 4061 void MacroAssembler::set_narrow_klass(Register dst, Klass* k) { 4062 assert (UseCompressedClassPointers, "should only be used for compressed headers"); 4063 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder"); 4064 int index = oop_recorder()->find_index(k); 4065 assert(! Universe::heap()->is_in_reserved(k), "should not be an oop"); 4066 4067 InstructionMark im(this); 4068 RelocationHolder rspec = metadata_Relocation::spec(index); 4069 code_section()->relocate(inst_mark(), rspec); 4070 narrowKlass nk = CompressedKlassPointers::encode(k); 4071 movz(dst, (nk >> 16), 16); 4072 movk(dst, nk & 0xffff); 4073 } 4074 4075 void MacroAssembler::access_load_at(BasicType type, DecoratorSet decorators, 4076 Register dst, Address src, 4077 Register tmp1, Register thread_tmp) { 4078 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 4079 decorators = AccessInternal::decorator_fixup(decorators); 4080 bool as_raw = (decorators & AS_RAW) != 0; 4081 if (as_raw) { 4082 bs->BarrierSetAssembler::load_at(this, decorators, type, dst, src, tmp1, thread_tmp); 4083 } else { 4084 bs->load_at(this, decorators, type, dst, src, tmp1, thread_tmp); 4085 } 4086 } 4087 4088 void MacroAssembler::access_store_at(BasicType type, DecoratorSet decorators, 4089 Address dst, Register src, 4090 Register tmp1, Register thread_tmp, Register tmp3) { 4091 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 4092 decorators = AccessInternal::decorator_fixup(decorators); 4093 bool as_raw = (decorators & AS_RAW) != 0; 4094 if (as_raw) { 4095 bs->BarrierSetAssembler::store_at(this, decorators, type, dst, src, tmp1, thread_tmp, tmp3); 4096 } else { 4097 bs->store_at(this, decorators, type, dst, src, tmp1, thread_tmp, tmp3); 4098 } 4099 } 4100 4101 void MacroAssembler::resolve(DecoratorSet decorators, Register obj) { 4102 // Use stronger ACCESS_WRITE|ACCESS_READ by default. 4103 if ((decorators & (ACCESS_READ | ACCESS_WRITE)) == 0) { 4104 decorators |= ACCESS_READ | ACCESS_WRITE; 4105 } 4106 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 4107 return bs->resolve(this, decorators, obj); 4108 } 4109 4110 void MacroAssembler::load_heap_oop(Register dst, Address src, Register tmp1, 4111 Register thread_tmp, DecoratorSet decorators) { 4112 access_load_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp); 4113 } 4114 4115 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src, Register tmp1, 4116 Register thread_tmp, DecoratorSet decorators) { 4117 access_load_at(T_OBJECT, IN_HEAP | IS_NOT_NULL | decorators, dst, src, tmp1, thread_tmp); 4118 } 4119 4120 void MacroAssembler::store_heap_oop(Address dst, Register src, Register tmp1, 4121 Register thread_tmp, Register tmp3, DecoratorSet decorators) { 4122 access_store_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp, tmp3); 4123 } 4124 4125 // Used for storing NULLs. 4126 void MacroAssembler::store_heap_oop_null(Address dst) { 4127 access_store_at(T_OBJECT, IN_HEAP, dst, noreg, noreg, noreg, noreg); 4128 } 4129 4130 Address MacroAssembler::allocate_metadata_address(Metadata* obj) { 4131 assert(oop_recorder() != NULL, "this assembler needs a Recorder"); 4132 int index = oop_recorder()->allocate_metadata_index(obj); 4133 RelocationHolder rspec = metadata_Relocation::spec(index); 4134 return Address((address)obj, rspec); 4135 } 4136 4137 // Move an oop into a register. immediate is true if we want 4138 // immediate instrcutions, i.e. we are not going to patch this 4139 // instruction while the code is being executed by another thread. In 4140 // that case we can use move immediates rather than the constant pool. 4141 void MacroAssembler::movoop(Register dst, jobject obj, bool immediate) { 4142 int oop_index; 4143 if (obj == NULL) { 4144 oop_index = oop_recorder()->allocate_oop_index(obj); 4145 } else { 4146 #ifdef ASSERT 4147 { 4148 ThreadInVMfromUnknown tiv; 4149 assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "should be real oop"); 4150 } 4151 #endif 4152 oop_index = oop_recorder()->find_index(obj); 4153 } 4154 RelocationHolder rspec = oop_Relocation::spec(oop_index); 4155 if (! immediate) { 4156 address dummy = address(uintptr_t(pc()) & -wordSize); // A nearby aligned address 4157 ldr_constant(dst, Address(dummy, rspec)); 4158 } else 4159 mov(dst, Address((address)obj, rspec)); 4160 } 4161 4162 // Move a metadata address into a register. 4163 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) { 4164 int oop_index; 4165 if (obj == NULL) { 4166 oop_index = oop_recorder()->allocate_metadata_index(obj); 4167 } else { 4168 oop_index = oop_recorder()->find_index(obj); 4169 } 4170 RelocationHolder rspec = metadata_Relocation::spec(oop_index); 4171 mov(dst, Address((address)obj, rspec)); 4172 } 4173 4174 Address MacroAssembler::constant_oop_address(jobject obj) { 4175 #ifdef ASSERT 4176 { 4177 ThreadInVMfromUnknown tiv; 4178 assert(oop_recorder() != NULL, "this assembler needs an OopRecorder"); 4179 assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "not an oop"); 4180 } 4181 #endif 4182 int oop_index = oop_recorder()->find_index(obj); 4183 return Address((address)obj, oop_Relocation::spec(oop_index)); 4184 } 4185 4186 // Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes. 4187 void MacroAssembler::tlab_allocate(Register obj, 4188 Register var_size_in_bytes, 4189 int con_size_in_bytes, 4190 Register t1, 4191 Register t2, 4192 Label& slow_case) { 4193 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 4194 bs->tlab_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, t1, t2, slow_case); 4195 } 4196 4197 // Defines obj, preserves var_size_in_bytes 4198 void MacroAssembler::eden_allocate(Register obj, 4199 Register var_size_in_bytes, 4200 int con_size_in_bytes, 4201 Register t1, 4202 Label& slow_case) { 4203 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 4204 bs->eden_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, t1, slow_case); 4205 } 4206 4207 // Zero words; len is in bytes 4208 // Destroys all registers except addr 4209 // len must be a nonzero multiple of wordSize 4210 void MacroAssembler::zero_memory(Register addr, Register len, Register t1) { 4211 assert_different_registers(addr, len, t1, rscratch1, rscratch2); 4212 4213 #ifdef ASSERT 4214 { Label L; 4215 tst(len, BytesPerWord - 1); 4216 br(Assembler::EQ, L); 4217 stop("len is not a multiple of BytesPerWord"); 4218 bind(L); 4219 } 4220 #endif 4221 4222 #ifndef PRODUCT 4223 block_comment("zero memory"); 4224 #endif 4225 4226 Label loop; 4227 Label entry; 4228 4229 // Algorithm: 4230 // 4231 // scratch1 = cnt & 7; 4232 // cnt -= scratch1; 4233 // p += scratch1; 4234 // switch (scratch1) { 4235 // do { 4236 // cnt -= 8; 4237 // p[-8] = 0; 4238 // case 7: 4239 // p[-7] = 0; 4240 // case 6: 4241 // p[-6] = 0; 4242 // // ... 4243 // case 1: 4244 // p[-1] = 0; 4245 // case 0: 4246 // p += 8; 4247 // } while (cnt); 4248 // } 4249 4250 const int unroll = 8; // Number of str(zr) instructions we'll unroll 4251 4252 lsr(len, len, LogBytesPerWord); 4253 andr(rscratch1, len, unroll - 1); // tmp1 = cnt % unroll 4254 sub(len, len, rscratch1); // cnt -= unroll 4255 // t1 always points to the end of the region we're about to zero 4256 add(t1, addr, rscratch1, Assembler::LSL, LogBytesPerWord); 4257 adr(rscratch2, entry); 4258 sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 2); 4259 br(rscratch2); 4260 bind(loop); 4261 sub(len, len, unroll); 4262 for (int i = -unroll; i < 0; i++) 4263 Assembler::str(zr, Address(t1, i * wordSize)); 4264 bind(entry); 4265 add(t1, t1, unroll * wordSize); 4266 cbnz(len, loop); 4267 } 4268 4269 void MacroAssembler::verify_tlab() { 4270 #ifdef ASSERT 4271 if (UseTLAB && VerifyOops) { 4272 Label next, ok; 4273 4274 stp(rscratch2, rscratch1, Address(pre(sp, -16))); 4275 4276 ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_top_offset()))); 4277 ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_start_offset()))); 4278 cmp(rscratch2, rscratch1); 4279 br(Assembler::HS, next); 4280 STOP("assert(top >= start)"); 4281 should_not_reach_here(); 4282 4283 bind(next); 4284 ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_end_offset()))); 4285 ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_top_offset()))); 4286 cmp(rscratch2, rscratch1); 4287 br(Assembler::HS, ok); 4288 STOP("assert(top <= end)"); 4289 should_not_reach_here(); 4290 4291 bind(ok); 4292 ldp(rscratch2, rscratch1, Address(post(sp, 16))); 4293 } 4294 #endif 4295 } 4296 4297 // Writes to stack successive pages until offset reached to check for 4298 // stack overflow + shadow pages. This clobbers tmp. 4299 void MacroAssembler::bang_stack_size(Register size, Register tmp) { 4300 assert_different_registers(tmp, size, rscratch1); 4301 mov(tmp, sp); 4302 // Bang stack for total size given plus shadow page size. 4303 // Bang one page at a time because large size can bang beyond yellow and 4304 // red zones. 4305 Label loop; 4306 mov(rscratch1, os::vm_page_size()); 4307 bind(loop); 4308 lea(tmp, Address(tmp, -os::vm_page_size())); 4309 subsw(size, size, rscratch1); 4310 str(size, Address(tmp)); 4311 br(Assembler::GT, loop); 4312 4313 // Bang down shadow pages too. 4314 // At this point, (tmp-0) is the last address touched, so don't 4315 // touch it again. (It was touched as (tmp-pagesize) but then tmp 4316 // was post-decremented.) Skip this address by starting at i=1, and 4317 // touch a few more pages below. N.B. It is important to touch all 4318 // the way down to and including i=StackShadowPages. 4319 for (int i = 0; i < (int)(JavaThread::stack_shadow_zone_size() / os::vm_page_size()) - 1; i++) { 4320 // this could be any sized move but this is can be a debugging crumb 4321 // so the bigger the better. 4322 lea(tmp, Address(tmp, -os::vm_page_size())); 4323 str(size, Address(tmp)); 4324 } 4325 } 4326 4327 4328 // Move the address of the polling page into dest. 4329 void MacroAssembler::get_polling_page(Register dest, address page, relocInfo::relocType rtype) { 4330 if (SafepointMechanism::uses_thread_local_poll()) { 4331 ldr(dest, Address(rthread, Thread::polling_page_offset())); 4332 } else { 4333 unsigned long off; 4334 adrp(dest, Address(page, rtype), off); 4335 assert(off == 0, "polling page must be page aligned"); 4336 } 4337 } 4338 4339 // Move the address of the polling page into r, then read the polling 4340 // page. 4341 address MacroAssembler::read_polling_page(Register r, address page, relocInfo::relocType rtype) { 4342 get_polling_page(r, page, rtype); 4343 return read_polling_page(r, rtype); 4344 } 4345 4346 // Read the polling page. The address of the polling page must 4347 // already be in r. 4348 address MacroAssembler::read_polling_page(Register r, relocInfo::relocType rtype) { 4349 InstructionMark im(this); 4350 code_section()->relocate(inst_mark(), rtype); 4351 ldrw(zr, Address(r, 0)); 4352 return inst_mark(); 4353 } 4354 4355 void MacroAssembler::adrp(Register reg1, const Address &dest, unsigned long &byte_offset) { 4356 relocInfo::relocType rtype = dest.rspec().reloc()->type(); 4357 unsigned long low_page = (unsigned long)CodeCache::low_bound() >> 12; 4358 unsigned long high_page = (unsigned long)(CodeCache::high_bound()-1) >> 12; 4359 unsigned long dest_page = (unsigned long)dest.target() >> 12; 4360 long offset_low = dest_page - low_page; 4361 long offset_high = dest_page - high_page; 4362 4363 assert(is_valid_AArch64_address(dest.target()), "bad address"); 4364 assert(dest.getMode() == Address::literal, "ADRP must be applied to a literal address"); 4365 4366 InstructionMark im(this); 4367 code_section()->relocate(inst_mark(), dest.rspec()); 4368 // 8143067: Ensure that the adrp can reach the dest from anywhere within 4369 // the code cache so that if it is relocated we know it will still reach 4370 if (offset_high >= -(1<<20) && offset_low < (1<<20)) { 4371 _adrp(reg1, dest.target()); 4372 } else { 4373 unsigned long target = (unsigned long)dest.target(); 4374 unsigned long adrp_target 4375 = (target & 0xffffffffUL) | ((unsigned long)pc() & 0xffff00000000UL); 4376 4377 _adrp(reg1, (address)adrp_target); 4378 movk(reg1, target >> 32, 32); 4379 } 4380 byte_offset = (unsigned long)dest.target() & 0xfff; 4381 } 4382 4383 void MacroAssembler::load_byte_map_base(Register reg) { 4384 CardTable::CardValue* byte_map_base = 4385 ((CardTableBarrierSet*)(BarrierSet::barrier_set()))->card_table()->byte_map_base(); 4386 4387 if (is_valid_AArch64_address((address)byte_map_base)) { 4388 // Strictly speaking the byte_map_base isn't an address at all, 4389 // and it might even be negative. 4390 unsigned long offset; 4391 adrp(reg, ExternalAddress((address)byte_map_base), offset); 4392 // We expect offset to be zero with most collectors. 4393 if (offset != 0) { 4394 add(reg, reg, offset); 4395 } 4396 } else { 4397 mov(reg, (uint64_t)byte_map_base); 4398 } 4399 } 4400 4401 void MacroAssembler::build_frame(int framesize) { 4402 assert(framesize > 0, "framesize must be > 0"); 4403 if (framesize < ((1 << 9) + 2 * wordSize)) { 4404 sub(sp, sp, framesize); 4405 stp(rfp, lr, Address(sp, framesize - 2 * wordSize)); 4406 if (PreserveFramePointer) add(rfp, sp, framesize - 2 * wordSize); 4407 } else { 4408 stp(rfp, lr, Address(pre(sp, -2 * wordSize))); 4409 if (PreserveFramePointer) mov(rfp, sp); 4410 if (framesize < ((1 << 12) + 2 * wordSize)) 4411 sub(sp, sp, framesize - 2 * wordSize); 4412 else { 4413 mov(rscratch1, framesize - 2 * wordSize); 4414 sub(sp, sp, rscratch1); 4415 } 4416 } 4417 } 4418 4419 void MacroAssembler::remove_frame(int framesize) { 4420 assert(framesize > 0, "framesize must be > 0"); 4421 if (framesize < ((1 << 9) + 2 * wordSize)) { 4422 ldp(rfp, lr, Address(sp, framesize - 2 * wordSize)); 4423 add(sp, sp, framesize); 4424 } else { 4425 if (framesize < ((1 << 12) + 2 * wordSize)) 4426 add(sp, sp, framesize - 2 * wordSize); 4427 else { 4428 mov(rscratch1, framesize - 2 * wordSize); 4429 add(sp, sp, rscratch1); 4430 } 4431 ldp(rfp, lr, Address(post(sp, 2 * wordSize))); 4432 } 4433 } 4434 4435 #ifdef COMPILER2 4436 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr); 4437 4438 // Search for str1 in str2 and return index or -1 4439 void MacroAssembler::string_indexof(Register str2, Register str1, 4440 Register cnt2, Register cnt1, 4441 Register tmp1, Register tmp2, 4442 Register tmp3, Register tmp4, 4443 Register tmp5, Register tmp6, 4444 int icnt1, Register result, int ae) { 4445 // NOTE: tmp5, tmp6 can be zr depending on specific method version 4446 Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH; 4447 4448 Register ch1 = rscratch1; 4449 Register ch2 = rscratch2; 4450 Register cnt1tmp = tmp1; 4451 Register cnt2tmp = tmp2; 4452 Register cnt1_neg = cnt1; 4453 Register cnt2_neg = cnt2; 4454 Register result_tmp = tmp4; 4455 4456 bool isL = ae == StrIntrinsicNode::LL; 4457 4458 bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL; 4459 bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU; 4460 int str1_chr_shift = str1_isL ? 0:1; 4461 int str2_chr_shift = str2_isL ? 0:1; 4462 int str1_chr_size = str1_isL ? 1:2; 4463 int str2_chr_size = str2_isL ? 1:2; 4464 chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb : 4465 (chr_insn)&MacroAssembler::ldrh; 4466 chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb : 4467 (chr_insn)&MacroAssembler::ldrh; 4468 chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw; 4469 chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr; 4470 4471 // Note, inline_string_indexOf() generates checks: 4472 // if (substr.count > string.count) return -1; 4473 // if (substr.count == 0) return 0; 4474 4475 // We have two strings, a source string in str2, cnt2 and a pattern string 4476 // in str1, cnt1. Find the 1st occurence of pattern in source or return -1. 4477 4478 // For larger pattern and source we use a simplified Boyer Moore algorithm. 4479 // With a small pattern and source we use linear scan. 4480 4481 if (icnt1 == -1) { 4482 sub(result_tmp, cnt2, cnt1); 4483 cmp(cnt1, (u1)8); // Use Linear Scan if cnt1 < 8 || cnt1 >= 256 4484 br(LT, LINEARSEARCH); 4485 dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty 4486 subs(zr, cnt1, 256); 4487 lsr(tmp1, cnt2, 2); 4488 ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM 4489 br(GE, LINEARSTUB); 4490 } 4491 4492 // The Boyer Moore alogorithm is based on the description here:- 4493 // 4494 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm 4495 // 4496 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule 4497 // and the 'Good Suffix' rule. 4498 // 4499 // These rules are essentially heuristics for how far we can shift the 4500 // pattern along the search string. 4501 // 4502 // The implementation here uses the 'Bad Character' rule only because of the 4503 // complexity of initialisation for the 'Good Suffix' rule. 4504 // 4505 // This is also known as the Boyer-Moore-Horspool algorithm:- 4506 // 4507 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm 4508 // 4509 // This particular implementation has few java-specific optimizations. 4510 // 4511 // #define ASIZE 256 4512 // 4513 // int bm(unsigned char *x, int m, unsigned char *y, int n) { 4514 // int i, j; 4515 // unsigned c; 4516 // unsigned char bc[ASIZE]; 4517 // 4518 // /* Preprocessing */ 4519 // for (i = 0; i < ASIZE; ++i) 4520 // bc[i] = m; 4521 // for (i = 0; i < m - 1; ) { 4522 // c = x[i]; 4523 // ++i; 4524 // // c < 256 for Latin1 string, so, no need for branch 4525 // #ifdef PATTERN_STRING_IS_LATIN1 4526 // bc[c] = m - i; 4527 // #else 4528 // if (c < ASIZE) bc[c] = m - i; 4529 // #endif 4530 // } 4531 // 4532 // /* Searching */ 4533 // j = 0; 4534 // while (j <= n - m) { 4535 // c = y[i+j]; 4536 // if (x[m-1] == c) 4537 // for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i); 4538 // if (i < 0) return j; 4539 // // c < 256 for Latin1 string, so, no need for branch 4540 // #ifdef SOURCE_STRING_IS_LATIN1 4541 // // LL case: (c< 256) always true. Remove branch 4542 // j += bc[y[j+m-1]]; 4543 // #endif 4544 // #ifndef PATTERN_STRING_IS_UTF 4545 // // UU case: need if (c<ASIZE) check. Skip 1 character if not. 4546 // if (c < ASIZE) 4547 // j += bc[y[j+m-1]]; 4548 // else 4549 // j += 1 4550 // #endif 4551 // #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF 4552 // // UL case: need if (c<ASIZE) check. Skip <pattern length> if not. 4553 // if (c < ASIZE) 4554 // j += bc[y[j+m-1]]; 4555 // else 4556 // j += m 4557 // #endif 4558 // } 4559 // } 4560 4561 if (icnt1 == -1) { 4562 Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH, 4563 BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP; 4564 Register cnt1end = tmp2; 4565 Register str2end = cnt2; 4566 Register skipch = tmp2; 4567 4568 // str1 length is >=8, so, we can read at least 1 register for cases when 4569 // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for 4570 // UL case. We'll re-read last character in inner pre-loop code to have 4571 // single outer pre-loop load 4572 const int firstStep = isL ? 7 : 3; 4573 4574 const int ASIZE = 256; 4575 const int STORED_BYTES = 32; // amount of bytes stored per instruction 4576 sub(sp, sp, ASIZE); 4577 mov(tmp5, ASIZE/STORED_BYTES); // loop iterations 4578 mov(ch1, sp); 4579 BIND(BM_INIT_LOOP); 4580 stpq(v0, v0, Address(post(ch1, STORED_BYTES))); 4581 subs(tmp5, tmp5, 1); 4582 br(GT, BM_INIT_LOOP); 4583 4584 sub(cnt1tmp, cnt1, 1); 4585 mov(tmp5, str2); 4586 add(str2end, str2, result_tmp, LSL, str2_chr_shift); 4587 sub(ch2, cnt1, 1); 4588 mov(tmp3, str1); 4589 BIND(BCLOOP); 4590 (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size))); 4591 if (!str1_isL) { 4592 subs(zr, ch1, ASIZE); 4593 br(HS, BCSKIP); 4594 } 4595 strb(ch2, Address(sp, ch1)); 4596 BIND(BCSKIP); 4597 subs(ch2, ch2, 1); 4598 br(GT, BCLOOP); 4599 4600 add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1 4601 if (str1_isL == str2_isL) { 4602 // load last 8 bytes (8LL/4UU symbols) 4603 ldr(tmp6, Address(tmp6, -wordSize)); 4604 } else { 4605 ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols) 4606 // convert Latin1 to UTF. We'll have to wait until load completed, but 4607 // it's still faster than per-character loads+checks 4608 lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1] 4609 ubfx(ch1, tmp6, 8, 8); // str1[N-2] 4610 ubfx(ch2, tmp6, 16, 8); // str1[N-3] 4611 andr(tmp6, tmp6, 0xFF); // str1[N-4] 4612 orr(ch2, ch1, ch2, LSL, 16); 4613 orr(tmp6, tmp6, tmp3, LSL, 48); 4614 orr(tmp6, tmp6, ch2, LSL, 16); 4615 } 4616 BIND(BMLOOPSTR2); 4617 (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift))); 4618 sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8 4619 if (str1_isL == str2_isL) { 4620 // re-init tmp3. It's for free because it's executed in parallel with 4621 // load above. Alternative is to initialize it before loop, but it'll 4622 // affect performance on in-order systems with 2 or more ld/st pipelines 4623 lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size)); 4624 } 4625 if (!isL) { // UU/UL case 4626 lsl(ch2, cnt1tmp, 1); // offset in bytes 4627 } 4628 cmp(tmp3, skipch); 4629 br(NE, BMSKIP); 4630 ldr(ch2, Address(str2, isL ? cnt1tmp : ch2)); 4631 mov(ch1, tmp6); 4632 if (isL) { 4633 b(BMLOOPSTR1_AFTER_LOAD); 4634 } else { 4635 sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8 4636 b(BMLOOPSTR1_CMP); 4637 } 4638 BIND(BMLOOPSTR1); 4639 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift))); 4640 (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift))); 4641 BIND(BMLOOPSTR1_AFTER_LOAD); 4642 subs(cnt1tmp, cnt1tmp, 1); 4643 br(LT, BMLOOPSTR1_LASTCMP); 4644 BIND(BMLOOPSTR1_CMP); 4645 cmp(ch1, ch2); 4646 br(EQ, BMLOOPSTR1); 4647 BIND(BMSKIP); 4648 if (!isL) { 4649 // if we've met UTF symbol while searching Latin1 pattern, then we can 4650 // skip cnt1 symbols 4651 if (str1_isL != str2_isL) { 4652 mov(result_tmp, cnt1); 4653 } else { 4654 mov(result_tmp, 1); 4655 } 4656 subs(zr, skipch, ASIZE); 4657 br(HS, BMADV); 4658 } 4659 ldrb(result_tmp, Address(sp, skipch)); // load skip distance 4660 BIND(BMADV); 4661 sub(cnt1tmp, cnt1, 1); 4662 add(str2, str2, result_tmp, LSL, str2_chr_shift); 4663 cmp(str2, str2end); 4664 br(LE, BMLOOPSTR2); 4665 add(sp, sp, ASIZE); 4666 b(NOMATCH); 4667 BIND(BMLOOPSTR1_LASTCMP); 4668 cmp(ch1, ch2); 4669 br(NE, BMSKIP); 4670 BIND(BMMATCH); 4671 sub(result, str2, tmp5); 4672 if (!str2_isL) lsr(result, result, 1); 4673 add(sp, sp, ASIZE); 4674 b(DONE); 4675 4676 BIND(LINEARSTUB); 4677 cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm 4678 br(LT, LINEAR_MEDIUM); 4679 mov(result, zr); 4680 RuntimeAddress stub = NULL; 4681 if (isL) { 4682 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll()); 4683 assert(stub.target() != NULL, "string_indexof_linear_ll stub has not been generated"); 4684 } else if (str1_isL) { 4685 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul()); 4686 assert(stub.target() != NULL, "string_indexof_linear_ul stub has not been generated"); 4687 } else { 4688 stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu()); 4689 assert(stub.target() != NULL, "string_indexof_linear_uu stub has not been generated"); 4690 } 4691 trampoline_call(stub); 4692 b(DONE); 4693 } 4694 4695 BIND(LINEARSEARCH); 4696 { 4697 Label DO1, DO2, DO3; 4698 4699 Register str2tmp = tmp2; 4700 Register first = tmp3; 4701 4702 if (icnt1 == -1) 4703 { 4704 Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT; 4705 4706 cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2)); 4707 br(LT, DOSHORT); 4708 BIND(LINEAR_MEDIUM); 4709 (this->*str1_load_1chr)(first, Address(str1)); 4710 lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift))); 4711 sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift); 4712 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 4713 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 4714 4715 BIND(FIRST_LOOP); 4716 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg)); 4717 cmp(first, ch2); 4718 br(EQ, STR1_LOOP); 4719 BIND(STR2_NEXT); 4720 adds(cnt2_neg, cnt2_neg, str2_chr_size); 4721 br(LE, FIRST_LOOP); 4722 b(NOMATCH); 4723 4724 BIND(STR1_LOOP); 4725 adds(cnt1tmp, cnt1_neg, str1_chr_size); 4726 add(cnt2tmp, cnt2_neg, str2_chr_size); 4727 br(GE, MATCH); 4728 4729 BIND(STR1_NEXT); 4730 (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp)); 4731 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp)); 4732 cmp(ch1, ch2); 4733 br(NE, STR2_NEXT); 4734 adds(cnt1tmp, cnt1tmp, str1_chr_size); 4735 add(cnt2tmp, cnt2tmp, str2_chr_size); 4736 br(LT, STR1_NEXT); 4737 b(MATCH); 4738 4739 BIND(DOSHORT); 4740 if (str1_isL == str2_isL) { 4741 cmp(cnt1, (u1)2); 4742 br(LT, DO1); 4743 br(GT, DO3); 4744 } 4745 } 4746 4747 if (icnt1 == 4) { 4748 Label CH1_LOOP; 4749 4750 (this->*load_4chr)(ch1, str1); 4751 sub(result_tmp, cnt2, 4); 4752 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 4753 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 4754 4755 BIND(CH1_LOOP); 4756 (this->*load_4chr)(ch2, Address(str2, cnt2_neg)); 4757 cmp(ch1, ch2); 4758 br(EQ, MATCH); 4759 adds(cnt2_neg, cnt2_neg, str2_chr_size); 4760 br(LE, CH1_LOOP); 4761 b(NOMATCH); 4762 } 4763 4764 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) { 4765 Label CH1_LOOP; 4766 4767 BIND(DO2); 4768 (this->*load_2chr)(ch1, str1); 4769 if (icnt1 == 2) { 4770 sub(result_tmp, cnt2, 2); 4771 } 4772 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 4773 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 4774 BIND(CH1_LOOP); 4775 (this->*load_2chr)(ch2, Address(str2, cnt2_neg)); 4776 cmp(ch1, ch2); 4777 br(EQ, MATCH); 4778 adds(cnt2_neg, cnt2_neg, str2_chr_size); 4779 br(LE, CH1_LOOP); 4780 b(NOMATCH); 4781 } 4782 4783 if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) { 4784 Label FIRST_LOOP, STR2_NEXT, STR1_LOOP; 4785 4786 BIND(DO3); 4787 (this->*load_2chr)(first, str1); 4788 (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size)); 4789 if (icnt1 == 3) { 4790 sub(result_tmp, cnt2, 3); 4791 } 4792 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 4793 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 4794 BIND(FIRST_LOOP); 4795 (this->*load_2chr)(ch2, Address(str2, cnt2_neg)); 4796 cmpw(first, ch2); 4797 br(EQ, STR1_LOOP); 4798 BIND(STR2_NEXT); 4799 adds(cnt2_neg, cnt2_neg, str2_chr_size); 4800 br(LE, FIRST_LOOP); 4801 b(NOMATCH); 4802 4803 BIND(STR1_LOOP); 4804 add(cnt2tmp, cnt2_neg, 2*str2_chr_size); 4805 (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp)); 4806 cmp(ch1, ch2); 4807 br(NE, STR2_NEXT); 4808 b(MATCH); 4809 } 4810 4811 if (icnt1 == -1 || icnt1 == 1) { 4812 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP; 4813 4814 BIND(DO1); 4815 (this->*str1_load_1chr)(ch1, str1); 4816 cmp(cnt2, (u1)8); 4817 br(LT, DO1_SHORT); 4818 4819 sub(result_tmp, cnt2, 8/str2_chr_size); 4820 sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift); 4821 mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001); 4822 lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift))); 4823 4824 if (str2_isL) { 4825 orr(ch1, ch1, ch1, LSL, 8); 4826 } 4827 orr(ch1, ch1, ch1, LSL, 16); 4828 orr(ch1, ch1, ch1, LSL, 32); 4829 BIND(CH1_LOOP); 4830 ldr(ch2, Address(str2, cnt2_neg)); 4831 eor(ch2, ch1, ch2); 4832 sub(tmp1, ch2, tmp3); 4833 orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 4834 bics(tmp1, tmp1, tmp2); 4835 br(NE, HAS_ZERO); 4836 adds(cnt2_neg, cnt2_neg, 8); 4837 br(LT, CH1_LOOP); 4838 4839 cmp(cnt2_neg, (u1)8); 4840 mov(cnt2_neg, 0); 4841 br(LT, CH1_LOOP); 4842 b(NOMATCH); 4843 4844 BIND(HAS_ZERO); 4845 rev(tmp1, tmp1); 4846 clz(tmp1, tmp1); 4847 add(cnt2_neg, cnt2_neg, tmp1, LSR, 3); 4848 b(MATCH); 4849 4850 BIND(DO1_SHORT); 4851 mov(result_tmp, cnt2); 4852 lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift))); 4853 sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift); 4854 BIND(DO1_LOOP); 4855 (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg)); 4856 cmpw(ch1, ch2); 4857 br(EQ, MATCH); 4858 adds(cnt2_neg, cnt2_neg, str2_chr_size); 4859 br(LT, DO1_LOOP); 4860 } 4861 } 4862 BIND(NOMATCH); 4863 mov(result, -1); 4864 b(DONE); 4865 BIND(MATCH); 4866 add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift); 4867 BIND(DONE); 4868 } 4869 4870 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr); 4871 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn); 4872 4873 void MacroAssembler::string_indexof_char(Register str1, Register cnt1, 4874 Register ch, Register result, 4875 Register tmp1, Register tmp2, Register tmp3) 4876 { 4877 Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE; 4878 Register cnt1_neg = cnt1; 4879 Register ch1 = rscratch1; 4880 Register result_tmp = rscratch2; 4881 4882 cmp(cnt1, (u1)4); 4883 br(LT, DO1_SHORT); 4884 4885 orr(ch, ch, ch, LSL, 16); 4886 orr(ch, ch, ch, LSL, 32); 4887 4888 sub(cnt1, cnt1, 4); 4889 mov(result_tmp, cnt1); 4890 lea(str1, Address(str1, cnt1, Address::uxtw(1))); 4891 sub(cnt1_neg, zr, cnt1, LSL, 1); 4892 4893 mov(tmp3, 0x0001000100010001); 4894 4895 BIND(CH1_LOOP); 4896 ldr(ch1, Address(str1, cnt1_neg)); 4897 eor(ch1, ch, ch1); 4898 sub(tmp1, ch1, tmp3); 4899 orr(tmp2, ch1, 0x7fff7fff7fff7fff); 4900 bics(tmp1, tmp1, tmp2); 4901 br(NE, HAS_ZERO); 4902 adds(cnt1_neg, cnt1_neg, 8); 4903 br(LT, CH1_LOOP); 4904 4905 cmp(cnt1_neg, (u1)8); 4906 mov(cnt1_neg, 0); 4907 br(LT, CH1_LOOP); 4908 b(NOMATCH); 4909 4910 BIND(HAS_ZERO); 4911 rev(tmp1, tmp1); 4912 clz(tmp1, tmp1); 4913 add(cnt1_neg, cnt1_neg, tmp1, LSR, 3); 4914 b(MATCH); 4915 4916 BIND(DO1_SHORT); 4917 mov(result_tmp, cnt1); 4918 lea(str1, Address(str1, cnt1, Address::uxtw(1))); 4919 sub(cnt1_neg, zr, cnt1, LSL, 1); 4920 BIND(DO1_LOOP); 4921 ldrh(ch1, Address(str1, cnt1_neg)); 4922 cmpw(ch, ch1); 4923 br(EQ, MATCH); 4924 adds(cnt1_neg, cnt1_neg, 2); 4925 br(LT, DO1_LOOP); 4926 BIND(NOMATCH); 4927 mov(result, -1); 4928 b(DONE); 4929 BIND(MATCH); 4930 add(result, result_tmp, cnt1_neg, ASR, 1); 4931 BIND(DONE); 4932 } 4933 4934 // Compare strings. 4935 void MacroAssembler::string_compare(Register str1, Register str2, 4936 Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2, 4937 FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, int ae) { 4938 Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB, 4939 DIFFERENCE, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT, 4940 SHORT_LOOP_START, TAIL_CHECK; 4941 4942 const u1 STUB_THRESHOLD = 64 + 8; 4943 bool isLL = ae == StrIntrinsicNode::LL; 4944 bool isLU = ae == StrIntrinsicNode::LU; 4945 bool isUL = ae == StrIntrinsicNode::UL; 4946 4947 bool str1_isL = isLL || isLU; 4948 bool str2_isL = isLL || isUL; 4949 4950 int str1_chr_shift = str1_isL ? 0 : 1; 4951 int str2_chr_shift = str2_isL ? 0 : 1; 4952 int str1_chr_size = str1_isL ? 1 : 2; 4953 int str2_chr_size = str2_isL ? 1 : 2; 4954 int minCharsInWord = isLL ? wordSize : wordSize/2; 4955 4956 FloatRegister vtmpZ = vtmp1, vtmp = vtmp2; 4957 chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb : 4958 (chr_insn)&MacroAssembler::ldrh; 4959 chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb : 4960 (chr_insn)&MacroAssembler::ldrh; 4961 uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw : 4962 (uxt_insn)&MacroAssembler::uxthw; 4963 4964 BLOCK_COMMENT("string_compare {"); 4965 4966 // Bizzarely, the counts are passed in bytes, regardless of whether they 4967 // are L or U strings, however the result is always in characters. 4968 if (!str1_isL) asrw(cnt1, cnt1, 1); 4969 if (!str2_isL) asrw(cnt2, cnt2, 1); 4970 4971 // Compute the minimum of the string lengths and save the difference. 4972 subsw(result, cnt1, cnt2); 4973 cselw(cnt2, cnt1, cnt2, Assembler::LE); // min 4974 4975 // A very short string 4976 cmpw(cnt2, minCharsInWord); 4977 br(Assembler::LE, SHORT_STRING); 4978 4979 // Compare longwords 4980 // load first parts of strings and finish initialization while loading 4981 { 4982 if (str1_isL == str2_isL) { // LL or UU 4983 ldr(tmp1, Address(str1)); 4984 cmp(str1, str2); 4985 br(Assembler::EQ, DONE); 4986 ldr(tmp2, Address(str2)); 4987 cmp(cnt2, STUB_THRESHOLD); 4988 br(GE, STUB); 4989 subsw(cnt2, cnt2, minCharsInWord); 4990 br(EQ, TAIL_CHECK); 4991 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 4992 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 4993 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 4994 } else if (isLU) { 4995 ldrs(vtmp, Address(str1)); 4996 cmp(str1, str2); 4997 br(Assembler::EQ, DONE); 4998 ldr(tmp2, Address(str2)); 4999 cmp(cnt2, STUB_THRESHOLD); 5000 br(GE, STUB); 5001 subw(cnt2, cnt2, 4); 5002 eor(vtmpZ, T16B, vtmpZ, vtmpZ); 5003 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 5004 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 5005 zip1(vtmp, T8B, vtmp, vtmpZ); 5006 sub(cnt1, zr, cnt2, LSL, str1_chr_shift); 5007 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 5008 add(cnt1, cnt1, 4); 5009 fmovd(tmp1, vtmp); 5010 } else { // UL case 5011 ldr(tmp1, Address(str1)); 5012 cmp(str1, str2); 5013 br(Assembler::EQ, DONE); 5014 ldrs(vtmp, Address(str2)); 5015 cmp(cnt2, STUB_THRESHOLD); 5016 br(GE, STUB); 5017 subw(cnt2, cnt2, 4); 5018 lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift))); 5019 eor(vtmpZ, T16B, vtmpZ, vtmpZ); 5020 lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift))); 5021 sub(cnt1, zr, cnt2, LSL, str1_chr_shift); 5022 zip1(vtmp, T8B, vtmp, vtmpZ); 5023 sub(cnt2, zr, cnt2, LSL, str2_chr_shift); 5024 add(cnt1, cnt1, 8); 5025 fmovd(tmp2, vtmp); 5026 } 5027 adds(cnt2, cnt2, isUL ? 4 : 8); 5028 br(GE, TAIL); 5029 eor(rscratch2, tmp1, tmp2); 5030 cbnz(rscratch2, DIFFERENCE); 5031 // main loop 5032 bind(NEXT_WORD); 5033 if (str1_isL == str2_isL) { 5034 ldr(tmp1, Address(str1, cnt2)); 5035 ldr(tmp2, Address(str2, cnt2)); 5036 adds(cnt2, cnt2, 8); 5037 } else if (isLU) { 5038 ldrs(vtmp, Address(str1, cnt1)); 5039 ldr(tmp2, Address(str2, cnt2)); 5040 add(cnt1, cnt1, 4); 5041 zip1(vtmp, T8B, vtmp, vtmpZ); 5042 fmovd(tmp1, vtmp); 5043 adds(cnt2, cnt2, 8); 5044 } else { // UL 5045 ldrs(vtmp, Address(str2, cnt2)); 5046 ldr(tmp1, Address(str1, cnt1)); 5047 zip1(vtmp, T8B, vtmp, vtmpZ); 5048 add(cnt1, cnt1, 8); 5049 fmovd(tmp2, vtmp); 5050 adds(cnt2, cnt2, 4); 5051 } 5052 br(GE, TAIL); 5053 5054 eor(rscratch2, tmp1, tmp2); 5055 cbz(rscratch2, NEXT_WORD); 5056 b(DIFFERENCE); 5057 bind(TAIL); 5058 eor(rscratch2, tmp1, tmp2); 5059 cbnz(rscratch2, DIFFERENCE); 5060 // Last longword. In the case where length == 4 we compare the 5061 // same longword twice, but that's still faster than another 5062 // conditional branch. 5063 if (str1_isL == str2_isL) { 5064 ldr(tmp1, Address(str1)); 5065 ldr(tmp2, Address(str2)); 5066 } else if (isLU) { 5067 ldrs(vtmp, Address(str1)); 5068 ldr(tmp2, Address(str2)); 5069 zip1(vtmp, T8B, vtmp, vtmpZ); 5070 fmovd(tmp1, vtmp); 5071 } else { // UL 5072 ldrs(vtmp, Address(str2)); 5073 ldr(tmp1, Address(str1)); 5074 zip1(vtmp, T8B, vtmp, vtmpZ); 5075 fmovd(tmp2, vtmp); 5076 } 5077 bind(TAIL_CHECK); 5078 eor(rscratch2, tmp1, tmp2); 5079 cbz(rscratch2, DONE); 5080 5081 // Find the first different characters in the longwords and 5082 // compute their difference. 5083 bind(DIFFERENCE); 5084 rev(rscratch2, rscratch2); 5085 clz(rscratch2, rscratch2); 5086 andr(rscratch2, rscratch2, isLL ? -8 : -16); 5087 lsrv(tmp1, tmp1, rscratch2); 5088 (this->*ext_chr)(tmp1, tmp1); 5089 lsrv(tmp2, tmp2, rscratch2); 5090 (this->*ext_chr)(tmp2, tmp2); 5091 subw(result, tmp1, tmp2); 5092 b(DONE); 5093 } 5094 5095 bind(STUB); 5096 RuntimeAddress stub = NULL; 5097 switch(ae) { 5098 case StrIntrinsicNode::LL: 5099 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL()); 5100 break; 5101 case StrIntrinsicNode::UU: 5102 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU()); 5103 break; 5104 case StrIntrinsicNode::LU: 5105 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU()); 5106 break; 5107 case StrIntrinsicNode::UL: 5108 stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL()); 5109 break; 5110 default: 5111 ShouldNotReachHere(); 5112 } 5113 assert(stub.target() != NULL, "compare_long_string stub has not been generated"); 5114 trampoline_call(stub); 5115 b(DONE); 5116 5117 bind(SHORT_STRING); 5118 // Is the minimum length zero? 5119 cbz(cnt2, DONE); 5120 // arrange code to do most branches while loading and loading next characters 5121 // while comparing previous 5122 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size))); 5123 subs(cnt2, cnt2, 1); 5124 br(EQ, SHORT_LAST_INIT); 5125 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 5126 b(SHORT_LOOP_START); 5127 bind(SHORT_LOOP); 5128 subs(cnt2, cnt2, 1); 5129 br(EQ, SHORT_LAST); 5130 bind(SHORT_LOOP_START); 5131 (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size))); 5132 (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size))); 5133 cmp(tmp1, cnt1); 5134 br(NE, SHORT_LOOP_TAIL); 5135 subs(cnt2, cnt2, 1); 5136 br(EQ, SHORT_LAST2); 5137 (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size))); 5138 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 5139 cmp(tmp2, rscratch1); 5140 br(EQ, SHORT_LOOP); 5141 sub(result, tmp2, rscratch1); 5142 b(DONE); 5143 bind(SHORT_LOOP_TAIL); 5144 sub(result, tmp1, cnt1); 5145 b(DONE); 5146 bind(SHORT_LAST2); 5147 cmp(tmp2, rscratch1); 5148 br(EQ, DONE); 5149 sub(result, tmp2, rscratch1); 5150 5151 b(DONE); 5152 bind(SHORT_LAST_INIT); 5153 (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size))); 5154 bind(SHORT_LAST); 5155 cmp(tmp1, cnt1); 5156 br(EQ, DONE); 5157 sub(result, tmp1, cnt1); 5158 5159 bind(DONE); 5160 5161 BLOCK_COMMENT("} string_compare"); 5162 } 5163 #endif // COMPILER2 5164 5165 // This method checks if provided byte array contains byte with highest bit set. 5166 void MacroAssembler::has_negatives(Register ary1, Register len, Register result) { 5167 // Simple and most common case of aligned small array which is not at the 5168 // end of memory page is placed here. All other cases are in stub. 5169 Label LOOP, END, STUB, STUB_LONG, SET_RESULT, DONE; 5170 const uint64_t UPPER_BIT_MASK=0x8080808080808080; 5171 assert_different_registers(ary1, len, result); 5172 5173 cmpw(len, 0); 5174 br(LE, SET_RESULT); 5175 cmpw(len, 4 * wordSize); 5176 br(GE, STUB_LONG); // size > 32 then go to stub 5177 5178 int shift = 64 - exact_log2(os::vm_page_size()); 5179 lsl(rscratch1, ary1, shift); 5180 mov(rscratch2, (size_t)(4 * wordSize) << shift); 5181 adds(rscratch2, rscratch1, rscratch2); // At end of page? 5182 br(CS, STUB); // at the end of page then go to stub 5183 subs(len, len, wordSize); 5184 br(LT, END); 5185 5186 BIND(LOOP); 5187 ldr(rscratch1, Address(post(ary1, wordSize))); 5188 tst(rscratch1, UPPER_BIT_MASK); 5189 br(NE, SET_RESULT); 5190 subs(len, len, wordSize); 5191 br(GE, LOOP); 5192 cmpw(len, -wordSize); 5193 br(EQ, SET_RESULT); 5194 5195 BIND(END); 5196 ldr(result, Address(ary1)); 5197 sub(len, zr, len, LSL, 3); // LSL 3 is to get bits from bytes 5198 lslv(result, result, len); 5199 tst(result, UPPER_BIT_MASK); 5200 b(SET_RESULT); 5201 5202 BIND(STUB); 5203 RuntimeAddress has_neg = RuntimeAddress(StubRoutines::aarch64::has_negatives()); 5204 assert(has_neg.target() != NULL, "has_negatives stub has not been generated"); 5205 trampoline_call(has_neg); 5206 b(DONE); 5207 5208 BIND(STUB_LONG); 5209 RuntimeAddress has_neg_long = RuntimeAddress( 5210 StubRoutines::aarch64::has_negatives_long()); 5211 assert(has_neg_long.target() != NULL, "has_negatives stub has not been generated"); 5212 trampoline_call(has_neg_long); 5213 b(DONE); 5214 5215 BIND(SET_RESULT); 5216 cset(result, NE); // set true or false 5217 5218 BIND(DONE); 5219 } 5220 5221 void MacroAssembler::arrays_equals(Register a1, Register a2, Register tmp3, 5222 Register tmp4, Register tmp5, Register result, 5223 Register cnt1, int elem_size) { 5224 Label DONE, SAME; 5225 Register tmp1 = rscratch1; 5226 Register tmp2 = rscratch2; 5227 Register cnt2 = tmp2; // cnt2 only used in array length compare 5228 int elem_per_word = wordSize/elem_size; 5229 int log_elem_size = exact_log2(elem_size); 5230 int length_offset = arrayOopDesc::length_offset_in_bytes(); 5231 int base_offset 5232 = arrayOopDesc::base_offset_in_bytes(elem_size == 2 ? T_CHAR : T_BYTE); 5233 int stubBytesThreshold = 3 * 64 + (UseSIMDForArrayEquals ? 0 : 16); 5234 5235 assert(elem_size == 1 || elem_size == 2, "must be char or byte"); 5236 assert_different_registers(a1, a2, result, cnt1, rscratch1, rscratch2); 5237 5238 #ifndef PRODUCT 5239 { 5240 const char kind = (elem_size == 2) ? 'U' : 'L'; 5241 char comment[64]; 5242 snprintf(comment, sizeof comment, "array_equals%c{", kind); 5243 BLOCK_COMMENT(comment); 5244 } 5245 #endif 5246 5247 // if (a1 == a2) 5248 // return true; 5249 cmpoop(a1, a2); // May have read barriers for a1 and a2. 5250 br(EQ, SAME); 5251 5252 if (UseSimpleArrayEquals) { 5253 Label NEXT_WORD, SHORT, TAIL03, TAIL01, A_MIGHT_BE_NULL, A_IS_NOT_NULL; 5254 // if (a1 == null || a2 == null) 5255 // return false; 5256 // a1 & a2 == 0 means (some-pointer is null) or 5257 // (very-rare-or-even-probably-impossible-pointer-values) 5258 // so, we can save one branch in most cases 5259 tst(a1, a2); 5260 mov(result, false); 5261 br(EQ, A_MIGHT_BE_NULL); 5262 // if (a1.length != a2.length) 5263 // return false; 5264 bind(A_IS_NOT_NULL); 5265 ldrw(cnt1, Address(a1, length_offset)); 5266 ldrw(cnt2, Address(a2, length_offset)); 5267 eorw(tmp5, cnt1, cnt2); 5268 cbnzw(tmp5, DONE); 5269 lea(a1, Address(a1, base_offset)); 5270 lea(a2, Address(a2, base_offset)); 5271 // Check for short strings, i.e. smaller than wordSize. 5272 subs(cnt1, cnt1, elem_per_word); 5273 br(Assembler::LT, SHORT); 5274 // Main 8 byte comparison loop. 5275 bind(NEXT_WORD); { 5276 ldr(tmp1, Address(post(a1, wordSize))); 5277 ldr(tmp2, Address(post(a2, wordSize))); 5278 subs(cnt1, cnt1, elem_per_word); 5279 eor(tmp5, tmp1, tmp2); 5280 cbnz(tmp5, DONE); 5281 } br(GT, NEXT_WORD); 5282 // Last longword. In the case where length == 4 we compare the 5283 // same longword twice, but that's still faster than another 5284 // conditional branch. 5285 // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when 5286 // length == 4. 5287 if (log_elem_size > 0) 5288 lsl(cnt1, cnt1, log_elem_size); 5289 ldr(tmp3, Address(a1, cnt1)); 5290 ldr(tmp4, Address(a2, cnt1)); 5291 eor(tmp5, tmp3, tmp4); 5292 cbnz(tmp5, DONE); 5293 b(SAME); 5294 bind(A_MIGHT_BE_NULL); 5295 // in case both a1 and a2 are not-null, proceed with loads 5296 cbz(a1, DONE); 5297 cbz(a2, DONE); 5298 b(A_IS_NOT_NULL); 5299 bind(SHORT); 5300 5301 tbz(cnt1, 2 - log_elem_size, TAIL03); // 0-7 bytes left. 5302 { 5303 ldrw(tmp1, Address(post(a1, 4))); 5304 ldrw(tmp2, Address(post(a2, 4))); 5305 eorw(tmp5, tmp1, tmp2); 5306 cbnzw(tmp5, DONE); 5307 } 5308 bind(TAIL03); 5309 tbz(cnt1, 1 - log_elem_size, TAIL01); // 0-3 bytes left. 5310 { 5311 ldrh(tmp3, Address(post(a1, 2))); 5312 ldrh(tmp4, Address(post(a2, 2))); 5313 eorw(tmp5, tmp3, tmp4); 5314 cbnzw(tmp5, DONE); 5315 } 5316 bind(TAIL01); 5317 if (elem_size == 1) { // Only needed when comparing byte arrays. 5318 tbz(cnt1, 0, SAME); // 0-1 bytes left. 5319 { 5320 ldrb(tmp1, a1); 5321 ldrb(tmp2, a2); 5322 eorw(tmp5, tmp1, tmp2); 5323 cbnzw(tmp5, DONE); 5324 } 5325 } 5326 } else { 5327 Label NEXT_DWORD, SHORT, TAIL, TAIL2, STUB, EARLY_OUT, 5328 CSET_EQ, LAST_CHECK; 5329 mov(result, false); 5330 cbz(a1, DONE); 5331 ldrw(cnt1, Address(a1, length_offset)); 5332 cbz(a2, DONE); 5333 ldrw(cnt2, Address(a2, length_offset)); 5334 // on most CPUs a2 is still "locked"(surprisingly) in ldrw and it's 5335 // faster to perform another branch before comparing a1 and a2 5336 cmp(cnt1, (u1)elem_per_word); 5337 br(LE, SHORT); // short or same 5338 ldr(tmp3, Address(pre(a1, base_offset))); 5339 subs(zr, cnt1, stubBytesThreshold); 5340 br(GE, STUB); 5341 ldr(tmp4, Address(pre(a2, base_offset))); 5342 sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size); 5343 cmp(cnt2, cnt1); 5344 br(NE, DONE); 5345 5346 // Main 16 byte comparison loop with 2 exits 5347 bind(NEXT_DWORD); { 5348 ldr(tmp1, Address(pre(a1, wordSize))); 5349 ldr(tmp2, Address(pre(a2, wordSize))); 5350 subs(cnt1, cnt1, 2 * elem_per_word); 5351 br(LE, TAIL); 5352 eor(tmp4, tmp3, tmp4); 5353 cbnz(tmp4, DONE); 5354 ldr(tmp3, Address(pre(a1, wordSize))); 5355 ldr(tmp4, Address(pre(a2, wordSize))); 5356 cmp(cnt1, (u1)elem_per_word); 5357 br(LE, TAIL2); 5358 cmp(tmp1, tmp2); 5359 } br(EQ, NEXT_DWORD); 5360 b(DONE); 5361 5362 bind(TAIL); 5363 eor(tmp4, tmp3, tmp4); 5364 eor(tmp2, tmp1, tmp2); 5365 lslv(tmp2, tmp2, tmp5); 5366 orr(tmp5, tmp4, tmp2); 5367 cmp(tmp5, zr); 5368 b(CSET_EQ); 5369 5370 bind(TAIL2); 5371 eor(tmp2, tmp1, tmp2); 5372 cbnz(tmp2, DONE); 5373 b(LAST_CHECK); 5374 5375 bind(STUB); 5376 ldr(tmp4, Address(pre(a2, base_offset))); 5377 cmp(cnt2, cnt1); 5378 br(NE, DONE); 5379 if (elem_size == 2) { // convert to byte counter 5380 lsl(cnt1, cnt1, 1); 5381 } 5382 eor(tmp5, tmp3, tmp4); 5383 cbnz(tmp5, DONE); 5384 RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_array_equals()); 5385 assert(stub.target() != NULL, "array_equals_long stub has not been generated"); 5386 trampoline_call(stub); 5387 b(DONE); 5388 5389 bind(EARLY_OUT); 5390 // (a1 != null && a2 == null) || (a1 != null && a2 != null && a1 == a2) 5391 // so, if a2 == null => return false(0), else return true, so we can return a2 5392 mov(result, a2); 5393 b(DONE); 5394 bind(SHORT); 5395 cmp(cnt2, cnt1); 5396 br(NE, DONE); 5397 cbz(cnt1, SAME); 5398 sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size); 5399 ldr(tmp3, Address(a1, base_offset)); 5400 ldr(tmp4, Address(a2, base_offset)); 5401 bind(LAST_CHECK); 5402 eor(tmp4, tmp3, tmp4); 5403 lslv(tmp5, tmp4, tmp5); 5404 cmp(tmp5, zr); 5405 bind(CSET_EQ); 5406 cset(result, EQ); 5407 b(DONE); 5408 } 5409 5410 bind(SAME); 5411 mov(result, true); 5412 // That's it. 5413 bind(DONE); 5414 5415 BLOCK_COMMENT("} array_equals"); 5416 } 5417 5418 // Compare Strings 5419 5420 // For Strings we're passed the address of the first characters in a1 5421 // and a2 and the length in cnt1. 5422 // elem_size is the element size in bytes: either 1 or 2. 5423 // There are two implementations. For arrays >= 8 bytes, all 5424 // comparisons (including the final one, which may overlap) are 5425 // performed 8 bytes at a time. For strings < 8 bytes, we compare a 5426 // halfword, then a short, and then a byte. 5427 5428 void MacroAssembler::string_equals(Register a1, Register a2, 5429 Register result, Register cnt1, int elem_size) 5430 { 5431 Label SAME, DONE, SHORT, NEXT_WORD; 5432 Register tmp1 = rscratch1; 5433 Register tmp2 = rscratch2; 5434 Register cnt2 = tmp2; // cnt2 only used in array length compare 5435 5436 assert(elem_size == 1 || elem_size == 2, "must be 2 or 1 byte"); 5437 assert_different_registers(a1, a2, result, cnt1, rscratch1, rscratch2); 5438 5439 #ifndef PRODUCT 5440 { 5441 const char kind = (elem_size == 2) ? 'U' : 'L'; 5442 char comment[64]; 5443 snprintf(comment, sizeof comment, "{string_equals%c", kind); 5444 BLOCK_COMMENT(comment); 5445 } 5446 #endif 5447 5448 mov(result, false); 5449 5450 // Check for short strings, i.e. smaller than wordSize. 5451 subs(cnt1, cnt1, wordSize); 5452 br(Assembler::LT, SHORT); 5453 // Main 8 byte comparison loop. 5454 bind(NEXT_WORD); { 5455 ldr(tmp1, Address(post(a1, wordSize))); 5456 ldr(tmp2, Address(post(a2, wordSize))); 5457 subs(cnt1, cnt1, wordSize); 5458 eor(tmp1, tmp1, tmp2); 5459 cbnz(tmp1, DONE); 5460 } br(GT, NEXT_WORD); 5461 // Last longword. In the case where length == 4 we compare the 5462 // same longword twice, but that's still faster than another 5463 // conditional branch. 5464 // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when 5465 // length == 4. 5466 ldr(tmp1, Address(a1, cnt1)); 5467 ldr(tmp2, Address(a2, cnt1)); 5468 eor(tmp2, tmp1, tmp2); 5469 cbnz(tmp2, DONE); 5470 b(SAME); 5471 5472 bind(SHORT); 5473 Label TAIL03, TAIL01; 5474 5475 tbz(cnt1, 2, TAIL03); // 0-7 bytes left. 5476 { 5477 ldrw(tmp1, Address(post(a1, 4))); 5478 ldrw(tmp2, Address(post(a2, 4))); 5479 eorw(tmp1, tmp1, tmp2); 5480 cbnzw(tmp1, DONE); 5481 } 5482 bind(TAIL03); 5483 tbz(cnt1, 1, TAIL01); // 0-3 bytes left. 5484 { 5485 ldrh(tmp1, Address(post(a1, 2))); 5486 ldrh(tmp2, Address(post(a2, 2))); 5487 eorw(tmp1, tmp1, tmp2); 5488 cbnzw(tmp1, DONE); 5489 } 5490 bind(TAIL01); 5491 if (elem_size == 1) { // Only needed when comparing 1-byte elements 5492 tbz(cnt1, 0, SAME); // 0-1 bytes left. 5493 { 5494 ldrb(tmp1, a1); 5495 ldrb(tmp2, a2); 5496 eorw(tmp1, tmp1, tmp2); 5497 cbnzw(tmp1, DONE); 5498 } 5499 } 5500 // Arrays are equal. 5501 bind(SAME); 5502 mov(result, true); 5503 5504 // That's it. 5505 bind(DONE); 5506 BLOCK_COMMENT("} string_equals"); 5507 } 5508 5509 5510 // The size of the blocks erased by the zero_blocks stub. We must 5511 // handle anything smaller than this ourselves in zero_words(). 5512 const int MacroAssembler::zero_words_block_size = 8; 5513 5514 // zero_words() is used by C2 ClearArray patterns. It is as small as 5515 // possible, handling small word counts locally and delegating 5516 // anything larger to the zero_blocks stub. It is expanded many times 5517 // in compiled code, so it is important to keep it short. 5518 5519 // ptr: Address of a buffer to be zeroed. 5520 // cnt: Count in HeapWords. 5521 // 5522 // ptr, cnt, rscratch1, and rscratch2 are clobbered. 5523 void MacroAssembler::zero_words(Register ptr, Register cnt) 5524 { 5525 assert(is_power_of_2(zero_words_block_size), "adjust this"); 5526 assert(ptr == r10 && cnt == r11, "mismatch in register usage"); 5527 5528 BLOCK_COMMENT("zero_words {"); 5529 cmp(cnt, (u1)zero_words_block_size); 5530 Label around; 5531 br(LO, around); 5532 { 5533 RuntimeAddress zero_blocks = RuntimeAddress(StubRoutines::aarch64::zero_blocks()); 5534 assert(zero_blocks.target() != NULL, "zero_blocks stub has not been generated"); 5535 if (StubRoutines::aarch64::complete()) { 5536 trampoline_call(zero_blocks); 5537 } else { 5538 bl(zero_blocks); 5539 } 5540 } 5541 bind(around); 5542 for (int i = zero_words_block_size >> 1; i > 1; i >>= 1) { 5543 Label l; 5544 tbz(cnt, exact_log2(i), l); 5545 for (int j = 0; j < i; j += 2) { 5546 stp(zr, zr, post(ptr, 16)); 5547 } 5548 bind(l); 5549 } 5550 { 5551 Label l; 5552 tbz(cnt, 0, l); 5553 str(zr, Address(ptr)); 5554 bind(l); 5555 } 5556 BLOCK_COMMENT("} zero_words"); 5557 } 5558 5559 // base: Address of a buffer to be zeroed, 8 bytes aligned. 5560 // cnt: Immediate count in HeapWords. 5561 #define SmallArraySize (18 * BytesPerLong) 5562 void MacroAssembler::zero_words(Register base, u_int64_t cnt) 5563 { 5564 BLOCK_COMMENT("zero_words {"); 5565 int i = cnt & 1; // store any odd word to start 5566 if (i) str(zr, Address(base)); 5567 5568 if (cnt <= SmallArraySize / BytesPerLong) { 5569 for (; i < (int)cnt; i += 2) 5570 stp(zr, zr, Address(base, i * wordSize)); 5571 } else { 5572 const int unroll = 4; // Number of stp(zr, zr) instructions we'll unroll 5573 int remainder = cnt % (2 * unroll); 5574 for (; i < remainder; i += 2) 5575 stp(zr, zr, Address(base, i * wordSize)); 5576 5577 Label loop; 5578 Register cnt_reg = rscratch1; 5579 Register loop_base = rscratch2; 5580 cnt = cnt - remainder; 5581 mov(cnt_reg, cnt); 5582 // adjust base and prebias by -2 * wordSize so we can pre-increment 5583 add(loop_base, base, (remainder - 2) * wordSize); 5584 bind(loop); 5585 sub(cnt_reg, cnt_reg, 2 * unroll); 5586 for (i = 1; i < unroll; i++) 5587 stp(zr, zr, Address(loop_base, 2 * i * wordSize)); 5588 stp(zr, zr, Address(pre(loop_base, 2 * unroll * wordSize))); 5589 cbnz(cnt_reg, loop); 5590 } 5591 BLOCK_COMMENT("} zero_words"); 5592 } 5593 5594 // Zero blocks of memory by using DC ZVA. 5595 // 5596 // Aligns the base address first sufficently for DC ZVA, then uses 5597 // DC ZVA repeatedly for every full block. cnt is the size to be 5598 // zeroed in HeapWords. Returns the count of words left to be zeroed 5599 // in cnt. 5600 // 5601 // NOTE: This is intended to be used in the zero_blocks() stub. If 5602 // you want to use it elsewhere, note that cnt must be >= 2*zva_length. 5603 void MacroAssembler::zero_dcache_blocks(Register base, Register cnt) { 5604 Register tmp = rscratch1; 5605 Register tmp2 = rscratch2; 5606 int zva_length = VM_Version::zva_length(); 5607 Label initial_table_end, loop_zva; 5608 Label fini; 5609 5610 // Base must be 16 byte aligned. If not just return and let caller handle it 5611 tst(base, 0x0f); 5612 br(Assembler::NE, fini); 5613 // Align base with ZVA length. 5614 neg(tmp, base); 5615 andr(tmp, tmp, zva_length - 1); 5616 5617 // tmp: the number of bytes to be filled to align the base with ZVA length. 5618 add(base, base, tmp); 5619 sub(cnt, cnt, tmp, Assembler::ASR, 3); 5620 adr(tmp2, initial_table_end); 5621 sub(tmp2, tmp2, tmp, Assembler::LSR, 2); 5622 br(tmp2); 5623 5624 for (int i = -zva_length + 16; i < 0; i += 16) 5625 stp(zr, zr, Address(base, i)); 5626 bind(initial_table_end); 5627 5628 sub(cnt, cnt, zva_length >> 3); 5629 bind(loop_zva); 5630 dc(Assembler::ZVA, base); 5631 subs(cnt, cnt, zva_length >> 3); 5632 add(base, base, zva_length); 5633 br(Assembler::GE, loop_zva); 5634 add(cnt, cnt, zva_length >> 3); // count not zeroed by DC ZVA 5635 bind(fini); 5636 } 5637 5638 // base: Address of a buffer to be filled, 8 bytes aligned. 5639 // cnt: Count in 8-byte unit. 5640 // value: Value to be filled with. 5641 // base will point to the end of the buffer after filling. 5642 void MacroAssembler::fill_words(Register base, Register cnt, Register value) 5643 { 5644 // Algorithm: 5645 // 5646 // scratch1 = cnt & 7; 5647 // cnt -= scratch1; 5648 // p += scratch1; 5649 // switch (scratch1) { 5650 // do { 5651 // cnt -= 8; 5652 // p[-8] = v; 5653 // case 7: 5654 // p[-7] = v; 5655 // case 6: 5656 // p[-6] = v; 5657 // // ... 5658 // case 1: 5659 // p[-1] = v; 5660 // case 0: 5661 // p += 8; 5662 // } while (cnt); 5663 // } 5664 5665 assert_different_registers(base, cnt, value, rscratch1, rscratch2); 5666 5667 Label fini, skip, entry, loop; 5668 const int unroll = 8; // Number of stp instructions we'll unroll 5669 5670 cbz(cnt, fini); 5671 tbz(base, 3, skip); 5672 str(value, Address(post(base, 8))); 5673 sub(cnt, cnt, 1); 5674 bind(skip); 5675 5676 andr(rscratch1, cnt, (unroll-1) * 2); 5677 sub(cnt, cnt, rscratch1); 5678 add(base, base, rscratch1, Assembler::LSL, 3); 5679 adr(rscratch2, entry); 5680 sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 1); 5681 br(rscratch2); 5682 5683 bind(loop); 5684 add(base, base, unroll * 16); 5685 for (int i = -unroll; i < 0; i++) 5686 stp(value, value, Address(base, i * 16)); 5687 bind(entry); 5688 subs(cnt, cnt, unroll * 2); 5689 br(Assembler::GE, loop); 5690 5691 tbz(cnt, 0, fini); 5692 str(value, Address(post(base, 8))); 5693 bind(fini); 5694 } 5695 5696 // Intrinsic for sun/nio/cs/ISO_8859_1$Encoder.implEncodeISOArray and 5697 // java/lang/StringUTF16.compress. 5698 void MacroAssembler::encode_iso_array(Register src, Register dst, 5699 Register len, Register result, 5700 FloatRegister Vtmp1, FloatRegister Vtmp2, 5701 FloatRegister Vtmp3, FloatRegister Vtmp4) 5702 { 5703 Label DONE, SET_RESULT, NEXT_32, NEXT_32_PRFM, LOOP_8, NEXT_8, LOOP_1, NEXT_1, 5704 NEXT_32_START, NEXT_32_PRFM_START; 5705 Register tmp1 = rscratch1, tmp2 = rscratch2; 5706 5707 mov(result, len); // Save initial len 5708 5709 #ifndef BUILTIN_SIM 5710 cmp(len, (u1)8); // handle shortest strings first 5711 br(LT, LOOP_1); 5712 cmp(len, (u1)32); 5713 br(LT, NEXT_8); 5714 // The following code uses the SIMD 'uzp1' and 'uzp2' instructions 5715 // to convert chars to bytes 5716 if (SoftwarePrefetchHintDistance >= 0) { 5717 ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src); 5718 subs(tmp2, len, SoftwarePrefetchHintDistance/2 + 16); 5719 br(LE, NEXT_32_START); 5720 b(NEXT_32_PRFM_START); 5721 BIND(NEXT_32_PRFM); 5722 ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src); 5723 BIND(NEXT_32_PRFM_START); 5724 prfm(Address(src, SoftwarePrefetchHintDistance)); 5725 orr(v4, T16B, Vtmp1, Vtmp2); 5726 orr(v5, T16B, Vtmp3, Vtmp4); 5727 uzp1(Vtmp1, T16B, Vtmp1, Vtmp2); 5728 uzp1(Vtmp3, T16B, Vtmp3, Vtmp4); 5729 uzp2(v5, T16B, v4, v5); // high bytes 5730 umov(tmp2, v5, D, 1); 5731 fmovd(tmp1, v5); 5732 orr(tmp1, tmp1, tmp2); 5733 cbnz(tmp1, LOOP_8); 5734 stpq(Vtmp1, Vtmp3, dst); 5735 sub(len, len, 32); 5736 add(dst, dst, 32); 5737 add(src, src, 64); 5738 subs(tmp2, len, SoftwarePrefetchHintDistance/2 + 16); 5739 br(GE, NEXT_32_PRFM); 5740 cmp(len, (u1)32); 5741 br(LT, LOOP_8); 5742 BIND(NEXT_32); 5743 ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src); 5744 BIND(NEXT_32_START); 5745 } else { 5746 BIND(NEXT_32); 5747 ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src); 5748 } 5749 prfm(Address(src, SoftwarePrefetchHintDistance)); 5750 uzp1(v4, T16B, Vtmp1, Vtmp2); 5751 uzp1(v5, T16B, Vtmp3, Vtmp4); 5752 orr(Vtmp1, T16B, Vtmp1, Vtmp2); 5753 orr(Vtmp3, T16B, Vtmp3, Vtmp4); 5754 uzp2(Vtmp1, T16B, Vtmp1, Vtmp3); // high bytes 5755 umov(tmp2, Vtmp1, D, 1); 5756 fmovd(tmp1, Vtmp1); 5757 orr(tmp1, tmp1, tmp2); 5758 cbnz(tmp1, LOOP_8); 5759 stpq(v4, v5, dst); 5760 sub(len, len, 32); 5761 add(dst, dst, 32); 5762 add(src, src, 64); 5763 cmp(len, (u1)32); 5764 br(GE, NEXT_32); 5765 cbz(len, DONE); 5766 5767 BIND(LOOP_8); 5768 cmp(len, (u1)8); 5769 br(LT, LOOP_1); 5770 BIND(NEXT_8); 5771 ld1(Vtmp1, T8H, src); 5772 uzp1(Vtmp2, T16B, Vtmp1, Vtmp1); // low bytes 5773 uzp2(Vtmp3, T16B, Vtmp1, Vtmp1); // high bytes 5774 fmovd(tmp1, Vtmp3); 5775 cbnz(tmp1, NEXT_1); 5776 strd(Vtmp2, dst); 5777 5778 sub(len, len, 8); 5779 add(dst, dst, 8); 5780 add(src, src, 16); 5781 cmp(len, (u1)8); 5782 br(GE, NEXT_8); 5783 5784 BIND(LOOP_1); 5785 #endif 5786 cbz(len, DONE); 5787 BIND(NEXT_1); 5788 ldrh(tmp1, Address(post(src, 2))); 5789 tst(tmp1, 0xff00); 5790 br(NE, SET_RESULT); 5791 strb(tmp1, Address(post(dst, 1))); 5792 subs(len, len, 1); 5793 br(GT, NEXT_1); 5794 5795 BIND(SET_RESULT); 5796 sub(result, result, len); // Return index where we stopped 5797 // Return len == 0 if we processed all 5798 // characters 5799 BIND(DONE); 5800 } 5801 5802 5803 // Inflate byte[] array to char[]. 5804 void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len, 5805 FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, 5806 Register tmp4) { 5807 Label big, done, after_init, to_stub; 5808 5809 assert_different_registers(src, dst, len, tmp4, rscratch1); 5810 5811 fmovd(vtmp1, zr); 5812 lsrw(tmp4, len, 3); 5813 bind(after_init); 5814 cbnzw(tmp4, big); 5815 // Short string: less than 8 bytes. 5816 { 5817 Label loop, tiny; 5818 5819 cmpw(len, 4); 5820 br(LT, tiny); 5821 // Use SIMD to do 4 bytes. 5822 ldrs(vtmp2, post(src, 4)); 5823 zip1(vtmp3, T8B, vtmp2, vtmp1); 5824 subw(len, len, 4); 5825 strd(vtmp3, post(dst, 8)); 5826 5827 cbzw(len, done); 5828 5829 // Do the remaining bytes by steam. 5830 bind(loop); 5831 ldrb(tmp4, post(src, 1)); 5832 strh(tmp4, post(dst, 2)); 5833 subw(len, len, 1); 5834 5835 bind(tiny); 5836 cbnz(len, loop); 5837 5838 b(done); 5839 } 5840 5841 if (SoftwarePrefetchHintDistance >= 0) { 5842 bind(to_stub); 5843 RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_byte_array_inflate()); 5844 assert(stub.target() != NULL, "large_byte_array_inflate stub has not been generated"); 5845 trampoline_call(stub); 5846 b(after_init); 5847 } 5848 5849 // Unpack the bytes 8 at a time. 5850 bind(big); 5851 { 5852 Label loop, around, loop_last, loop_start; 5853 5854 if (SoftwarePrefetchHintDistance >= 0) { 5855 const int large_loop_threshold = (64 + 16)/8; 5856 ldrd(vtmp2, post(src, 8)); 5857 andw(len, len, 7); 5858 cmp(tmp4, (u1)large_loop_threshold); 5859 br(GE, to_stub); 5860 b(loop_start); 5861 5862 bind(loop); 5863 ldrd(vtmp2, post(src, 8)); 5864 bind(loop_start); 5865 subs(tmp4, tmp4, 1); 5866 br(EQ, loop_last); 5867 zip1(vtmp2, T16B, vtmp2, vtmp1); 5868 ldrd(vtmp3, post(src, 8)); 5869 st1(vtmp2, T8H, post(dst, 16)); 5870 subs(tmp4, tmp4, 1); 5871 zip1(vtmp3, T16B, vtmp3, vtmp1); 5872 st1(vtmp3, T8H, post(dst, 16)); 5873 br(NE, loop); 5874 b(around); 5875 bind(loop_last); 5876 zip1(vtmp2, T16B, vtmp2, vtmp1); 5877 st1(vtmp2, T8H, post(dst, 16)); 5878 bind(around); 5879 cbz(len, done); 5880 } else { 5881 andw(len, len, 7); 5882 bind(loop); 5883 ldrd(vtmp2, post(src, 8)); 5884 sub(tmp4, tmp4, 1); 5885 zip1(vtmp3, T16B, vtmp2, vtmp1); 5886 st1(vtmp3, T8H, post(dst, 16)); 5887 cbnz(tmp4, loop); 5888 } 5889 } 5890 5891 // Do the tail of up to 8 bytes. 5892 add(src, src, len); 5893 ldrd(vtmp3, Address(src, -8)); 5894 add(dst, dst, len, ext::uxtw, 1); 5895 zip1(vtmp3, T16B, vtmp3, vtmp1); 5896 strq(vtmp3, Address(dst, -16)); 5897 5898 bind(done); 5899 } 5900 5901 // Compress char[] array to byte[]. 5902 void MacroAssembler::char_array_compress(Register src, Register dst, Register len, 5903 FloatRegister tmp1Reg, FloatRegister tmp2Reg, 5904 FloatRegister tmp3Reg, FloatRegister tmp4Reg, 5905 Register result) { 5906 encode_iso_array(src, dst, len, result, 5907 tmp1Reg, tmp2Reg, tmp3Reg, tmp4Reg); 5908 cmp(len, zr); 5909 csel(result, result, zr, EQ); 5910 } 5911 5912 // get_thread() can be called anywhere inside generated code so we 5913 // need to save whatever non-callee save context might get clobbered 5914 // by the call to JavaThread::aarch64_get_thread_helper() or, indeed, 5915 // the call setup code. 5916 // 5917 // aarch64_get_thread_helper() clobbers only r0, r1, and flags. 5918 // 5919 void MacroAssembler::get_thread(Register dst) { 5920 RegSet saved_regs = RegSet::range(r0, r1) + lr - dst; 5921 push(saved_regs, sp); 5922 5923 mov(lr, CAST_FROM_FN_PTR(address, JavaThread::aarch64_get_thread_helper)); 5924 blrt(lr, 1, 0, 1); 5925 if (dst != c_rarg0) { 5926 mov(dst, c_rarg0); 5927 } 5928 5929 pop(saved_regs, sp); 5930 } 5931 5932 // C2 compiled method's prolog code 5933 // Moved here from aarch64.ad to support Valhalla code belows 5934 void MacroAssembler::verified_entry(Compile* C, int sp_inc) { 5935 5936 // n.b. frame size includes space for return pc and rfp 5937 const long framesize = C->frame_size_in_bytes(); 5938 assert(framesize % (2 * wordSize) == 0, "must preserve 2 * wordSize alignment"); 5939 5940 // insert a nop at the start of the prolog so we can patch in a 5941 // branch if we need to invalidate the method later 5942 nop(); 5943 5944 int bangsize = C->bang_size_in_bytes(); 5945 if (C->need_stack_bang(bangsize) && UseStackBanging) 5946 generate_stack_overflow_check(bangsize); 5947 5948 build_frame(framesize); 5949 5950 if (NotifySimulator) { 5951 notify(Assembler::method_entry); 5952 } 5953 5954 if (VerifyStackAtCalls) { 5955 Unimplemented(); 5956 } 5957 } 5958 5959 void MacroAssembler::unpack_value_args(Compile* C, bool receiver_only) { 5960 // Called from MachVEP node 5961 unimplemented("Support for ValueTypePassFieldsAsArgs and ValueTypeReturnedAsFields is not implemented"); 5962 } 5963 5964 void MacroAssembler::store_value_type_fields_to_buf(ciValueKlass* vk) { 5965 super_call_VM_leaf(StubRoutines::store_value_type_fields_to_buf()); 5966 }