--- old/src/hotspot/cpu/x86/assembler_x86.cpp 2019-12-10 17:10:24.993015386 -0800 +++ new/src/hotspot/cpu/x86/assembler_x86.cpp 2019-12-10 17:10:24.845015386 -0800 @@ -4257,8 +4257,8 @@ void Assembler::vpshufd(XMMRegister dst, XMMRegister src, int mode, int vector_len) { assert(vector_len == AVX_128bit? VM_Version::supports_avx() : - vector_len == AVX_256bit? VM_Version::supports_avx2() : - 0, ""); + (vector_len == AVX_256bit? VM_Version::supports_avx2() : + (vector_len == AVX_512bit? VM_Version::supports_evex() : 0)), ""); NOT_LP64(assert(VM_Version::supports_sse2(), "")); InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); int encode = simd_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, VEX_OPCODE_0F, &attributes); @@ -4737,6 +4737,36 @@ emit_int8((unsigned char)(0xE8 | encode)); } +void Assembler::shldl(Register dst, Register src) { + int encode = prefix_and_encode(src->encoding(), dst->encoding()); + emit_int8(0x0F); + emit_int8((unsigned char)0xA5); + emit_int8((unsigned char)(0xC0 | encode)); +} + +void Assembler::shldl(Register dst, Register src, int8_t imm8) { + int encode = prefix_and_encode(src->encoding(), dst->encoding()); + emit_int8(0x0F); + emit_int8((unsigned char)0xA4); + emit_int8((unsigned char)(0xC0 | encode)); + emit_int8(imm8); +} + +void Assembler::shrdl(Register dst, Register src) { + int encode = prefix_and_encode(src->encoding(), dst->encoding()); + emit_int8(0x0F); + emit_int8((unsigned char)0xAD); + emit_int8((unsigned char)(0xC0 | encode)); +} + +void Assembler::shrdl(Register dst, Register src, int8_t imm8) { + int encode = prefix_and_encode(src->encoding(), dst->encoding()); + emit_int8(0x0F); + emit_int8((unsigned char)0xAC); + emit_int8((unsigned char)(0xC0 | encode)); + emit_int8(imm8); +} + // copies a single word from [esi] to [edi] void Assembler::smovl() { emit_int8((unsigned char)0xA5); @@ -6513,6 +6543,23 @@ emit_int8((unsigned char)(0xC0 | encode)); } +void Assembler::vpshldvd(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len) { + assert(UseVBMI2, "requires vbmi2"); + InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); + attributes.set_is_evex_instruction(); + int encode = vex_prefix_and_encode(dst->encoding(), src->encoding(), shift->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); + emit_int8(0x71); + emit_int8((unsigned char)(0xC0 | encode)); +} + +void Assembler::vpshrdvd(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len) { + assert(UseVBMI2, "requires vbmi2"); + InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true); + attributes.set_is_evex_instruction(); + int encode = vex_prefix_and_encode(dst->encoding(), src->encoding(), shift->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes); + emit_int8(0x73); + emit_int8((unsigned char)(0xC0 | encode)); +} void Assembler::pandn(XMMRegister dst, XMMRegister src) { NOT_LP64(assert(VM_Version::supports_sse2(), "")); @@ -8109,26 +8156,6 @@ emit_int8((unsigned char)(0xE0 | dst->encoding())); } -void Assembler::shldl(Register dst, Register src) { - emit_int8(0x0F); - emit_int8((unsigned char)0xA5); - emit_int8((unsigned char)(0xC0 | src->encoding() << 3 | dst->encoding())); -} - -// 0F A4 / r ib -void Assembler::shldl(Register dst, Register src, int8_t imm8) { - emit_int8(0x0F); - emit_int8((unsigned char)0xA4); - emit_int8((unsigned char)(0xC0 | src->encoding() << 3 | dst->encoding())); - emit_int8(imm8); -} - -void Assembler::shrdl(Register dst, Register src) { - emit_int8(0x0F); - emit_int8((unsigned char)0xAD); - emit_int8((unsigned char)(0xC0 | src->encoding() << 3 | dst->encoding())); -} - #else // LP64 void Assembler::set_byte_if_not_zero(Register dst) { --- old/src/hotspot/cpu/x86/assembler_x86.hpp 2019-12-10 17:10:25.337015384 -0800 +++ new/src/hotspot/cpu/x86/assembler_x86.hpp 2019-12-10 17:10:25.193015385 -0800 @@ -1838,6 +1838,8 @@ void shldl(Register dst, Register src); void shldl(Register dst, Register src, int8_t imm8); + void shrdl(Register dst, Register src); + void shrdl(Register dst, Register src, int8_t imm8); void shll(Register dst, int imm8); void shll(Register dst); @@ -1845,8 +1847,6 @@ void shlq(Register dst, int imm8); void shlq(Register dst); - void shrdl(Register dst, Register src); - void shrl(Register dst, int imm8); void shrl(Register dst); @@ -2140,6 +2140,9 @@ void evpsraq(XMMRegister dst, XMMRegister src, int shift, int vector_len); void evpsraq(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len); + void vpshldvd(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len); + void vpshrdvd(XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len); + // And packed integers void pand(XMMRegister dst, XMMRegister src); void vpand(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len); --- old/src/hotspot/cpu/x86/globals_x86.hpp 2019-12-10 17:10:25.645015383 -0800 +++ new/src/hotspot/cpu/x86/globals_x86.hpp 2019-12-10 17:10:25.497015384 -0800 @@ -207,6 +207,9 @@ product(bool, UseBMI2Instructions, false, \ "Use BMI2 instructions") \ \ + product(bool, UseVBMI2, false, \ + "Use VBMI2 instructions") \ + \ diagnostic(bool, UseLibmIntrinsic, true, \ "Use Libm Intrinsics") \ \ --- old/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp 2019-12-10 17:10:25.941015382 -0800 +++ new/src/hotspot/cpu/x86/stubGenerator_x86_64.cpp 2019-12-10 17:10:25.789015383 -0800 @@ -5694,6 +5694,241 @@ return start; } + address generate_bigIntegerRightShift() { + __ align(CodeEntryAlignment); + StubCodeMark mark(this, "StubRoutines", "bigIntegerRightShiftWorker"); + + address start = __ pc(); + Label Shift512Loop, ShiftTwo, ShiftTwoLoop, ShiftOne, Exit; + // For Unix, the arguments are as follows: rdi, rsi, rdx, rcx, r8. + const Register newArr = rdi; + const Register oldArr = rsi; + const Register newIdx = rdx; + const Register shiftCount = rcx; // It was intentional to have shiftCount in rcx since it is used implicitly for shift. + const Register totalNumIter = r8; + + // For windows, we use r9 and r10 as temps to save rdi and rsi. Thus we cannot allocate them for our temps. + // For everything else, we prefer using r9 and r10 since we do not have to save them before use. + const Register tmp1 = r11; // Caller save. + const Register tmp2 = rax; // Caller save. + const Register tmp3 = WINDOWS_ONLY(r12) NOT_WINDOWS(r9); // Windows: Callee save. Linux: Caller save. + const Register tmp4 = WINDOWS_ONLY(r13) NOT_WINDOWS(r10); // Windows: Callee save. Linux: Caller save. + const Register tmp5 = r14; // Callee save. + const Register tmp6 = r15; + + const XMMRegister x0 = xmm0; + const XMMRegister x1 = xmm1; + const XMMRegister x2 = xmm2; + + BLOCK_COMMENT("Entry:"); + __ enter(); // required for proper stackwalking of RuntimeStub frame + +#ifdef _WINDOWS + setup_arg_regs(4); + // For windows, since last argument is on stack, we need to move it to the appropriate register. + __ movl(totalNumIter, Address(rsp, 6 * wordSize)); + // Save callee save registers. + __ push(tmp3); + __ push(tmp4); +#endif + __ push(tmp5); + + // Rename temps used throughout the code. + const Register idx = tmp1; + const Register nIdx = tmp2; + + __ cmpl(totalNumIter, 1); + __ jcc(Assembler::less, Exit); + + __ xorl(idx, idx); + + // Start right shift from end of the array. + // For example, if #iteration = 4 and newIdx = 1 + // then dest[4] = src[4] >> shiftCount | src[3] <<< (shiftCount - 32) + // if #iteration = 4 and newIdx = 0 + // then dest[3] = src[4] >> shiftCount | src[3] <<< (shiftCount - 32) + __ movl(idx, totalNumIter); + __ movl(nIdx, idx); + __ addl(nIdx, newIdx); + + // If vectorization is enabled, check if the number of iterations is greater than 63 + // If not, then go to ShifTwo processing 2 iterations + if (UseAVX > 2 && UseVBMI2) { + __ cmpl(totalNumIter, 63); + __ jcc(Assembler::less, ShiftTwo); + __ evpbroadcastd(x0, shiftCount, Assembler::AVX_512bit); + __ subl(idx, 16); + __ subl(nIdx, 16); + __ BIND(Shift512Loop); + __ evmovdqul(x2, Address(oldArr, idx, Address::times_4, 4), Assembler::AVX_512bit); + __ evmovdqul(x1, Address(oldArr, idx, Address::times_4), Assembler::AVX_512bit); + __ vpshrdvd(x2, x1, x0, Assembler::AVX_512bit); + __ evmovdqul(Address(newArr, nIdx, Address::times_4), x2, Assembler::AVX_512bit); + __ subl(nIdx, 16); + __ subl(idx, 16); + __ jcc(Assembler::greaterEqual, Shift512Loop); + __ addl(idx, 16); + __ addl(nIdx, 16); + } + __ BIND(ShiftTwo); + __ cmpl(idx, 2); + __ jcc(Assembler::less, ShiftOne); + __ subl(idx, 2); + __ subl(nIdx, 2); + __ BIND(ShiftTwoLoop); + __ movl(tmp5, Address(oldArr, idx, Address::times_4, 8)); + __ movl(tmp4, Address(oldArr, idx, Address::times_4, 4)); + __ movl(tmp3, Address(oldArr, idx, Address::times_4)); + __ shrdl(tmp5, tmp4); + __ shrdl(tmp4, tmp3); + __ movl(Address(newArr, nIdx, Address::times_4, 4), tmp5); + __ movl(Address(newArr, nIdx, Address::times_4), tmp4); + __ subl(nIdx, 2); + __ subl(idx, 2); + __ jcc(Assembler::greaterEqual, ShiftTwoLoop); + __ addl(idx, 2); + __ addl(nIdx, 2); + + // Do the last iteration + __ BIND(ShiftOne); + __ cmpl(idx, 1); + __ jcc(Assembler::less, Exit); + __ subl(idx, 1); + __ subl(nIdx, 1); + __ movl(tmp4, Address(oldArr, idx, Address::times_4, 4)); + __ movl(tmp3, Address(oldArr, idx, Address::times_4)); + __ shrdl(tmp4, tmp3); + __ movl(Address(newArr, nIdx, Address::times_4), tmp4); + __ BIND(Exit); + // Restore callee save registers. + __ pop(tmp5); +#ifdef _WINDOWS + __ pop(tmp4); + __ pop(tmp3); + restore_arg_regs(); +#endif + __ leave(); // required for proper stackwalking of RuntimeStub frame + __ ret(0); + return start; + } + + /** + * Arguments: + * + * Input: + * c_rarg0 - newArr address + * c_rarg1 - oldArr address + * c_rarg2 - newIdx + * c_rarg3 - shiftCount + * not Win64 + * c_rarg4 - numIter + * Win64 + * rsp40 - numIter + */ + address generate_bigIntegerLeftShift() { + __ align(CodeEntryAlignment); + StubCodeMark mark(this, "StubRoutines", "bigIntegerLeftShiftWorker"); + address start = __ pc(); + Label Shift512Loop, ShiftTwo, ShiftTwoLoop, ShiftOne, Exit; + // For Unix, the arguments are as follows: rdi, rsi, rdx, rcx, r8. + const Register newArr = rdi; + const Register oldArr = rsi; + const Register newIdx = rdx; + const Register shiftCount = rcx; // It was intentional to have shiftCount in rcx since it is used implicitly for shift. + const Register totalNumIter = r8; + // For windows, we use r9 and r10 as temps to save rdi and rsi. Thus we cannot allocate them for our temps. + // For everything else, we prefer using r9 and r10 since we do not have to save them before use. + const Register tmp1 = r11; // Caller save. + const Register tmp2 = rax; // Caller save. + const Register tmp3 = WINDOWS_ONLY(r12) NOT_WINDOWS(r9); // Windows: Callee save. Linux: Caller save. + const Register tmp4 = WINDOWS_ONLY(r13) NOT_WINDOWS(r10); // Windows: Callee save. Linux: Caller save. + const Register tmp5 = r14; // Callee save. + + const XMMRegister x0 = xmm0; + const XMMRegister x1 = xmm1; + const XMMRegister x2 = xmm2; + BLOCK_COMMENT("Entry:"); + __ enter(); // required for proper stackwalking of RuntimeStub frame + +#ifdef _WINDOWS + setup_arg_regs(4); + // For windows, since last argument is on stack, we need to move it to the appropriate register. + __ movl(totalNumIter, Address(rsp, 6 * wordSize)); + // Save callee save registers. + __ push(tmp3); + __ push(tmp4); +#endif + __ push(tmp5); + + // Rename temps used throughout the code + const Register idx = tmp1; + const Register numIterTmp = tmp2; + + __ cmpl(totalNumIter, 1); + __ jcc(Assembler::less, Exit); + + // Start idx from zero. + __ xorl(idx, idx); + // Compute interior pointer for new array. We do this so that we can use same index for both old and new arrays. + __ lea(newArr, Address(newArr, newIdx, Address::times_4)); + __ movl(numIterTmp, totalNumIter); + + // If vectorization is enabled, check if the number of iterations is greater than 63. + // If not, then go to ShiftTwo shifting two numbers at a time + if (UseAVX > 2 && UseVBMI2) { + __ cmpl(totalNumIter, 63); + __ jcc(Assembler::less, ShiftTwo); + __ evpbroadcastd(x0, shiftCount, Assembler::AVX_512bit); + __ subl(numIterTmp, 16); + __ BIND(Shift512Loop); + __ evmovdqul(x1, Address(oldArr, idx, Address::times_4), Assembler::AVX_512bit); + __ evmovdqul(x2, Address(oldArr, idx, Address::times_4, 0x4), Assembler::AVX_512bit); + __ vpshldvd(x1, x2, x0, Assembler::AVX_512bit); + __ evmovdqul(Address(newArr, idx, Address::times_4), x1, Assembler::AVX_512bit); + __ addl(idx, 16); + __ subl(numIterTmp, 16); + __ jcc(Assembler::greaterEqual, Shift512Loop); + __ addl(numIterTmp, 16); + } + __ BIND(ShiftTwo); + __ movl(tmp3, Address(oldArr, idx, Address::times_4)); + __ subl(numIterTmp, 2); + __ jcc(Assembler::less, ShiftOne); + + __ BIND(ShiftTwoLoop); + __ movl(tmp4, Address(oldArr, idx, Address::times_4, 0x4)); + __ movl(tmp5, Address(oldArr, idx, Address::times_4, 0x8)); + __ shldl(tmp3, tmp4); + __ shldl(tmp4, tmp5); + __ movl(Address(newArr, idx, Address::times_4), tmp3); + __ movl(Address(newArr, idx, Address::times_4, 0x4), tmp4); + __ movl(tmp3, tmp5); + __ addl(idx, 2); + __ subl(numIterTmp, 2); + __ jcc(Assembler::greaterEqual, ShiftTwoLoop); + + // Do the last iteration + __ BIND(ShiftOne); + __ addl(numIterTmp, 2); + __ cmpl(numIterTmp, 1); + __ jcc(Assembler::less, Exit); + __ movl(tmp4, Address(oldArr, idx, Address::times_4, 0x4)); + __ shldl(tmp3, tmp4); + __ movl(Address(newArr, idx, Address::times_4), tmp3); + + __ BIND(Exit); + // Restore callee save registers. + __ pop(tmp5); +#ifdef _WINDOWS + __ pop(tmp4); + __ pop(tmp3); + restore_arg_regs(); +#endif + __ leave(); // required for proper stackwalking of RuntimeStub frame + __ ret(0); + return start; + } + address generate_libmExp() { StubCodeMark mark(this, "StubRoutines", "libmExp"); @@ -6314,6 +6549,8 @@ if (UseMulAddIntrinsic) { StubRoutines::_mulAdd = generate_mulAdd(); } + StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift(); + StubRoutines::_bigIntegerLeftShiftWorker = generate_bigIntegerLeftShift(); #ifndef _WINDOWS if (UseMontgomeryMultiplyIntrinsic) { StubRoutines::_montgomeryMultiply --- old/src/hotspot/cpu/x86/vm_version_x86.cpp 2019-12-10 17:10:26.265015381 -0800 +++ new/src/hotspot/cpu/x86/vm_version_x86.cpp 2019-12-10 17:10:26.113015381 -0800 @@ -748,7 +748,10 @@ (supports_adx() ? ", adx" : ""), (supports_evex() ? ", evex" : ""), (supports_sha() ? ", sha" : ""), - (supports_fma() ? ", fma" : "")); + (supports_fma() ? ", fma" : ""), + (supports_vbmi2() ? ", vbmi2" : ""), + (supports_vaes() ? ", vaes" : ""), + (supports_vnni() ? ", vnni" : "")); _features_string = os::strdup(buf); // UseSSE is set to the smaller of what hardware supports and what @@ -1429,6 +1432,16 @@ FLAG_SET_DEFAULT(UseBMI2Instructions, false); } + // To enable VBMI2 we require that processor supports it and also that EVEX encoding is supported and enabled. + if (supports_vbmi2() && UseAVX > 2) { + if (FLAG_IS_DEFAULT(UseVBMI2)) { + UseVBMI2 = true; + } + } else if (UseVBMI2) { + warning("VBMI2 instructions are not available on this CPU"); + FLAG_SET_DEFAULT(UseVBMI2, false); + } + // Use population count instruction if available. if (supports_popcnt()) { if (FLAG_IS_DEFAULT(UsePopCountInstruction)) { --- old/src/hotspot/cpu/x86/vm_version_x86.hpp 2019-12-10 17:10:26.565015380 -0800 +++ new/src/hotspot/cpu/x86/vm_version_x86.hpp 2019-12-10 17:10:26.421015380 -0800 @@ -341,6 +341,7 @@ #define CPU_AVX512_VPCLMULQDQ ((uint64_t)UCONST64(0x4000000000)) //Vector carryless multiplication #define CPU_VAES ((uint64_t)UCONST64(0x8000000000)) // Vector AES instructions #define CPU_VNNI ((uint64_t)UCONST64(0x10000000000)) // Vector Neural Network Instructions +#define CPU_VBMI2 ((uint64_t)UCONST64(0x20000000000)) // VBMI2 shift left double instructions #define CPU_FLUSH ((uint64_t)UCONST64(0x20000000000)) // flush instruction #define CPU_FLUSHOPT ((uint64_t)UCONST64(0x40000000000)) // flushopt instruction @@ -567,6 +568,8 @@ result |= CPU_VAES; if (_cpuid_info.sef_cpuid7_ecx.bits.avx512_vnni != 0) result |= CPU_VNNI; + if (_cpuid_info.sef_cpuid7_ecx.bits.avx512_vbmi2 != 0) + result |= CPU_VBMI2; } } if (_cpuid_info.sef_cpuid7_ebx.bits.bmi1 != 0) @@ -858,6 +861,7 @@ static bool supports_avx512_vpclmulqdq() { return (_features & CPU_AVX512_VPCLMULQDQ) != 0; } static bool supports_vaes() { return (_features & CPU_VAES) != 0; } static bool supports_vnni() { return (_features & CPU_VNNI) != 0; } + static bool supports_vbmi2() { return (_features & CPU_VBMI2) != 0; } // Intel features static bool is_intel_family_core() { return is_intel() && --- old/src/hotspot/share/classfile/vmSymbols.cpp 2019-12-10 17:10:26.865015379 -0800 +++ new/src/hotspot/share/classfile/vmSymbols.cpp 2019-12-10 17:10:26.721015379 -0800 @@ -837,6 +837,9 @@ case vmIntrinsics::_montgomerySquare: if (!UseMontgomerySquareIntrinsic) return true; break; + case vmIntrinsics::_bigIntegerRightShiftWorker: + case vmIntrinsics::_bigIntegerLeftShiftWorker: + break; case vmIntrinsics::_addExactI: case vmIntrinsics::_addExactL: case vmIntrinsics::_decrementExactI: --- old/src/hotspot/share/classfile/vmSymbols.hpp 2019-12-10 17:10:27.157015378 -0800 +++ new/src/hotspot/share/classfile/vmSymbols.hpp 2019-12-10 17:10:27.017015378 -0800 @@ -565,6 +565,7 @@ template(char_StringBuffer_signature, "(C)Ljava/lang/StringBuffer;") \ template(int_String_signature, "(I)Ljava/lang/String;") \ template(boolean_boolean_int_signature, "(ZZ)I") \ + template(big_integer_shift_worker_signature, "([I[IIII)V") \ template(reflect_method_signature, "Ljava/lang/reflect/Method;") \ /* signature symbols needed by intrinsics */ \ VM_INTRINSICS_DO(VM_INTRINSIC_IGNORE, VM_SYMBOL_IGNORE, VM_SYMBOL_IGNORE, template, VM_ALIAS_IGNORE) \ @@ -1007,6 +1008,12 @@ do_name( montgomerySquare_name, "implMontgomerySquare") \ do_signature(montgomerySquare_signature, "([I[IIJ[I)[I") \ \ + do_intrinsic(_bigIntegerRightShiftWorker, java_math_BigInteger, rightShift_name, big_integer_shift_worker_signature, F_S) \ + do_name( rightShift_name, "shiftRightImplWorker") \ + \ + do_intrinsic(_bigIntegerLeftShiftWorker, java_math_BigInteger, leftShift_name, big_integer_shift_worker_signature, F_S) \ + do_name( leftShift_name, "shiftLeftImplWorker") \ + \ do_class(jdk_internal_util_ArraysSupport, "jdk/internal/util/ArraysSupport") \ do_intrinsic(_vectorizedMismatch, jdk_internal_util_ArraysSupport, vectorizedMismatch_name, vectorizedMismatch_signature, F_S)\ do_name(vectorizedMismatch_name, "vectorizedMismatch") \ --- old/src/hotspot/share/opto/c2compiler.cpp 2019-12-10 17:10:27.477015376 -0800 +++ new/src/hotspot/share/opto/c2compiler.cpp 2019-12-10 17:10:27.329015377 -0800 @@ -628,6 +628,8 @@ case vmIntrinsics::_mulAdd: case vmIntrinsics::_montgomeryMultiply: case vmIntrinsics::_montgomerySquare: + case vmIntrinsics::_bigIntegerRightShiftWorker: + case vmIntrinsics::_bigIntegerLeftShiftWorker: case vmIntrinsics::_vectorizedMismatch: case vmIntrinsics::_ghash_processBlocks: case vmIntrinsics::_base64_encodeBlock: --- old/src/hotspot/share/opto/escape.cpp 2019-12-10 17:10:27.765015375 -0800 +++ new/src/hotspot/share/opto/escape.cpp 2019-12-10 17:10:27.625015376 -0800 @@ -1006,6 +1006,8 @@ strcmp(call->as_CallLeaf()->_name, "mulAdd") == 0 || strcmp(call->as_CallLeaf()->_name, "montgomery_multiply") == 0 || strcmp(call->as_CallLeaf()->_name, "montgomery_square") == 0 || + strcmp(call->as_CallLeaf()->_name, "bigIntegerRightShiftWorker") == 0 || + strcmp(call->as_CallLeaf()->_name, "bigIntegerLeftShiftWorker") == 0 || strcmp(call->as_CallLeaf()->_name, "vectorizedMismatch") == 0) ))) { call->dump(); --- old/src/hotspot/share/opto/library_call.cpp 2019-12-10 17:10:28.077015374 -0800 +++ new/src/hotspot/share/opto/library_call.cpp 2019-12-10 17:10:27.933015375 -0800 @@ -327,6 +327,7 @@ bool inline_mulAdd(); bool inline_montgomeryMultiply(); bool inline_montgomerySquare(); + bool inline_bigIntegerShift(bool isRightShift); bool inline_vectorizedMismatch(); bool inline_fma(vmIntrinsics::ID id); bool inline_character_compare(vmIntrinsics::ID id); @@ -845,6 +846,11 @@ case vmIntrinsics::_montgomerySquare: return inline_montgomerySquare(); + case vmIntrinsics::_bigIntegerRightShiftWorker: + return inline_bigIntegerShift(true); + case vmIntrinsics::_bigIntegerLeftShiftWorker: + return inline_bigIntegerShift(false); + case vmIntrinsics::_vectorizedMismatch: return inline_vectorizedMismatch(); @@ -5311,6 +5317,60 @@ } return true; +} + +bool LibraryCallKit::inline_bigIntegerShift(bool isRightShift) { + address stubAddr = NULL; + const char* stubName = NULL; + + stubAddr = isRightShift? StubRoutines::bigIntegerRightShift(): StubRoutines::bigIntegerLeftShift(); + if (stubAddr == NULL) { + return false; // Intrinsic's stub is not implemented on this platform + } + + stubName = isRightShift? "bigIntegerRightShiftWorker" : "bigIntegerLeftShiftWorker"; + + assert(callee()->signature()->size() == 5, "expected 5 arguments"); + + Node* newArr = argument(0); + Node* oldArr = argument(1); + Node* newIdx = argument(2); + Node* shiftCount = argument(3); + Node* numIter = argument(4); + + const Type* newArr_type = newArr->Value(&_gvn); + const TypeAryPtr* top_newArr = newArr_type->isa_aryptr(); + const Type* oldArr_type = oldArr->Value(&_gvn); + const TypeAryPtr* top_oldArr = oldArr_type->isa_aryptr(); + if (top_newArr == NULL || top_newArr->klass() == NULL || top_oldArr == NULL + || top_oldArr->klass() == NULL) { + return false; + } + + BasicType newArr_elem = newArr_type->isa_aryptr()->klass()->as_array_klass()->element_type()->basic_type(); + BasicType oldArr_elem = oldArr_type->isa_aryptr()->klass()->as_array_klass()->element_type()->basic_type(); + if (newArr_elem != T_INT || oldArr_elem != T_INT) { + return false; + } + + // Make the call + { + Node* newArr_start = array_element_address(newArr, intcon(0), newArr_elem); + Node* oldArr_start = array_element_address(oldArr, intcon(0), oldArr_elem); + + Node* call = make_runtime_call(RC_LEAF, + OptoRuntime::bigIntegerShift_Type(), + stubAddr, + stubName, + TypePtr::BOTTOM, + newArr_start, + oldArr_start, + newIdx, + shiftCount, + numIter); + } + + return true; } //-------------inline_vectorizedMismatch------------------------------ --- old/src/hotspot/share/opto/runtime.cpp 2019-12-10 17:10:28.413015373 -0800 +++ new/src/hotspot/share/opto/runtime.cpp 2019-12-10 17:10:28.273015373 -0800 @@ -1111,6 +1111,25 @@ return TypeFunc::make(domain, range); } +const TypeFunc * OptoRuntime::bigIntegerShift_Type() { + int argcnt = 5; + const Type** fields = TypeTuple::fields(argcnt); + int argp = TypeFunc::Parms; + fields[argp++] = TypePtr::NOTNULL; // newArr + fields[argp++] = TypePtr::NOTNULL; // oldArr + fields[argp++] = TypeInt::INT; // newIdx + fields[argp++] = TypeInt::INT; // shiftCount + fields[argp++] = TypeInt::INT; // numIter + assert(argp == TypeFunc::Parms + argcnt, "correct decoding"); + const TypeTuple* domain = TypeTuple::make(TypeFunc::Parms + argcnt, fields); + + // no result type needed + fields = TypeTuple::fields(1); + fields[TypeFunc::Parms + 0] = NULL; + const TypeTuple* range = TypeTuple::make(TypeFunc::Parms, fields); + return TypeFunc::make(domain, range); +} + const TypeFunc* OptoRuntime::vectorizedMismatch_Type() { // create input type (domain) int num_args = 4; --- old/src/hotspot/share/opto/runtime.hpp 2019-12-10 17:10:28.709015372 -0800 +++ new/src/hotspot/share/opto/runtime.hpp 2019-12-10 17:10:28.565015372 -0800 @@ -289,6 +289,8 @@ static const TypeFunc* mulAdd_Type(); + static const TypeFunc* bigIntegerShift_Type(); + static const TypeFunc* vectorizedMismatch_Type(); static const TypeFunc* ghash_processBlocks_Type(); --- old/src/hotspot/share/runtime/stubRoutines.cpp 2019-12-10 17:10:29.001015370 -0800 +++ new/src/hotspot/share/runtime/stubRoutines.cpp 2019-12-10 17:10:28.857015371 -0800 @@ -157,6 +157,8 @@ address StubRoutines::_mulAdd = NULL; address StubRoutines::_montgomeryMultiply = NULL; address StubRoutines::_montgomerySquare = NULL; +address StubRoutines::_bigIntegerRightShiftWorker = NULL; +address StubRoutines::_bigIntegerLeftShiftWorker = NULL; address StubRoutines::_vectorizedMismatch = NULL; --- old/src/hotspot/share/runtime/stubRoutines.hpp 2019-12-10 17:10:29.305015369 -0800 +++ new/src/hotspot/share/runtime/stubRoutines.hpp 2019-12-10 17:10:29.149015370 -0800 @@ -239,6 +239,8 @@ static address _mulAdd; static address _montgomeryMultiply; static address _montgomerySquare; + static address _bigIntegerRightShiftWorker; + static address _bigIntegerLeftShiftWorker; static address _vectorizedMismatch; @@ -414,6 +416,8 @@ static address mulAdd() { return _mulAdd; } static address montgomeryMultiply() { return _montgomeryMultiply; } static address montgomerySquare() { return _montgomerySquare; } + static address bigIntegerRightShift() { return _bigIntegerRightShiftWorker; } + static address bigIntegerLeftShift() { return _bigIntegerLeftShiftWorker; } static address vectorizedMismatch() { return _vectorizedMismatch; } --- old/src/hotspot/share/runtime/vmStructs.cpp 2019-12-10 17:10:29.609015368 -0800 +++ new/src/hotspot/share/runtime/vmStructs.cpp 2019-12-10 17:10:29.461015369 -0800 @@ -602,6 +602,8 @@ static_field(StubRoutines, _updateBytesCRC32C, address) \ static_field(StubRoutines, _multiplyToLen, address) \ static_field(StubRoutines, _squareToLen, address) \ + static_field(StubRoutines, _bigIntegerRightShiftWorker, address) \ + static_field(StubRoutines, _bigIntegerLeftShiftWorker, address) \ static_field(StubRoutines, _mulAdd, address) \ static_field(StubRoutines, _dexp, address) \ static_field(StubRoutines, _dlog, address) \ --- old/src/java.base/share/classes/java/math/BigInteger.java 2019-12-10 17:10:29.929015367 -0800 +++ new/src/java.base/share/classes/java/math/BigInteger.java 2019-12-10 17:10:29.777015368 -0800 @@ -42,6 +42,7 @@ import jdk.internal.math.FloatConsts; import jdk.internal.HotSpotIntrinsicCandidate; import jdk.internal.vm.annotation.Stable; +import jdk.internal.vm.annotation.ForceInline; /** * Immutable arbitrary-precision integers. All operations behave as if @@ -2621,12 +2622,8 @@ // shifts a up to len right n bits assumes no leading zeros, 0 0; i--) { - int b = c; - c = a[i-1]; - a[i] = (c << n2) | (b >>> n); - } + Objects.checkFromToIndex(0, len, a.length); + shiftRightImplWorker(a, a, 1, n, len-1); a[0] >>>= n; } @@ -2634,13 +2631,8 @@ static void primitiveLeftShift(int[] a, int len, int n) { if (len == 0 || n == 0) return; - - int n2 = 32 - n; - for (int i=0, c=a[i], m=i+len-1; i < m; i++) { - int b = c; - c = a[i+1]; - a[i] = (b << n) | (c >>> n2); - } + Objects.checkFromToIndex(0, len, a.length); + shiftLeftImplWorker(a, a, 0, n, len-1); a[len-1] <<= n; } @@ -3353,14 +3345,25 @@ } else { newMag = new int[magLen + nInts]; } - int j=0; - while (j < magLen-1) - newMag[i++] = mag[j++] << nBits | mag[j] >>> nBits2; - newMag[i] = mag[j] << nBits; + int numIter = magLen - 1; + Objects.checkFromToIndex(0, numIter + 1, mag.length); + Objects.checkFromToIndex(i, numIter + i + 1, newMag.length); + shiftLeftImplWorker(newMag, mag, i, nBits, numIter); + newMag[numIter + i] = mag[numIter] << nBits; } return newMag; } + @ForceInline + @HotSpotIntrinsicCandidate + private static void shiftLeftImplWorker(int[] newArr, int[] oldArr, int newIdx, int shiftCount, int numIter) { + int shiftCountRight = 32 - shiftCount; + int oldIdx = 0; + while (oldIdx < numIter) { + newArr[newIdx++] = (oldArr[oldIdx++] << shiftCount) | (oldArr[oldIdx] >>> shiftCountRight); + } + } + /** * Returns a BigInteger whose value is {@code (this >> n)}. Sign * extension is performed. The shift distance, {@code n}, may be @@ -3415,11 +3418,10 @@ } else { newMag = new int[magLen - nInts -1]; } - - int nBits2 = 32 - nBits; - int j=0; - while (j < magLen - nInts - 1) - newMag[i++] = (mag[j++] << nBits2) | (mag[j] >>> nBits); + int numIter = magLen - nInts - 1; + Objects.checkFromToIndex(0, numIter + 1, mag.length); + Objects.checkFromToIndex(i, numIter + i, newMag.length); + shiftRightImplWorker(newMag, mag, i, nBits, numIter); } if (signum < 0) { @@ -3437,6 +3439,17 @@ return new BigInteger(newMag, signum); } + @ForceInline + @HotSpotIntrinsicCandidate + private static void shiftRightImplWorker(int[] newArr, int[] oldArr, int newIdx, int shiftCount, int numIter) { + int shiftCountLeft = 32 - shiftCount; + int idx = numIter; + int nidx = (newIdx == 0) ? numIter - 1 : numIter; + while (nidx >= newIdx) { + newArr[nidx--] = (oldArr[idx--] >>> shiftCount) | (oldArr[idx] << shiftCountLeft); + } + } + int[] javaIncrement(int[] val) { int lastSum = 0; for (int i=val.length-1; i >= 0 && lastSum == 0; i--) --- old/test/micro/org/openjdk/bench/java/math/BigIntegers.java 2019-12-10 17:10:30.249015366 -0800 +++ new/test/micro/org/openjdk/bench/java/math/BigIntegers.java 2019-12-10 17:10:30.101015366 -0800 @@ -137,4 +137,38 @@ } bh.consume(tmp); } + + /** Invokes the shiftLeft method of BigInteger with different values. */ + @Benchmark + @OperationsPerInvocation(TESTSIZE) + public void testLeftShift(Blackhole bh) { + Random rand = new Random(); + int shift = rand.nextInt(30) + 1; + BigInteger tmp = null; + for (BigInteger s : hugeArray) { + if (tmp == null) { + tmp = s; + continue; + } + tmp = tmp.shiftLeft(shift); + } + bh.consume(tmp); + } + + /** Invokes the shiftRight method of BigInteger with different values. */ + @Benchmark + @OperationsPerInvocation(TESTSIZE) + public void testRightShift(Blackhole bh) { + Random rand = new Random(); + int shift = rand.nextInt(30) + 1; + BigInteger tmp = null; + for (BigInteger s : hugeArray) { + if (tmp == null) { + tmp = s; + continue; + } + tmp = tmp.shiftRight(shift); + } + bh.consume(tmp); + } } --- /dev/null 2019-10-11 13:52:03.380849910 -0700 +++ new/test/hotspot/jtreg/compiler/intrinsics/bigInteger/TestShift.java 2019-12-10 17:10:30.393015365 -0800 @@ -0,0 +1,121 @@ +/* + * Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +/** + * @test + * @bug 8234692 + * @summary Add C2 x86 intrinsic for BigInteger::shiftLeft() and BigInteger::shiftRight() method + * + * @run main/othervm/timeout=600 -XX:-TieredCompilation -Xbatch + * -XX:CompileCommand=exclude,compiler.intrinsics.bigInteger.TestShift::main + * -XX:CompileCommand=option,compiler.intrinsics.bigInteger.TestShift::base_left_shift,ccstr,DisableIntrinsic,_bigIntegerLeftShiftWorker + * -XX:CompileCommand=option,compiler.intrinsics.bigInteger.TestShift::base_right_shift,ccstr,DisableIntrinsic,_bigIntegerRightShiftWorker + * -XX:CompileCommand=inline,java.math.BigInteger::shiftLeft + * -XX:CompileCommand=inline,java.math.BigInteger::shiftRight + * compiler.intrinsics.bigInteger.TestShift + */ + +package compiler.intrinsics.bigInteger; + +import java.math.BigInteger; +import java.util.Arrays; +import java.util.Random; + +public class TestShift { + + public static BigInteger base_left_shift(BigInteger op1, int shift) { + return op1.shiftLeft(shift); + } + + public static BigInteger new_left_shift(BigInteger op1, int shift) { + return op1.shiftLeft(shift); + } + + public static BigInteger base_right_shift(BigInteger op1, int shift) { + return op1.shiftRight(shift); + } + + public static BigInteger new_right_shift(BigInteger op1, int shift) { + return op1.shiftRight(shift); + } + + public static boolean bytecompare(BigInteger b1, BigInteger b2) { + byte[] data1 = b1.toByteArray(); + byte[] data2 = b2.toByteArray(); + if (data1.length != data2.length) + return false; + for (int i = 0; i < data1.length; i++) { + if (data1[i] != data2[i]) + return false; + } + return true; + } + + public static String stringify(BigInteger b) { + String strout= ""; + byte [] data = b.toByteArray(); + for (int i = 0; i < data.length; i++) { + strout += (String.format("%02x",data[i]) + " "); + } + return strout; + } + + public static void main(String args[]) throws Exception { + BigInteger [] inputbuffer = new BigInteger[10]; + BigInteger [] oldLeftShiftResult = new BigInteger[10]; + BigInteger [] newLeftShiftResult = new BigInteger[10]; + BigInteger [] oldRightShiftResult = new BigInteger[10]; + BigInteger [] newRightShiftResult = new BigInteger[10]; + + Random rand = new Random(); + long seed = System.nanoTime(); + Random rand1 = new Random(); + long seed1 = System.nanoTime(); + rand.setSeed(seed); + rand1.setSeed(seed1); + int shiftCount = rand.nextInt(30) + 1; + + for(int i = 0; i < inputbuffer.length; i++) { + int numbits = rand.nextInt(4096)+32; + inputbuffer[i] = new BigInteger(numbits, rand); + } + + for (int j = 0; j < 100000; j++) { + for(int i = 0; i < inputbuffer.length; i++) { + oldLeftShiftResult[i] = base_left_shift(inputbuffer[i], shiftCount); + newLeftShiftResult[i] = new_left_shift(inputbuffer[i], shiftCount); + if (!bytecompare(oldLeftShiftResult[i], newLeftShiftResult[i])) { + System.out.println("mismatch for input:" + stringify(inputbuffer[i]) + "\n" + "expected left shift result:" + stringify(oldLeftShiftResult[i]) + "\n" + "calculated left shift result:" + stringify(newLeftShiftResult[i])); + throw new Exception("Failed"); + } + + oldRightShiftResult[i] = base_right_shift(inputbuffer[i], shiftCount); + newRightShiftResult[i] = new_right_shift(inputbuffer[i], shiftCount); + if (!bytecompare(oldRightShiftResult[i], newRightShiftResult[i])) { + System.out.println("mismatch for input:" + stringify(inputbuffer[i]) + "\n" + "expected right shift result:" + stringify(oldRightShiftResult[i]) + "\n" + "calculated right shift result:" + stringify(newRightShiftResult[i])); + throw new Exception("Failed"); + } + } + } + } +}