--- old/src/cpu/aarch64/vm/vm_version_aarch64.cpp 2015-06-11 10:35:38.775397500 -0700 +++ new/src/cpu/aarch64/vm/vm_version_aarch64.cpp 2015-06-11 10:35:38.711396500 -0700 @@ -1,5 +1,5 @@ /* - * Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1997, 2015, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2015, Red Hat Inc. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * @@ -190,6 +190,11 @@ } } + if (UseGHASHIntrinsics) { + warning("GHASH intrinsics are not available on this CPU"); + FLAG_SET_DEFAULT(UseGHASHIntrinsics, false); + } + if (FLAG_IS_DEFAULT(UseCRC32Intrinsics)) { UseCRC32Intrinsics = true; } --- old/src/cpu/ppc/vm/vm_version_ppc.cpp 2015-06-11 10:35:39.055401878 -0700 +++ new/src/cpu/ppc/vm/vm_version_ppc.cpp 2015-06-11 10:35:38.967400502 -0700 @@ -176,6 +176,11 @@ FLAG_SET_DEFAULT(UseAESIntrinsics, false); } + if (UseGHASHIntrinsics) { + warning("GHASH intrinsics are not available on this CPU"); + FLAG_SET_DEFAULT(UseGHASHIntrinsics, false); + } + if (UseSHA) { warning("SHA instructions are not available on this CPU"); FLAG_SET_DEFAULT(UseSHA, false); --- old/src/cpu/sparc/vm/assembler_sparc.hpp 2015-06-11 10:35:39.359406631 -0700 +++ new/src/cpu/sparc/vm/assembler_sparc.hpp 2015-06-11 10:35:39.275405318 -0700 @@ -1,5 +1,5 @@ /* - * Copyright (c) 1997, 2014, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1997, 2015, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -129,6 +129,7 @@ flog3_op3 = 0x36, edge_op3 = 0x36, fsrc_op3 = 0x36, + xmulx_op3 = 0x36, impdep2_op3 = 0x37, stpartialf_op3 = 0x37, jmpl_op3 = 0x38, @@ -220,6 +221,8 @@ mdtox_opf = 0x110, mstouw_opf = 0x111, mstosw_opf = 0x113, + xmulx_opf = 0x115, + xmulxhi_opf = 0x116, mxtod_opf = 0x118, mwtos_opf = 0x119, @@ -1212,6 +1215,9 @@ void movwtos( Register s, FloatRegister d ) { vis3_only(); emit_int32( op(arith_op) | fd(d, FloatRegisterImpl::S) | op3(mftoi_op3) | opf(mwtos_opf) | rs2(s)); } void movxtod( Register s, FloatRegister d ) { vis3_only(); emit_int32( op(arith_op) | fd(d, FloatRegisterImpl::D) | op3(mftoi_op3) | opf(mxtod_opf) | rs2(s)); } + void xmulx(Register s1, Register s2, Register d) { vis3_only(); emit_int32( op(arith_op) | rd(d) | op3(xmulx_op3) | rs1(s1) | opf(xmulx_opf) | rs2(s2)); } + void xmulxhi(Register s1, Register s2, Register d) { vis3_only(); emit_int32( op(arith_op) | rd(d) | op3(xmulx_op3) | rs1(s1) | opf(xmulxhi_opf) | rs2(s2)); } + // Crypto SHA instructions void sha1() { sha1_only(); emit_int32( op(arith_op) | op3(sha_op3) | opf(sha1_opf)); } --- old/src/cpu/sparc/vm/stubGenerator_sparc.cpp 2015-06-11 10:35:39.639411009 -0700 +++ new/src/cpu/sparc/vm/stubGenerator_sparc.cpp 2015-06-11 10:35:39.579410071 -0700 @@ -4786,6 +4786,130 @@ return start; } + /* Single and multi-block ghash operations */ + address generate_ghash_processBlocks() { + __ align(CodeEntryAlignment); + Label L_ghash_loop, L_aligned, L_main; + StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks"); + address start = __ pc(); + + Register state = I0; + Register subkeyH = I1; + Register data = I2; + Register len = I3; + + __ save_frame(0); + + __ ldx(state, 0, O0); + __ ldx(state, 8, O1); + + // Loop label for multiblock operations + __ BIND(L_ghash_loop); + + // Check if 'data' is unaligned + __ andcc(data, 7, G1); + __ br(Assembler::zero, false, Assembler::pt, L_aligned); + __ delayed()->nop(); + + Register left_shift = L1; + Register right_shift = L2; + Register data_ptr = L3; + + // Get left and right shift values in bits + __ sll(G1, LogBitsPerByte, left_shift); + __ mov(64, right_shift); + __ sub(right_shift, left_shift, right_shift); + + // Align to read 'data' + __ sub(data, G1, data_ptr); + + // Load first 8 bytes of 'data' + __ ldx(data_ptr, 0, O4); + __ sllx(O4, left_shift, O4); + __ ldx(data_ptr, 8, O5); + __ srlx(O5, right_shift, G4); + __ bset(G4, O4); + + // Load second 8 bytes of 'data' + __ sllx(O5, left_shift, O5); + __ ldx(data_ptr, 16, G4); + __ srlx(G4, right_shift, G4); + __ ba(L_main); + __ delayed()->bset(G4, O5); + + // If 'data' is aligned, load normally + __ BIND(L_aligned); + __ ldx(data, 0, O4); + __ ldx(data, 8, O5); + + __ BIND(L_main); + __ ldx(subkeyH, 0, O2); + __ ldx(subkeyH, 8, O3); + + __ xor3(O0, O4, O0); + __ xor3(O1, O5, O1); + + __ xmulxhi(O0, O3, G3); + __ xmulx(O0, O2, O5); + __ xmulxhi(O1, O2, G4); + __ xmulxhi(O1, O3, G5); + __ xmulx(O0, O3, G1); + __ xmulx(O1, O3, G2); + __ xmulx(O1, O2, O3); + __ xmulxhi(O0, O2, O4); + + __ mov(0xE1, O0); + __ sllx(O0, 56, O0); + + __ xor3(O5, G3, O5); + __ xor3(O5, G4, O5); + __ xor3(G5, G1, G1); + __ xor3(G1, O3, G1); + __ srlx(G2, 63, O1); + __ srlx(G1, 63, G3); + __ sllx(G2, 63, O3); + __ sllx(G2, 58, O2); + __ xor3(O3, O2, O2); + + __ sllx(G1, 1, G1); + __ or3(G1, O1, G1); + + __ xor3(G1, O2, G1); + + __ sllx(G2, 1, G2); + + __ xmulxhi(G1, O0, O1); + __ xmulx(G1, O0, O2); + __ xmulxhi(G2, O0, O3); + __ xmulx(G2, O0, G1); + + __ xor3(O4, O1, O4); + __ xor3(O5, O2, O5); + __ xor3(O5, O3, O5); + + __ sllx(O4, 1, O2); + __ srlx(O5, 63, O3); + + __ or3(O2, O3, O0); + + __ sllx(O5, 1, O1); + __ srlx(G1, 63, O2); + __ or3(O1, O2, O1); + __ xor3(O1, G3, O1); + + __ deccc(len); + __ br(Assembler::notZero, true, Assembler::pt, L_ghash_loop); + __ delayed()->add(data, 16, data); + + __ stx(O0, I0, 0); + __ stx(O1, I0, 8); + + __ ret(); + __ delayed()->restore(); + + return start; + } + void generate_initial() { // Generates all stubs and initializes the entry points @@ -4859,6 +4983,10 @@ StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(); StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt_Parallel(); } + // generate GHASH intrinsics code + if (UseGHASHIntrinsics) { + StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks(); + } // generate SHA1/SHA256/SHA512 intrinsics code if (UseSHA1Intrinsics) { --- old/src/cpu/sparc/vm/vm_version_sparc.cpp 2015-06-11 10:35:39.975416263 -0700 +++ new/src/cpu/sparc/vm/vm_version_sparc.cpp 2015-06-11 10:35:39.911415262 -0700 @@ -300,6 +300,17 @@ } } + // GHASH/GCM intrinsics + if (has_vis3() && (UseVIS > 2)) { + if (FLAG_IS_DEFAULT(UseGHASHIntrinsics)) { + UseGHASHIntrinsics = true; + } + } else if (UseGHASHIntrinsics) { + if (!FLAG_IS_DEFAULT(UseGHASHIntrinsics)) + warning("GHASH intrinsics require VIS3 insructions support. Intriniscs will be disabled"); + FLAG_SET_DEFAULT(UseGHASHIntrinsics, false); + } + // SHA1, SHA256, and SHA512 instructions were added to SPARC T-series at different times if (has_sha1() || has_sha256() || has_sha512()) { if (UseVIS > 0) { // SHA intrinsics use VIS1 instructions --- old/src/cpu/x86/vm/assembler_x86.cpp 2015-06-11 10:35:40.295421266 -0700 +++ new/src/cpu/x86/vm/assembler_x86.cpp 2015-06-11 10:35:40.211419952 -0700 @@ -3095,8 +3095,16 @@ void Assembler::psrldq(XMMRegister dst, int shift) { // Shift 128 bit value in xmm register by number of bytes. NOT_LP64(assert(VM_Version::supports_sse2(), "")); - int encode = simd_prefix_and_encode(xmm3, dst, dst, VEX_SIMD_66, true, VEX_OPCODE_0F, - false, AVX_128bit, (VM_Version::supports_avx512bw() == false)); + int encode = simd_prefix_and_encode(xmm3, dst, dst, VEX_SIMD_66, true, VEX_OPCODE_0F, false, AVX_128bit, (VM_Version::supports_avx512bw() == false)); + emit_int8(0x73); + emit_int8((unsigned char)(0xC0 | encode)); + emit_int8(shift); +} + +void Assembler::pslldq(XMMRegister dst, int shift) { + // Shift left 128 bit value in xmm register by number of bytes. + NOT_LP64(assert(VM_Version::supports_sse2(), "")); + int encode = simd_prefix_and_encode(xmm7, dst, dst, VEX_SIMD_66, true, VEX_OPCODE_0F, false, AVX_128bit, (VM_Version::supports_avx512bw() == false)); emit_int8(0x73); emit_int8((unsigned char)(0xC0 | encode)); emit_int8(shift); --- old/src/cpu/x86/vm/assembler_x86.hpp 2015-06-11 10:35:40.603426082 -0700 +++ new/src/cpu/x86/vm/assembler_x86.hpp 2015-06-11 10:35:40.539425081 -0700 @@ -1666,6 +1666,8 @@ // Shift Right by bytes Logical DoubleQuadword Immediate void psrldq(XMMRegister dst, int shift); + // Shift Left by bytes Logical DoubleQuadword Immediate + void pslldq(XMMRegister dst, int shift); // Logical Compare 128bit void ptest(XMMRegister dst, XMMRegister src); --- old/src/cpu/x86/vm/stubGenerator_x86_32.cpp 2015-06-11 10:35:40.879430397 -0700 +++ new/src/cpu/x86/vm/stubGenerator_x86_32.cpp 2015-06-11 10:35:40.815429396 -0700 @@ -2727,6 +2727,167 @@ return start; } + // byte swap x86 long + address generate_ghash_long_swap_mask() { + __ align(CodeEntryAlignment); + StubCodeMark mark(this, "StubRoutines", "ghash_long_swap_mask"); + address start = __ pc(); + __ emit_data(0x0b0a0908, relocInfo::none, 0); + __ emit_data(0x0f0e0d0c, relocInfo::none, 0); + __ emit_data(0x03020100, relocInfo::none, 0); + __ emit_data(0x07060504, relocInfo::none, 0); + + return start; + } + + // byte swap x86 byte array + address generate_ghash_byte_swap_mask() { + __ align(CodeEntryAlignment); + StubCodeMark mark(this, "StubRoutines", "ghash_byte_swap_mask"); + address start = __ pc(); + __ emit_data(0x0c0d0e0f, relocInfo::none, 0); + __ emit_data(0x08090a0b, relocInfo::none, 0); + __ emit_data(0x04050607, relocInfo::none, 0); + __ emit_data(0x00010203, relocInfo::none, 0); + return start; + } + + /* Single and multi-block ghash operations */ + address generate_ghash_processBlocks() { + assert(UseGHASHIntrinsics, "need GHASH intrinsics and CLMUL support"); + __ align(CodeEntryAlignment); + Label L_ghash_loop, L_exit; + StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks"); + address start = __ pc(); + + const Register state = rdi; + const Register subkeyH = rsi; + const Register data = rdx; + const Register blocks = rcx; + + const Address state_param(rbp, 8+0); + const Address subkeyH_param(rbp, 8+4); + const Address data_param(rbp, 8+8); + const Address blocks_param(rbp, 8+12); + + const XMMRegister xmm_temp0 = xmm0; + const XMMRegister xmm_temp1 = xmm1; + const XMMRegister xmm_temp2 = xmm2; + const XMMRegister xmm_temp3 = xmm3; + const XMMRegister xmm_temp4 = xmm4; + const XMMRegister xmm_temp5 = xmm5; + const XMMRegister xmm_temp6 = xmm6; + const XMMRegister xmm_temp7 = xmm7; + + __ enter(); + + __ movptr(state, state_param); + __ movptr(subkeyH, subkeyH_param); + __ movptr(data, data_param); + __ movptr(blocks, blocks_param); + + __ movdqu(xmm_temp0, Address(state, 0)); + __ pshufb(xmm_temp0, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr())); + + __ movdqu(xmm_temp1, Address(subkeyH, 0)); + __ pshufb(xmm_temp1, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr())); + + __ BIND(L_ghash_loop); + __ movdqu(xmm_temp2, Address(data, 0)); + __ pshufb(xmm_temp2, ExternalAddress(StubRoutines::x86::ghash_byte_swap_mask_addr())); + + __ pxor(xmm_temp0, xmm_temp2); + + // + // Multiply with the hash key + // + __ movdqu(xmm_temp3, xmm_temp0); + __ pclmulqdq(xmm_temp3, xmm_temp1, 0); // xmm3 holds a0*b0 + __ movdqu(xmm_temp4, xmm_temp0); + __ pclmulqdq(xmm_temp4, xmm_temp1, 16); // xmm4 holds a0*b1 + + __ movdqu(xmm_temp5, xmm_temp0); + __ pclmulqdq(xmm_temp5, xmm_temp1, 1); // xmm5 holds a1*b0 + __ movdqu(xmm_temp6, xmm_temp0); + __ pclmulqdq(xmm_temp6, xmm_temp1, 17); // xmm6 holds a1*b1 + + __ pxor(xmm_temp4, xmm_temp5); // xmm4 holds a0*b1 + a1*b0 + + __ movdqu(xmm_temp5, xmm_temp4); // move the contents of xmm4 to xmm5 + __ psrldq(xmm_temp4, 8); // shift by xmm4 64 bits to the right + __ pslldq(xmm_temp5, 8); // shift by xmm5 64 bits to the left + __ pxor(xmm_temp3, xmm_temp5); + __ pxor(xmm_temp6, xmm_temp4); // Register pair holds the result + // of the carry-less multiplication of + // xmm0 by xmm1. + + // We shift the result of the multiplication by one bit position + // to the left to cope for the fact that the bits are reversed. + __ movdqu(xmm_temp7, xmm_temp3); + __ movdqu(xmm_temp4, xmm_temp6); + __ pslld (xmm_temp3, 1); + __ pslld(xmm_temp6, 1); + __ psrld(xmm_temp7, 31); + __ psrld(xmm_temp4, 31); + __ movdqu(xmm_temp5, xmm_temp7); + __ pslldq(xmm_temp4, 4); + __ pslldq(xmm_temp7, 4); + __ psrldq(xmm_temp5, 12); + __ por(xmm_temp3, xmm_temp7); + __ por(xmm_temp6, xmm_temp4); + __ por(xmm_temp6, xmm_temp5); + + // + // First phase of the reduction + // + // Move xmm3 into xmm4, xmm5, xmm7 in order to perform the shifts + // independently. + __ movdqu(xmm_temp7, xmm_temp3); + __ movdqu(xmm_temp4, xmm_temp3); + __ movdqu(xmm_temp5, xmm_temp3); + __ pslld(xmm_temp7, 31); // packed right shift shifting << 31 + __ pslld(xmm_temp4, 30); // packed right shift shifting << 30 + __ pslld(xmm_temp5, 25); // packed right shift shifting << 25 + __ pxor(xmm_temp7, xmm_temp4); // xor the shifted versions + __ pxor(xmm_temp7, xmm_temp5); + __ movdqu(xmm_temp4, xmm_temp7); + __ pslldq(xmm_temp7, 12); + __ psrldq(xmm_temp4, 4); + __ pxor(xmm_temp3, xmm_temp7); // first phase of the reduction complete + + // + // Second phase of the reduction + // + // Make 3 copies of xmm3 in xmm2, xmm5, xmm7 for doing these + // shift operations. + __ movdqu(xmm_temp2, xmm_temp3); + __ movdqu(xmm_temp7, xmm_temp3); + __ movdqu(xmm_temp5, xmm_temp3); + __ psrld(xmm_temp2, 1); // packed left shifting >> 1 + __ psrld(xmm_temp7, 2); // packed left shifting >> 2 + __ psrld(xmm_temp5, 7); // packed left shifting >> 7 + __ pxor(xmm_temp2, xmm_temp7); // xor the shifted versions + __ pxor(xmm_temp2, xmm_temp5); + __ pxor(xmm_temp2, xmm_temp4); + __ pxor(xmm_temp3, xmm_temp2); + __ pxor(xmm_temp6, xmm_temp3); // the result is in xmm6 + + __ decrement(blocks); + __ jcc(Assembler::zero, L_exit); + __ movdqu(xmm_temp0, xmm_temp6); + __ addptr(data, 16); + __ jmp(L_ghash_loop); + + __ BIND(L_exit); + // Byte swap 16-byte result + __ pshufb(xmm_temp6, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr())); + __ movdqu(Address(state, 0), xmm_temp6); // store the result + + __ leave(); + __ ret(0); + return start; + } + /** * Arguments: * @@ -3026,6 +3187,13 @@ StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt(); } + // Generate GHASH intrinsics code + if (UseGHASHIntrinsics) { + StubRoutines::x86::_ghash_long_swap_mask_addr = generate_ghash_long_swap_mask(); + StubRoutines::x86::_ghash_byte_swap_mask_addr = generate_ghash_byte_swap_mask(); + StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks(); + } + // Safefetch stubs. generate_safefetch("SafeFetch32", sizeof(int), &StubRoutines::_safefetch32_entry, &StubRoutines::_safefetch32_fault_pc, --- old/src/cpu/x86/vm/stubGenerator_x86_64.cpp 2015-06-11 10:35:41.179435087 -0700 +++ new/src/cpu/x86/vm/stubGenerator_x86_64.cpp 2015-06-11 10:35:41.103433899 -0700 @@ -3681,6 +3681,175 @@ return start; } + + // byte swap x86 long + address generate_ghash_long_swap_mask() { + __ align(CodeEntryAlignment); + StubCodeMark mark(this, "StubRoutines", "ghash_long_swap_mask"); + address start = __ pc(); + __ emit_data64(0x0f0e0d0c0b0a0908, relocInfo::none ); + __ emit_data64(0x0706050403020100, relocInfo::none ); + return start; + } + + // byte swap x86 byte array + address generate_ghash_byte_swap_mask() { + __ align(CodeEntryAlignment); + StubCodeMark mark(this, "StubRoutines", "ghash_byte_swap_mask"); + address start = __ pc(); + __ emit_data64(0x08090a0b0c0d0e0f, relocInfo::none ); + __ emit_data64(0x0001020304050607, relocInfo::none ); + return start; + } + + /* Single and multi-block ghash operations */ + address generate_ghash_processBlocks() { + __ align(CodeEntryAlignment); + Label L_ghash_loop, L_exit; + StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks"); + address start = __ pc(); + + const Register state = c_rarg0; + const Register subkeyH = c_rarg1; + const Register data = c_rarg2; + const Register blocks = c_rarg3; + +#ifdef _WIN64 + const int XMM_REG_LAST = 10; +#endif + + const XMMRegister xmm_temp0 = xmm0; + const XMMRegister xmm_temp1 = xmm1; + const XMMRegister xmm_temp2 = xmm2; + const XMMRegister xmm_temp3 = xmm3; + const XMMRegister xmm_temp4 = xmm4; + const XMMRegister xmm_temp5 = xmm5; + const XMMRegister xmm_temp6 = xmm6; + const XMMRegister xmm_temp7 = xmm7; + const XMMRegister xmm_temp8 = xmm8; + const XMMRegister xmm_temp9 = xmm9; + const XMMRegister xmm_temp10 = xmm10; + + __ enter(); + +#ifdef _WIN64 + // save the xmm registers which must be preserved 6-10 + __ subptr(rsp, -rsp_after_call_off * wordSize); + for (int i = 6; i <= XMM_REG_LAST; i++) { + __ movdqu(xmm_save(i), as_XMMRegister(i)); + } +#endif + + __ movdqu(xmm_temp10, ExternalAddress(StubRoutines::x86::ghash_long_swap_mask_addr())); + + __ movdqu(xmm_temp0, Address(state, 0)); + __ pshufb(xmm_temp0, xmm_temp10); + + + __ BIND(L_ghash_loop); + __ movdqu(xmm_temp2, Address(data, 0)); + __ pshufb(xmm_temp2, ExternalAddress(StubRoutines::x86::ghash_byte_swap_mask_addr())); + + __ movdqu(xmm_temp1, Address(subkeyH, 0)); + __ pshufb(xmm_temp1, xmm_temp10); + + __ pxor(xmm_temp0, xmm_temp2); + + // + // Multiply with the hash key + // + __ movdqu(xmm_temp3, xmm_temp0); + __ pclmulqdq(xmm_temp3, xmm_temp1, 0); // xmm3 holds a0*b0 + __ movdqu(xmm_temp4, xmm_temp0); + __ pclmulqdq(xmm_temp4, xmm_temp1, 16); // xmm4 holds a0*b1 + + __ movdqu(xmm_temp5, xmm_temp0); + __ pclmulqdq(xmm_temp5, xmm_temp1, 1); // xmm5 holds a1*b0 + __ movdqu(xmm_temp6, xmm_temp0); + __ pclmulqdq(xmm_temp6, xmm_temp1, 17); // xmm6 holds a1*b1 + + __ pxor(xmm_temp4, xmm_temp5); // xmm4 holds a0*b1 + a1*b0 + + __ movdqu(xmm_temp5, xmm_temp4); // move the contents of xmm4 to xmm5 + __ psrldq(xmm_temp4, 8); // shift by xmm4 64 bits to the right + __ pslldq(xmm_temp5, 8); // shift by xmm5 64 bits to the left + __ pxor(xmm_temp3, xmm_temp5); + __ pxor(xmm_temp6, xmm_temp4); // Register pair holds the result + // of the carry-less multiplication of + // xmm0 by xmm1. + + // We shift the result of the multiplication by one bit position + // to the left to cope for the fact that the bits are reversed. + __ movdqu(xmm_temp7, xmm_temp3); + __ movdqu(xmm_temp8, xmm_temp6); + __ pslld(xmm_temp3, 1); + __ pslld(xmm_temp6, 1); + __ psrld(xmm_temp7, 31); + __ psrld(xmm_temp8, 31); + __ movdqu(xmm_temp9, xmm_temp7); + __ pslldq(xmm_temp8, 4); + __ pslldq(xmm_temp7, 4); + __ psrldq(xmm_temp9, 12); + __ por(xmm_temp3, xmm_temp7); + __ por(xmm_temp6, xmm_temp8); + __ por(xmm_temp6, xmm_temp9); + + // + // First phase of the reduction + // + // Move xmm3 into xmm7, xmm8, xmm9 in order to perform the shifts + // independently. + __ movdqu(xmm_temp7, xmm_temp3); + __ movdqu(xmm_temp8, xmm_temp3); + __ movdqu(xmm_temp9, xmm_temp3); + __ pslld(xmm_temp7, 31); // packed right shift shifting << 31 + __ pslld(xmm_temp8, 30); // packed right shift shifting << 30 + __ pslld(xmm_temp9, 25); // packed right shift shifting << 25 + __ pxor(xmm_temp7, xmm_temp8); // xor the shifted versions + __ pxor(xmm_temp7, xmm_temp9); + __ movdqu(xmm_temp8, xmm_temp7); + __ pslldq(xmm_temp7, 12); + __ psrldq(xmm_temp8, 4); + __ pxor(xmm_temp3, xmm_temp7); // first phase of the reduction complete + + // + // Second phase of the reduction + // + // Make 3 copies of xmm3 in xmm2, xmm4, xmm5 for doing these + // shift operations. + __ movdqu(xmm_temp2, xmm_temp3); + __ movdqu(xmm_temp4, xmm_temp3); + __ movdqu(xmm_temp5, xmm_temp3); + __ psrld(xmm_temp2, 1); // packed left shifting >> 1 + __ psrld(xmm_temp4, 2); // packed left shifting >> 2 + __ psrld(xmm_temp5, 7); // packed left shifting >> 7 + __ pxor(xmm_temp2, xmm_temp4); // xor the shifted versions + __ pxor(xmm_temp2, xmm_temp5); + __ pxor(xmm_temp2, xmm_temp8); + __ pxor(xmm_temp3, xmm_temp2); + __ pxor(xmm_temp6, xmm_temp3); // the result is in xmm6 + + __ decrement(blocks); + __ jcc(Assembler::zero, L_exit); + __ movdqu(xmm_temp0, xmm_temp6); + __ addptr(data, 16); + __ jmp(L_ghash_loop); + + __ BIND(L_exit); + __ pshufb(xmm_temp6, xmm_temp10); // Byte swap 16-byte result + __ movdqu(Address(state, 0), xmm_temp6); // store the result + +#ifdef _WIN64 + // restore xmm regs belonging to calling function + for (int i = 6; i <= XMM_REG_LAST; i++) { + __ movdqu(as_XMMRegister(i), xmm_save(i)); + } +#endif + __ leave(); + __ ret(0); + return start; + } + /** * Arguments: * @@ -4120,6 +4289,13 @@ StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt_Parallel(); } + // Generate GHASH intrinsics code + if (UseGHASHIntrinsics) { + StubRoutines::x86::_ghash_long_swap_mask_addr = generate_ghash_long_swap_mask(); + StubRoutines::x86::_ghash_byte_swap_mask_addr = generate_ghash_byte_swap_mask(); + StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks(); + } + // Safefetch stubs. generate_safefetch("SafeFetch32", sizeof(int), &StubRoutines::_safefetch32_entry, &StubRoutines::_safefetch32_fault_pc, --- old/src/cpu/x86/vm/stubRoutines_x86.cpp 2015-06-11 10:35:41.475439715 -0700 +++ new/src/cpu/x86/vm/stubRoutines_x86.cpp 2015-06-11 10:35:41.411438715 -0700 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2013, 2015, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -33,6 +33,8 @@ address StubRoutines::x86::_verify_mxcsr_entry = NULL; address StubRoutines::x86::_key_shuffle_mask_addr = NULL; +address StubRoutines::x86::_ghash_long_swap_mask_addr = NULL; +address StubRoutines::x86::_ghash_byte_swap_mask_addr = NULL; uint64_t StubRoutines::x86::_crc_by128_masks[] = { --- old/src/cpu/x86/vm/stubRoutines_x86.hpp 2015-06-11 10:35:41.771444343 -0700 +++ new/src/cpu/x86/vm/stubRoutines_x86.hpp 2015-06-11 10:35:41.679442905 -0700 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2013, 2015, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -36,10 +36,15 @@ // masks and table for CRC32 static uint64_t _crc_by128_masks[]; static juint _crc_table[]; + // swap mask for ghash + static address _ghash_long_swap_mask_addr; + static address _ghash_byte_swap_mask_addr; public: static address verify_mxcsr_entry() { return _verify_mxcsr_entry; } static address key_shuffle_mask_addr() { return _key_shuffle_mask_addr; } static address crc_by128_masks_addr() { return (address)_crc_by128_masks; } + static address ghash_long_swap_mask_addr() { return _ghash_long_swap_mask_addr; } + static address ghash_byte_swap_mask_addr() { return _ghash_byte_swap_mask_addr; } #endif // CPU_X86_VM_STUBROUTINES_X86_32_HPP --- old/src/cpu/x86/vm/vm_version_x86.cpp 2015-06-11 10:35:42.095449410 -0700 +++ new/src/cpu/x86/vm/vm_version_x86.cpp 2015-06-11 10:35:42.019448221 -0700 @@ -677,6 +677,17 @@ FLAG_SET_DEFAULT(UseAESIntrinsics, false); } + // GHASH/GCM intrinsics + if (UseCLMUL && (UseSSE > 2)) { + if (FLAG_IS_DEFAULT(UseGHASHIntrinsics)) { + UseGHASHIntrinsics = true; + } + } else if (UseGHASHIntrinsics) { + if (!FLAG_IS_DEFAULT(UseGHASHIntrinsics)) + warning("GHASH intrinsic requires CLMUL and SSE2 instructions on this CPU"); + FLAG_SET_DEFAULT(UseGHASHIntrinsics, false); + } + if (UseSHA) { warning("SHA instructions are not available on this CPU"); FLAG_SET_DEFAULT(UseSHA, false); --- old/src/share/vm/classfile/vmSymbols.hpp 2015-06-11 10:35:42.419454475 -0700 +++ new/src/share/vm/classfile/vmSymbols.hpp 2015-06-11 10:35:42.351453412 -0700 @@ -845,6 +845,12 @@ do_name( implCompressMB_name, "implCompressMultiBlock") \ do_signature(implCompressMB_signature, "([BII)I") \ \ + /* support for com.sun.crypto.provider.GHASH */ \ + do_class(com_sun_crypto_provider_ghash, "com/sun/crypto/provider/GHASH") \ + do_intrinsic(_ghash_processBlocks, com_sun_crypto_provider_ghash, processBlocks_name, ghash_processBlocks_signature, F_R) \ + do_name(processBlocks_name, "processBlocks") \ + do_signature(ghash_processBlocks_signature, "([BII)V") \ + \ /* support for java.util.zip */ \ do_class(java_util_zip_CRC32, "java/util/zip/CRC32") \ do_intrinsic(_updateCRC32, java_util_zip_CRC32, update_name, int2_int_signature, F_SN) \ --- old/src/share/vm/opto/escape.cpp 2015-06-11 10:35:42.707458978 -0700 +++ new/src/share/vm/opto/escape.cpp 2015-06-11 10:35:42.627457727 -0700 @@ -966,6 +966,7 @@ strcmp(call->as_CallLeaf()->_name, "aescrypt_decryptBlock") == 0 || strcmp(call->as_CallLeaf()->_name, "cipherBlockChaining_encryptAESCrypt") == 0 || strcmp(call->as_CallLeaf()->_name, "cipherBlockChaining_decryptAESCrypt") == 0 || + strcmp(call->as_CallLeaf()->_name, "ghash_processBlocks") == 0 || strcmp(call->as_CallLeaf()->_name, "sha1_implCompress") == 0 || strcmp(call->as_CallLeaf()->_name, "sha1_implCompressMB") == 0 || strcmp(call->as_CallLeaf()->_name, "sha256_implCompress") == 0 || --- old/src/share/vm/opto/library_call.cpp 2015-06-11 10:35:43.007463669 -0700 +++ new/src/share/vm/opto/library_call.cpp 2015-06-11 10:35:42.939462605 -0700 @@ -278,6 +278,8 @@ Node* inline_cipherBlockChaining_AESCrypt_predicate(bool decrypting); Node* get_key_start_from_aescrypt_object(Node* aescrypt_object); Node* get_original_key_start_from_aescrypt_object(Node* aescrypt_object); + bool inline_ghash_processBlocks(); + Node* get_vars_from_ghash_object(Node *ghash_object, const char *var_name); bool inline_sha_implCompress(vmIntrinsics::ID id); bool inline_digestBase_implCompressMB(int predicate); bool inline_sha_implCompressMB(Node* digestBaseObj, ciInstanceKlass* instklass_SHA, @@ -528,6 +530,10 @@ predicates = 3; break; + case vmIntrinsics::_ghash_processBlocks: + if (!UseGHASHIntrinsics) return NULL; + break; + case vmIntrinsics::_updateCRC32: case vmIntrinsics::_updateBytesCRC32: case vmIntrinsics::_updateByteBufferCRC32: @@ -929,6 +935,9 @@ case vmIntrinsics::_mulAdd: return inline_mulAdd(); + case vmIntrinsics::_ghash_processBlocks: + return inline_ghash_processBlocks(); + case vmIntrinsics::_encodeISOArray: return inline_encodeISOArray(); @@ -5858,6 +5867,45 @@ return _gvn.transform(region); } +//------------------------------get_vars_from_ghash_object----------------------- +Node * LibraryCallKit::get_vars_from_ghash_object(Node *ghash_object, const char *var_name) { + Node* ghash_var = load_field_from_object(ghash_object, var_name, "[J", /*is_exact*/ false); + assert (ghash_var != NULL, "wrong version of sun.security.provider.GHASH"); + if (ghash_var == NULL) return (Node *) NULL; + + // now have the array, need to get the start address of the array + Node* var = array_element_address(ghash_var, intcon(0), T_LONG); + return var; +} + +//------------------------------inline_ghash_processBlocks +bool LibraryCallKit::inline_ghash_processBlocks() { + address stubAddr; + const char *stubName; + assert(UseGHASHIntrinsics, "need GHASH intrinsics support"); + + stubAddr = StubRoutines::ghash_processBlocks(); + stubName = "ghash_processBlocks"; + + Node* ghash_object = argument(0); + Node* data = argument(1); + Node* offset = argument(2); + Node* len = argument(3); + + Node* state_start = get_vars_from_ghash_object(ghash_object, "state"); + assert(state_start, "Unable to load GHASH state"); + Node* subkeyH_start = get_vars_from_ghash_object(ghash_object, "subkeyH"); + assert(subkeyH_start, "Unable to load GHASH subkeyH"); + Node* data_start = array_element_address(data, offset, T_BYTE); + assert(data_start, "data is NULL"); + + Node* ghash = make_runtime_call(RC_LEAF|RC_NO_FP, + OptoRuntime::ghash_processBlocks_Type(), + stubAddr, stubName, TypePtr::BOTTOM, + state_start, subkeyH_start, data_start, len); + return true; +} + //------------------------------inline_sha_implCompress----------------------- // // Calculate SHA (i.e., SHA-1) for single-block byte[] array. --- old/src/share/vm/opto/runtime.cpp 2015-06-11 10:35:43.359469172 -0700 +++ new/src/share/vm/opto/runtime.cpp 2015-06-11 10:35:43.275467859 -0700 @@ -987,7 +987,25 @@ return TypeFunc::make(domain, range); } +// GHASH block processing +const TypeFunc* OptoRuntime::ghash_processBlocks_Type() { + int argcnt = 4; + const Type** fields = TypeTuple::fields(argcnt); + int argp = TypeFunc::Parms; + fields[argp++] = TypePtr::NOTNULL; // state + fields[argp++] = TypePtr::NOTNULL; // subkeyH + fields[argp++] = TypePtr::NOTNULL; // data + fields[argp++] = TypeInt::INT; // blocks + assert(argp == TypeFunc::Parms+argcnt, "correct decoding"); + const TypeTuple* domain = TypeTuple::make(TypeFunc::Parms+argcnt, fields); + + // result type needed + fields = TypeTuple::fields(1); + fields[TypeFunc::Parms+0] = NULL; // void + const TypeTuple* range = TypeTuple::make(TypeFunc::Parms, fields); + return TypeFunc::make(domain, range); +} //------------- Interpreter state access for on stack replacement const TypeFunc* OptoRuntime::osr_end_Type() { --- old/src/share/vm/opto/runtime.hpp 2015-06-11 10:35:43.707474614 -0700 +++ new/src/share/vm/opto/runtime.hpp 2015-06-11 10:35:43.635473488 -0700 @@ -316,6 +316,8 @@ static const TypeFunc* mulAdd_Type(); + static const TypeFunc* ghash_processBlocks_Type(); + static const TypeFunc* updateBytesCRC32_Type(); // leaf on stack replacement interpreter accessor types --- old/src/share/vm/runtime/globals.hpp 2015-06-11 10:35:43.983478929 -0700 +++ new/src/share/vm/runtime/globals.hpp 2015-06-11 10:35:43.911477803 -0700 @@ -641,6 +641,9 @@ product(bool, UseSHA, false, \ "Control whether SHA instructions can be used on SPARC") \ \ + product(bool, UseGHASHIntrinsics, false, \ + "Use intrinsics for GHASH versions of crypto") \ + \ product(size_t, LargePageSizeInBytes, 0, \ "Large page size (0 to let VM choose the page size)") \ \ --- old/src/share/vm/runtime/stubRoutines.cpp 2015-06-11 10:35:44.419485746 -0700 +++ new/src/share/vm/runtime/stubRoutines.cpp 2015-06-11 10:35:44.327484307 -0700 @@ -125,6 +125,7 @@ address StubRoutines::_aescrypt_decryptBlock = NULL; address StubRoutines::_cipherBlockChaining_encryptAESCrypt = NULL; address StubRoutines::_cipherBlockChaining_decryptAESCrypt = NULL; +address StubRoutines::_ghash_processBlocks = NULL; address StubRoutines::_sha1_implCompress = NULL; address StubRoutines::_sha1_implCompressMB = NULL; --- old/src/share/vm/runtime/stubRoutines.hpp 2015-06-11 10:35:44.727490562 -0700 +++ new/src/share/vm/runtime/stubRoutines.hpp 2015-06-11 10:35:44.659489498 -0700 @@ -185,6 +185,7 @@ static address _aescrypt_decryptBlock; static address _cipherBlockChaining_encryptAESCrypt; static address _cipherBlockChaining_decryptAESCrypt; + static address _ghash_processBlocks; static address _sha1_implCompress; static address _sha1_implCompressMB; @@ -346,6 +347,7 @@ static address aescrypt_decryptBlock() { return _aescrypt_decryptBlock; } static address cipherBlockChaining_encryptAESCrypt() { return _cipherBlockChaining_encryptAESCrypt; } static address cipherBlockChaining_decryptAESCrypt() { return _cipherBlockChaining_decryptAESCrypt; } + static address ghash_processBlocks() { return _ghash_processBlocks; } static address sha1_implCompress() { return _sha1_implCompress; } static address sha1_implCompressMB() { return _sha1_implCompressMB; } --- old/src/share/vm/runtime/vmStructs.cpp 2015-06-11 10:35:45.027495252 -0700 +++ new/src/share/vm/runtime/vmStructs.cpp 2015-06-11 10:35:44.939493876 -0700 @@ -828,6 +828,7 @@ static_field(StubRoutines, _aescrypt_decryptBlock, address) \ static_field(StubRoutines, _cipherBlockChaining_encryptAESCrypt, address) \ static_field(StubRoutines, _cipherBlockChaining_decryptAESCrypt, address) \ + static_field(StubRoutines, _ghash_processBlocks, address) \ static_field(StubRoutines, _updateBytesCRC32, address) \ static_field(StubRoutines, _crc_table_adr, address) \ static_field(StubRoutines, _multiplyToLen, address) \ --- old/test/compiler/codegen/7184394/TestAESBase.java 2015-06-11 10:35:45.335500068 -0700 +++ new/test/compiler/codegen/7184394/TestAESBase.java 2015-06-11 10:35:45.275499129 -0700 @@ -31,6 +31,7 @@ import java.util.Random; import javax.crypto.Cipher; import javax.crypto.SecretKey; +import javax.crypto.spec.GCMParameterSpec; import javax.crypto.spec.IvParameterSpec; import javax.crypto.spec.SecretKeySpec; @@ -62,6 +63,10 @@ Cipher dCipher; AlgorithmParameters algParams; SecretKey key; + GCMParameterSpec gcm_spec; + byte[] aad; + int tlen = 12; + byte[] iv; static int numThreads = 0; int threadId; @@ -100,6 +105,12 @@ int ivLen = (algorithm.equals("AES") ? 16 : algorithm.equals("DES") ? 8 : 0); IvParameterSpec initVector = new IvParameterSpec(new byte[ivLen]); cipher.init(Cipher.ENCRYPT_MODE, key, initVector); + } else if (mode.equals("GCM")) { + iv = new byte[64]; + random.nextBytes(iv); + aad = new byte[5]; + random.nextBytes(aad); + gcm_init(); } else { algParams = cipher.getParameters(); cipher.init(Cipher.ENCRYPT_MODE, key, algParams); @@ -186,4 +197,12 @@ } abstract void childShowCipher(); + + void gcm_init() throws Exception { + tlen = 12; + gcm_spec = new GCMParameterSpec(tlen * 8, iv); + cipher = Cipher.getInstance(algorithm + "/" + mode + "/" + paddingStr, "SunJCE"); + cipher.init(Cipher.ENCRYPT_MODE, key, gcm_spec); + cipher.update(aad); + } } --- old/test/compiler/codegen/7184394/TestAESEncode.java 2015-06-11 10:35:45.607504321 -0700 +++ new/test/compiler/codegen/7184394/TestAESEncode.java 2015-06-11 10:35:45.547503382 -0700 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2012, 2014, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2015, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -32,7 +32,11 @@ @Override public void run() { try { - if (!noReinit) cipher.init(Cipher.ENCRYPT_MODE, key, algParams); + if (mode.equals("GCM")) { + gcm_init(); + } else if (!noReinit) { + cipher.init(Cipher.ENCRYPT_MODE, key, algParams); + } encode = new byte[encodeLength]; if (testingMisalignment) { int tempSize = cipher.update(input, encInputOffset, (msgSize - lastChunkSize), encode, encOutputOffset); --- old/test/compiler/codegen/7184394/TestAESMain.java 2015-06-11 10:35:45.863508323 -0700 +++ new/test/compiler/codegen/7184394/TestAESMain.java 2015-06-11 10:35:45.803507384 -0700 @@ -44,6 +44,13 @@ * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=ECB -DencInputOffset=1 -DencOutputOffset=1 TestAESMain * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=ECB -DencInputOffset=1 -DencOutputOffset=1 -DdecOutputOffset=1 TestAESMain * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=ECB -DencInputOffset=1 -DencOutputOffset=1 -DdecOutputOffset=1 -DpaddingStr=NoPadding -DmsgSize=640 TestAESMain + * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=GCM TestAESMain + * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=GCM -DencInputOffset=1 TestAESMain + * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=GCM -DencOutputOffset=1 TestAESMain + * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=GCM -DdecOutputOffset=1 TestAESMain + * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=GCM -DencInputOffset=1 -DencOutputOffset=1 TestAESMain + * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=GCM -DencInputOffset=1 -DencOutputOffset=1 -DdecOutputOffset=1 TestAESMain + * @run main/othervm/timeout=600 -Xbatch -DcheckOutput=true -Dmode=GCM -DencInputOffset=1 -DencOutputOffset=1 -DdecOutputOffset=1 -DpaddingStr=NoPadding -DmsgSize=640 TestAESMain * * @author Tom Deneau */