< prev index next >

src/cpu/x86/vm/sharedRuntime_x86_64.cpp

Print this page

        

*** 23,32 **** --- 23,34 ---- */ #include "precompiled.hpp" #ifndef _WINDOWS #include "alloca.h" + #else //WINDOWS + #include <intrin.h> #endif #include "asm/macroAssembler.hpp" #include "asm/macroAssembler.inline.hpp" #include "code/debugInfoRec.hpp" #include "code/icBuffer.hpp"
*** 3969,3986 **** //------------------------------Montgomery multiplication------------------------ // #ifndef _WINDOWS - #define ASM_SUBTRACT - - #ifdef ASM_SUBTRACT // Subtract 0:b from carry:a. Return carry. ! static unsigned long ! sub(unsigned long a[], unsigned long b[], unsigned long carry, long len) { ! long i = 0, cnt = len; ! unsigned long tmp; asm volatile("clc; " "0: ; " "mov (%[b], %[i], 8), %[tmp]; " "sbb %[tmp], (%[a], %[i], 8); " "inc %[i]; dec %[cnt]; " --- 3971,3985 ---- //------------------------------Montgomery multiplication------------------------ // #ifndef _WINDOWS // Subtract 0:b from carry:a. Return carry. ! static julong ! sub(julong a[], julong b[], julong carry, long len) { ! long long i = 0, cnt = len; ! julong tmp; asm volatile("clc; " "0: ; " "mov (%[b], %[i], 8), %[tmp]; " "sbb %[tmp], (%[a], %[i], 8); " "inc %[i]; dec %[cnt]; "
*** 3989,4016 **** : [i]"+r"(i), [cnt]"+r"(cnt), [tmp]"=&r"(tmp) : [a]"r"(a), [b]"r"(b), [carry]"r"(carry) : "memory"); return tmp; } - #else // ASM_SUBTRACT - typedef int __attribute__((mode(TI))) int128; - - // Subtract 0:b from carry:a. Return carry. - static unsigned long - sub(unsigned long a[], unsigned long b[], unsigned long carry, int len) { - int128 tmp = 0; - int i; - for (i = 0; i < len; i++) { - tmp += a[i]; - tmp -= b[i]; - a[i] = tmp; - tmp >>= 64; - assert(-1 <= tmp && tmp <= 0, "invariant"); - } - return tmp + carry; - } - #endif // ! ASM_SUBTRACT // Multiply (unsigned) Long A by Long B, accumulating the double- // length result into the accumulator formed of T0, T1, and T2. #define MACC(A, B, T0, T1, T2) \ do { \ --- 3988,3997 ----
*** 4029,4049 **** "add %%rax, %2; adc %%rdx, %3; adc $0, %4" \ : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2) \ : "r"(A), "a"(B) : "cc"); \ } while(0) // Fast Montgomery multiplication. The derivation of the algorithm is // in A Cryptographic Library for the Motorola DSP56000, // Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. ! static void __attribute__((noinline)) ! montgomery_multiply(unsigned long a[], unsigned long b[], unsigned long n[], ! unsigned long m[], unsigned long inv, int len) { ! unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator int i; ! assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply"); for (i = 0; i < len; i++) { int j; for (j = 0; j < i; j++) { MACC(a[j], b[i-j], t0, t1, t2); --- 4010,4078 ---- "add %%rax, %2; adc %%rdx, %3; adc $0, %4" \ : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2) \ : "r"(A), "a"(B) : "cc"); \ } while(0) + #else //_WINDOWS + + // Visual Studio 2010 does not have _addcarry_u64 instrinsic + // (TBD: does 2015?) + #if defined(_WINDOWS) && _MSC_VER >= 1910 + static julong + sub(julong a[], julong b[], julong carry, long len) { + long i; + julong tmp; + unsigned char c = 1; + for (i = 0; i < len; i++) { + c = _addcarry_u64(c, a[i], ~b[i], &tmp); + a[i] = tmp; + } + c = _addcarry_u64(c, carry, ~0, &tmp); + return tmp; + } + + // Multiply (unsigned) Long A by Long B, accumulating the double- + // length result into the accumulator formed of T0, T1, and T2. + #define MACC(A, B, T0, T1, T2) \ + do { \ + julong hi, lo; \ + lo = _umul128(A, B, &hi); \ + unsigned char c = _addcarry_u64(0, lo, T0, &T0); \ + c = _addcarry_u64(c, hi, T1, &T1); \ + _addcarry_u64(c, T2, 0, &T2); \ + } while(0) + + // As above, but add twice the double-length result into the + // accumulator. + #define MACC2(A, B, T0, T1, T2) \ + do { \ + julong hi, lo; \ + lo = _umul128(A, B, &hi); \ + unsigned char c = _addcarry_u64(0, lo, T0, &T0); \ + c = _addcarry_u64(c, hi, T1, &T1); \ + _addcarry_u64(c, T2, 0, &T2); \ + c = _addcarry_u64(0, lo, T0, &T0); \ + c = _addcarry_u64(c, hi, T1, &T1); \ + _addcarry_u64(c, T2, 0, &T2); \ + } while(0) + + #endif // defined(_WINDOWS) && _MSC_VER >= 1910 + #endif // defined(_WINDOWS) + + #if !(defined(_WINDOWS) && _MSC_VER < 1910) + // Fast Montgomery multiplication. The derivation of the algorithm is // in A Cryptographic Library for the Motorola DSP56000, // Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. ! static void NOINLINE ! montgomery_multiply(julong a[], julong b[], julong n[], ! julong m[], julong inv, int len) { ! julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator int i; ! assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery multiply"); for (i = 0; i < len; i++) { int j; for (j = 0; j < i; j++) { MACC(a[j], b[i-j], t0, t1, t2);
*** 4075,4091 **** // Fast Montgomery squaring. This uses asymptotically 25% fewer // multiplies so it should be up to 25% faster than Montgomery // multiplication. However, its loop control is more complex and it // may actually run slower on some machines. ! static void __attribute__((noinline)) ! montgomery_square(unsigned long a[], unsigned long n[], ! unsigned long m[], unsigned long inv, int len) { ! unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator int i; ! assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply"); for (i = 0; i < len; i++) { int j; int end = (i+1)/2; for (j = 0; j < end; j++) { --- 4104,4120 ---- // Fast Montgomery squaring. This uses asymptotically 25% fewer // multiplies so it should be up to 25% faster than Montgomery // multiplication. However, its loop control is more complex and it // may actually run slower on some machines. ! static void NOINLINE ! montgomery_square(julong a[], julong n[], ! julong m[], julong inv, int len) { ! julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator int i; ! assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery square"); for (i = 0; i < len; i++) { int j; int end = (i+1)/2; for (j = 0; j < end; j++) {
*** 4127,4143 **** while (t0) t0 = sub(m, n, t0, len); } // Swap words in a longword. ! static unsigned long swap(unsigned long x) { return (x << 32) | (x >> 32); } // Copy len longwords from s to d, word-swapping as we go. The // destination array is reversed. ! static void reverse_words(unsigned long *s, unsigned long *d, int len) { d += len; while(len-- > 0) { d--; *d = swap(*s); s++; --- 4156,4172 ---- while (t0) t0 = sub(m, n, t0, len); } // Swap words in a longword. ! static julong swap(julong x) { return (x << 32) | (x >> 32); } // Copy len longwords from s to d, word-swapping as we go. The // destination array is reversed. ! static void reverse_words(julong *s, julong *d, int len) { d += len; while(len-- > 0) { d--; *d = swap(*s); s++;
*** 4155,4182 **** int longwords = len/2; // Make very sure we don't use so much space that the stack might // overflow. 512 jints corresponds to an 16384-bit integer and // will use here a total of 8k bytes of stack space. ! int total_allocation = longwords * sizeof (unsigned long) * 4; guarantee(total_allocation <= 8192, "must be"); ! unsigned long *scratch = (unsigned long *)alloca(total_allocation); // Local scratch arrays ! unsigned long *a = scratch + 0 * longwords, *b = scratch + 1 * longwords, *n = scratch + 2 * longwords, *m = scratch + 3 * longwords; ! reverse_words((unsigned long *)a_ints, a, longwords); ! reverse_words((unsigned long *)b_ints, b, longwords); ! reverse_words((unsigned long *)n_ints, n, longwords); ! ::montgomery_multiply(a, b, n, m, (unsigned long)inv, longwords); ! reverse_words(m, (unsigned long *)m_ints, longwords); } void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints, jint len, jlong inv, jint *m_ints) { --- 4184,4211 ---- int longwords = len/2; // Make very sure we don't use so much space that the stack might // overflow. 512 jints corresponds to an 16384-bit integer and // will use here a total of 8k bytes of stack space. ! int total_allocation = longwords * sizeof (julong) * 4; guarantee(total_allocation <= 8192, "must be"); ! julong *scratch = (julong *)alloca(total_allocation); // Local scratch arrays ! julong *a = scratch + 0 * longwords, *b = scratch + 1 * longwords, *n = scratch + 2 * longwords, *m = scratch + 3 * longwords; ! reverse_words((julong *)a_ints, a, longwords); ! reverse_words((julong *)b_ints, b, longwords); ! reverse_words((julong *)n_ints, n, longwords); ! ::montgomery_multiply(a, b, n, m, (julong)inv, longwords); ! reverse_words(m, (julong *)m_ints, longwords); } void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints, jint len, jlong inv, jint *m_ints) {
*** 4184,4222 **** int longwords = len/2; // Make very sure we don't use so much space that the stack might // overflow. 512 jints corresponds to an 16384-bit integer and // will use here a total of 6k bytes of stack space. ! int total_allocation = longwords * sizeof (unsigned long) * 3; guarantee(total_allocation <= 8192, "must be"); ! unsigned long *scratch = (unsigned long *)alloca(total_allocation); // Local scratch arrays ! unsigned long *a = scratch + 0 * longwords, *n = scratch + 1 * longwords, *m = scratch + 2 * longwords; ! reverse_words((unsigned long *)a_ints, a, longwords); ! reverse_words((unsigned long *)n_ints, n, longwords); //montgomery_square fails to pass BigIntegerTest on solaris amd64 //on jdk7 and jdk8. #ifndef SOLARIS if (len >= MONTGOMERY_SQUARING_THRESHOLD) { #else if (0) { #endif ! ::montgomery_square(a, n, m, (unsigned long)inv, longwords); } else { ! ::montgomery_multiply(a, a, n, m, (unsigned long)inv, longwords); } ! reverse_words(m, (unsigned long *)m_ints, longwords); } ! #endif // WINDOWS #ifdef COMPILER2 // This is here instead of runtime_x86_64.cpp because it uses SimpleRuntimeFrame // //------------------------------generate_exception_blob--------------------------- --- 4213,4251 ---- int longwords = len/2; // Make very sure we don't use so much space that the stack might // overflow. 512 jints corresponds to an 16384-bit integer and // will use here a total of 6k bytes of stack space. ! int total_allocation = longwords * sizeof (julong) * 3; guarantee(total_allocation <= 8192, "must be"); ! julong *scratch = (julong *)alloca(total_allocation); // Local scratch arrays ! julong *a = scratch + 0 * longwords, *n = scratch + 1 * longwords, *m = scratch + 2 * longwords; ! reverse_words((julong *)a_ints, a, longwords); ! reverse_words((julong *)n_ints, n, longwords); //montgomery_square fails to pass BigIntegerTest on solaris amd64 //on jdk7 and jdk8. #ifndef SOLARIS if (len >= MONTGOMERY_SQUARING_THRESHOLD) { #else if (0) { #endif ! ::montgomery_square(a, n, m, (julong)inv, longwords); } else { ! ::montgomery_multiply(a, a, n, m, (julong)inv, longwords); } ! reverse_words(m, (julong *)m_ints, longwords); } ! #endif // !(defined(_WINDOWS) && MSVC < VS2017) #ifdef COMPILER2 // This is here instead of runtime_x86_64.cpp because it uses SimpleRuntimeFrame // //------------------------------generate_exception_blob---------------------------
< prev index next >