< prev index next >
src/cpu/x86/vm/sharedRuntime_x86_64.cpp
Print this page
*** 23,32 ****
--- 23,34 ----
*/
#include "precompiled.hpp"
#ifndef _WINDOWS
#include "alloca.h"
+ #else //WINDOWS
+ #include <intrin.h>
#endif
#include "asm/macroAssembler.hpp"
#include "asm/macroAssembler.inline.hpp"
#include "code/debugInfoRec.hpp"
#include "code/icBuffer.hpp"
*** 3969,3986 ****
//------------------------------Montgomery multiplication------------------------
//
#ifndef _WINDOWS
- #define ASM_SUBTRACT
-
- #ifdef ASM_SUBTRACT
// Subtract 0:b from carry:a. Return carry.
! static unsigned long
! sub(unsigned long a[], unsigned long b[], unsigned long carry, long len) {
! long i = 0, cnt = len;
! unsigned long tmp;
asm volatile("clc; "
"0: ; "
"mov (%[b], %[i], 8), %[tmp]; "
"sbb %[tmp], (%[a], %[i], 8); "
"inc %[i]; dec %[cnt]; "
--- 3971,3985 ----
//------------------------------Montgomery multiplication------------------------
//
#ifndef _WINDOWS
// Subtract 0:b from carry:a. Return carry.
! static julong
! sub(julong a[], julong b[], julong carry, long len) {
! long long i = 0, cnt = len;
! julong tmp;
asm volatile("clc; "
"0: ; "
"mov (%[b], %[i], 8), %[tmp]; "
"sbb %[tmp], (%[a], %[i], 8); "
"inc %[i]; dec %[cnt]; "
*** 3989,4016 ****
: [i]"+r"(i), [cnt]"+r"(cnt), [tmp]"=&r"(tmp)
: [a]"r"(a), [b]"r"(b), [carry]"r"(carry)
: "memory");
return tmp;
}
- #else // ASM_SUBTRACT
- typedef int __attribute__((mode(TI))) int128;
-
- // Subtract 0:b from carry:a. Return carry.
- static unsigned long
- sub(unsigned long a[], unsigned long b[], unsigned long carry, int len) {
- int128 tmp = 0;
- int i;
- for (i = 0; i < len; i++) {
- tmp += a[i];
- tmp -= b[i];
- a[i] = tmp;
- tmp >>= 64;
- assert(-1 <= tmp && tmp <= 0, "invariant");
- }
- return tmp + carry;
- }
- #endif // ! ASM_SUBTRACT
// Multiply (unsigned) Long A by Long B, accumulating the double-
// length result into the accumulator formed of T0, T1, and T2.
#define MACC(A, B, T0, T1, T2) \
do { \
--- 3988,3997 ----
*** 4029,4049 ****
"add %%rax, %2; adc %%rdx, %3; adc $0, %4" \
: "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2) \
: "r"(A), "a"(B) : "cc"); \
} while(0)
// Fast Montgomery multiplication. The derivation of the algorithm is
// in A Cryptographic Library for the Motorola DSP56000,
// Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
! static void __attribute__((noinline))
! montgomery_multiply(unsigned long a[], unsigned long b[], unsigned long n[],
! unsigned long m[], unsigned long inv, int len) {
! unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
int i;
! assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
for (i = 0; i < len; i++) {
int j;
for (j = 0; j < i; j++) {
MACC(a[j], b[i-j], t0, t1, t2);
--- 4010,4078 ----
"add %%rax, %2; adc %%rdx, %3; adc $0, %4" \
: "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2) \
: "r"(A), "a"(B) : "cc"); \
} while(0)
+ #else //_WINDOWS
+
+ // Visual Studio 2010 does not have _addcarry_u64 instrinsic
+ // (TBD: does 2015?)
+ #if defined(_WINDOWS) && _MSC_VER >= 1910
+ static julong
+ sub(julong a[], julong b[], julong carry, long len) {
+ long i;
+ julong tmp;
+ unsigned char c = 1;
+ for (i = 0; i < len; i++) {
+ c = _addcarry_u64(c, a[i], ~b[i], &tmp);
+ a[i] = tmp;
+ }
+ c = _addcarry_u64(c, carry, ~0, &tmp);
+ return tmp;
+ }
+
+ // Multiply (unsigned) Long A by Long B, accumulating the double-
+ // length result into the accumulator formed of T0, T1, and T2.
+ #define MACC(A, B, T0, T1, T2) \
+ do { \
+ julong hi, lo; \
+ lo = _umul128(A, B, &hi); \
+ unsigned char c = _addcarry_u64(0, lo, T0, &T0); \
+ c = _addcarry_u64(c, hi, T1, &T1); \
+ _addcarry_u64(c, T2, 0, &T2); \
+ } while(0)
+
+ // As above, but add twice the double-length result into the
+ // accumulator.
+ #define MACC2(A, B, T0, T1, T2) \
+ do { \
+ julong hi, lo; \
+ lo = _umul128(A, B, &hi); \
+ unsigned char c = _addcarry_u64(0, lo, T0, &T0); \
+ c = _addcarry_u64(c, hi, T1, &T1); \
+ _addcarry_u64(c, T2, 0, &T2); \
+ c = _addcarry_u64(0, lo, T0, &T0); \
+ c = _addcarry_u64(c, hi, T1, &T1); \
+ _addcarry_u64(c, T2, 0, &T2); \
+ } while(0)
+
+ #endif // defined(_WINDOWS) && _MSC_VER >= 1910
+ #endif // defined(_WINDOWS)
+
+ #if !(defined(_WINDOWS) && _MSC_VER < 1910)
+
// Fast Montgomery multiplication. The derivation of the algorithm is
// in A Cryptographic Library for the Motorola DSP56000,
// Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
! static void NOINLINE
! montgomery_multiply(julong a[], julong b[], julong n[],
! julong m[], julong inv, int len) {
! julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
int i;
! assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery multiply");
for (i = 0; i < len; i++) {
int j;
for (j = 0; j < i; j++) {
MACC(a[j], b[i-j], t0, t1, t2);
*** 4075,4091 ****
// Fast Montgomery squaring. This uses asymptotically 25% fewer
// multiplies so it should be up to 25% faster than Montgomery
// multiplication. However, its loop control is more complex and it
// may actually run slower on some machines.
! static void __attribute__((noinline))
! montgomery_square(unsigned long a[], unsigned long n[],
! unsigned long m[], unsigned long inv, int len) {
! unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
int i;
! assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
for (i = 0; i < len; i++) {
int j;
int end = (i+1)/2;
for (j = 0; j < end; j++) {
--- 4104,4120 ----
// Fast Montgomery squaring. This uses asymptotically 25% fewer
// multiplies so it should be up to 25% faster than Montgomery
// multiplication. However, its loop control is more complex and it
// may actually run slower on some machines.
! static void NOINLINE
! montgomery_square(julong a[], julong n[],
! julong m[], julong inv, int len) {
! julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
int i;
! assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery square");
for (i = 0; i < len; i++) {
int j;
int end = (i+1)/2;
for (j = 0; j < end; j++) {
*** 4127,4143 ****
while (t0)
t0 = sub(m, n, t0, len);
}
// Swap words in a longword.
! static unsigned long swap(unsigned long x) {
return (x << 32) | (x >> 32);
}
// Copy len longwords from s to d, word-swapping as we go. The
// destination array is reversed.
! static void reverse_words(unsigned long *s, unsigned long *d, int len) {
d += len;
while(len-- > 0) {
d--;
*d = swap(*s);
s++;
--- 4156,4172 ----
while (t0)
t0 = sub(m, n, t0, len);
}
// Swap words in a longword.
! static julong swap(julong x) {
return (x << 32) | (x >> 32);
}
// Copy len longwords from s to d, word-swapping as we go. The
// destination array is reversed.
! static void reverse_words(julong *s, julong *d, int len) {
d += len;
while(len-- > 0) {
d--;
*d = swap(*s);
s++;
*** 4155,4182 ****
int longwords = len/2;
// Make very sure we don't use so much space that the stack might
// overflow. 512 jints corresponds to an 16384-bit integer and
// will use here a total of 8k bytes of stack space.
! int total_allocation = longwords * sizeof (unsigned long) * 4;
guarantee(total_allocation <= 8192, "must be");
! unsigned long *scratch = (unsigned long *)alloca(total_allocation);
// Local scratch arrays
! unsigned long
*a = scratch + 0 * longwords,
*b = scratch + 1 * longwords,
*n = scratch + 2 * longwords,
*m = scratch + 3 * longwords;
! reverse_words((unsigned long *)a_ints, a, longwords);
! reverse_words((unsigned long *)b_ints, b, longwords);
! reverse_words((unsigned long *)n_ints, n, longwords);
! ::montgomery_multiply(a, b, n, m, (unsigned long)inv, longwords);
! reverse_words(m, (unsigned long *)m_ints, longwords);
}
void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints,
jint len, jlong inv,
jint *m_ints) {
--- 4184,4211 ----
int longwords = len/2;
// Make very sure we don't use so much space that the stack might
// overflow. 512 jints corresponds to an 16384-bit integer and
// will use here a total of 8k bytes of stack space.
! int total_allocation = longwords * sizeof (julong) * 4;
guarantee(total_allocation <= 8192, "must be");
! julong *scratch = (julong *)alloca(total_allocation);
// Local scratch arrays
! julong
*a = scratch + 0 * longwords,
*b = scratch + 1 * longwords,
*n = scratch + 2 * longwords,
*m = scratch + 3 * longwords;
! reverse_words((julong *)a_ints, a, longwords);
! reverse_words((julong *)b_ints, b, longwords);
! reverse_words((julong *)n_ints, n, longwords);
! ::montgomery_multiply(a, b, n, m, (julong)inv, longwords);
! reverse_words(m, (julong *)m_ints, longwords);
}
void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints,
jint len, jlong inv,
jint *m_ints) {
*** 4184,4222 ****
int longwords = len/2;
// Make very sure we don't use so much space that the stack might
// overflow. 512 jints corresponds to an 16384-bit integer and
// will use here a total of 6k bytes of stack space.
! int total_allocation = longwords * sizeof (unsigned long) * 3;
guarantee(total_allocation <= 8192, "must be");
! unsigned long *scratch = (unsigned long *)alloca(total_allocation);
// Local scratch arrays
! unsigned long
*a = scratch + 0 * longwords,
*n = scratch + 1 * longwords,
*m = scratch + 2 * longwords;
! reverse_words((unsigned long *)a_ints, a, longwords);
! reverse_words((unsigned long *)n_ints, n, longwords);
//montgomery_square fails to pass BigIntegerTest on solaris amd64
//on jdk7 and jdk8.
#ifndef SOLARIS
if (len >= MONTGOMERY_SQUARING_THRESHOLD) {
#else
if (0) {
#endif
! ::montgomery_square(a, n, m, (unsigned long)inv, longwords);
} else {
! ::montgomery_multiply(a, a, n, m, (unsigned long)inv, longwords);
}
! reverse_words(m, (unsigned long *)m_ints, longwords);
}
! #endif // WINDOWS
#ifdef COMPILER2
// This is here instead of runtime_x86_64.cpp because it uses SimpleRuntimeFrame
//
//------------------------------generate_exception_blob---------------------------
--- 4213,4251 ----
int longwords = len/2;
// Make very sure we don't use so much space that the stack might
// overflow. 512 jints corresponds to an 16384-bit integer and
// will use here a total of 6k bytes of stack space.
! int total_allocation = longwords * sizeof (julong) * 3;
guarantee(total_allocation <= 8192, "must be");
! julong *scratch = (julong *)alloca(total_allocation);
// Local scratch arrays
! julong
*a = scratch + 0 * longwords,
*n = scratch + 1 * longwords,
*m = scratch + 2 * longwords;
! reverse_words((julong *)a_ints, a, longwords);
! reverse_words((julong *)n_ints, n, longwords);
//montgomery_square fails to pass BigIntegerTest on solaris amd64
//on jdk7 and jdk8.
#ifndef SOLARIS
if (len >= MONTGOMERY_SQUARING_THRESHOLD) {
#else
if (0) {
#endif
! ::montgomery_square(a, n, m, (julong)inv, longwords);
} else {
! ::montgomery_multiply(a, a, n, m, (julong)inv, longwords);
}
! reverse_words(m, (julong *)m_ints, longwords);
}
! #endif // !(defined(_WINDOWS) && MSVC < VS2017)
#ifdef COMPILER2
// This is here instead of runtime_x86_64.cpp because it uses SimpleRuntimeFrame
//
//------------------------------generate_exception_blob---------------------------
< prev index next >