< prev index next >
src/cpu/x86/vm/sharedRuntime_x86_64.cpp
Print this page
@@ -23,10 +23,12 @@
*/
#include "precompiled.hpp"
#ifndef _WINDOWS
#include "alloca.h"
+#else //WINDOWS
+#include <intrin.h>
#endif
#include "asm/macroAssembler.hpp"
#include "asm/macroAssembler.inline.hpp"
#include "code/debugInfoRec.hpp"
#include "code/icBuffer.hpp"
@@ -3969,18 +3971,15 @@
//------------------------------Montgomery multiplication------------------------
//
#ifndef _WINDOWS
-#define ASM_SUBTRACT
-
-#ifdef ASM_SUBTRACT
// Subtract 0:b from carry:a. Return carry.
-static unsigned long
-sub(unsigned long a[], unsigned long b[], unsigned long carry, long len) {
- long i = 0, cnt = len;
- unsigned long tmp;
+static julong
+sub(julong a[], julong b[], julong carry, long len) {
+ long long i = 0, cnt = len;
+ julong tmp;
asm volatile("clc; "
"0: ; "
"mov (%[b], %[i], 8), %[tmp]; "
"sbb %[tmp], (%[a], %[i], 8); "
"inc %[i]; dec %[cnt]; "
@@ -3989,28 +3988,10 @@
: [i]"+r"(i), [cnt]"+r"(cnt), [tmp]"=&r"(tmp)
: [a]"r"(a), [b]"r"(b), [carry]"r"(carry)
: "memory");
return tmp;
}
-#else // ASM_SUBTRACT
-typedef int __attribute__((mode(TI))) int128;
-
-// Subtract 0:b from carry:a. Return carry.
-static unsigned long
-sub(unsigned long a[], unsigned long b[], unsigned long carry, int len) {
- int128 tmp = 0;
- int i;
- for (i = 0; i < len; i++) {
- tmp += a[i];
- tmp -= b[i];
- a[i] = tmp;
- tmp >>= 64;
- assert(-1 <= tmp && tmp <= 0, "invariant");
- }
- return tmp + carry;
-}
-#endif // ! ASM_SUBTRACT
// Multiply (unsigned) Long A by Long B, accumulating the double-
// length result into the accumulator formed of T0, T1, and T2.
#define MACC(A, B, T0, T1, T2) \
do { \
@@ -4029,21 +4010,69 @@
"add %%rax, %2; adc %%rdx, %3; adc $0, %4" \
: "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2) \
: "r"(A), "a"(B) : "cc"); \
} while(0)
+#else //_WINDOWS
+
+// Visual Studio 2010 does not have _addcarry_u64 instrinsic
+// (TBD: does 2015?)
+#if defined(_WINDOWS) && _MSC_VER >= 1910
+static julong
+sub(julong a[], julong b[], julong carry, long len) {
+ long i;
+ julong tmp;
+ unsigned char c = 1;
+ for (i = 0; i < len; i++) {
+ c = _addcarry_u64(c, a[i], ~b[i], &tmp);
+ a[i] = tmp;
+ }
+ c = _addcarry_u64(c, carry, ~0, &tmp);
+ return tmp;
+}
+
+// Multiply (unsigned) Long A by Long B, accumulating the double-
+// length result into the accumulator formed of T0, T1, and T2.
+#define MACC(A, B, T0, T1, T2) \
+do { \
+ julong hi, lo; \
+ lo = _umul128(A, B, &hi); \
+ unsigned char c = _addcarry_u64(0, lo, T0, &T0); \
+ c = _addcarry_u64(c, hi, T1, &T1); \
+ _addcarry_u64(c, T2, 0, &T2); \
+ } while(0)
+
+// As above, but add twice the double-length result into the
+// accumulator.
+#define MACC2(A, B, T0, T1, T2) \
+do { \
+ julong hi, lo; \
+ lo = _umul128(A, B, &hi); \
+ unsigned char c = _addcarry_u64(0, lo, T0, &T0); \
+ c = _addcarry_u64(c, hi, T1, &T1); \
+ _addcarry_u64(c, T2, 0, &T2); \
+ c = _addcarry_u64(0, lo, T0, &T0); \
+ c = _addcarry_u64(c, hi, T1, &T1); \
+ _addcarry_u64(c, T2, 0, &T2); \
+ } while(0)
+
+#endif // defined(_WINDOWS) && _MSC_VER >= 1910
+#endif // defined(_WINDOWS)
+
+#if !(defined(_WINDOWS) && _MSC_VER < 1910)
+
// Fast Montgomery multiplication. The derivation of the algorithm is
// in A Cryptographic Library for the Motorola DSP56000,
// Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
-static void __attribute__((noinline))
-montgomery_multiply(unsigned long a[], unsigned long b[], unsigned long n[],
- unsigned long m[], unsigned long inv, int len) {
- unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
+static void NOINLINE
+montgomery_multiply(julong a[], julong b[], julong n[],
+ julong m[], julong inv, int len) {
+ julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
int i;
- assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
+ assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery multiply");
for (i = 0; i < len; i++) {
int j;
for (j = 0; j < i; j++) {
MACC(a[j], b[i-j], t0, t1, t2);
@@ -4075,17 +4104,17 @@
// Fast Montgomery squaring. This uses asymptotically 25% fewer
// multiplies so it should be up to 25% faster than Montgomery
// multiplication. However, its loop control is more complex and it
// may actually run slower on some machines.
-static void __attribute__((noinline))
-montgomery_square(unsigned long a[], unsigned long n[],
- unsigned long m[], unsigned long inv, int len) {
- unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
+static void NOINLINE
+montgomery_square(julong a[], julong n[],
+ julong m[], julong inv, int len) {
+ julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
int i;
- assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
+ assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery square");
for (i = 0; i < len; i++) {
int j;
int end = (i+1)/2;
for (j = 0; j < end; j++) {
@@ -4127,17 +4156,17 @@
while (t0)
t0 = sub(m, n, t0, len);
}
// Swap words in a longword.
-static unsigned long swap(unsigned long x) {
+static julong swap(julong x) {
return (x << 32) | (x >> 32);
}
// Copy len longwords from s to d, word-swapping as we go. The
// destination array is reversed.
-static void reverse_words(unsigned long *s, unsigned long *d, int len) {
+static void reverse_words(julong *s, julong *d, int len) {
d += len;
while(len-- > 0) {
d--;
*d = swap(*s);
s++;
@@ -4155,28 +4184,28 @@
int longwords = len/2;
// Make very sure we don't use so much space that the stack might
// overflow. 512 jints corresponds to an 16384-bit integer and
// will use here a total of 8k bytes of stack space.
- int total_allocation = longwords * sizeof (unsigned long) * 4;
+ int total_allocation = longwords * sizeof (julong) * 4;
guarantee(total_allocation <= 8192, "must be");
- unsigned long *scratch = (unsigned long *)alloca(total_allocation);
+ julong *scratch = (julong *)alloca(total_allocation);
// Local scratch arrays
- unsigned long
+ julong
*a = scratch + 0 * longwords,
*b = scratch + 1 * longwords,
*n = scratch + 2 * longwords,
*m = scratch + 3 * longwords;
- reverse_words((unsigned long *)a_ints, a, longwords);
- reverse_words((unsigned long *)b_ints, b, longwords);
- reverse_words((unsigned long *)n_ints, n, longwords);
+ reverse_words((julong *)a_ints, a, longwords);
+ reverse_words((julong *)b_ints, b, longwords);
+ reverse_words((julong *)n_ints, n, longwords);
- ::montgomery_multiply(a, b, n, m, (unsigned long)inv, longwords);
+ ::montgomery_multiply(a, b, n, m, (julong)inv, longwords);
- reverse_words(m, (unsigned long *)m_ints, longwords);
+ reverse_words(m, (julong *)m_ints, longwords);
}
void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints,
jint len, jlong inv,
jint *m_ints) {
@@ -4184,39 +4213,39 @@
int longwords = len/2;
// Make very sure we don't use so much space that the stack might
// overflow. 512 jints corresponds to an 16384-bit integer and
// will use here a total of 6k bytes of stack space.
- int total_allocation = longwords * sizeof (unsigned long) * 3;
+ int total_allocation = longwords * sizeof (julong) * 3;
guarantee(total_allocation <= 8192, "must be");
- unsigned long *scratch = (unsigned long *)alloca(total_allocation);
+ julong *scratch = (julong *)alloca(total_allocation);
// Local scratch arrays
- unsigned long
+ julong
*a = scratch + 0 * longwords,
*n = scratch + 1 * longwords,
*m = scratch + 2 * longwords;
- reverse_words((unsigned long *)a_ints, a, longwords);
- reverse_words((unsigned long *)n_ints, n, longwords);
+ reverse_words((julong *)a_ints, a, longwords);
+ reverse_words((julong *)n_ints, n, longwords);
//montgomery_square fails to pass BigIntegerTest on solaris amd64
//on jdk7 and jdk8.
#ifndef SOLARIS
if (len >= MONTGOMERY_SQUARING_THRESHOLD) {
#else
if (0) {
#endif
- ::montgomery_square(a, n, m, (unsigned long)inv, longwords);
+ ::montgomery_square(a, n, m, (julong)inv, longwords);
} else {
- ::montgomery_multiply(a, a, n, m, (unsigned long)inv, longwords);
+ ::montgomery_multiply(a, a, n, m, (julong)inv, longwords);
}
- reverse_words(m, (unsigned long *)m_ints, longwords);
+ reverse_words(m, (julong *)m_ints, longwords);
}
-#endif // WINDOWS
+#endif // !(defined(_WINDOWS) && MSVC < VS2017)
#ifdef COMPILER2
// This is here instead of runtime_x86_64.cpp because it uses SimpleRuntimeFrame
//
//------------------------------generate_exception_blob---------------------------
< prev index next >