< prev index next >

src/cpu/x86/vm/sharedRuntime_x86_64.cpp

Print this page




   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #ifndef _WINDOWS
  27 #include "alloca.h"


  28 #endif
  29 #include "asm/macroAssembler.hpp"
  30 #include "asm/macroAssembler.inline.hpp"
  31 #include "code/debugInfoRec.hpp"
  32 #include "code/icBuffer.hpp"
  33 #include "code/vtableStubs.hpp"
  34 #include "interpreter/interpreter.hpp"
  35 #include "oops/compiledICHolder.hpp"
  36 #include "prims/jvmtiRedefineClassesTrace.hpp"
  37 #include "runtime/sharedRuntime.hpp"
  38 #include "runtime/vframeArray.hpp"
  39 #include "vmreg_x86.inline.hpp"
  40 #ifdef COMPILER1
  41 #include "c1/c1_Runtime1.hpp"
  42 #endif
  43 #ifdef COMPILER2
  44 #include "opto/runtime.hpp"
  45 #endif
  46 
  47 #define __ masm->


3954   __ movptr(Address(r15_thread, JavaThread::vm_result_offset()), (int)NULL_WORD);
3955 
3956   __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
3957   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3958 
3959   // -------------
3960   // make sure all code is generated
3961   masm->flush();
3962 
3963   // return the  blob
3964   // frame_size_words or bytes??
3965   return RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_words, oop_maps, true);
3966 }
3967 
3968 
3969 //------------------------------Montgomery multiplication------------------------
3970 //
3971 
3972 #ifndef _WINDOWS
3973 
3974 #define ASM_SUBTRACT
3975 
3976 #ifdef ASM_SUBTRACT
3977 // Subtract 0:b from carry:a.  Return carry.
3978 static unsigned long
3979 sub(unsigned long a[], unsigned long b[], unsigned long carry, long len) {
3980   long i = 0, cnt = len;
3981   unsigned long tmp;
3982   asm volatile("clc; "
3983                "0: ; "
3984                "mov (%[b], %[i], 8), %[tmp]; "
3985                "sbb %[tmp], (%[a], %[i], 8); "
3986                "inc %[i]; dec %[cnt]; "
3987                "jne 0b; "
3988                "mov %[carry], %[tmp]; sbb $0, %[tmp]; "
3989                : [i]"+r"(i), [cnt]"+r"(cnt), [tmp]"=&r"(tmp)
3990                : [a]"r"(a), [b]"r"(b), [carry]"r"(carry)
3991                : "memory");
3992   return tmp;
3993 }
3994 #else // ASM_SUBTRACT
3995 typedef int __attribute__((mode(TI))) int128;
3996 
3997 // Subtract 0:b from carry:a.  Return carry.
3998 static unsigned long
3999 sub(unsigned long a[], unsigned long b[], unsigned long carry, int len) {
4000   int128 tmp = 0;
4001   int i;
4002   for (i = 0; i < len; i++) {
4003     tmp += a[i];
4004     tmp -= b[i];
4005     a[i] = tmp;
4006     tmp >>= 64;
4007     assert(-1 <= tmp && tmp <= 0, "invariant");
4008   }
4009   return tmp + carry;
4010 }
4011 #endif // ! ASM_SUBTRACT
4012 
4013 // Multiply (unsigned) Long A by Long B, accumulating the double-
4014 // length result into the accumulator formed of T0, T1, and T2.
4015 #define MACC(A, B, T0, T1, T2)                                      \
4016 do {                                                                \
4017   unsigned long hi, lo;                                             \
4018   asm volatile("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4"   \
4019            : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)      \
4020            : "r"(A), "a"(B) : "cc");                                \
4021  } while(0)
4022 
4023 // As above, but add twice the double-length result into the
4024 // accumulator.
4025 #define MACC2(A, B, T0, T1, T2)                                     \
4026 do {                                                                \
4027   unsigned long hi, lo;                                             \
4028   asm volatile("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4;"  \
4029            "add %%rax, %2; adc %%rdx, %3; adc $0, %4"               \
4030            : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)      \
4031            : "r"(A), "a"(B) : "cc");                                \
4032  } while(0)
4033 
















































4034 // Fast Montgomery multiplication.  The derivation of the algorithm is
4035 // in  A Cryptographic Library for the Motorola DSP56000,
4036 // Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
4037 
4038 static void __attribute__((noinline))
4039 montgomery_multiply(unsigned long a[], unsigned long b[], unsigned long n[],
4040                     unsigned long m[], unsigned long inv, int len) {
4041   unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
4042   int i;
4043 
4044   assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
4045 
4046   for (i = 0; i < len; i++) {
4047     int j;
4048     for (j = 0; j < i; j++) {
4049       MACC(a[j], b[i-j], t0, t1, t2);
4050       MACC(m[j], n[i-j], t0, t1, t2);
4051     }
4052     MACC(a[i], b[0], t0, t1, t2);
4053     m[i] = t0 * inv;
4054     MACC(m[i], n[0], t0, t1, t2);
4055 
4056     assert(t0 == 0, "broken Montgomery multiply");
4057 
4058     t0 = t1; t1 = t2; t2 = 0;
4059   }
4060 
4061   for (i = len; i < 2*len; i++) {
4062     int j;
4063     for (j = i-len+1; j < len; j++) {
4064       MACC(a[j], b[i-j], t0, t1, t2);
4065       MACC(m[j], n[i-j], t0, t1, t2);
4066     }
4067     m[i-len] = t0;
4068     t0 = t1; t1 = t2; t2 = 0;
4069   }
4070 
4071   while (t0)
4072     t0 = sub(m, n, t0, len);
4073 }
4074 
4075 // Fast Montgomery squaring.  This uses asymptotically 25% fewer
4076 // multiplies so it should be up to 25% faster than Montgomery
4077 // multiplication.  However, its loop control is more complex and it
4078 // may actually run slower on some machines.
4079 
4080 static void __attribute__((noinline))
4081 montgomery_square(unsigned long a[], unsigned long n[],
4082                   unsigned long m[], unsigned long inv, int len) {
4083   unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
4084   int i;
4085 
4086   assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
4087 
4088   for (i = 0; i < len; i++) {
4089     int j;
4090     int end = (i+1)/2;
4091     for (j = 0; j < end; j++) {
4092       MACC2(a[j], a[i-j], t0, t1, t2);
4093       MACC(m[j], n[i-j], t0, t1, t2);
4094     }
4095     if ((i & 1) == 0) {
4096       MACC(a[j], a[j], t0, t1, t2);
4097     }
4098     for (; j < i; j++) {
4099       MACC(m[j], n[i-j], t0, t1, t2);
4100     }
4101     m[i] = t0 * inv;
4102     MACC(m[i], n[0], t0, t1, t2);
4103 
4104     assert(t0 == 0, "broken Montgomery square");
4105 
4106     t0 = t1; t1 = t2; t2 = 0;


4112     int j;
4113     for (j = start; j < end; j++) {
4114       MACC2(a[j], a[i-j], t0, t1, t2);
4115       MACC(m[j], n[i-j], t0, t1, t2);
4116     }
4117     if ((i & 1) == 0) {
4118       MACC(a[j], a[j], t0, t1, t2);
4119     }
4120     for (; j < len; j++) {
4121       MACC(m[j], n[i-j], t0, t1, t2);
4122     }
4123     m[i-len] = t0;
4124     t0 = t1; t1 = t2; t2 = 0;
4125   }
4126 
4127   while (t0)
4128     t0 = sub(m, n, t0, len);
4129 }
4130 
4131 // Swap words in a longword.
4132 static unsigned long swap(unsigned long x) {
4133   return (x << 32) | (x >> 32);
4134 }
4135 
4136 // Copy len longwords from s to d, word-swapping as we go.  The
4137 // destination array is reversed.
4138 static void reverse_words(unsigned long *s, unsigned long *d, int len) {
4139   d += len;
4140   while(len-- > 0) {
4141     d--;
4142     *d = swap(*s);
4143     s++;
4144   }
4145 }
4146 
4147 // The threshold at which squaring is advantageous was determined
4148 // experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz.
4149 #define MONTGOMERY_SQUARING_THRESHOLD 64
4150 
4151 void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints,
4152                                         jint len, jlong inv,
4153                                         jint *m_ints) {
4154   assert(len % 2 == 0, "array length in montgomery_multiply must be even");
4155   int longwords = len/2;
4156 
4157   // Make very sure we don't use so much space that the stack might
4158   // overflow.  512 jints corresponds to an 16384-bit integer and
4159   // will use here a total of 8k bytes of stack space.
4160   int total_allocation = longwords * sizeof (unsigned long) * 4;
4161   guarantee(total_allocation <= 8192, "must be");
4162   unsigned long *scratch = (unsigned long *)alloca(total_allocation);
4163 
4164   // Local scratch arrays
4165   unsigned long
4166     *a = scratch + 0 * longwords,
4167     *b = scratch + 1 * longwords,
4168     *n = scratch + 2 * longwords,
4169     *m = scratch + 3 * longwords;
4170 
4171   reverse_words((unsigned long *)a_ints, a, longwords);
4172   reverse_words((unsigned long *)b_ints, b, longwords);
4173   reverse_words((unsigned long *)n_ints, n, longwords);
4174 
4175   ::montgomery_multiply(a, b, n, m, (unsigned long)inv, longwords);
4176 
4177   reverse_words(m, (unsigned long *)m_ints, longwords);
4178 }
4179 
4180 void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints,
4181                                       jint len, jlong inv,
4182                                       jint *m_ints) {
4183   assert(len % 2 == 0, "array length in montgomery_square must be even");
4184   int longwords = len/2;
4185 
4186   // Make very sure we don't use so much space that the stack might
4187   // overflow.  512 jints corresponds to an 16384-bit integer and
4188   // will use here a total of 6k bytes of stack space.
4189   int total_allocation = longwords * sizeof (unsigned long) * 3;
4190   guarantee(total_allocation <= 8192, "must be");
4191   unsigned long *scratch = (unsigned long *)alloca(total_allocation);
4192 
4193   // Local scratch arrays
4194   unsigned long
4195     *a = scratch + 0 * longwords,
4196     *n = scratch + 1 * longwords,
4197     *m = scratch + 2 * longwords;
4198 
4199   reverse_words((unsigned long *)a_ints, a, longwords);
4200   reverse_words((unsigned long *)n_ints, n, longwords);
4201 
4202   //montgomery_square fails to pass BigIntegerTest on solaris amd64
4203   //on jdk7 and jdk8.
4204 #ifndef SOLARIS
4205   if (len >= MONTGOMERY_SQUARING_THRESHOLD) {
4206 #else
4207   if (0) {
4208 #endif
4209     ::montgomery_square(a, n, m, (unsigned long)inv, longwords);
4210   } else {
4211     ::montgomery_multiply(a, a, n, m, (unsigned long)inv, longwords);
4212   }
4213 
4214   reverse_words(m, (unsigned long *)m_ints, longwords);
4215 }
4216 
4217 #endif // WINDOWS
4218 
4219 #ifdef COMPILER2
4220 // This is here instead of runtime_x86_64.cpp because it uses SimpleRuntimeFrame
4221 //
4222 //------------------------------generate_exception_blob---------------------------
4223 // creates exception blob at the end
4224 // Using exception blob, this code is jumped from a compiled method.
4225 // (see emit_exception_handler in x86_64.ad file)
4226 //
4227 // Given an exception pc at a call we call into the runtime for the
4228 // handler in this method. This handler might merely restore state
4229 // (i.e. callee save registers) unwind the frame and jump to the
4230 // exception handler for the nmethod if there is no Java level handler
4231 // for the nmethod.
4232 //
4233 // This code is entered with a jmp.
4234 //
4235 // Arguments:
4236 //   rax: exception oop
4237 //   rdx: exception pc




   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #ifndef _WINDOWS
  27 #include "alloca.h"
  28 #else //WINDOWS
  29 #include <intrin.h>
  30 #endif
  31 #include "asm/macroAssembler.hpp"
  32 #include "asm/macroAssembler.inline.hpp"
  33 #include "code/debugInfoRec.hpp"
  34 #include "code/icBuffer.hpp"
  35 #include "code/vtableStubs.hpp"
  36 #include "interpreter/interpreter.hpp"
  37 #include "oops/compiledICHolder.hpp"
  38 #include "prims/jvmtiRedefineClassesTrace.hpp"
  39 #include "runtime/sharedRuntime.hpp"
  40 #include "runtime/vframeArray.hpp"
  41 #include "vmreg_x86.inline.hpp"
  42 #ifdef COMPILER1
  43 #include "c1/c1_Runtime1.hpp"
  44 #endif
  45 #ifdef COMPILER2
  46 #include "opto/runtime.hpp"
  47 #endif
  48 
  49 #define __ masm->


3956   __ movptr(Address(r15_thread, JavaThread::vm_result_offset()), (int)NULL_WORD);
3957 
3958   __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset()));
3959   __ jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3960 
3961   // -------------
3962   // make sure all code is generated
3963   masm->flush();
3964 
3965   // return the  blob
3966   // frame_size_words or bytes??
3967   return RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_words, oop_maps, true);
3968 }
3969 
3970 
3971 //------------------------------Montgomery multiplication------------------------
3972 //
3973 
3974 #ifndef _WINDOWS
3975 



3976 // Subtract 0:b from carry:a.  Return carry.
3977 static julong
3978 sub(julong a[], julong b[], julong carry, long len) {
3979   long long i = 0, cnt = len;
3980   julong tmp;
3981   asm volatile("clc; "
3982                "0: ; "
3983                "mov (%[b], %[i], 8), %[tmp]; "
3984                "sbb %[tmp], (%[a], %[i], 8); "
3985                "inc %[i]; dec %[cnt]; "
3986                "jne 0b; "
3987                "mov %[carry], %[tmp]; sbb $0, %[tmp]; "
3988                : [i]"+r"(i), [cnt]"+r"(cnt), [tmp]"=&r"(tmp)
3989                : [a]"r"(a), [b]"r"(b), [carry]"r"(carry)
3990                : "memory");
3991   return tmp;
3992 }


















3993 
3994 // Multiply (unsigned) Long A by Long B, accumulating the double-
3995 // length result into the accumulator formed of T0, T1, and T2.
3996 #define MACC(A, B, T0, T1, T2)                                      \
3997 do {                                                                \
3998   unsigned long hi, lo;                                             \
3999   asm volatile("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4"   \
4000            : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)      \
4001            : "r"(A), "a"(B) : "cc");                                \
4002  } while(0)
4003 
4004 // As above, but add twice the double-length result into the
4005 // accumulator.
4006 #define MACC2(A, B, T0, T1, T2)                                     \
4007 do {                                                                \
4008   unsigned long hi, lo;                                             \
4009   asm volatile("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4;"  \
4010            "add %%rax, %2; adc %%rdx, %3; adc $0, %4"               \
4011            : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2)      \
4012            : "r"(A), "a"(B) : "cc");                                \
4013  } while(0)
4014 
4015 #else //_WINDOWS
4016 
4017 // Visual Studio 2010 does not have _addcarry_u64 instrinsic
4018 // (TBD: does 2015?)
4019 #if defined(_WINDOWS) && _MSC_VER >= 1910
4020 static julong
4021 sub(julong a[], julong b[], julong carry, long len) {
4022   long i;
4023   julong tmp;
4024   unsigned char c = 1;
4025   for (i = 0; i < len; i++) {
4026     c = _addcarry_u64(c, a[i], ~b[i], &tmp);
4027     a[i] = tmp;
4028   }
4029   c = _addcarry_u64(c, carry, ~0, &tmp);
4030   return tmp;
4031 }
4032 
4033 // Multiply (unsigned) Long A by Long B, accumulating the double-
4034 // length result into the accumulator formed of T0, T1, and T2.
4035 #define MACC(A, B, T0, T1, T2)                          \
4036 do {                                                    \
4037   julong hi, lo;                            \
4038   lo = _umul128(A, B, &hi);                             \
4039   unsigned char c = _addcarry_u64(0, lo, T0, &T0);      \
4040   c = _addcarry_u64(c, hi, T1, &T1);                    \
4041   _addcarry_u64(c, T2, 0, &T2);                         \
4042  } while(0)
4043 
4044 // As above, but add twice the double-length result into the
4045 // accumulator.
4046 #define MACC2(A, B, T0, T1, T2)                         \
4047 do {                                                    \
4048   julong hi, lo;                            \
4049   lo = _umul128(A, B, &hi);                             \
4050   unsigned char c = _addcarry_u64(0, lo, T0, &T0);      \
4051   c = _addcarry_u64(c, hi, T1, &T1);                    \
4052   _addcarry_u64(c, T2, 0, &T2);                         \
4053   c = _addcarry_u64(0, lo, T0, &T0);                    \
4054   c = _addcarry_u64(c, hi, T1, &T1);                    \
4055   _addcarry_u64(c, T2, 0, &T2);                         \
4056  } while(0)
4057 
4058 #endif // defined(_WINDOWS) && _MSC_VER >= 1910
4059 #endif // defined(_WINDOWS)
4060 
4061 #if !(defined(_WINDOWS) && _MSC_VER < 1910)
4062 
4063 // Fast Montgomery multiplication.  The derivation of the algorithm is
4064 // in  A Cryptographic Library for the Motorola DSP56000,
4065 // Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
4066 
4067 static void NOINLINE
4068 montgomery_multiply(julong a[], julong b[], julong n[],
4069                     julong m[], julong inv, int len) {
4070   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
4071   int i;
4072 
4073   assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery multiply");
4074 
4075   for (i = 0; i < len; i++) {
4076     int j;
4077     for (j = 0; j < i; j++) {
4078       MACC(a[j], b[i-j], t0, t1, t2);
4079       MACC(m[j], n[i-j], t0, t1, t2);
4080     }
4081     MACC(a[i], b[0], t0, t1, t2);
4082     m[i] = t0 * inv;
4083     MACC(m[i], n[0], t0, t1, t2);
4084 
4085     assert(t0 == 0, "broken Montgomery multiply");
4086 
4087     t0 = t1; t1 = t2; t2 = 0;
4088   }
4089 
4090   for (i = len; i < 2*len; i++) {
4091     int j;
4092     for (j = i-len+1; j < len; j++) {
4093       MACC(a[j], b[i-j], t0, t1, t2);
4094       MACC(m[j], n[i-j], t0, t1, t2);
4095     }
4096     m[i-len] = t0;
4097     t0 = t1; t1 = t2; t2 = 0;
4098   }
4099 
4100   while (t0)
4101     t0 = sub(m, n, t0, len);
4102 }
4103 
4104 // Fast Montgomery squaring.  This uses asymptotically 25% fewer
4105 // multiplies so it should be up to 25% faster than Montgomery
4106 // multiplication.  However, its loop control is more complex and it
4107 // may actually run slower on some machines.
4108 
4109 static void NOINLINE
4110 montgomery_square(julong a[], julong n[],
4111                   julong m[], julong inv, int len) {
4112   julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
4113   int i;
4114 
4115   assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery square");
4116 
4117   for (i = 0; i < len; i++) {
4118     int j;
4119     int end = (i+1)/2;
4120     for (j = 0; j < end; j++) {
4121       MACC2(a[j], a[i-j], t0, t1, t2);
4122       MACC(m[j], n[i-j], t0, t1, t2);
4123     }
4124     if ((i & 1) == 0) {
4125       MACC(a[j], a[j], t0, t1, t2);
4126     }
4127     for (; j < i; j++) {
4128       MACC(m[j], n[i-j], t0, t1, t2);
4129     }
4130     m[i] = t0 * inv;
4131     MACC(m[i], n[0], t0, t1, t2);
4132 
4133     assert(t0 == 0, "broken Montgomery square");
4134 
4135     t0 = t1; t1 = t2; t2 = 0;


4141     int j;
4142     for (j = start; j < end; j++) {
4143       MACC2(a[j], a[i-j], t0, t1, t2);
4144       MACC(m[j], n[i-j], t0, t1, t2);
4145     }
4146     if ((i & 1) == 0) {
4147       MACC(a[j], a[j], t0, t1, t2);
4148     }
4149     for (; j < len; j++) {
4150       MACC(m[j], n[i-j], t0, t1, t2);
4151     }
4152     m[i-len] = t0;
4153     t0 = t1; t1 = t2; t2 = 0;
4154   }
4155 
4156   while (t0)
4157     t0 = sub(m, n, t0, len);
4158 }
4159 
4160 // Swap words in a longword.
4161 static julong swap(julong x) {
4162   return (x << 32) | (x >> 32);
4163 }
4164 
4165 // Copy len longwords from s to d, word-swapping as we go.  The
4166 // destination array is reversed.
4167 static void reverse_words(julong *s, julong *d, int len) {
4168   d += len;
4169   while(len-- > 0) {
4170     d--;
4171     *d = swap(*s);
4172     s++;
4173   }
4174 }
4175 
4176 // The threshold at which squaring is advantageous was determined
4177 // experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz.
4178 #define MONTGOMERY_SQUARING_THRESHOLD 64
4179 
4180 void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints,
4181                                         jint len, jlong inv,
4182                                         jint *m_ints) {
4183   assert(len % 2 == 0, "array length in montgomery_multiply must be even");
4184   int longwords = len/2;
4185 
4186   // Make very sure we don't use so much space that the stack might
4187   // overflow.  512 jints corresponds to an 16384-bit integer and
4188   // will use here a total of 8k bytes of stack space.
4189   int total_allocation = longwords * sizeof (julong) * 4;
4190   guarantee(total_allocation <= 8192, "must be");
4191   julong *scratch = (julong *)alloca(total_allocation);
4192 
4193   // Local scratch arrays
4194   julong
4195     *a = scratch + 0 * longwords,
4196     *b = scratch + 1 * longwords,
4197     *n = scratch + 2 * longwords,
4198     *m = scratch + 3 * longwords;
4199 
4200   reverse_words((julong *)a_ints, a, longwords);
4201   reverse_words((julong *)b_ints, b, longwords);
4202   reverse_words((julong *)n_ints, n, longwords);
4203 
4204   ::montgomery_multiply(a, b, n, m, (julong)inv, longwords);
4205 
4206   reverse_words(m, (julong *)m_ints, longwords);
4207 }
4208 
4209 void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints,
4210                                       jint len, jlong inv,
4211                                       jint *m_ints) {
4212   assert(len % 2 == 0, "array length in montgomery_square must be even");
4213   int longwords = len/2;
4214 
4215   // Make very sure we don't use so much space that the stack might
4216   // overflow.  512 jints corresponds to an 16384-bit integer and
4217   // will use here a total of 6k bytes of stack space.
4218   int total_allocation = longwords * sizeof (julong) * 3;
4219   guarantee(total_allocation <= 8192, "must be");
4220   julong *scratch = (julong *)alloca(total_allocation);
4221 
4222   // Local scratch arrays
4223   julong
4224     *a = scratch + 0 * longwords,
4225     *n = scratch + 1 * longwords,
4226     *m = scratch + 2 * longwords;
4227 
4228   reverse_words((julong *)a_ints, a, longwords);
4229   reverse_words((julong *)n_ints, n, longwords);
4230 
4231   //montgomery_square fails to pass BigIntegerTest on solaris amd64
4232   //on jdk7 and jdk8.
4233 #ifndef SOLARIS
4234   if (len >= MONTGOMERY_SQUARING_THRESHOLD) {
4235 #else
4236   if (0) {
4237 #endif
4238     ::montgomery_square(a, n, m, (julong)inv, longwords);
4239   } else {
4240     ::montgomery_multiply(a, a, n, m, (julong)inv, longwords);
4241   }
4242 
4243   reverse_words(m, (julong *)m_ints, longwords);
4244 }
4245 
4246 #endif // !(defined(_WINDOWS) && MSVC < VS2017)
4247 
4248 #ifdef COMPILER2
4249 // This is here instead of runtime_x86_64.cpp because it uses SimpleRuntimeFrame
4250 //
4251 //------------------------------generate_exception_blob---------------------------
4252 // creates exception blob at the end
4253 // Using exception blob, this code is jumped from a compiled method.
4254 // (see emit_exception_handler in x86_64.ad file)
4255 //
4256 // Given an exception pc at a call we call into the runtime for the
4257 // handler in this method. This handler might merely restore state
4258 // (i.e. callee save registers) unwind the frame and jump to the
4259 // exception handler for the nmethod if there is no Java level handler
4260 // for the nmethod.
4261 //
4262 // This code is entered with a jmp.
4263 //
4264 // Arguments:
4265 //   rax: exception oop
4266 //   rdx: exception pc


< prev index next >