8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "precompiled.hpp" 26 #ifndef _WINDOWS 27 #include "alloca.h" 28 #endif 29 #include "asm/macroAssembler.hpp" 30 #include "asm/macroAssembler.inline.hpp" 31 #include "code/debugInfoRec.hpp" 32 #include "code/icBuffer.hpp" 33 #include "code/vtableStubs.hpp" 34 #include "interpreter/interpreter.hpp" 35 #include "oops/compiledICHolder.hpp" 36 #include "prims/jvmtiRedefineClassesTrace.hpp" 37 #include "runtime/sharedRuntime.hpp" 38 #include "runtime/vframeArray.hpp" 39 #include "vmreg_x86.inline.hpp" 40 #ifdef COMPILER1 41 #include "c1/c1_Runtime1.hpp" 42 #endif 43 #ifdef COMPILER2 44 #include "opto/runtime.hpp" 45 #endif 46 47 #define __ masm-> 3954 __ movptr(Address(r15_thread, JavaThread::vm_result_offset()), (int)NULL_WORD); 3955 3956 __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset())); 3957 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 3958 3959 // ------------- 3960 // make sure all code is generated 3961 masm->flush(); 3962 3963 // return the blob 3964 // frame_size_words or bytes?? 3965 return RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_words, oop_maps, true); 3966 } 3967 3968 3969 //------------------------------Montgomery multiplication------------------------ 3970 // 3971 3972 #ifndef _WINDOWS 3973 3974 #define ASM_SUBTRACT 3975 3976 #ifdef ASM_SUBTRACT 3977 // Subtract 0:b from carry:a. Return carry. 3978 static unsigned long 3979 sub(unsigned long a[], unsigned long b[], unsigned long carry, long len) { 3980 long i = 0, cnt = len; 3981 unsigned long tmp; 3982 asm volatile("clc; " 3983 "0: ; " 3984 "mov (%[b], %[i], 8), %[tmp]; " 3985 "sbb %[tmp], (%[a], %[i], 8); " 3986 "inc %[i]; dec %[cnt]; " 3987 "jne 0b; " 3988 "mov %[carry], %[tmp]; sbb $0, %[tmp]; " 3989 : [i]"+r"(i), [cnt]"+r"(cnt), [tmp]"=&r"(tmp) 3990 : [a]"r"(a), [b]"r"(b), [carry]"r"(carry) 3991 : "memory"); 3992 return tmp; 3993 } 3994 #else // ASM_SUBTRACT 3995 typedef int __attribute__((mode(TI))) int128; 3996 3997 // Subtract 0:b from carry:a. Return carry. 3998 static unsigned long 3999 sub(unsigned long a[], unsigned long b[], unsigned long carry, int len) { 4000 int128 tmp = 0; 4001 int i; 4002 for (i = 0; i < len; i++) { 4003 tmp += a[i]; 4004 tmp -= b[i]; 4005 a[i] = tmp; 4006 tmp >>= 64; 4007 assert(-1 <= tmp && tmp <= 0, "invariant"); 4008 } 4009 return tmp + carry; 4010 } 4011 #endif // ! ASM_SUBTRACT 4012 4013 // Multiply (unsigned) Long A by Long B, accumulating the double- 4014 // length result into the accumulator formed of T0, T1, and T2. 4015 #define MACC(A, B, T0, T1, T2) \ 4016 do { \ 4017 unsigned long hi, lo; \ 4018 asm volatile("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4" \ 4019 : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2) \ 4020 : "r"(A), "a"(B) : "cc"); \ 4021 } while(0) 4022 4023 // As above, but add twice the double-length result into the 4024 // accumulator. 4025 #define MACC2(A, B, T0, T1, T2) \ 4026 do { \ 4027 unsigned long hi, lo; \ 4028 asm volatile("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4;" \ 4029 "add %%rax, %2; adc %%rdx, %3; adc $0, %4" \ 4030 : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2) \ 4031 : "r"(A), "a"(B) : "cc"); \ 4032 } while(0) 4033 4034 // Fast Montgomery multiplication. The derivation of the algorithm is 4035 // in A Cryptographic Library for the Motorola DSP56000, 4036 // Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. 4037 4038 static void __attribute__((noinline)) 4039 montgomery_multiply(unsigned long a[], unsigned long b[], unsigned long n[], 4040 unsigned long m[], unsigned long inv, int len) { 4041 unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 4042 int i; 4043 4044 assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply"); 4045 4046 for (i = 0; i < len; i++) { 4047 int j; 4048 for (j = 0; j < i; j++) { 4049 MACC(a[j], b[i-j], t0, t1, t2); 4050 MACC(m[j], n[i-j], t0, t1, t2); 4051 } 4052 MACC(a[i], b[0], t0, t1, t2); 4053 m[i] = t0 * inv; 4054 MACC(m[i], n[0], t0, t1, t2); 4055 4056 assert(t0 == 0, "broken Montgomery multiply"); 4057 4058 t0 = t1; t1 = t2; t2 = 0; 4059 } 4060 4061 for (i = len; i < 2*len; i++) { 4062 int j; 4063 for (j = i-len+1; j < len; j++) { 4064 MACC(a[j], b[i-j], t0, t1, t2); 4065 MACC(m[j], n[i-j], t0, t1, t2); 4066 } 4067 m[i-len] = t0; 4068 t0 = t1; t1 = t2; t2 = 0; 4069 } 4070 4071 while (t0) 4072 t0 = sub(m, n, t0, len); 4073 } 4074 4075 // Fast Montgomery squaring. This uses asymptotically 25% fewer 4076 // multiplies so it should be up to 25% faster than Montgomery 4077 // multiplication. However, its loop control is more complex and it 4078 // may actually run slower on some machines. 4079 4080 static void __attribute__((noinline)) 4081 montgomery_square(unsigned long a[], unsigned long n[], 4082 unsigned long m[], unsigned long inv, int len) { 4083 unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 4084 int i; 4085 4086 assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply"); 4087 4088 for (i = 0; i < len; i++) { 4089 int j; 4090 int end = (i+1)/2; 4091 for (j = 0; j < end; j++) { 4092 MACC2(a[j], a[i-j], t0, t1, t2); 4093 MACC(m[j], n[i-j], t0, t1, t2); 4094 } 4095 if ((i & 1) == 0) { 4096 MACC(a[j], a[j], t0, t1, t2); 4097 } 4098 for (; j < i; j++) { 4099 MACC(m[j], n[i-j], t0, t1, t2); 4100 } 4101 m[i] = t0 * inv; 4102 MACC(m[i], n[0], t0, t1, t2); 4103 4104 assert(t0 == 0, "broken Montgomery square"); 4105 4106 t0 = t1; t1 = t2; t2 = 0; 4112 int j; 4113 for (j = start; j < end; j++) { 4114 MACC2(a[j], a[i-j], t0, t1, t2); 4115 MACC(m[j], n[i-j], t0, t1, t2); 4116 } 4117 if ((i & 1) == 0) { 4118 MACC(a[j], a[j], t0, t1, t2); 4119 } 4120 for (; j < len; j++) { 4121 MACC(m[j], n[i-j], t0, t1, t2); 4122 } 4123 m[i-len] = t0; 4124 t0 = t1; t1 = t2; t2 = 0; 4125 } 4126 4127 while (t0) 4128 t0 = sub(m, n, t0, len); 4129 } 4130 4131 // Swap words in a longword. 4132 static unsigned long swap(unsigned long x) { 4133 return (x << 32) | (x >> 32); 4134 } 4135 4136 // Copy len longwords from s to d, word-swapping as we go. The 4137 // destination array is reversed. 4138 static void reverse_words(unsigned long *s, unsigned long *d, int len) { 4139 d += len; 4140 while(len-- > 0) { 4141 d--; 4142 *d = swap(*s); 4143 s++; 4144 } 4145 } 4146 4147 // The threshold at which squaring is advantageous was determined 4148 // experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz. 4149 #define MONTGOMERY_SQUARING_THRESHOLD 64 4150 4151 void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints, 4152 jint len, jlong inv, 4153 jint *m_ints) { 4154 assert(len % 2 == 0, "array length in montgomery_multiply must be even"); 4155 int longwords = len/2; 4156 4157 // Make very sure we don't use so much space that the stack might 4158 // overflow. 512 jints corresponds to an 16384-bit integer and 4159 // will use here a total of 8k bytes of stack space. 4160 int total_allocation = longwords * sizeof (unsigned long) * 4; 4161 guarantee(total_allocation <= 8192, "must be"); 4162 unsigned long *scratch = (unsigned long *)alloca(total_allocation); 4163 4164 // Local scratch arrays 4165 unsigned long 4166 *a = scratch + 0 * longwords, 4167 *b = scratch + 1 * longwords, 4168 *n = scratch + 2 * longwords, 4169 *m = scratch + 3 * longwords; 4170 4171 reverse_words((unsigned long *)a_ints, a, longwords); 4172 reverse_words((unsigned long *)b_ints, b, longwords); 4173 reverse_words((unsigned long *)n_ints, n, longwords); 4174 4175 ::montgomery_multiply(a, b, n, m, (unsigned long)inv, longwords); 4176 4177 reverse_words(m, (unsigned long *)m_ints, longwords); 4178 } 4179 4180 void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints, 4181 jint len, jlong inv, 4182 jint *m_ints) { 4183 assert(len % 2 == 0, "array length in montgomery_square must be even"); 4184 int longwords = len/2; 4185 4186 // Make very sure we don't use so much space that the stack might 4187 // overflow. 512 jints corresponds to an 16384-bit integer and 4188 // will use here a total of 6k bytes of stack space. 4189 int total_allocation = longwords * sizeof (unsigned long) * 3; 4190 guarantee(total_allocation <= 8192, "must be"); 4191 unsigned long *scratch = (unsigned long *)alloca(total_allocation); 4192 4193 // Local scratch arrays 4194 unsigned long 4195 *a = scratch + 0 * longwords, 4196 *n = scratch + 1 * longwords, 4197 *m = scratch + 2 * longwords; 4198 4199 reverse_words((unsigned long *)a_ints, a, longwords); 4200 reverse_words((unsigned long *)n_ints, n, longwords); 4201 4202 //montgomery_square fails to pass BigIntegerTest on solaris amd64 4203 //on jdk7 and jdk8. 4204 #ifndef SOLARIS 4205 if (len >= MONTGOMERY_SQUARING_THRESHOLD) { 4206 #else 4207 if (0) { 4208 #endif 4209 ::montgomery_square(a, n, m, (unsigned long)inv, longwords); 4210 } else { 4211 ::montgomery_multiply(a, a, n, m, (unsigned long)inv, longwords); 4212 } 4213 4214 reverse_words(m, (unsigned long *)m_ints, longwords); 4215 } 4216 4217 #endif // WINDOWS 4218 4219 #ifdef COMPILER2 4220 // This is here instead of runtime_x86_64.cpp because it uses SimpleRuntimeFrame 4221 // 4222 //------------------------------generate_exception_blob--------------------------- 4223 // creates exception blob at the end 4224 // Using exception blob, this code is jumped from a compiled method. 4225 // (see emit_exception_handler in x86_64.ad file) 4226 // 4227 // Given an exception pc at a call we call into the runtime for the 4228 // handler in this method. This handler might merely restore state 4229 // (i.e. callee save registers) unwind the frame and jump to the 4230 // exception handler for the nmethod if there is no Java level handler 4231 // for the nmethod. 4232 // 4233 // This code is entered with a jmp. 4234 // 4235 // Arguments: 4236 // rax: exception oop 4237 // rdx: exception pc | 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "precompiled.hpp" 26 #ifndef _WINDOWS 27 #include "alloca.h" 28 #else //WINDOWS 29 #include <intrin.h> 30 #endif 31 #include "asm/macroAssembler.hpp" 32 #include "asm/macroAssembler.inline.hpp" 33 #include "code/debugInfoRec.hpp" 34 #include "code/icBuffer.hpp" 35 #include "code/vtableStubs.hpp" 36 #include "interpreter/interpreter.hpp" 37 #include "oops/compiledICHolder.hpp" 38 #include "prims/jvmtiRedefineClassesTrace.hpp" 39 #include "runtime/sharedRuntime.hpp" 40 #include "runtime/vframeArray.hpp" 41 #include "vmreg_x86.inline.hpp" 42 #ifdef COMPILER1 43 #include "c1/c1_Runtime1.hpp" 44 #endif 45 #ifdef COMPILER2 46 #include "opto/runtime.hpp" 47 #endif 48 49 #define __ masm-> 3956 __ movptr(Address(r15_thread, JavaThread::vm_result_offset()), (int)NULL_WORD); 3957 3958 __ movptr(rax, Address(r15_thread, Thread::pending_exception_offset())); 3959 __ jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 3960 3961 // ------------- 3962 // make sure all code is generated 3963 masm->flush(); 3964 3965 // return the blob 3966 // frame_size_words or bytes?? 3967 return RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_words, oop_maps, true); 3968 } 3969 3970 3971 //------------------------------Montgomery multiplication------------------------ 3972 // 3973 3974 #ifndef _WINDOWS 3975 3976 // Subtract 0:b from carry:a. Return carry. 3977 static julong 3978 sub(julong a[], julong b[], julong carry, long len) { 3979 long long i = 0, cnt = len; 3980 julong tmp; 3981 asm volatile("clc; " 3982 "0: ; " 3983 "mov (%[b], %[i], 8), %[tmp]; " 3984 "sbb %[tmp], (%[a], %[i], 8); " 3985 "inc %[i]; dec %[cnt]; " 3986 "jne 0b; " 3987 "mov %[carry], %[tmp]; sbb $0, %[tmp]; " 3988 : [i]"+r"(i), [cnt]"+r"(cnt), [tmp]"=&r"(tmp) 3989 : [a]"r"(a), [b]"r"(b), [carry]"r"(carry) 3990 : "memory"); 3991 return tmp; 3992 } 3993 3994 // Multiply (unsigned) Long A by Long B, accumulating the double- 3995 // length result into the accumulator formed of T0, T1, and T2. 3996 #define MACC(A, B, T0, T1, T2) \ 3997 do { \ 3998 unsigned long hi, lo; \ 3999 asm volatile("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4" \ 4000 : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2) \ 4001 : "r"(A), "a"(B) : "cc"); \ 4002 } while(0) 4003 4004 // As above, but add twice the double-length result into the 4005 // accumulator. 4006 #define MACC2(A, B, T0, T1, T2) \ 4007 do { \ 4008 unsigned long hi, lo; \ 4009 asm volatile("mul %5; add %%rax, %2; adc %%rdx, %3; adc $0, %4;" \ 4010 "add %%rax, %2; adc %%rdx, %3; adc $0, %4" \ 4011 : "=&d"(hi), "=a"(lo), "+r"(T0), "+r"(T1), "+g"(T2) \ 4012 : "r"(A), "a"(B) : "cc"); \ 4013 } while(0) 4014 4015 #else //_WINDOWS 4016 4017 // Visual Studio 2010 does not have _addcarry_u64 instrinsic 4018 // (TBD: does 2015?) 4019 #if defined(_WINDOWS) && _MSC_VER >= 1910 4020 static julong 4021 sub(julong a[], julong b[], julong carry, long len) { 4022 long i; 4023 julong tmp; 4024 unsigned char c = 1; 4025 for (i = 0; i < len; i++) { 4026 c = _addcarry_u64(c, a[i], ~b[i], &tmp); 4027 a[i] = tmp; 4028 } 4029 c = _addcarry_u64(c, carry, ~0, &tmp); 4030 return tmp; 4031 } 4032 4033 // Multiply (unsigned) Long A by Long B, accumulating the double- 4034 // length result into the accumulator formed of T0, T1, and T2. 4035 #define MACC(A, B, T0, T1, T2) \ 4036 do { \ 4037 julong hi, lo; \ 4038 lo = _umul128(A, B, &hi); \ 4039 unsigned char c = _addcarry_u64(0, lo, T0, &T0); \ 4040 c = _addcarry_u64(c, hi, T1, &T1); \ 4041 _addcarry_u64(c, T2, 0, &T2); \ 4042 } while(0) 4043 4044 // As above, but add twice the double-length result into the 4045 // accumulator. 4046 #define MACC2(A, B, T0, T1, T2) \ 4047 do { \ 4048 julong hi, lo; \ 4049 lo = _umul128(A, B, &hi); \ 4050 unsigned char c = _addcarry_u64(0, lo, T0, &T0); \ 4051 c = _addcarry_u64(c, hi, T1, &T1); \ 4052 _addcarry_u64(c, T2, 0, &T2); \ 4053 c = _addcarry_u64(0, lo, T0, &T0); \ 4054 c = _addcarry_u64(c, hi, T1, &T1); \ 4055 _addcarry_u64(c, T2, 0, &T2); \ 4056 } while(0) 4057 4058 #endif // defined(_WINDOWS) && _MSC_VER >= 1910 4059 #endif // defined(_WINDOWS) 4060 4061 #if !(defined(_WINDOWS) && _MSC_VER < 1910) 4062 4063 // Fast Montgomery multiplication. The derivation of the algorithm is 4064 // in A Cryptographic Library for the Motorola DSP56000, 4065 // Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. 4066 4067 static void NOINLINE 4068 montgomery_multiply(julong a[], julong b[], julong n[], 4069 julong m[], julong inv, int len) { 4070 julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 4071 int i; 4072 4073 assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery multiply"); 4074 4075 for (i = 0; i < len; i++) { 4076 int j; 4077 for (j = 0; j < i; j++) { 4078 MACC(a[j], b[i-j], t0, t1, t2); 4079 MACC(m[j], n[i-j], t0, t1, t2); 4080 } 4081 MACC(a[i], b[0], t0, t1, t2); 4082 m[i] = t0 * inv; 4083 MACC(m[i], n[0], t0, t1, t2); 4084 4085 assert(t0 == 0, "broken Montgomery multiply"); 4086 4087 t0 = t1; t1 = t2; t2 = 0; 4088 } 4089 4090 for (i = len; i < 2*len; i++) { 4091 int j; 4092 for (j = i-len+1; j < len; j++) { 4093 MACC(a[j], b[i-j], t0, t1, t2); 4094 MACC(m[j], n[i-j], t0, t1, t2); 4095 } 4096 m[i-len] = t0; 4097 t0 = t1; t1 = t2; t2 = 0; 4098 } 4099 4100 while (t0) 4101 t0 = sub(m, n, t0, len); 4102 } 4103 4104 // Fast Montgomery squaring. This uses asymptotically 25% fewer 4105 // multiplies so it should be up to 25% faster than Montgomery 4106 // multiplication. However, its loop control is more complex and it 4107 // may actually run slower on some machines. 4108 4109 static void NOINLINE 4110 montgomery_square(julong a[], julong n[], 4111 julong m[], julong inv, int len) { 4112 julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 4113 int i; 4114 4115 assert(inv * n[0] == ULLONG_MAX, "broken inverse in Montgomery square"); 4116 4117 for (i = 0; i < len; i++) { 4118 int j; 4119 int end = (i+1)/2; 4120 for (j = 0; j < end; j++) { 4121 MACC2(a[j], a[i-j], t0, t1, t2); 4122 MACC(m[j], n[i-j], t0, t1, t2); 4123 } 4124 if ((i & 1) == 0) { 4125 MACC(a[j], a[j], t0, t1, t2); 4126 } 4127 for (; j < i; j++) { 4128 MACC(m[j], n[i-j], t0, t1, t2); 4129 } 4130 m[i] = t0 * inv; 4131 MACC(m[i], n[0], t0, t1, t2); 4132 4133 assert(t0 == 0, "broken Montgomery square"); 4134 4135 t0 = t1; t1 = t2; t2 = 0; 4141 int j; 4142 for (j = start; j < end; j++) { 4143 MACC2(a[j], a[i-j], t0, t1, t2); 4144 MACC(m[j], n[i-j], t0, t1, t2); 4145 } 4146 if ((i & 1) == 0) { 4147 MACC(a[j], a[j], t0, t1, t2); 4148 } 4149 for (; j < len; j++) { 4150 MACC(m[j], n[i-j], t0, t1, t2); 4151 } 4152 m[i-len] = t0; 4153 t0 = t1; t1 = t2; t2 = 0; 4154 } 4155 4156 while (t0) 4157 t0 = sub(m, n, t0, len); 4158 } 4159 4160 // Swap words in a longword. 4161 static julong swap(julong x) { 4162 return (x << 32) | (x >> 32); 4163 } 4164 4165 // Copy len longwords from s to d, word-swapping as we go. The 4166 // destination array is reversed. 4167 static void reverse_words(julong *s, julong *d, int len) { 4168 d += len; 4169 while(len-- > 0) { 4170 d--; 4171 *d = swap(*s); 4172 s++; 4173 } 4174 } 4175 4176 // The threshold at which squaring is advantageous was determined 4177 // experimentally on an i7-3930K (Ivy Bridge) CPU @ 3.5GHz. 4178 #define MONTGOMERY_SQUARING_THRESHOLD 64 4179 4180 void SharedRuntime::montgomery_multiply(jint *a_ints, jint *b_ints, jint *n_ints, 4181 jint len, jlong inv, 4182 jint *m_ints) { 4183 assert(len % 2 == 0, "array length in montgomery_multiply must be even"); 4184 int longwords = len/2; 4185 4186 // Make very sure we don't use so much space that the stack might 4187 // overflow. 512 jints corresponds to an 16384-bit integer and 4188 // will use here a total of 8k bytes of stack space. 4189 int total_allocation = longwords * sizeof (julong) * 4; 4190 guarantee(total_allocation <= 8192, "must be"); 4191 julong *scratch = (julong *)alloca(total_allocation); 4192 4193 // Local scratch arrays 4194 julong 4195 *a = scratch + 0 * longwords, 4196 *b = scratch + 1 * longwords, 4197 *n = scratch + 2 * longwords, 4198 *m = scratch + 3 * longwords; 4199 4200 reverse_words((julong *)a_ints, a, longwords); 4201 reverse_words((julong *)b_ints, b, longwords); 4202 reverse_words((julong *)n_ints, n, longwords); 4203 4204 ::montgomery_multiply(a, b, n, m, (julong)inv, longwords); 4205 4206 reverse_words(m, (julong *)m_ints, longwords); 4207 } 4208 4209 void SharedRuntime::montgomery_square(jint *a_ints, jint *n_ints, 4210 jint len, jlong inv, 4211 jint *m_ints) { 4212 assert(len % 2 == 0, "array length in montgomery_square must be even"); 4213 int longwords = len/2; 4214 4215 // Make very sure we don't use so much space that the stack might 4216 // overflow. 512 jints corresponds to an 16384-bit integer and 4217 // will use here a total of 6k bytes of stack space. 4218 int total_allocation = longwords * sizeof (julong) * 3; 4219 guarantee(total_allocation <= 8192, "must be"); 4220 julong *scratch = (julong *)alloca(total_allocation); 4221 4222 // Local scratch arrays 4223 julong 4224 *a = scratch + 0 * longwords, 4225 *n = scratch + 1 * longwords, 4226 *m = scratch + 2 * longwords; 4227 4228 reverse_words((julong *)a_ints, a, longwords); 4229 reverse_words((julong *)n_ints, n, longwords); 4230 4231 //montgomery_square fails to pass BigIntegerTest on solaris amd64 4232 //on jdk7 and jdk8. 4233 #ifndef SOLARIS 4234 if (len >= MONTGOMERY_SQUARING_THRESHOLD) { 4235 #else 4236 if (0) { 4237 #endif 4238 ::montgomery_square(a, n, m, (julong)inv, longwords); 4239 } else { 4240 ::montgomery_multiply(a, a, n, m, (julong)inv, longwords); 4241 } 4242 4243 reverse_words(m, (julong *)m_ints, longwords); 4244 } 4245 4246 #endif // !(defined(_WINDOWS) && MSVC < VS2017) 4247 4248 #ifdef COMPILER2 4249 // This is here instead of runtime_x86_64.cpp because it uses SimpleRuntimeFrame 4250 // 4251 //------------------------------generate_exception_blob--------------------------- 4252 // creates exception blob at the end 4253 // Using exception blob, this code is jumped from a compiled method. 4254 // (see emit_exception_handler in x86_64.ad file) 4255 // 4256 // Given an exception pc at a call we call into the runtime for the 4257 // handler in this method. This handler might merely restore state 4258 // (i.e. callee save registers) unwind the frame and jump to the 4259 // exception handler for the nmethod if there is no Java level handler 4260 // for the nmethod. 4261 // 4262 // This code is entered with a jmp. 4263 // 4264 // Arguments: 4265 // rax: exception oop 4266 // rdx: exception pc |