37 * is commonly used in cloud frameworks like Kubernetes[1],
38 * AWS[2] and Mesos[3] in a similar way. They spawn containers with
39 * --cpu-shares option values scaled by PER_CPU_SHARES. Thus, we do
40 * the inverse for determining the number of possible available
41 * CPUs to the JVM inside a container. See JDK-8216366.
42 *
43 * [1] https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/#meaning-of-cpu
44 * In particular:
45 * When using Docker:
46 * The spec.containers[].resources.requests.cpu is converted to its core value, which is potentially
47 * fractional, and multiplied by 1024. The greater of this number or 2 is used as the value of the
48 * --cpu-shares flag in the docker run command.
49 * [2] https://docs.aws.amazon.com/AmazonECS/latest/APIReference/API_ContainerDefinition.html
50 * [3] https://github.com/apache/mesos/blob/3478e344fb77d931f6122980c6e94cd3913c441d/src/docker/docker.cpp#L648
51 * https://github.com/apache/mesos/blob/3478e344fb77d931f6122980c6e94cd3913c441d/src/slave/containerizer/mesos/isolators/cgroups/constants.hpp#L30
52 */
53 #define PER_CPU_SHARES 1024
54
55 bool OSContainer::_is_initialized = false;
56 bool OSContainer::_is_containerized = false;
57 julong _unlimited_memory;
58
59 class CgroupSubsystem: CHeapObj<mtInternal> {
60 friend class OSContainer;
61
62 private:
63 /* mountinfo contents */
64 char *_root;
65 char *_mount_point;
66
67 /* Constructed subsystem directory */
68 char *_path;
69
70 public:
71 CgroupSubsystem(char *root, char *mountpoint) {
72 _root = os::strdup(root);
73 _mount_point = os::strdup(mountpoint);
74 _path = NULL;
75 }
76
77 /*
78 * Set directory to subsystem specific files based
79 * on the contents of the mountinfo and cgroup files.
80 */
81 void set_subsystem_path(char *cgroup_path) {
82 char buf[MAXPATHLEN+1];
83 if (_root != NULL && cgroup_path != NULL) {
84 if (strcmp(_root, "/") == 0) {
85 int buflen;
86 strncpy(buf, _mount_point, MAXPATHLEN);
87 buf[MAXPATHLEN-1] = '\0';
88 if (strcmp(cgroup_path,"/") != 0) {
89 buflen = strlen(buf);
90 if ((buflen + strlen(cgroup_path)) > (MAXPATHLEN-1)) {
91 return;
92 }
93 strncat(buf, cgroup_path, MAXPATHLEN-buflen);
94 buf[MAXPATHLEN-1] = '\0';
104 if (p != NULL && p == _root) {
105 if (strlen(cgroup_path) > strlen(_root)) {
106 int buflen;
107 strncpy(buf, _mount_point, MAXPATHLEN);
108 buf[MAXPATHLEN-1] = '\0';
109 buflen = strlen(buf);
110 if ((buflen + strlen(cgroup_path) - strlen(_root)) > (MAXPATHLEN-1)) {
111 return;
112 }
113 strncat(buf, cgroup_path + strlen(_root), MAXPATHLEN-buflen);
114 buf[MAXPATHLEN-1] = '\0';
115 _path = os::strdup(buf);
116 }
117 }
118 }
119 }
120 }
121 }
122
123 char *subsystem_path() { return _path; }
124 };
125
126 class CgroupMemorySubsystem: CgroupSubsystem {
127 friend class OSContainer;
128
129 private:
130 /* Some container runtimes set limits via cgroup
131 * hierarchy. If set to true consider also memory.stat
132 * file if everything else seems unlimited */
133 bool _uses_mem_hierarchy;
134 volatile jlong _memory_limit_in_bytes;
135 volatile jlong _next_check_counter;
136
137 public:
138 CgroupMemorySubsystem(char *root, char *mountpoint) : CgroupSubsystem::CgroupSubsystem(root, mountpoint) {
139 _uses_mem_hierarchy = false;
140 _memory_limit_in_bytes = -1;
141 _next_check_counter = min_jlong;
142
143 }
144
145 bool is_hierarchical() { return _uses_mem_hierarchy; }
146 void set_hierarchical(bool value) { _uses_mem_hierarchy = value; }
147
148 bool should_check_memory_limit() {
149 return os::elapsed_counter() > _next_check_counter;
150 }
151 jlong memory_limit_in_bytes() { return _memory_limit_in_bytes; }
152 void set_memory_limit_in_bytes(jlong value) {
153 _memory_limit_in_bytes = value;
154 // max memory limit is unlikely to change, but we want to remain
155 // responsive to configuration changes. A very short (20ms) grace time
156 // between re-read avoids excessive overhead during startup without
157 // significantly reducing the VMs ability to promptly react to reduced
158 // memory availability
159 _next_check_counter = os::elapsed_counter() + (NANOSECS_PER_SEC/50);
160 }
161
162 };
163
164 CgroupMemorySubsystem* memory = NULL;
165 CgroupSubsystem* cpuset = NULL;
166 CgroupSubsystem* cpu = NULL;
167 CgroupSubsystem* cpuacct = NULL;
168
169 typedef char * cptr;
170
171 PRAGMA_DIAG_PUSH
172 PRAGMA_FORMAT_NONLITERAL_IGNORED
173 template <typename T> int subsystem_file_line_contents(CgroupSubsystem* c,
174 const char *filename,
175 const char *matchline,
176 const char *scan_fmt,
177 T returnval) {
178 FILE *fp = NULL;
179 char *p;
464 * A number > 0 if true, or
465 * OSCONTAINER_ERROR for not supported
466 */
467 jlong OSContainer::uses_mem_hierarchy() {
468 GET_CONTAINER_INFO(jlong, memory, "/memory.use_hierarchy",
469 "Use Hierarchy is: " JLONG_FORMAT, JLONG_FORMAT, use_hierarchy);
470 return use_hierarchy;
471 }
472
473
474 /* memory_limit_in_bytes
475 *
476 * Return the limit of available memory for this process.
477 *
478 * return:
479 * memory limit in bytes or
480 * -1 for unlimited
481 * OSCONTAINER_ERROR for not supported
482 */
483 jlong OSContainer::memory_limit_in_bytes() {
484 if (!memory->should_check_memory_limit()) {
485 return memory->memory_limit_in_bytes();
486 }
487 jlong memory_limit = read_memory_limit_in_bytes();
488 // Update CgroupMemorySubsystem to avoid re-reading container settings too often
489 memory->set_memory_limit_in_bytes(memory_limit);
490 return memory_limit;
491 }
492
493 jlong OSContainer::read_memory_limit_in_bytes() {
494 GET_CONTAINER_INFO(julong, memory, "/memory.limit_in_bytes",
495 "Memory Limit is: " JULONG_FORMAT, JULONG_FORMAT, memlimit);
496
497 if (memlimit >= _unlimited_memory) {
498 log_trace(os, container)("Non-Hierarchical Memory Limit is: Unlimited");
499 if (memory->is_hierarchical()) {
500 const char* matchline = "hierarchical_memory_limit";
501 const char* format = "%s " JULONG_FORMAT;
502 GET_CONTAINER_INFO_LINE(julong, memory, "/memory.stat", matchline,
503 "Hierarchical Memory Limit is: " JULONG_FORMAT, format, hier_memlimit)
504 if (hier_memlimit >= _unlimited_memory) {
600 *
601 * If neither shares or quotas have been specified, return the
602 * number of active processors in the system.
603 *
604 * If both shares and quotas have been specified, the results are
605 * based on the flag PreferContainerQuotaForCPUCount. If true,
606 * return the quota value. If false return the smallest value
607 * between shares or quotas.
608 *
609 * If shares and/or quotas have been specified, the resulting number
610 * returned will never exceed the number of active processors.
611 *
612 * return:
613 * number of CPUs
614 */
615 int OSContainer::active_processor_count() {
616 int quota_count = 0, share_count = 0;
617 int cpu_count, limit_count;
618 int result;
619
620 cpu_count = limit_count = os::Linux::active_processor_count();
621 int quota = cpu_quota();
622 int period = cpu_period();
623 int share = cpu_shares();
624
625 if (quota > -1 && period > 0) {
626 quota_count = ceilf((float)quota / (float)period);
627 log_trace(os, container)("CPU Quota count based on quota/period: %d", quota_count);
628 }
629 if (share > -1) {
630 share_count = ceilf((float)share / (float)PER_CPU_SHARES);
631 log_trace(os, container)("CPU Share count based on shares: %d", share_count);
632 }
633
634 // If both shares and quotas are setup results depend
635 // on flag PreferContainerQuotaForCPUCount.
636 // If true, limit CPU count to quota
637 // If false, use minimum of shares and quotas
638 if (quota_count !=0 && share_count != 0) {
639 if (PreferContainerQuotaForCPUCount) {
640 limit_count = quota_count;
641 } else {
642 limit_count = MIN2(quota_count, share_count);
643 }
644 } else if (quota_count != 0) {
645 limit_count = quota_count;
646 } else if (share_count != 0) {
647 limit_count = share_count;
648 }
649
650 result = MIN2(cpu_count, limit_count);
651 log_trace(os, container)("OSContainer::active_processor_count: %d", result);
652 return result;
653 }
654
655 char * OSContainer::cpu_cpuset_cpus() {
656 GET_CONTAINER_INFO_CPTR(cptr, cpuset, "/cpuset.cpus",
657 "cpuset.cpus is: %s", "%1023s", cpus, 1024);
658 return os::strdup(cpus);
659 }
660
661 char * OSContainer::cpu_cpuset_memory_nodes() {
662 GET_CONTAINER_INFO_CPTR(cptr, cpuset, "/cpuset.mems",
663 "cpuset.mems is: %s", "%1023s", mems, 1024);
664 return os::strdup(mems);
665 }
666
667 /* cpu_quota
668 *
669 * Return the number of milliseconds per period
670 * process is guaranteed to run.
671 *
|
37 * is commonly used in cloud frameworks like Kubernetes[1],
38 * AWS[2] and Mesos[3] in a similar way. They spawn containers with
39 * --cpu-shares option values scaled by PER_CPU_SHARES. Thus, we do
40 * the inverse for determining the number of possible available
41 * CPUs to the JVM inside a container. See JDK-8216366.
42 *
43 * [1] https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/#meaning-of-cpu
44 * In particular:
45 * When using Docker:
46 * The spec.containers[].resources.requests.cpu is converted to its core value, which is potentially
47 * fractional, and multiplied by 1024. The greater of this number or 2 is used as the value of the
48 * --cpu-shares flag in the docker run command.
49 * [2] https://docs.aws.amazon.com/AmazonECS/latest/APIReference/API_ContainerDefinition.html
50 * [3] https://github.com/apache/mesos/blob/3478e344fb77d931f6122980c6e94cd3913c441d/src/docker/docker.cpp#L648
51 * https://github.com/apache/mesos/blob/3478e344fb77d931f6122980c6e94cd3913c441d/src/slave/containerizer/mesos/isolators/cgroups/constants.hpp#L30
52 */
53 #define PER_CPU_SHARES 1024
54
55 bool OSContainer::_is_initialized = false;
56 bool OSContainer::_is_containerized = false;
57 int OSContainer::_active_processor_count = 1;
58 julong _unlimited_memory;
59
60 class CgroupSubsystem: CHeapObj<mtInternal> {
61 friend class OSContainer;
62
63
64 private:
65 volatile jlong _next_check_counter;
66
67 /* mountinfo contents */
68 char *_root;
69 char *_mount_point;
70
71 /* Constructed subsystem directory */
72 char *_path;
73
74 public:
75 CgroupSubsystem(char *root, char *mountpoint) {
76 _root = os::strdup(root);
77 _mount_point = os::strdup(mountpoint);
78 _path = NULL;
79 _next_check_counter = min_jlong;
80 }
81
82 /*
83 * Set directory to subsystem specific files based
84 * on the contents of the mountinfo and cgroup files.
85 */
86 void set_subsystem_path(char *cgroup_path) {
87 char buf[MAXPATHLEN+1];
88 if (_root != NULL && cgroup_path != NULL) {
89 if (strcmp(_root, "/") == 0) {
90 int buflen;
91 strncpy(buf, _mount_point, MAXPATHLEN);
92 buf[MAXPATHLEN-1] = '\0';
93 if (strcmp(cgroup_path,"/") != 0) {
94 buflen = strlen(buf);
95 if ((buflen + strlen(cgroup_path)) > (MAXPATHLEN-1)) {
96 return;
97 }
98 strncat(buf, cgroup_path, MAXPATHLEN-buflen);
99 buf[MAXPATHLEN-1] = '\0';
109 if (p != NULL && p == _root) {
110 if (strlen(cgroup_path) > strlen(_root)) {
111 int buflen;
112 strncpy(buf, _mount_point, MAXPATHLEN);
113 buf[MAXPATHLEN-1] = '\0';
114 buflen = strlen(buf);
115 if ((buflen + strlen(cgroup_path) - strlen(_root)) > (MAXPATHLEN-1)) {
116 return;
117 }
118 strncat(buf, cgroup_path + strlen(_root), MAXPATHLEN-buflen);
119 buf[MAXPATHLEN-1] = '\0';
120 _path = os::strdup(buf);
121 }
122 }
123 }
124 }
125 }
126 }
127
128 char *subsystem_path() { return _path; }
129
130 bool check_cache_timeout() {
131 return os::elapsed_counter() > _next_check_counter;
132 }
133
134 void set_cache_timeout(jlong timeout) {
135 _next_check_counter = os::elapsed_counter() + timeout;
136 }
137
138 };
139
140 class CgroupMemorySubsystem: CgroupSubsystem {
141 friend class OSContainer;
142
143 private:
144 /* Some container runtimes set limits via cgroup
145 * hierarchy. If set to true consider also memory.stat
146 * file if everything else seems unlimited */
147 bool _uses_mem_hierarchy;
148 volatile jlong _memory_limit_in_bytes;
149
150 public:
151 CgroupMemorySubsystem(char *root, char *mountpoint) : CgroupSubsystem::CgroupSubsystem(root, mountpoint) {
152 _uses_mem_hierarchy = false;
153 _memory_limit_in_bytes = -1;
154
155 }
156
157 bool is_hierarchical() { return _uses_mem_hierarchy; }
158 void set_hierarchical(bool value) { _uses_mem_hierarchy = value; }
159
160 jlong memory_limit_in_bytes() { return _memory_limit_in_bytes; }
161 void set_memory_limit_in_bytes(jlong value) {
162 _memory_limit_in_bytes = value;
163 // max memory limit is unlikely to change, but we want to remain
164 // responsive to configuration changes. A very short (20ms) grace time
165 // between re-read avoids excessive overhead during startup without
166 // significantly reducing the VMs ability to promptly react to reduced
167 // memory availability
168 set_cache_timeout(OSCONTAINER_CACHE_TIMEOUT);
169 }
170
171 };
172
173 CgroupMemorySubsystem* memory = NULL;
174 CgroupSubsystem* cpuset = NULL;
175 CgroupSubsystem* cpu = NULL;
176 CgroupSubsystem* cpuacct = NULL;
177
178 typedef char * cptr;
179
180 PRAGMA_DIAG_PUSH
181 PRAGMA_FORMAT_NONLITERAL_IGNORED
182 template <typename T> int subsystem_file_line_contents(CgroupSubsystem* c,
183 const char *filename,
184 const char *matchline,
185 const char *scan_fmt,
186 T returnval) {
187 FILE *fp = NULL;
188 char *p;
473 * A number > 0 if true, or
474 * OSCONTAINER_ERROR for not supported
475 */
476 jlong OSContainer::uses_mem_hierarchy() {
477 GET_CONTAINER_INFO(jlong, memory, "/memory.use_hierarchy",
478 "Use Hierarchy is: " JLONG_FORMAT, JLONG_FORMAT, use_hierarchy);
479 return use_hierarchy;
480 }
481
482
483 /* memory_limit_in_bytes
484 *
485 * Return the limit of available memory for this process.
486 *
487 * return:
488 * memory limit in bytes or
489 * -1 for unlimited
490 * OSCONTAINER_ERROR for not supported
491 */
492 jlong OSContainer::memory_limit_in_bytes() {
493 if (!memory->check_cache_timeout()) {
494 return memory->memory_limit_in_bytes();
495 }
496 jlong memory_limit = read_memory_limit_in_bytes();
497 // Update CgroupMemorySubsystem to avoid re-reading container settings too often
498 memory->set_memory_limit_in_bytes(memory_limit);
499 return memory_limit;
500 }
501
502 jlong OSContainer::read_memory_limit_in_bytes() {
503 GET_CONTAINER_INFO(julong, memory, "/memory.limit_in_bytes",
504 "Memory Limit is: " JULONG_FORMAT, JULONG_FORMAT, memlimit);
505
506 if (memlimit >= _unlimited_memory) {
507 log_trace(os, container)("Non-Hierarchical Memory Limit is: Unlimited");
508 if (memory->is_hierarchical()) {
509 const char* matchline = "hierarchical_memory_limit";
510 const char* format = "%s " JULONG_FORMAT;
511 GET_CONTAINER_INFO_LINE(julong, memory, "/memory.stat", matchline,
512 "Hierarchical Memory Limit is: " JULONG_FORMAT, format, hier_memlimit)
513 if (hier_memlimit >= _unlimited_memory) {
609 *
610 * If neither shares or quotas have been specified, return the
611 * number of active processors in the system.
612 *
613 * If both shares and quotas have been specified, the results are
614 * based on the flag PreferContainerQuotaForCPUCount. If true,
615 * return the quota value. If false return the smallest value
616 * between shares or quotas.
617 *
618 * If shares and/or quotas have been specified, the resulting number
619 * returned will never exceed the number of active processors.
620 *
621 * return:
622 * number of CPUs
623 */
624 int OSContainer::active_processor_count() {
625 int quota_count = 0, share_count = 0;
626 int cpu_count, limit_count;
627 int result;
628
629 // We use a cache with a timeout to avoid performing expensive
630 // computations in the event this function is called frequently.
631 // [See 8227006].
632 if (!cpu->check_cache_timeout()) {
633 log_trace(os, container)("OSContainer::active_processor_count (cached): %d", OSContainer::_active_processor_count);
634 return OSContainer::_active_processor_count;
635 }
636
637 cpu_count = limit_count = os::Linux::active_processor_count();
638 int quota = cpu_quota();
639 int period = cpu_period();
640 int share = cpu_shares();
641
642 if (quota > -1 && period > 0) {
643 quota_count = ceilf((float)quota / (float)period);
644 log_trace(os, container)("CPU Quota count based on quota/period: %d", quota_count);
645 }
646 if (share > -1) {
647 share_count = ceilf((float)share / (float)PER_CPU_SHARES);
648 log_trace(os, container)("CPU Share count based on shares: %d", share_count);
649 }
650
651 // If both shares and quotas are setup results depend
652 // on flag PreferContainerQuotaForCPUCount.
653 // If true, limit CPU count to quota
654 // If false, use minimum of shares and quotas
655 if (quota_count !=0 && share_count != 0) {
656 if (PreferContainerQuotaForCPUCount) {
657 limit_count = quota_count;
658 } else {
659 limit_count = MIN2(quota_count, share_count);
660 }
661 } else if (quota_count != 0) {
662 limit_count = quota_count;
663 } else if (share_count != 0) {
664 limit_count = share_count;
665 }
666
667 result = MIN2(cpu_count, limit_count);
668 log_trace(os, container)("OSContainer::active_processor_count: %d", result);
669
670 // Update the value and set the cache timeout to 20ms.
671 OSContainer::_active_processor_count = result;
672 cpu->set_cache_timeout(OSCONTAINER_CACHE_TIMEOUT);
673
674 return result;
675 }
676
677 char * OSContainer::cpu_cpuset_cpus() {
678 GET_CONTAINER_INFO_CPTR(cptr, cpuset, "/cpuset.cpus",
679 "cpuset.cpus is: %s", "%1023s", cpus, 1024);
680 return os::strdup(cpus);
681 }
682
683 char * OSContainer::cpu_cpuset_memory_nodes() {
684 GET_CONTAINER_INFO_CPTR(cptr, cpuset, "/cpuset.mems",
685 "cpuset.mems is: %s", "%1023s", mems, 1024);
686 return os::strdup(mems);
687 }
688
689 /* cpu_quota
690 *
691 * Return the number of milliseconds per period
692 * process is guaranteed to run.
693 *
|