@ rev 53302 : [Containers] Also consider hierarchical memory limits. | ~
1 /* 2 * Copyright (c) 2017, 2018, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include <string.h> 26 #include <math.h> 27 #include <errno.h> 28 #include "utilities/globalDefinitions.hpp" 29 #include "memory/allocation.hpp" 30 #include "runtime/os.hpp" 31 #include "logging/log.hpp" 32 #include "osContainer_linux.hpp" 33 34 /* 35 * PER_CPU_SHARES has been set to 1024 because CPU shares' quota 36 * is commonly used in cloud frameworks like Kubernetes[1], 37 * AWS[2] and Mesos[3] in a similar way. They spawn containers with 38 * --cpu-shares option values scaled by PER_CPU_SHARES. Thus, we do 39 * the inverse for determining the number of possible available 40 * CPUs to the JVM inside a container. See JDK-8216366. 41 * 42 * [1] https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/#meaning-of-cpu 43 * In particular: 44 * When using Docker: 45 * The spec.containers[].resources.requests.cpu is converted to its core value, which is potentially 46 * fractional, and multiplied by 1024. The greater of this number or 2 is used as the value of the 47 * --cpu-shares flag in the docker run command. 48 * [2] https://docs.aws.amazon.com/AmazonECS/latest/APIReference/API_ContainerDefinition.html 49 * [3] https://github.com/apache/mesos/blob/3478e344fb77d931f6122980c6e94cd3913c441d/src/docker/docker.cpp#L648 50 * https://github.com/apache/mesos/blob/3478e344fb77d931f6122980c6e94cd3913c441d/src/slave/containerizer/mesos/isolators/cgroups/constants.hpp#L30 51 */ 52 #define PER_CPU_SHARES 1024 53 54 bool OSContainer::_is_initialized = false; 55 bool OSContainer::_is_containerized = false; 56 julong _unlimited_memory; 57 58 class CgroupSubsystem: CHeapObj<mtInternal> { 59 friend class OSContainer; 60 61 private: 62 /* mountinfo contents */ 63 char *_root; 64 char *_mount_point; 65 66 /* Constructed subsystem directory */ 67 char *_path; 68 69 public: 70 CgroupSubsystem(char *root, char *mountpoint) { 71 _root = os::strdup(root); 72 _mount_point = os::strdup(mountpoint); 73 _path = NULL; 74 } 75 76 /* 77 * Set directory to subsystem specific files based 78 * on the contents of the mountinfo and cgroup files. 79 */ 80 void set_subsystem_path(char *cgroup_path) { 81 char buf[MAXPATHLEN+1]; 82 if (_root != NULL && cgroup_path != NULL) { 83 if (strcmp(_root, "/") == 0) { 84 int buflen; 85 strncpy(buf, _mount_point, MAXPATHLEN); 86 buf[MAXPATHLEN-1] = '\0'; 87 if (strcmp(cgroup_path,"/") != 0) { 88 buflen = strlen(buf); 89 if ((buflen + strlen(cgroup_path)) > (MAXPATHLEN-1)) { 90 return; 91 } 92 strncat(buf, cgroup_path, MAXPATHLEN-buflen); 93 buf[MAXPATHLEN-1] = '\0'; 94 } 95 _path = os::strdup(buf); 96 } else { 97 if (strcmp(_root, cgroup_path) == 0) { 98 strncpy(buf, _mount_point, MAXPATHLEN); 99 buf[MAXPATHLEN-1] = '\0'; 100 _path = os::strdup(buf); 101 } else { 102 char *p = strstr(_root, cgroup_path); 103 if (p != NULL && p == _root) { 104 if (strlen(cgroup_path) > strlen(_root)) { 105 int buflen; 106 strncpy(buf, _mount_point, MAXPATHLEN); 107 buf[MAXPATHLEN-1] = '\0'; 108 buflen = strlen(buf); 109 if ((buflen + strlen(cgroup_path)) > (MAXPATHLEN-1)) { 110 return; 111 } 112 strncat(buf, cgroup_path + strlen(_root), MAXPATHLEN-buflen); 113 buf[MAXPATHLEN-1] = '\0'; 114 _path = os::strdup(buf); 115 } 116 } 117 } 118 } 119 } 120 } 121 122 char *subsystem_path() { return _path; } 123 }; 124 125 CgroupSubsystem* memory = NULL; 126 CgroupSubsystem* cpuset = NULL; 127 CgroupSubsystem* cpu = NULL; 128 CgroupSubsystem* cpuacct = NULL; 129 130 typedef char * cptr; 131 132 PRAGMA_DIAG_PUSH 133 PRAGMA_FORMAT_NONLITERAL_IGNORED 134 template <typename T> int subsystem_file_contents(CgroupSubsystem* c, 135 const char *filename, 136 const char *scan_fmt, 137 T returnval) { 138 FILE *fp = NULL; 139 char *p; 140 char file[MAXPATHLEN+1]; 141 char buf[MAXPATHLEN+1]; 142 143 if (c == NULL) { 144 log_debug(os, container)("subsystem_file_contents: CgroupSubsytem* is NULL"); 145 return OSCONTAINER_ERROR; 146 } 147 if (c->subsystem_path() == NULL) { 148 log_debug(os, container)("subsystem_file_contents: subsystem path is NULL"); 149 return OSCONTAINER_ERROR; 150 } 151 152 strncpy(file, c->subsystem_path(), MAXPATHLEN); 153 file[MAXPATHLEN-1] = '\0'; 154 int filelen = strlen(file); 155 if ((filelen + strlen(filename)) > (MAXPATHLEN-1)) { 156 log_debug(os, container)("File path too long %s, %s", file, filename); 157 return OSCONTAINER_ERROR; 158 } 159 strncat(file, filename, MAXPATHLEN-filelen); 160 log_trace(os, container)("Path to %s is %s", filename, file); 161 fp = fopen(file, "r"); 162 if (fp != NULL) { 163 p = fgets(buf, MAXPATHLEN, fp); 164 if (p != NULL) { 165 int matched = sscanf(p, scan_fmt, returnval); 166 if (matched == 1) { 167 fclose(fp); 168 return 0; 169 } else { 170 log_debug(os, container)("Type %s not found in file %s", scan_fmt, file); 171 } 172 } else { 173 log_debug(os, container)("Empty file %s", file); 174 } 175 } else { 176 log_debug(os, container)("Open of file %s failed, %s", file, os::strerror(errno)); 177 } 178 if (fp != NULL) 179 fclose(fp); 180 return OSCONTAINER_ERROR; 181 } 182 PRAGMA_DIAG_POP 183 184 #define GET_CONTAINER_INFO(return_type, subsystem, filename, \ 185 logstring, scan_fmt, variable) \ 186 return_type variable; \ 187 { \ 188 int err; \ 189 err = subsystem_file_contents(subsystem, \ 190 filename, \ 191 scan_fmt, \ 192 &variable); \ 193 if (err != 0) \ 194 return (return_type) OSCONTAINER_ERROR; \ 195 \ 196 log_trace(os, container)(logstring, variable); \ 197 } 198 199 #define GET_CONTAINER_INFO_CPTR(return_type, subsystem, filename, \ 200 logstring, scan_fmt, variable, bufsize) \ 201 char variable[bufsize]; \ 202 { \ 203 int err; \ 204 err = subsystem_file_contents(subsystem, \ 205 filename, \ 206 scan_fmt, \ 207 variable); \ 208 if (err != 0) \ 209 return (return_type) NULL; \ 210 \ 211 log_trace(os, container)(logstring, variable); \ 212 } 213 214 /* init 215 * 216 * Initialize the container support and determine if 217 * we are running under cgroup control. 218 */ 219 void OSContainer::init() { 220 int mountid; 221 int parentid; 222 int major; 223 int minor; 224 FILE *mntinfo = NULL; 225 FILE *cgroup = NULL; 226 char buf[MAXPATHLEN+1]; 227 char tmproot[MAXPATHLEN+1]; 228 char tmpmount[MAXPATHLEN+1]; 229 char tmpbase[MAXPATHLEN+1]; 230 char *p; 231 jlong mem_limit; 232 233 assert(!_is_initialized, "Initializing OSContainer more than once"); 234 235 _is_initialized = true; 236 _is_containerized = false; 237 238 _unlimited_memory = (LONG_MAX / os::vm_page_size()) * os::vm_page_size(); 239 240 log_trace(os, container)("OSContainer::init: Initializing Container Support"); 241 if (!UseContainerSupport) { 242 log_trace(os, container)("Container Support not enabled"); 243 return; 244 } 245 246 /* 247 * Find the cgroup mount point for memory and cpuset 248 * by reading /proc/self/mountinfo 249 * 250 * Example for docker: 251 * 219 214 0:29 /docker/7208cebd00fa5f2e342b1094f7bed87fa25661471a4637118e65f1c995be8a34 /sys/fs/cgroup/memory ro,nosuid,nodev,noexec,relatime - cgroup cgroup rw,memory 252 * 253 * Example for host: 254 * 34 28 0:29 / /sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime shared:16 - cgroup cgroup rw,memory 255 */ 256 mntinfo = fopen("/proc/self/mountinfo", "r"); 257 if (mntinfo == NULL) { 258 log_debug(os, container)("Can't open /proc/self/mountinfo, %s", 259 os::strerror(errno)); 260 return; 261 } 262 263 while ( (p = fgets(buf, MAXPATHLEN, mntinfo)) != NULL) { 264 // Look for the filesystem type and see if it's cgroup 265 char fstype[MAXPATHLEN+1]; 266 fstype[0] = '\0'; 267 char *s = strstr(p, " - "); 268 if (s != NULL && 269 sscanf(s, " - %s", fstype) == 1 && 270 strcmp(fstype, "cgroup") == 0) { 271 272 if (strstr(p, "memory") != NULL) { 273 int matched = sscanf(p, "%d %d %d:%d %s %s", 274 &mountid, 275 &parentid, 276 &major, 277 &minor, 278 tmproot, 279 tmpmount); 280 if (matched == 6) { 281 memory = new CgroupSubsystem(tmproot, tmpmount); 282 } 283 else 284 log_debug(os, container)("Incompatible str containing cgroup and memory: %s", p); 285 } else if (strstr(p, "cpuset") != NULL) { 286 int matched = sscanf(p, "%d %d %d:%d %s %s", 287 &mountid, 288 &parentid, 289 &major, 290 &minor, 291 tmproot, 292 tmpmount); 293 if (matched == 6) { 294 cpuset = new CgroupSubsystem(tmproot, tmpmount); 295 } 296 else { 297 log_debug(os, container)("Incompatible str containing cgroup and cpuset: %s", p); 298 } 299 } else if (strstr(p, "cpu,cpuacct") != NULL || strstr(p, "cpuacct,cpu") != NULL) { 300 int matched = sscanf(p, "%d %d %d:%d %s %s", 301 &mountid, 302 &parentid, 303 &major, 304 &minor, 305 tmproot, 306 tmpmount); 307 if (matched == 6) { 308 cpu = new CgroupSubsystem(tmproot, tmpmount); 309 cpuacct = new CgroupSubsystem(tmproot, tmpmount); 310 } 311 else { 312 log_debug(os, container)("Incompatible str containing cgroup and cpu,cpuacct: %s", p); 313 } 314 } else if (strstr(p, "cpuacct") != NULL) { 315 int matched = sscanf(p, "%d %d %d:%d %s %s", 316 &mountid, 317 &parentid, 318 &major, 319 &minor, 320 tmproot, 321 tmpmount); 322 if (matched == 6) { 323 cpuacct = new CgroupSubsystem(tmproot, tmpmount); 324 } 325 else { 326 log_debug(os, container)("Incompatible str containing cgroup and cpuacct: %s", p); 327 } 328 } else if (strstr(p, "cpu") != NULL) { 329 int matched = sscanf(p, "%d %d %d:%d %s %s", 330 &mountid, 331 &parentid, 332 &major, 333 &minor, 334 tmproot, 335 tmpmount); 336 if (matched == 6) { 337 cpu = new CgroupSubsystem(tmproot, tmpmount); 338 } 339 else { 340 log_debug(os, container)("Incompatible str containing cgroup and cpu: %s", p); 341 } 342 } 343 } 344 } 345 346 fclose(mntinfo); 347 348 if (memory == NULL) { 349 log_debug(os, container)("Required cgroup memory subsystem not found"); 350 return; 351 } 352 if (cpuset == NULL) { 353 log_debug(os, container)("Required cgroup cpuset subsystem not found"); 354 return; 355 } 356 if (cpu == NULL) { 357 log_debug(os, container)("Required cgroup cpu subsystem not found"); 358 return; 359 } 360 if (cpuacct == NULL) { 361 log_debug(os, container)("Required cgroup cpuacct subsystem not found"); 362 return; 363 } 364 365 /* 366 * Read /proc/self/cgroup and map host mount point to 367 * local one via /proc/self/mountinfo content above 368 * 369 * Docker example: 370 * 5:memory:/docker/6558aed8fc662b194323ceab5b964f69cf36b3e8af877a14b80256e93aecb044 371 * 372 * Host example: 373 * 5:memory:/user.slice 374 * 375 * Construct a path to the process specific memory and cpuset 376 * cgroup directory. 377 * 378 * For a container running under Docker from memory example above 379 * the paths would be: 380 * 381 * /sys/fs/cgroup/memory 382 * 383 * For a Host from memory example above the path would be: 384 * 385 * /sys/fs/cgroup/memory/user.slice 386 * 387 */ 388 cgroup = fopen("/proc/self/cgroup", "r"); 389 if (cgroup == NULL) { 390 log_debug(os, container)("Can't open /proc/self/cgroup, %s", 391 os::strerror(errno)); 392 return; 393 } 394 395 while ( (p = fgets(buf, MAXPATHLEN, cgroup)) != NULL) { 396 int cgno; 397 int matched; 398 char *controller; 399 char *base; 400 401 /* Skip cgroup number */ 402 strsep(&p, ":"); 403 /* Get controller and base */ 404 controller = strsep(&p, ":"); 405 base = strsep(&p, "\n"); 406 407 if (controller != NULL) { 408 if (strstr(controller, "memory") != NULL) { 409 memory->set_subsystem_path(base); 410 } else if (strstr(controller, "cpuset") != NULL) { 411 cpuset->set_subsystem_path(base); 412 } else if (strstr(controller, "cpu,cpuacct") != NULL || strstr(controller, "cpuacct,cpu") != NULL) { 413 cpu->set_subsystem_path(base); 414 cpuacct->set_subsystem_path(base); 415 } else if (strstr(controller, "cpuacct") != NULL) { 416 cpuacct->set_subsystem_path(base); 417 } else if (strstr(controller, "cpu") != NULL) { 418 cpu->set_subsystem_path(base); 419 } 420 } 421 } 422 423 fclose(cgroup); 424 425 // We need to update the amount of physical memory now that 426 // command line arguments have been processed. 427 if ((mem_limit = memory_limit_in_bytes()) > 0) { 428 os::Linux::set_physical_memory(mem_limit); 429 } 430 431 _is_containerized = true; 432 433 } 434 435 const char * OSContainer::container_type() { 436 if (is_containerized()) { 437 return "cgroupv1"; 438 } else { 439 return NULL; 440 } 441 } 442 443 444 /* memory_limit_in_bytes 445 * 446 * Return the limit of available memory for this process. 447 * 448 * return: 449 * memory limit in bytes or 450 * -1 for unlimited 451 * OSCONTAINER_ERROR for not supported 452 */ 453 jlong OSContainer::memory_limit_in_bytes() { 454 GET_CONTAINER_INFO(julong, memory, "/memory.limit_in_bytes", 455 "Memory Limit is: " JULONG_FORMAT, JULONG_FORMAT, memlimit); 456 457 if (memlimit >= _unlimited_memory) { 458 log_trace(os, container)("Memory Limit is: Unlimited"); 459 return (jlong)-1; 460 } 461 else { 462 return (jlong)memlimit; 463 } 464 } 465 466 jlong OSContainer::memory_and_swap_limit_in_bytes() { 467 GET_CONTAINER_INFO(julong, memory, "/memory.memsw.limit_in_bytes", 468 "Memory and Swap Limit is: " JULONG_FORMAT, JULONG_FORMAT, memswlimit); 469 if (memswlimit >= _unlimited_memory) { 470 log_trace(os, container)("Memory and Swap Limit is: Unlimited"); 471 return (jlong)-1; 472 } else { 473 return (jlong)memswlimit; 474 } 475 } 476 477 jlong OSContainer::memory_soft_limit_in_bytes() { 478 GET_CONTAINER_INFO(julong, memory, "/memory.soft_limit_in_bytes", 479 "Memory Soft Limit is: " JULONG_FORMAT, JULONG_FORMAT, memsoftlimit); 480 if (memsoftlimit >= _unlimited_memory) { 481 log_trace(os, container)("Memory Soft Limit is: Unlimited"); 482 return (jlong)-1; 483 } else { 484 return (jlong)memsoftlimit; 485 } 486 } 487 488 /* memory_usage_in_bytes 489 * 490 * Return the amount of used memory for this process. 491 * 492 * return: 493 * memory usage in bytes or 494 * -1 for unlimited 495 * OSCONTAINER_ERROR for not supported 496 */ 497 jlong OSContainer::memory_usage_in_bytes() { 498 GET_CONTAINER_INFO(jlong, memory, "/memory.usage_in_bytes", 499 "Memory Usage is: " JLONG_FORMAT, JLONG_FORMAT, memusage); 500 return memusage; 501 } 502 503 /* memory_max_usage_in_bytes 504 * 505 * Return the maximum amount of used memory for this process. 506 * 507 * return: 508 * max memory usage in bytes or 509 * OSCONTAINER_ERROR for not supported 510 */ 511 jlong OSContainer::memory_max_usage_in_bytes() { 512 GET_CONTAINER_INFO(jlong, memory, "/memory.max_usage_in_bytes", 513 "Maximum Memory Usage is: " JLONG_FORMAT, JLONG_FORMAT, memmaxusage); 514 return memmaxusage; 515 } 516 517 /* active_processor_count 518 * 519 * Calculate an appropriate number of active processors for the 520 * VM to use based on these three inputs. 521 * 522 * cpu affinity 523 * cgroup cpu quota & cpu period 524 * cgroup cpu shares 525 * 526 * Algorithm: 527 * 528 * Determine the number of available CPUs from sched_getaffinity 529 * 530 * If user specified a quota (quota != -1), calculate the number of 531 * required CPUs by dividing quota by period. 532 * 533 * If shares are in effect (shares != -1), calculate the number 534 * of CPUs required for the shares by dividing the share value 535 * by PER_CPU_SHARES. 536 * 537 * All results of division are rounded up to the next whole number. 538 * 539 * If neither shares or quotas have been specified, return the 540 * number of active processors in the system. 541 * 542 * If both shares and quotas have been specified, the results are 543 * based on the flag PreferContainerQuotaForCPUCount. If true, 544 * return the quota value. If false return the smallest value 545 * between shares or quotas. 546 * 547 * If shares and/or quotas have been specified, the resulting number 548 * returned will never exceed the number of active processors. 549 * 550 * return: 551 * number of CPUs 552 */ 553 int OSContainer::active_processor_count() { 554 int quota_count = 0, share_count = 0; 555 int cpu_count, limit_count; 556 int result; 557 558 cpu_count = limit_count = os::Linux::active_processor_count(); 559 int quota = cpu_quota(); 560 int period = cpu_period(); 561 int share = cpu_shares(); 562 563 if (quota > -1 && period > 0) { 564 quota_count = ceilf((float)quota / (float)period); 565 log_trace(os, container)("CPU Quota count based on quota/period: %d", quota_count); 566 } 567 if (share > -1) { 568 share_count = ceilf((float)share / (float)PER_CPU_SHARES); 569 log_trace(os, container)("CPU Share count based on shares: %d", share_count); 570 } 571 572 // If both shares and quotas are setup results depend 573 // on flag PreferContainerQuotaForCPUCount. 574 // If true, limit CPU count to quota 575 // If false, use minimum of shares and quotas 576 if (quota_count !=0 && share_count != 0) { 577 if (PreferContainerQuotaForCPUCount) { 578 limit_count = quota_count; 579 } else { 580 limit_count = MIN2(quota_count, share_count); 581 } 582 } else if (quota_count != 0) { 583 limit_count = quota_count; 584 } else if (share_count != 0) { 585 limit_count = share_count; 586 } 587 588 result = MIN2(cpu_count, limit_count); 589 log_trace(os, container)("OSContainer::active_processor_count: %d", result); 590 return result; 591 } 592 593 char * OSContainer::cpu_cpuset_cpus() { 594 GET_CONTAINER_INFO_CPTR(cptr, cpuset, "/cpuset.cpus", 595 "cpuset.cpus is: %s", "%1023s", cpus, 1024); 596 return os::strdup(cpus); 597 } 598 599 char * OSContainer::cpu_cpuset_memory_nodes() { 600 GET_CONTAINER_INFO_CPTR(cptr, cpuset, "/cpuset.mems", 601 "cpuset.mems is: %s", "%1023s", mems, 1024); 602 return os::strdup(mems); 603 } 604 605 /* cpu_quota 606 * 607 * Return the number of milliseconds per period 608 * process is guaranteed to run. 609 * 610 * return: 611 * quota time in milliseconds 612 * -1 for no quota 613 * OSCONTAINER_ERROR for not supported 614 */ 615 int OSContainer::cpu_quota() { 616 GET_CONTAINER_INFO(int, cpu, "/cpu.cfs_quota_us", 617 "CPU Quota is: %d", "%d", quota); 618 return quota; 619 } 620 621 int OSContainer::cpu_period() { 622 GET_CONTAINER_INFO(int, cpu, "/cpu.cfs_period_us", 623 "CPU Period is: %d", "%d", period); 624 return period; 625 } 626 627 /* cpu_shares 628 * 629 * Return the amount of cpu shares available to the process 630 * 631 * return: 632 * Share number (typically a number relative to 1024) 633 * (2048 typically expresses 2 CPUs worth of processing) 634 * -1 for no share setup 635 * OSCONTAINER_ERROR for not supported 636 */ 637 int OSContainer::cpu_shares() { 638 GET_CONTAINER_INFO(int, cpu, "/cpu.shares", 639 "CPU Shares is: %d", "%d", shares); 640 // Convert 1024 to no shares setup 641 if (shares == 1024) return -1; 642 643 return shares; 644 } 645 --- EOF ---