1 /*
   2  * Copyright (c) 2017, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include <string.h>
  26 #include <math.h>
  27 #include <errno.h>
  28 #include "utilities/globalDefinitions.hpp"
  29 #include "memory/allocation.hpp"
  30 #include "runtime/os.hpp"
  31 #include "logging/log.hpp"
  32 #include "osContainer_linux.hpp"
  33 
  34 /*
  35  * Warning: Some linux distros use 0x7FFFFFFFFFFFF000
  36  * and others use 0x7FFFFFFFFFFFFFFF for unlimited.
  37  */
  38 #define UNLIMITED_MEM CONST64(0x7FFFFFFFFFFFF000)
  39 
  40 #define PER_CPU_SHARES 1024
  41 
  42 bool  OSContainer::_is_initialized   = false;
  43 bool  OSContainer::_is_containerized = false;
  44 
  45 class CgroupSubsystem: CHeapObj<mtInternal> {
  46  friend class OSContainer;
  47 
  48  private:
  49     /* mountinfo contents */
  50     char *_root;
  51     char *_mount_point;
  52 
  53     /* Constructed subsystem directory */
  54     char *_path;
  55 
  56  public:
  57     CgroupSubsystem(char *root, char *mountpoint) {
  58       _root = os::strdup(root);
  59       _mount_point = os::strdup(mountpoint);
  60       _path = NULL;
  61     }
  62 
  63     /*
  64      * Set directory to subsystem specific files based
  65      * on the contents of the mountinfo and cgroup files.
  66      */
  67     void set_subsystem_path(char *cgroup_path) {
  68       char buf[MAXPATHLEN+1];
  69       if (_root != NULL && cgroup_path != NULL) {
  70         if (strcmp(_root, "/") == 0) {
  71           int buflen;
  72           strncpy(buf, _mount_point, MAXPATHLEN);
  73           buf[MAXPATHLEN-1] = '\0';
  74           if (strcmp(cgroup_path,"/") != 0) {
  75             buflen = strlen(buf);
  76             if ((buflen + strlen(cgroup_path)) > (MAXPATHLEN-1)) {
  77               return;
  78             }
  79             strncat(buf, cgroup_path, MAXPATHLEN-buflen);
  80             buf[MAXPATHLEN-1] = '\0';
  81           }
  82           _path = os::strdup(buf);
  83         } else {
  84           if (strcmp(_root, cgroup_path) == 0) {
  85             strncpy(buf, _mount_point, MAXPATHLEN);
  86             buf[MAXPATHLEN-1] = '\0';
  87             _path = os::strdup(buf);
  88           } else {
  89             char *p = strstr(_root, cgroup_path);
  90             if (p != NULL && p == _root) {
  91               if (strlen(cgroup_path) > strlen(_root)) {
  92                 int buflen;
  93                 strncpy(buf, _mount_point, MAXPATHLEN);
  94                 buf[MAXPATHLEN-1] = '\0';
  95                 buflen = strlen(buf);
  96                 if ((buflen + strlen(cgroup_path)) > (MAXPATHLEN-1)) {
  97                   return;
  98                 }
  99                 strncat(buf, cgroup_path + strlen(_root), MAXPATHLEN-buflen);
 100                 buf[MAXPATHLEN-1] = '\0';
 101                 _path = os::strdup(buf);
 102               }
 103             }
 104           }
 105         }
 106       }
 107     }
 108 
 109     char *subsystem_path() { return _path; }
 110 };
 111 
 112 CgroupSubsystem* memory = NULL;
 113 CgroupSubsystem* cpuset = NULL;
 114 CgroupSubsystem* cpu = NULL;
 115 CgroupSubsystem* cpuacct = NULL;
 116 
 117 typedef char * cptr;
 118 
 119 PRAGMA_DIAG_PUSH
 120 PRAGMA_FORMAT_NONLITERAL_IGNORED
 121 template <typename T> int subsystem_file_contents(CgroupSubsystem* c,
 122                                               const char *filename,
 123                                               const char *scan_fmt,
 124                                               T returnval) {
 125   FILE *fp = NULL;
 126   char *p;
 127   char file[MAXPATHLEN+1];
 128   char buf[MAXPATHLEN+1];
 129 
 130   if (c != NULL && c->subsystem_path() != NULL) {
 131     strncpy(file, c->subsystem_path(), MAXPATHLEN);
 132     file[MAXPATHLEN-1] = '\0';
 133     int filelen = strlen(file);
 134     if ((filelen + strlen(filename)) > (MAXPATHLEN-1)) {
 135        log_debug(os, container)("File path too long %s, %s", file, filename);
 136        return OSCONTAINER_ERROR;
 137     }
 138     strncat(file, filename, MAXPATHLEN-filelen);
 139     log_trace(os, container)("Path to %s is %s", filename, file);
 140     fp = fopen(file, "r");
 141     if (fp != NULL) {
 142       p = fgets(buf, MAXPATHLEN, fp);
 143       if (p != NULL) {
 144         int matched = sscanf(p, scan_fmt, returnval);
 145         if (matched == 1) {
 146           fclose(fp);
 147           return 0;
 148         } else {
 149           log_debug(os, container)("Type %s not found in file %s",
 150                                      scan_fmt , file);
 151         }
 152       } else {
 153         log_debug(os, container)("Empty file %s", file);
 154       }
 155     } else {
 156       log_debug(os, container)("Open of file %s failed, %s", file,
 157                                os::strerror(errno));
 158     }
 159   }
 160   if (fp != NULL)
 161     fclose(fp);
 162   return OSCONTAINER_ERROR;
 163 }
 164 PRAGMA_DIAG_POP
 165 
 166 #define GET_CONTAINER_INFO(return_type, subsystem, filename,              \
 167                            logstring, scan_fmt, variable)                 \
 168   return_type variable;                                                   \
 169 {                                                                         \
 170   int err;                                                                \
 171   err = subsystem_file_contents(subsystem,                                \
 172                                 filename,                                 \
 173                                 scan_fmt,                                 \
 174                                 &variable);                               \
 175   if (err != 0)                                                           \
 176     return (return_type) OSCONTAINER_ERROR;                               \
 177                                                                           \
 178   log_trace(os, container)(logstring, variable);                          \
 179 }
 180 
 181 #define GET_CONTAINER_INFO_CPTR(return_type, subsystem, filename,         \
 182                                logstring, scan_fmt, variable, bufsize)    \
 183   char variable[bufsize];                                                 \
 184 {                                                                         \
 185   int err;                                                                \
 186   err = subsystem_file_contents(subsystem,                                \
 187                                 filename,                                 \
 188                                 scan_fmt,                                 \
 189                                 variable);                                \
 190   if (err != 0)                                                           \
 191     return (return_type) NULL;                                            \
 192                                                                           \
 193   log_trace(os, container)(logstring, variable);                          \
 194 }
 195 
 196 /* init
 197  *
 198  * Initialize the container support and determine if
 199  * we are running under cgroup control.
 200  */
 201 void OSContainer::init() {
 202   int mountid;
 203   int parentid;
 204   int major;
 205   int minor;
 206   FILE *mntinfo = NULL;
 207   FILE *cgroup = NULL;
 208   char buf[MAXPATHLEN+1];
 209   char tmproot[MAXPATHLEN+1];
 210   char tmpmount[MAXPATHLEN+1];
 211   char tmpbase[MAXPATHLEN+1];
 212   char *p;
 213   jlong mem_limit;
 214 
 215   assert(!_is_initialized, "Initializing OSContainer more than once");
 216 
 217   _is_initialized = true;
 218   _is_containerized = false;
 219 
 220   log_trace(os, container)("OSContainer::init: Initializing Container Support");
 221   if (!UseContainerSupport) {
 222     log_trace(os, container)("Container Support not enabled");
 223     return;
 224   }
 225 
 226   /*
 227    * Find the cgroup mount point for memory and cpuset
 228    * by reading /proc/self/mountinfo
 229    *
 230    * Example for docker:
 231    * 219 214 0:29 /docker/7208cebd00fa5f2e342b1094f7bed87fa25661471a4637118e65f1c995be8a34 /sys/fs/cgroup/memory ro,nosuid,nodev,noexec,relatime - cgroup cgroup rw,memory
 232    *
 233    * Example for host:
 234    * 34 28 0:29 / /sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime shared:16 - cgroup cgroup rw,memory
 235    */
 236   mntinfo = fopen("/proc/self/mountinfo", "r");
 237   if (mntinfo == NULL) {
 238       log_debug(os, container)("Can't open /proc/self/mountinfo, %s",
 239                                os::strerror(errno));
 240       return;
 241   }
 242 
 243   while ( (p = fgets(buf, MAXPATHLEN, mntinfo)) != NULL) {
 244     // Look for the filesystem type and see if it's cgroup
 245     char fstype[MAXPATHLEN+1];
 246     fstype[0] = '\0';
 247     char *s =  strstr(p, " - ");
 248     if (s != NULL &&
 249         sscanf(s, " - %s", fstype) == 1 &&
 250         strcmp(fstype, "cgroup") == 0) {
 251 
 252       if (strstr(p, "memory") != NULL) {
 253         int matched = sscanf(p, "%d %d %d:%d %s %s",
 254                              &mountid,
 255                              &parentid,
 256                              &major,
 257                              &minor,
 258                              tmproot,
 259                              tmpmount);
 260         if (matched == 6) {
 261           memory = new CgroupSubsystem(tmproot, tmpmount);
 262         }
 263         else
 264           log_debug(os, container)("Incompatible str containing cgroup and memory: %s", p);
 265       } else if (strstr(p, "cpuset") != NULL) {
 266         int matched = sscanf(p, "%d %d %d:%d %s %s",
 267                              &mountid,
 268                              &parentid,
 269                              &major,
 270                              &minor,
 271                              tmproot,
 272                              tmpmount);
 273         if (matched == 6) {
 274           cpuset = new CgroupSubsystem(tmproot, tmpmount);
 275         }
 276         else {
 277           log_debug(os, container)("Incompatible str containing cgroup and cpuset: %s", p);
 278         }
 279       } else if (strstr(p, "cpu,cpuacct") != NULL) {
 280         int matched = sscanf(p, "%d %d %d:%d %s %s",
 281                              &mountid,
 282                              &parentid,
 283                              &major,
 284                              &minor,
 285                              tmproot,
 286                              tmpmount);
 287         if (matched == 6) {
 288           cpu = new CgroupSubsystem(tmproot, tmpmount);
 289           cpuacct = new CgroupSubsystem(tmproot, tmpmount);
 290         }
 291         else {
 292           log_debug(os, container)("Incompatible str containing cgroup and cpu,cpuacct: %s", p);
 293         }
 294       } else if (strstr(p, "cpuacct") != NULL) {
 295         int matched = sscanf(p, "%d %d %d:%d %s %s",
 296                              &mountid,
 297                              &parentid,
 298                              &major,
 299                              &minor,
 300                              tmproot,
 301                              tmpmount);
 302         if (matched == 6) {
 303           cpuacct = new CgroupSubsystem(tmproot, tmpmount);
 304         }
 305         else {
 306           log_debug(os, container)("Incompatible str containing cgroup and cpuacct: %s", p);
 307         }
 308       } else if (strstr(p, "cpu") != NULL) {
 309         int matched = sscanf(p, "%d %d %d:%d %s %s",
 310                              &mountid,
 311                              &parentid,
 312                              &major,
 313                              &minor,
 314                              tmproot,
 315                              tmpmount);
 316         if (matched == 6) {
 317           cpu = new CgroupSubsystem(tmproot, tmpmount);
 318         }
 319         else {
 320           log_debug(os, container)("Incompatible str containing cgroup and cpu: %s", p);
 321         }
 322       }
 323     }
 324   }
 325 
 326   fclose(mntinfo);
 327 
 328   if (memory == NULL || cpuset == NULL || cpu == NULL || cpuacct == NULL) {
 329     log_debug(os, container)("Required cgroup subsystems not found");
 330     return;
 331   }
 332 
 333   /*
 334    * Read /proc/self/cgroup and map host mount point to
 335    * local one via /proc/self/mountinfo content above
 336    *
 337    * Docker example:
 338    * 5:memory:/docker/6558aed8fc662b194323ceab5b964f69cf36b3e8af877a14b80256e93aecb044
 339    *
 340    * Host example:
 341    * 5:memory:/user.slice
 342    *
 343    * Construct a path to the process specific memory and cpuset
 344    * cgroup directory.
 345    *
 346    * For a container running under Docker from memory example above
 347    * the paths would be:
 348    *
 349    * /sys/fs/cgroup/memory
 350    *
 351    * For a Host from memory example above the path would be:
 352    *
 353    * /sys/fs/cgroup/memory/user.slice
 354    *
 355    */
 356   cgroup = fopen("/proc/self/cgroup", "r");
 357   if (cgroup == NULL) {
 358     log_debug(os, container)("Can't open /proc/self/cgroup, %s",
 359                              os::strerror(errno));
 360     return;
 361   }
 362 
 363   while ( (p = fgets(buf, MAXPATHLEN, cgroup)) != NULL) {
 364     int cgno;
 365     int matched;
 366     char *controller;
 367     char *base;
 368 
 369     /* Skip cgroup number */
 370     strsep(&p, ":");
 371     /* Get controller and base */
 372     controller = strsep(&p, ":");
 373     base = strsep(&p, "\n");
 374 
 375     if (controller != NULL) {
 376       if (strstr(controller, "memory") != NULL) {
 377         memory->set_subsystem_path(base);
 378       } else if (strstr(controller, "cpuset") != NULL) {
 379         cpuset->set_subsystem_path(base);
 380       } else if (strstr(controller, "cpu,cpuacct") != NULL) {
 381         cpu->set_subsystem_path(base);
 382         cpuacct->set_subsystem_path(base);
 383       } else if (strstr(controller, "cpuacct") != NULL) {
 384         cpuacct->set_subsystem_path(base);
 385       } else if (strstr(controller, "cpu") != NULL) {
 386         cpu->set_subsystem_path(base);
 387       }
 388     }
 389   }
 390 
 391   fclose(cgroup);
 392 
 393   // We need to update the amount of physical memory now that
 394   // command line arguments have been processed.
 395   if ((mem_limit = memory_limit_in_bytes()) > 0) {
 396     os::Linux::set_physical_memory(mem_limit);
 397   }
 398 
 399   _is_containerized = true;
 400 
 401 }
 402 
 403 char * OSContainer::container_type() {
 404   if (is_containerized()) {
 405     return (char *)"cgroupv1";
 406   } else {
 407     return NULL;
 408   }
 409 }
 410 
 411 
 412 /* memory_limit_in_bytes
 413  *
 414  * Return the limit of available memory for this process.
 415  *
 416  * return:
 417  *    memory limit in bytes or
 418  *    -1 for unlimited
 419  *    OSCONTAINER_ERROR for not supported
 420  */
 421 jlong OSContainer::memory_limit_in_bytes() {
 422   GET_CONTAINER_INFO(jlong, memory, "/memory.limit_in_bytes",
 423                      "Memory Limit is: " JLONG_FORMAT, JLONG_FORMAT, memlimit);
 424 
 425   if (memlimit >= UNLIMITED_MEM) {
 426     log_trace(os, container)("Memory Limit is: Unlimited");
 427     return (jlong)-1;
 428   }
 429   else {
 430     return memlimit;
 431   }
 432 }
 433 
 434 jlong OSContainer::memory_and_swap_limit_in_bytes() {
 435   GET_CONTAINER_INFO(jlong, memory, "/memory.memsw.limit_in_bytes",
 436                      "Memory and Swap Limit is: " JLONG_FORMAT, JLONG_FORMAT, memswlimit);
 437   if (memswlimit >= UNLIMITED_MEM) {
 438     log_trace(os, container)("Memory and Swap Limit is: Unlimited");
 439     return (jlong)-1;
 440   } else {
 441     return memswlimit;
 442   }
 443 }
 444 
 445 jlong OSContainer::memory_soft_limit_in_bytes() {
 446   GET_CONTAINER_INFO(jlong, memory, "/memory.soft_limit_in_bytes",
 447                      "Memory Soft Limit is: " JLONG_FORMAT, JLONG_FORMAT, memsoftlimit);
 448   if (memsoftlimit >= UNLIMITED_MEM) {
 449     log_trace(os, container)("Memory Soft Limit is: Unlimited");
 450     return (jlong)-1;
 451   } else {
 452     return memsoftlimit;
 453   }
 454 }
 455 
 456 /* memory_usage_in_bytes
 457  *
 458  * Return the amount of used memory for this process.
 459  *
 460  * return:
 461  *    memory usage in bytes or
 462  *    -1 for unlimited
 463  *    OSCONTAINER_ERROR for not supported
 464  */
 465 jlong OSContainer::memory_usage_in_bytes() {
 466   GET_CONTAINER_INFO(jlong, memory, "/memory.usage_in_bytes",
 467                      "Memory Usage is: " JLONG_FORMAT, JLONG_FORMAT, memusage);
 468   return memusage;
 469 }
 470 
 471 /* memory_max_usage_in_bytes
 472  *
 473  * Return the maximum amount of used memory for this process.
 474  *
 475  * return:
 476  *    max memory usage in bytes or
 477  *    OSCONTAINER_ERROR for not supported
 478  */
 479 jlong OSContainer::memory_max_usage_in_bytes() {
 480   GET_CONTAINER_INFO(jlong, memory, "/memory.max_usage_in_bytes",
 481                      "Maximum Memory Usage is: " JLONG_FORMAT, JLONG_FORMAT, memmaxusage);
 482   return memmaxusage;
 483 }
 484 
 485 /* active_processor_count
 486  *
 487  * Calculate an appropriate number of active processors for the
 488  * VM to use based on these three cgroup options.
 489  *
 490  * cpu affinity
 491  * cpu quota & cpu period
 492  * cpu shares
 493  *
 494  * Algorithm:
 495  *
 496  * Determine the number of available CPUs from sched_getaffinity
 497  *
 498  * If user specified a quota (quota != -1), calculate the number of
 499  * required CPUs by dividing quota by period.
 500  *
 501  * If shares are in effect (shares != -1), calculate the number
 502  * of cpus required for the shares by dividing the share value
 503  * by PER_CPU_SHARES.
 504  *
 505  * All results of division are rounded up to the next whole number.
 506  *
 507  * Return the smaller number from the three different settings.
 508  *
 509  * return:
 510  *    number of cpus
 511  *    OSCONTAINER_ERROR if failure occured during extract of cpuset info
 512  */
 513 int OSContainer::active_processor_count() {
 514   int cpu_count, share_count, quota_count;
 515   int share, quota, period;
 516   int result;
 517 
 518   cpu_count = os::Linux::active_processor_count();
 519 
 520   share = cpu_shares();
 521   if (share > -1) {
 522     share_count = ceilf((float)share / (float)PER_CPU_SHARES);
 523     log_trace(os, container)("cpu_share count: %d", share_count);
 524   } else {
 525     share_count = cpu_count;
 526   }
 527 
 528   quota = cpu_quota();
 529   period = cpu_period();
 530   if (quota > -1 && period > 0) {
 531     quota_count = ceilf((float)quota / (float)period);
 532     log_trace(os, container)("quota_count: %d", quota_count);
 533   } else {
 534     quota_count = cpu_count;
 535   }
 536 
 537   result = MIN2(cpu_count, MIN2(share_count, quota_count));
 538   log_trace(os, container)("OSContainer::active_processor_count: %d", result);
 539   return result;
 540 }
 541 
 542 char * OSContainer::cpu_cpuset_cpus() {
 543   GET_CONTAINER_INFO_CPTR(cptr, cpuset, "/cpuset.cpus",
 544                      "cpuset.cpus is: %s", "%1023s", cpus, 1024);
 545   return os::strdup(cpus);
 546 }
 547 
 548 char * OSContainer::cpu_cpuset_memory_nodes() {
 549   GET_CONTAINER_INFO_CPTR(cptr, cpuset, "/cpuset.mems",
 550                      "cpuset.mems is: %s", "%1023s", mems, 1024);
 551   return os::strdup(mems);
 552 }
 553 
 554 /* cpu_quota
 555  *
 556  * Return the number of milliseconds per period
 557  * process is guaranteed to run.
 558  *
 559  * return:
 560  *    quota time in milliseconds
 561  *    -1 for no quota
 562  *    OSCONTAINER_ERROR for not supported
 563  */
 564 int OSContainer::cpu_quota() {
 565   GET_CONTAINER_INFO(int, cpu, "/cpu.cfs_quota_us",
 566                      "CPU Quota is: %d", "%d", quota);
 567   return quota;
 568 }
 569 
 570 int OSContainer::cpu_period() {
 571   GET_CONTAINER_INFO(int, cpu, "/cpu.cfs_period_us",
 572                      "CPU Period is: %d", "%d", period);
 573   return period;
 574 }
 575 
 576 /* cpu_shares
 577  *
 578  * Return the amount of cpu shares available to the process
 579  *
 580  * return:
 581  *    Share number (typically a number relative to 1024)
 582  *                 (2048 typically expresses 2 CPUs worth of processing)
 583  *    -1 for no share setup
 584  *    OSCONTAINER_ERROR for not supported
 585  */
 586 int OSContainer::cpu_shares() {
 587   GET_CONTAINER_INFO(int, cpu, "/cpu.shares",
 588                      "CPU Shares is: %d", "%d", shares);
 589   // Convert 1024 to no shares setup
 590   if (shares == 1024) return -1;
 591 
 592   return shares;
 593 }
 594