1 /* 2 * Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include <string.h> 26 #include <math.h> 27 #include <errno.h> 28 #include "cgroupSubsystem_linux.hpp" 29 #include "cgroupV1Subsystem_linux.hpp" 30 #include "cgroupV2Subsystem_linux.hpp" 31 #include "logging/log.hpp" 32 #include "memory/allocation.hpp" 33 #include "runtime/globals.hpp" 34 #include "runtime/os.hpp" 35 #include "utilities/globalDefinitions.hpp" 36 37 CgroupSubsystem* CgroupSubsystemFactory::create() { 38 CgroupV1MemoryController* memory = NULL; 39 CgroupV1Controller* cpuset = NULL; 40 CgroupV1Controller* cpu = NULL; 41 CgroupV1Controller* cpuacct = NULL; 42 FILE *mntinfo = NULL; 43 FILE *cgroups = NULL; 44 FILE *cgroup = NULL; 45 char buf[MAXPATHLEN+1]; 46 char tmproot[MAXPATHLEN+1]; 47 char tmpmount[MAXPATHLEN+1]; 48 char *p; 49 bool is_cgroupsV2; 50 // true iff all controllers, memory, cpu, cpuset, cpuacct are enabled 51 // at the kernel level. 52 bool all_controllers_enabled; 53 54 CgroupInfo cg_infos[CG_INFO_LENGTH]; 55 int cpuset_idx = 0; 56 int cpu_idx = 1; 57 int cpuacct_idx = 2; 58 int memory_idx = 3; 59 60 /* 61 * Read /proc/cgroups so as to be able to distinguish cgroups v2 vs cgroups v1. 62 * 63 * For cgroups v1 unified hierarchy, cpu, cpuacct, cpuset, memory controllers 64 * must have non-zero for the hierarchy ID field. 65 */ 66 cgroups = fopen("/proc/cgroups", "r"); 67 if (cgroups == NULL) { 68 log_debug(os, container)("Can't open /proc/cgroups, %s", 69 os::strerror(errno)); 70 return NULL; 71 } 72 73 while ((p = fgets(buf, MAXPATHLEN, cgroups)) != NULL) { 74 char name[MAXPATHLEN+1]; 75 int hierarchy_id; 76 int enabled; 77 78 // Format of /proc/cgroups documented via man 7 cgroups 79 if (sscanf(p, "%s %d %*d %d", name, &hierarchy_id, &enabled) != 3) { 80 continue; 81 } 82 if (strcmp(name, "memory") == 0) { 83 cg_infos[memory_idx]._name = os::strdup(name); 84 cg_infos[memory_idx]._hierarchy_id = hierarchy_id; 85 cg_infos[memory_idx]._enabled = (enabled == 1); 86 } else if (strcmp(name, "cpuset") == 0) { 87 cg_infos[cpuset_idx]._name = os::strdup(name); 88 cg_infos[cpuset_idx]._hierarchy_id = hierarchy_id; 89 cg_infos[cpuset_idx]._enabled = (enabled == 1); 90 } else if (strcmp(name, "cpu") == 0) { 91 cg_infos[cpu_idx]._name = os::strdup(name); 92 cg_infos[cpu_idx]._hierarchy_id = hierarchy_id; 93 cg_infos[cpu_idx]._enabled = (enabled == 1); 94 } else if (strcmp(name, "cpuacct") == 0) { 95 cg_infos[cpuacct_idx]._name = os::strdup(name); 96 cg_infos[cpuacct_idx]._hierarchy_id = hierarchy_id; 97 cg_infos[cpuacct_idx]._enabled = (enabled == 1); 98 } 99 } 100 fclose(cgroups); 101 102 is_cgroupsV2 = true; 103 all_controllers_enabled = true; 104 for (int i = 0; i < CG_INFO_LENGTH; i++) { 105 is_cgroupsV2 = is_cgroupsV2 && cg_infos[i]._hierarchy_id == 0; 106 all_controllers_enabled = all_controllers_enabled && cg_infos[i]._enabled; 107 } 108 109 if (!all_controllers_enabled) { 110 // one or more controllers disabled, disable container support 111 log_debug(os, container)("One or more required controllers disabled at kernel level."); 112 return NULL; 113 } 114 115 /* 116 * Read /proc/self/cgroup and determine: 117 * - the cgroup path for cgroups v2 or 118 * - on a cgroups v1 system, collect info for mapping 119 * the host mount point to the local one via /proc/self/mountinfo below. 120 */ 121 cgroup = fopen("/proc/self/cgroup", "r"); 122 if (cgroup == NULL) { 123 log_debug(os, container)("Can't open /proc/self/cgroup, %s", 124 os::strerror(errno)); 125 return NULL; 126 } 127 128 while ((p = fgets(buf, MAXPATHLEN, cgroup)) != NULL) { 129 char *controllers; 130 char *token; 131 char *hierarchy_id_str; 132 int hierarchy_id; 133 char *cgroup_path; 134 135 hierarchy_id_str = strsep(&p, ":"); 136 hierarchy_id = atoi(hierarchy_id_str); 137 /* Get controllers and base */ 138 controllers = strsep(&p, ":"); 139 cgroup_path = strsep(&p, "\n"); 140 141 if (controllers == NULL) { 142 continue; 143 } 144 145 while (!is_cgroupsV2 && (token = strsep(&controllers, ",")) != NULL) { 146 if (strcmp(token, "memory") == 0) { 147 assert(hierarchy_id == cg_infos[memory_idx]._hierarchy_id, "/proc/cgroups and /proc/self/cgroup hierarchy mismatch"); 148 cg_infos[memory_idx]._cgroup_path = os::strdup(cgroup_path); 149 } else if (strcmp(token, "cpuset") == 0) { 150 assert(hierarchy_id == cg_infos[cpuset_idx]._hierarchy_id, "/proc/cgroups and /proc/self/cgroup hierarchy mismatch"); 151 cg_infos[cpuset_idx]._cgroup_path = os::strdup(cgroup_path); 152 } else if (strcmp(token, "cpu") == 0) { 153 assert(hierarchy_id == cg_infos[cpu_idx]._hierarchy_id, "/proc/cgroups and /proc/self/cgroup hierarchy mismatch"); 154 cg_infos[cpu_idx]._cgroup_path = os::strdup(cgroup_path); 155 } else if (strcmp(token, "cpuacct") == 0) { 156 assert(hierarchy_id == cg_infos[cpuacct_idx]._hierarchy_id, "/proc/cgroups and /proc/self/cgroup hierarchy mismatch"); 157 cg_infos[cpuacct_idx]._cgroup_path = os::strdup(cgroup_path); 158 } 159 } 160 if (is_cgroupsV2) { 161 for (int i = 0; i < CG_INFO_LENGTH; i++) { 162 cg_infos[i]._cgroup_path = os::strdup(cgroup_path); 163 } 164 } 165 } 166 fclose(cgroup); 167 168 if (is_cgroupsV2) { 169 // Find the cgroup2 mount point by reading /proc/self/mountinfo 170 mntinfo = fopen("/proc/self/mountinfo", "r"); 171 if (mntinfo == NULL) { 172 log_debug(os, container)("Can't open /proc/self/mountinfo, %s", 173 os::strerror(errno)); 174 return NULL; 175 } 176 177 char cgroupv2_mount[MAXPATHLEN+1]; 178 char fstype[MAXPATHLEN+1]; 179 bool mount_point_found = false; 180 while ((p = fgets(buf, MAXPATHLEN, mntinfo)) != NULL) { 181 char *tmp_mount_point = cgroupv2_mount; 182 char *tmp_fs_type = fstype; 183 184 // mountinfo format is documented at https://www.kernel.org/doc/Documentation/filesystems/proc.txt 185 if (sscanf(p, "%*d %*d %*d:%*d %*s %s %*[^-]- %s cgroup2 %*s", tmp_mount_point, tmp_fs_type) == 2) { 186 // we likely have an early match return, be sure we have cgroup2 as fstype 187 if (strcmp("cgroup2", tmp_fs_type) == 0) { 188 mount_point_found = true; 189 break; 190 } 191 } 192 } 193 fclose(mntinfo); 194 if (!mount_point_found) { 195 log_trace(os, container)("Mount point for cgroupv2 not found in /proc/self/mountinfo"); 196 return NULL; 197 } 198 // Cgroups v2 case, we have all the info we need. 199 // Construct the subsystem, free resources and return 200 // Note: any index in cg_infos will do as the path is the same for 201 // all controllers. 202 CgroupController* unified = new CgroupV2Controller(cgroupv2_mount, cg_infos[memory_idx]._cgroup_path); 203 for (int i = 0; i < CG_INFO_LENGTH; i++) { 204 os::free(cg_infos[i]._name); 205 os::free(cg_infos[i]._cgroup_path); 206 } 207 log_debug(os, container)("Detected cgroups v2 unified hierarchy"); 208 return new CgroupV2Subsystem(unified); 209 } 210 211 // What follows is cgroups v1 212 log_debug(os, container)("Detected cgroups hybrid or legacy hierarchy, using cgroups v1 controllers"); 213 214 /* 215 * Find the cgroup mount point for memory and cpuset 216 * by reading /proc/self/mountinfo 217 * 218 * Example for docker: 219 * 219 214 0:29 /docker/7208cebd00fa5f2e342b1094f7bed87fa25661471a4637118e65f1c995be8a34 /sys/fs/cgroup/memory ro,nosuid,nodev,noexec,relatime - cgroup cgroup rw,memory 220 * 221 * Example for host: 222 * 34 28 0:29 / /sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime shared:16 - cgroup cgroup rw,memory 223 */ 224 mntinfo = fopen("/proc/self/mountinfo", "r"); 225 if (mntinfo == NULL) { 226 log_debug(os, container)("Can't open /proc/self/mountinfo, %s", 227 os::strerror(errno)); 228 return NULL; 229 } 230 231 while ((p = fgets(buf, MAXPATHLEN, mntinfo)) != NULL) { 232 char tmpcgroups[MAXPATHLEN+1]; 233 char *cptr = tmpcgroups; 234 char *token; 235 236 // mountinfo format is documented at https://www.kernel.org/doc/Documentation/filesystems/proc.txt 237 if (sscanf(p, "%*d %*d %*d:%*d %s %s %*[^-]- cgroup %*s %s", tmproot, tmpmount, tmpcgroups) != 3) { 238 continue; 239 } 240 while ((token = strsep(&cptr, ",")) != NULL) { 241 if (strcmp(token, "memory") == 0) { 242 memory = new CgroupV1MemoryController(tmproot, tmpmount); 243 } else if (strcmp(token, "cpuset") == 0) { 244 cpuset = new CgroupV1Controller(tmproot, tmpmount); 245 } else if (strcmp(token, "cpu") == 0) { 246 cpu = new CgroupV1Controller(tmproot, tmpmount); 247 } else if (strcmp(token, "cpuacct") == 0) { 248 cpuacct= new CgroupV1Controller(tmproot, tmpmount); 249 } 250 } 251 } 252 253 fclose(mntinfo); 254 255 if (memory == NULL) { 256 log_debug(os, container)("Required cgroup v1 memory subsystem not found"); 257 return NULL; 258 } 259 if (cpuset == NULL) { 260 log_debug(os, container)("Required cgroup v1 cpuset subsystem not found"); 261 return NULL; 262 } 263 if (cpu == NULL) { 264 log_debug(os, container)("Required cgroup v1 cpu subsystem not found"); 265 return NULL; 266 } 267 if (cpuacct == NULL) { 268 log_debug(os, container)("Required cgroup v1 cpuacct subsystem not found"); 269 return NULL; 270 } 271 272 /* 273 * Use info gathered previously from /proc/self/cgroup 274 * and map host mount point to 275 * local one via /proc/self/mountinfo content above 276 * 277 * Docker example: 278 * 5:memory:/docker/6558aed8fc662b194323ceab5b964f69cf36b3e8af877a14b80256e93aecb044 279 * 280 * Host example: 281 * 5:memory:/user.slice 282 * 283 * Construct a path to the process specific memory and cpuset 284 * cgroup directory. 285 * 286 * For a container running under Docker from memory example above 287 * the paths would be: 288 * 289 * /sys/fs/cgroup/memory 290 * 291 * For a Host from memory example above the path would be: 292 * 293 * /sys/fs/cgroup/memory/user.slice 294 * 295 */ 296 for (int i = 0; i < CG_INFO_LENGTH; i++) { 297 CgroupInfo info = cg_infos[i]; 298 if (strcmp(info._name, "memory") == 0) { 299 memory->set_subsystem_path(info._cgroup_path); 300 } else if (strcmp(info._name, "cpuset") == 0) { 301 cpuset->set_subsystem_path(info._cgroup_path); 302 } else if (strcmp(info._name, "cpu") == 0) { 303 cpu->set_subsystem_path(info._cgroup_path); 304 } else if (strcmp(info._name, "cpuacct") == 0) { 305 cpuacct->set_subsystem_path(info._cgroup_path); 306 } 307 } 308 return new CgroupV1Subsystem(cpuset, cpu, cpuacct, memory); 309 } 310 311 /* active_processor_count 312 * 313 * Calculate an appropriate number of active processors for the 314 * VM to use based on these three inputs. 315 * 316 * cpu affinity 317 * cgroup cpu quota & cpu period 318 * cgroup cpu shares 319 * 320 * Algorithm: 321 * 322 * Determine the number of available CPUs from sched_getaffinity 323 * 324 * If user specified a quota (quota != -1), calculate the number of 325 * required CPUs by dividing quota by period. 326 * 327 * If shares are in effect (shares != -1), calculate the number 328 * of CPUs required for the shares by dividing the share value 329 * by PER_CPU_SHARES. 330 * 331 * All results of division are rounded up to the next whole number. 332 * 333 * If neither shares or quotas have been specified, return the 334 * number of active processors in the system. 335 * 336 * If both shares and quotas have been specified, the results are 337 * based on the flag PreferContainerQuotaForCPUCount. If true, 338 * return the quota value. If false return the smallest value 339 * between shares or quotas. 340 * 341 * If shares and/or quotas have been specified, the resulting number 342 * returned will never exceed the number of active processors. 343 * 344 * return: 345 * number of CPUs 346 */ 347 int CgroupSubsystem::active_processor_count(int physical_proc_count) { 348 int quota_count = 0, share_count = 0; 349 int cpu_count, limit_count; 350 int result; 351 352 cpu_count = limit_count = physical_proc_count; 353 int quota = cpu_quota(); 354 int period = cpu_period(); 355 int share = cpu_shares(); 356 357 if (quota > -1 && period > 0) { 358 quota_count = ceilf((float)quota / (float)period); 359 log_trace(os, container)("CPU Quota count based on quota/period: %d", quota_count); 360 } 361 if (share > -1) { 362 share_count = ceilf((float)share / (float)PER_CPU_SHARES); 363 log_trace(os, container)("CPU Share count based on shares: %d", share_count); 364 } 365 366 // If both shares and quotas are setup results depend 367 // on flag PreferContainerQuotaForCPUCount. 368 // If true, limit CPU count to quota 369 // If false, use minimum of shares and quotas 370 if (quota_count !=0 && share_count != 0) { 371 if (PreferContainerQuotaForCPUCount) { 372 limit_count = quota_count; 373 } else { 374 limit_count = MIN2(quota_count, share_count); 375 } 376 } else if (quota_count != 0) { 377 limit_count = quota_count; 378 } else if (share_count != 0) { 379 limit_count = share_count; 380 } 381 382 result = MIN2(cpu_count, limit_count); 383 log_trace(os, container)("OSContainer::active_processor_count: %d", result); 384 return result; 385 } 386 387 /* memory_limit_in_bytes 388 * 389 * Return the limit of available memory for this process. 390 * 391 * return: 392 * memory limit in bytes or 393 * -1 for unlimited 394 * OSCONTAINER_ERROR for not supported 395 */ 396 jlong CgroupSubsystem::memory_limit_in_bytes() { 397 CachingCgroupController* contrl = memory_controller(); 398 CachedMetric* memory_limit = contrl->metrics_cache(); 399 if (!memory_limit->should_check_metric()) { 400 return memory_limit->value(); 401 } 402 jlong mem_limit = read_memory_limit_in_bytes(); 403 // Update cached metric to avoid re-reading container settings too often 404 memory_limit->set_value(mem_limit); 405 return mem_limit; 406 }