Skip to main content

背景

内存使用率告警的promeql如下:

100*(sum (container_memory_working_set_bytes{namespace=~"argo|khaos|obs|kube-system"}) by (khaos_product,khaos_cluster,namespace,app_name,pod,container)/sum (container_spec_memory_limit_bytes{namespace=~"argo|khaos|obs|kube-system"}) by (khaos_product,khaos_cluster,namespace,app_name,pod,container) <= 1)

其中container_memory_working_set_bytes包含了pagecache内存,如果容器使用了较多的pagecache,计算出来的内存使用率会比较偏高。那么我们需要关心的指标应该是哪些呢?这需要解答linux内核oom killer依赖哪些具体的cgroup指标项来执行oom kill

container_memory_working_set_bytes的具体实现

cadvisor中的源码计算如下:

func setMemoryStats(s *cgroups.Stats, ret *info.ContainerStats) {
ret.Memory.Usage = s.MemoryStats.Usage.Usage
ret.Memory.MaxUsage = s.MemoryStats.Usage.MaxUsage
ret.Memory.Failcnt = s.MemoryStats.Usage.Failcnt
ret.Memory.KernelUsage = s.MemoryStats.KernelUsage.Usage

if cgroups.IsCgroup2UnifiedMode() {
ret.Memory.Cache = s.MemoryStats.Stats["file"]
ret.Memory.RSS = s.MemoryStats.Stats["anon"]
ret.Memory.Swap = s.MemoryStats.SwapUsage.Usage - s.MemoryStats.Usage.Usage
ret.Memory.MappedFile = s.MemoryStats.Stats["file_mapped"]
} else if s.MemoryStats.UseHierarchy {
ret.Memory.Cache = s.MemoryStats.Stats["total_cache"]
ret.Memory.RSS = s.MemoryStats.Stats["total_rss"]
ret.Memory.Swap = s.MemoryStats.Stats["total_swap"]
ret.Memory.MappedFile = s.MemoryStats.Stats["total_mapped_file"]
} else {
ret.Memory.Cache = s.MemoryStats.Stats["cache"]
ret.Memory.RSS = s.MemoryStats.Stats["rss"]
ret.Memory.Swap = s.MemoryStats.Stats["swap"]
ret.Memory.MappedFile = s.MemoryStats.Stats["mapped_file"]
}
if v, ok := s.MemoryStats.Stats["pgfault"]; ok {
ret.Memory.ContainerData.Pgfault = v
ret.Memory.HierarchicalData.Pgfault = v
}
if v, ok := s.MemoryStats.Stats["pgmajfault"]; ok {
ret.Memory.ContainerData.Pgmajfault = v
ret.Memory.HierarchicalData.Pgmajfault = v
}

inactiveFileKeyName := "total_inactive_file"
if cgroups.IsCgroup2UnifiedMode() {
inactiveFileKeyName = "inactive_file"
}

workingSet := ret.Memory.Usage
if v, ok := s.MemoryStats.Stats[inactiveFileKeyName]; ok {
if workingSet < v {
workingSet = 0
} else {
workingSet -= v
}
}
ret.Memory.WorkingSet = workingSet
}

 其中的s.MemoryStats.Usage.Usage来源于github.com/opencontainers/runc库:

func getMemoryData(path, name string) (cgroups.MemoryData, error) {
memoryData := cgroups.MemoryData{}

moduleName := "memory"
if name != "" {
moduleName = "memory." + name
}
var (
usage = moduleName + ".usage_in_bytes"
maxUsage = moduleName + ".max_usage_in_bytes"
failcnt = moduleName + ".failcnt"
limit = moduleName + ".limit_in_bytes"
)

value, err := fscommon.GetCgroupParamUint(path, usage)
if err != nil {
if name != "" && os.IsNotExist(err) {
// Ignore ENOENT as swap and kmem controllers
// are optional in the kernel.
return cgroups.MemoryData{}, nil
}
return cgroups.MemoryData{}, err
}
memoryData.Usage = value
value, err = fscommon.GetCgroupParamUint(path, maxUsage)
if err != nil {
return cgroups.MemoryData{}, err
}
memoryData.MaxUsage = value
value, err = fscommon.GetCgroupParamUint(path, failcnt)
if err != nil {
return cgroups.MemoryData{}, err
}
memoryData.Failcnt = value
value, err = fscommon.GetCgroupParamUint(path, limit)
if err != nil {
if name == "kmem" && os.IsNotExist(err) {
// Ignore ENOENT as kmem.limit_in_bytes has
// been removed in newer kernels.
return memoryData, nil
}

return cgroups.MemoryData{}, err
}
memoryData.Limit = value

return memoryData, nil
}

可以看到container_memory_working_set_bytes=memory.usage_in_bytes - memory.stat[total_inactive_file],其中的memory.usage_in_bytes就包含了pagecache内容。

OOM Killer机制

linux oom-killer的源码中,主要使用的是rss+swap+pagetable来计算oom分值 https://elixir.bootlin.com/linux/v5.4.58/source/mm/oom_kill.c#L227 因此在pagecache比较大的场景下可以使用底层已经提供的两个指标container_memory_rss+container_memory_swap来替代container_memory_working_set_bytes计算内存使用率。

参考资料