背景
内存使用率告警的promeql如下:
100*(sum (container_memory_working_set_bytes{namespace=~"argo|khaos|obs|kube-system"}) by (khaos_product,khaos_cluster,namespace,app_name,pod,container)/sum (container_spec_memory_limit_bytes{namespace=~"argo|khaos|obs|kube-system"}) by (khaos_product,khaos_cluster,namespace,app_name,pod,container) <= 1)
其中container_memory_working_set_bytes包含了pagecache内存,如果容器使用了较多的pagecache,计算出来的内存使用率会比较偏高。那么我们需要关心的指标应该是哪些呢?这需要解答linux内核oom killer依赖哪些具体的cgroup指标项来执行oom kill。
container_memory_working_set_bytes的具体实现
cadvisor中的源码计算如下:
func setMemoryStats(s *cgroups.Stats, ret *info.ContainerStats) {
    ret.Memory.Usage = s.MemoryStats.Usage.Usage
    ret.Memory.MaxUsage = s.MemoryStats.Usage.MaxUsage
    ret.Memory.Failcnt = s.MemoryStats.Usage.Failcnt
    ret.Memory.KernelUsage = s.MemoryStats.KernelUsage.Usage
    if cgroups.IsCgroup2UnifiedMode() {
        ret.Memory.Cache = s.MemoryStats.Stats["file"]
        ret.Memory.RSS = s.MemoryStats.Stats["anon"]
        ret.Memory.Swap = s.MemoryStats.SwapUsage.Usage - s.MemoryStats.Usage.Usage
        ret.Memory.MappedFile = s.MemoryStats.Stats["file_mapped"]
    } else if s.MemoryStats.UseHierarchy {
        ret.Memory.Cache = s.MemoryStats.Stats["total_cache"]
        ret.Memory.RSS = s.MemoryStats.Stats["total_rss"]
        ret.Memory.Swap = s.MemoryStats.Stats["total_swap"]
        ret.Memory.MappedFile = s.MemoryStats.Stats["total_mapped_file"]
    } else {
        ret.Memory.Cache = s.MemoryStats.Stats["cache"]
        ret.Memory.RSS = s.MemoryStats.Stats["rss"]
        ret.Memory.Swap = s.MemoryStats.Stats["swap"]
        ret.Memory.MappedFile = s.MemoryStats.Stats["mapped_file"]
    }
    if v, ok := s.MemoryStats.Stats["pgfault"]; ok {
        ret.Memory.ContainerData.Pgfault = v
        ret.Memory.HierarchicalData.Pgfault = v
    }
    if v, ok := s.MemoryStats.Stats["pgmajfault"]; ok {
        ret.Memory.ContainerData.Pgmajfault = v
        ret.Memory.HierarchicalData.Pgmajfault = v
    }
    inactiveFileKeyName := "total_inactive_file"
    if cgroups.IsCgroup2UnifiedMode() {
        inactiveFileKeyName = "inactive_file"
    }
    workingSet := ret.Memory.Usage
    if v, ok := s.MemoryStats.Stats[inactiveFileKeyName]; ok {
        if workingSet < v {
            workingSet = 0
        } else {
            workingSet -= v
        }
    }
    ret.Memory.WorkingSet = workingSet
}
 其中的s.MemoryStats.Usage.Usage来源于github.com/opencontainers/runc库:
func getMemoryData(path, name string) (cgroups.MemoryData, error) {
    memoryData := cgroups.MemoryData{}
    moduleName := "memory"
    if name != "" {
        moduleName = "memory." + name
    }
    var (
        usage    = moduleName + ".usage_in_bytes"
        maxUsage = moduleName + ".max_usage_in_bytes"
        failcnt  = moduleName + ".failcnt"
        limit    = moduleName + ".limit_in_bytes"
    )
    value, err := fscommon.GetCgroupParamUint(path, usage)
    if err != nil {
        if name != "" && os.IsNotExist(err) {
            // Ignore ENOENT as swap and kmem controllers
            // are optional in the kernel.
            return cgroups.MemoryData{}, nil
        }
        return cgroups.MemoryData{}, err
    }
    memoryData.Usage = value
    value, err = fscommon.GetCgroupParamUint(path, maxUsage)
    if err != nil {
        return cgroups.MemoryData{}, err
    }
    memoryData.MaxUsage = value
    value, err = fscommon.GetCgroupParamUint(path, failcnt)
    if err != nil {
        return cgroups.MemoryData{}, err
    }
    memoryData.Failcnt = value
    value, err = fscommon.GetCgroupParamUint(path, limit)
    if err != nil {
        if name == "kmem" && os.IsNotExist(err) {
            // Ignore ENOENT as kmem.limit_in_bytes has
            // been removed in newer kernels.
            return memoryData, nil
        }
        return cgroups.MemoryData{}, err
    }
    memoryData.Limit = value
    return memoryData, nil
}
可以看到container_memory_working_set_bytes=memory.usage_in_bytes - memory.stat[total_inactive_file],其中的memory.usage_in_bytes就包含了pagecache内容。
OOM Killer机制
在linux oom-killer的源码中,主要使用的是rss+swap+pagetable来计算oom分值 https://elixir.bootlin.com/linux/v5.4.58/source/mm/oom_kill.c#L227 因此在pagecache比较大的场景下可以使用底层已经提供的两个指标container_memory_rss+container_memory_swap来替代container_memory_working_set_bytes计算内存使用率。