背景

内存使用率告警的promeql如下:

100*(sum (container_memory_working_set_bytes{namespace=~"argo|khaos|obs|kube-system"}) by (khaos_product,khaos_cluster,namespace,app_name,pod,container)/sum (container_spec_memory_limit_bytes{namespace=~"argo|khaos|obs|kube-system"}) by (khaos_product,khaos_cluster,namespace,app_name,pod,container) <= 1)

其中container_memory_working_set_bytes包含了pagecache内存,如果容器使用了较多的pagecache,计算出来的内存使用率会比较偏高。那么我们需要关心的指标应该是哪些呢?这需要解答linux内核oom killer依赖哪些具体的cgroup指标项来执行oom kill

container_memory_working_set_bytes的具体实现

cadvisor中的源码计算如下:

func setMemoryStats(s *cgroups.Stats, ret *info.ContainerStats) {
	ret.Memory.Usage = s.MemoryStats.Usage.Usage
	ret.Memory.MaxUsage = s.MemoryStats.Usage.MaxUsage
	ret.Memory.Failcnt = s.MemoryStats.Usage.Failcnt
	ret.Memory.KernelUsage = s.MemoryStats.KernelUsage.Usage

	if cgroups.IsCgroup2UnifiedMode() {
		ret.Memory.Cache = s.MemoryStats.Stats["file"]
		ret.Memory.RSS = s.MemoryStats.Stats["anon"]
		ret.Memory.Swap = s.MemoryStats.SwapUsage.Usage - s.MemoryStats.Usage.Usage
		ret.Memory.MappedFile = s.MemoryStats.Stats["file_mapped"]
	} else if s.MemoryStats.UseHierarchy {
		ret.Memory.Cache = s.MemoryStats.Stats["total_cache"]
		ret.Memory.RSS = s.MemoryStats.Stats["total_rss"]
		ret.Memory.Swap = s.MemoryStats.Stats["total_swap"]
		ret.Memory.MappedFile = s.MemoryStats.Stats["total_mapped_file"]
	} else {
		ret.Memory.Cache = s.MemoryStats.Stats["cache"]
		ret.Memory.RSS = s.MemoryStats.Stats["rss"]
		ret.Memory.Swap = s.MemoryStats.Stats["swap"]
		ret.Memory.MappedFile = s.MemoryStats.Stats["mapped_file"]
	}
	if v, ok := s.MemoryStats.Stats["pgfault"]; ok {
		ret.Memory.ContainerData.Pgfault = v
		ret.Memory.HierarchicalData.Pgfault = v
	}
	if v, ok := s.MemoryStats.Stats["pgmajfault"]; ok {
		ret.Memory.ContainerData.Pgmajfault = v
		ret.Memory.HierarchicalData.Pgmajfault = v
	}

	inactiveFileKeyName := "total_inactive_file"
	if cgroups.IsCgroup2UnifiedMode() {
		inactiveFileKeyName = "inactive_file"
	}

	workingSet := ret.Memory.Usage
	if v, ok := s.MemoryStats.Stats[inactiveFileKeyName]; ok {
		if workingSet < v {
			workingSet = 0
		} else {
			workingSet -= v
		}
	}
	ret.Memory.WorkingSet = workingSet
}

 其中的s.MemoryStats.Usage.Usage来源于github.com/opencontainers/runc库:

func getMemoryData(path, name string) (cgroups.MemoryData, error) {
	memoryData := cgroups.MemoryData{}

	moduleName := "memory"
	if name != "" {
		moduleName = "memory." + name
	}
	var (
		usage    = moduleName + ".usage_in_bytes"
		maxUsage = moduleName + ".max_usage_in_bytes"
		failcnt  = moduleName + ".failcnt"
		limit    = moduleName + ".limit_in_bytes"
	)

	value, err := fscommon.GetCgroupParamUint(path, usage)
	if err != nil {
		if name != "" && os.IsNotExist(err) {
			// Ignore ENOENT as swap and kmem controllers
			// are optional in the kernel.
			return cgroups.MemoryData{}, nil
		}
		return cgroups.MemoryData{}, err
	}
	memoryData.Usage = value
	value, err = fscommon.GetCgroupParamUint(path, maxUsage)
	if err != nil {
		return cgroups.MemoryData{}, err
	}
	memoryData.MaxUsage = value
	value, err = fscommon.GetCgroupParamUint(path, failcnt)
	if err != nil {
		return cgroups.MemoryData{}, err
	}
	memoryData.Failcnt = value
	value, err = fscommon.GetCgroupParamUint(path, limit)
	if err != nil {
		if name == "kmem" && os.IsNotExist(err) {
			// Ignore ENOENT as kmem.limit_in_bytes has
			// been removed in newer kernels.
			return memoryData, nil
		}

		return cgroups.MemoryData{}, err
	}
	memoryData.Limit = value

	return memoryData, nil
}

可以看到container_memory_working_set_bytes=memory.usage_in_bytes - memory.stat[total_inactive_file],其中的memory.usage_in_bytes就包含了pagecache内容。

OOM Killer机制

linux oom-killer的源码中,主要使用的是rss+swap+pagetable来计算oom分值 https://elixir.bootlin.com/linux/v5.4.58/source/mm/oom_kill.c#L227 因此在pagecache比较大的场景下可以使用底层已经提供的两个指标container_memory_rss+container_memory_swap来替代container_memory_working_set_bytes计算内存使用率。

参考资料




Content Menu

  • No labels