欢迎您访问 最编程 本站为您分享编程语言代码,编程技术文章!
您现在的位置是: 首页

理解Linux Cgroup的内存费用计算机制

最编程 2024-08-06 18:08:47
...

目的

了解linux内存计费的目的是,因为最近有业务反馈说从监控上看内存使用率已经达到150%容器才oom,我们监控使用的指标是container_memory_usage_bytes - container_memory_cache,通过观察linux内核日志发现oom时使用的内存并没有超。明白了内核是如何对cgroup内存计费的,能帮助我们更准确的做好监控。内核使用的5.4

从内存缺页中断开始

linux使用虚拟内存管理机制,每个进程都有独立的虚拟地址空间,分配和释放的都是虚拟内存,cpu读写虚拟内存时发现没有对应的物理页,会出发生缺页中断,调用一些列函数分配物理内存。linux内存管理不是一两个篇幅能说清楚的,所以我们直接从缺页开始看,主要分析下用户态匿名页这快逻辑。 关键的流程如下图所示:


缺页中断
  1. 缺页异常会出发缺页中断然后点用__do_page_fault,该函数那会判断缺页是发生在内核还是用户空间,本文只分析用户空产生的缺页,因为cgroup里主要是用户空间消耗的内存多,所以走到do_user_addr_fault
  2. do_user_addr_fault调用find_vma查找该内存地址所在的vm_area_struct(这里说的区也就是常说的linux虚拟内存划分:本文区、数据区、bss区、堆、栈等),然后点用handle_mm_fault进行映射这个区。
  3. 当前linux使用的是5级页表:PGD、P4G、PUD、PMD、PTE,__handle_mm_fault首先会判断这PGD、P4G、PUD、PMD是否存在,若果不存在则创建相应页目录项,创建失败则直接oom,成功的话调用handle_pte_fault创建页表。
  4. handle_pte_fault中判断页表项pte如果是null的,说明没有发生过映射,这是根据用户层申请内存的类型分为匿名页(调用do_anonymous_page)和文件映射页(调用do_fault,文件映射最终和匿名页计费逻辑相同,这下面主要分析匿名页);如果pte之前出现过,说明发生了页换出到磁盘,现在调用do_swap_page换入,因为关闭了swap所以这步也不关心了。
static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
{
    pte_t entry;

    if (unlikely(pmd_none(*vmf->pmd))) {
......
        vmf->pte = NULL;
    } else {
        /* See comment in pte_alloc_one_map() */
        if (pmd_devmap_trans_unstable(vmf->pmd))
            return 0;
......
        vmf->pte = pte_offset_map(vmf->pmd, vmf->address);
        vmf->orig_pte = *vmf->pte;
.....       barrier();
        if (pte_none(vmf->orig_pte)) {
            pte_unmap(vmf->pte);
            vmf->pte = NULL;
        }
    }

    if (!vmf->pte) {
        if (vma_is_anonymous(vmf->vma))
            return do_anonymous_page(vmf);
        else
            return do_fault(vmf);
    }

    if (!pte_present(vmf->orig_pte))
        return do_swap_page(vmf);

    if (pte_protnone(vmf->orig_pte) && vma_is_accessible(vmf->vma))
        return do_numa_page(vmf);

    vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);
    spin_lock(vmf->ptl);
    entry = vmf->orig_pte;
    if (unlikely(!pte_same(*vmf->pte, entry)))
        goto unlock;
    if (vmf->flags & FAULT_FLAG_WRITE) {
        if (!pte_write(entry))
            return do_wp_page(vmf);
        entry = pte_mkdirty(entry);
    }
......
    return 0;
}
  1. do_anonymous_page先分配页表项,分配页表项失败则直接oom。成功则调用alloc_zeroed_user_highpage_movable分配一个物理页,失败同样直接oom。成功则调用mem_cgroup_try_charge_delay对cgroup内存进行计费,如果超过crgoup的限制则先释放之前申请的页,然后oom,这部分下一节详细分析。然后调用mk_pte将页表项与物理页建立映射关系,调用page_add_new_anon_rmap将页表pte映射到匿名页。mem_cgroup_commit_charge用来提交cgroup计费,更新不同类型的缓存。调用lru_cache_add_active_or_unevictable将不可回收的页添加到相应的zone。调用set_pte_at将页表项添加到页表里。
static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
{
......
    if (pte_alloc(vma->vm_mm, vmf->pmd))
        return VM_FAULT_OOM;
......
    /* Allocate our own private page. */
    if (unlikely(anon_vma_prepare(vma)))
        goto oom;
    page = alloc_zeroed_user_highpage_movable(vma, vmf->address);
    if (!page)
        goto oom;

    if (mem_cgroup_try_charge_delay(page, vma->vm_mm, GFP_KERNEL, &memcg,
                    false))
        goto oom_free_page;
......
    entry = mk_pte(page, vma->vm_page_prot);
    if (vma->vm_flags & VM_WRITE)
        entry = pte_mkwrite(pte_mkdirty(entry));

    vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
            &vmf->ptl);
    if (!pte_none(*vmf->pte))
        goto release;

    ret = check_stable_address_space(vma->vm_mm);
    if (ret)
        goto release;

    /* Deliver the page fault to userland, check inside PT lock */
    if (userfaultfd_missing(vma)) {
        pte_unmap_unlock(vmf->pte, vmf->ptl);
        mem_cgroup_cancel_charge(page, memcg, false);
        put_page(page);
        return handle_userfault(vmf, VM_UFFD_MISSING);
    }

    inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
    page_add_new_anon_rmap(page, vma, vmf->address, false);
    mem_cgroup_commit_charge(page, memcg, false, false);
    lru_cache_add_active_or_unevictable(page, vma);
setpte:
    set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);

    /* No need to invalidate - it was non-present before */
    update_mmu_cache(vma, vmf->address, vmf->pte);
unlock:
    pte_unmap_unlock(vmf->pte, vmf->ptl);
    return ret;
release:
    mem_cgroup_cancel_charge(page, memcg, false);
    put_page(page);
    goto unlock;
oom_free_page:
    put_page(page);
oom:
    return VM_FAULT_OOM;
}

cgroup计费

下面是linux内核文档mem cgroup中给出主要资源对象之间的关系:


来自内核文档
  1. struct mem_cgroup mem cgroup中的一起都在这个结构里,使用Rik Van Riel为clock-pro开发的统计数据方式进行计费,也就是page_counter对象。
struct mem_cgroup {
    struct cgroup_subsys_state css;

    /* Private memcg ID. Used to ID objects that outlive the cgroup */
    struct mem_cgroup_id id;

    /* Accounted resources */
    struct page_counter memory;
    struct page_counter swap;

    /* Legacy consumer-oriented counters */
    struct page_counter memsw;
    struct page_counter kmem;
    struct page_counter tcpmem;

    /* Upper bound of normal memory consumption range */
    unsigned long high;

    /* Range enforcement for interrupt charges */
    struct work_struct high_work;

    unsigned long soft_limit;

    /* vmpressure notifications */
    struct vmpressure vmpressure;

    /*
     * Should the accounting and control be hierarchical, per subtree?
     */
    bool use_hierarchy;

    /*
     * Should the OOM killer kill all belonging tasks, had it kill one?
     */
    bool oom_group;
  1. page_counter结构如下
struct page_counter {
    atomic_long_t usage;  // cgroup中申请的内存
    unsigned long min;
    unsigned long low;
    unsigned long max;   // cgroup中设置的可用内存limit
    struct page_counter *parent; // cgroup层级关系

    /* effective memory.min and memory.min usage tracking */
    unsigned long emin;
    atomic_long_t min_usage;
    atomic_long_t children_min_usage;

    /* effective memory.low and memory.low usage tracking */
    unsigned long elow;
    atomic_long_t low_usage;
    atomic_long_t children_low_usage;

    /* legacy */
    unsigned long watermark;  // 
    unsigned long failcnt;  // 超过limit的次数
};
  1. mm_struct、mem_group、page、page_cgroup之间都能通过相应的结构相关联


    对象关系图来自张伟康的博客
  2. Memory Cgroup 只是统计rss 和 page cache

/*
 * The memory controller data structure. The memory controller controls both
 * page cache and RSS per cgroup. We would eventually like to provide
 * statistics based on the statistics developed by Rik Van Riel for clock-pro,
 * to help the administrator determine what knobs to tune.
 */
struct mem_cgroup {
  1. 计费page_counter主要函数
  • 计费方式usage += PAGE_SIZE
  • 计费并判断是否超过内存限制mem_cgroup_try_charge()
  • 取消计费,如超过内存限制oom之前要先取消之前的计费mem_cgroup_uncharge()
  • 提交计费计算更新rss、page cache,mem_cgroup_commit_charge()
  • page cache计费add_to_page_cache_locked()
  • 内核申请的内存计费__memcg_kmem_charge_memcg()
int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
                struct mem_cgroup *memcg)
{
    unsigned int nr_pages = 1 << order;
    struct page_counter *counter;
    int ret;

    ret = try_charge(memcg, gfp, nr_pages);  // 内核使用内存计费也是用的try_charge,所以内核使用的内存也会加到memcg->memory
    if (ret)
        return ret;
    if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) &&
        !page_counter_try_charge(&memcg->kmem, nr_pages, &counter)) { // 额外会单独计费内核使用的内存
......
    return 0;
}

cgroup内存控制

以匿名页位例,映射到文件与匿名页计算方式相同,swap不涉及。

  1. 设置limit_in_bytes
    设置limit_in_bytes通过mem_cgroup_write->mem_cgroup_resize_max->page_counter_set_max实现

static int mem_cgroup_resize_max(struct mem_cgroup *memcg,
                 unsigned long max, bool memsw)
{
    struct page_counter *counter = memsw ? &memcg->memsw : &memcg->memory;

    do {
 ......
        limits_invariant = memsw ? max >= memcg->memory.max :
                       max <= memcg->memsw.max;
        if (!limits_invariant) {
            mutex_unlock(&memcg_max_mutex);
            ret = -EINVAL;
            break;
        }
        if (max > counter->max)
            enlarge = true;
        ret = page_counter_set_max(counter, max);  // 修改limit_in_bytes
 ......
    return ret;
} 
  1. 读取mem cgroup的内存使用状态都是调用memory_stat_show->memory_stat_format。申请内存时,是内核申请的还是用户态,是匿名页还是文件映射,是可回收还是不可回收的,都会根据不同的类型设置不同的内存类别。
static char *memory_stat_format(struct mem_cgroup *memcg)
{
    struct seq_buf s;
    int i;

    seq_buf_init(&s, kmalloc(PAGE_SIZE, GFP_KERNEL), PAGE_SIZE);
    if (!s.buffer)
        return NULL;

    /*
     * Provide statistics on the state of the memory subsystem as
     * well as cumulative event counters that show past behavior.
     *
     * This list is ordered following a combination of these gradients:
     * 1) generic big picture -> specifics and details
     * 2) reflecting userspace activity -> reflecting kernel heuristics
     *
     * Current memory state:
     */

    seq_buf_printf(&s, "anon %llu\n",
               (u64)memcg_page_state(memcg, MEMCG_RSS) *
               PAGE_SIZE);
    seq_buf_printf(&s, "file %llu\n",
               (u64)memcg_page_state(memcg, MEMCG_CACHE) *
               PAGE_SIZE);
    seq_buf_printf(&s, "kernel_stack %llu\n",
               (u64)memcg_page_state(memcg, MEMCG_KERNEL_STACK_KB) *
               1024);
    seq_buf_printf(&s, "slab %llu\n",
               (u64)(memcg_page_state(memcg, NR_SLAB_RECLAIMABLE) +
                 memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE)) *
               PAGE_SIZE);
    seq_buf_printf(&s, "sock %llu\n",
               (u64)memcg_page_state(memcg, MEMCG_SOCK) *
               PAGE_SIZE);

    seq_buf_printf(&s, "shmem %llu\n",
               (u64)memcg_page_state(memcg, NR_SHMEM) *
               PAGE_SIZE);
    seq_buf_printf(&s, "file_mapped %llu\n",
               (u64)memcg_page_state(memcg, NR_FILE_MAPPED) *
               PAGE_SIZE);
    seq_buf_printf(&s, "file_dirty %llu\n",
               (u64)memcg_page_state(memcg, NR_FILE_DIRTY) *
               PAGE_SIZE);
    seq_buf_printf(&s, "file_writeback %llu\n",
               (u64)memcg_page_state(memcg, NR_WRITEBACK) *
               PAGE_SIZE);

    /*
     * TODO: We should eventually replace our own MEMCG_RSS_HUGE counter
     * with the NR_ANON_THP vm counter, but right now it's a pain in the
     * arse because it requires migrating the work out of rmap to a place
     * where the page->mem_cgroup is set up and stable.
     */
    seq_buf_printf(&s, "anon_thp %llu\n",
               (u64)memcg_page_state(memcg, MEMCG_RSS_HUGE) *
               PAGE_SIZE);

    for (i = 0; i < NR_LRU_LISTS; i++)
        seq_buf_printf(&s, "%s %llu\n", mem_cgroup_lru_names[i],
                   (u64)memcg_page_state(memcg, NR_LRU_BASE + i) *
                   PAGE_SIZE);

    seq_buf_printf(&s, "slab_reclaimable %llu\n",
               (u64)memcg_page_state(memcg, NR_SLAB_RECLAIMABLE) *
               PAGE_SIZE);
    seq_buf_printf(&s, "slab_unreclaimable %llu\n",
               (u64)memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE) *
               PAGE_SIZE);

    /* Accumulated memory events */

    seq_buf_printf(&s, "pgfault %lu\n", memcg_events(memcg, PGFAULT));
    seq_buf_printf(&s, "pgmajfault %lu\n", memcg_events(memcg, PGMAJFAULT));

    seq_buf_printf(&s, "workingset_refault %lu\n",
               memcg_page_state(memcg, WORKINGSET_REFAULT));
    seq_buf_printf(&s, "workingset_activate %lu\n",
               memcg_page_state(memcg, WORKINGSET_ACTIVATE));
    seq_buf_printf(&s, "workingset_nodereclaim %lu\n",
               memcg_page_state(memcg, WORKINGSET_NODERECLAIM));

    seq_buf_printf(&s, "pgrefill %lu\n", memcg_events(memcg, PGREFILL));
    seq_buf_printf(&s, "pgscan %lu\n",
               memcg_events(memcg, PGSCAN_KSWAPD) +
               memcg_events(memcg, PGSCAN_DIRECT));
    seq_buf_printf(&s, "pgsteal %lu\n",
               memcg_events(memcg, PGSTEAL_KSWAPD) +
               memcg_events(memcg, PGSTEAL_DIRECT));
    seq_buf_printf(&s, "pgactivate %lu\n", memcg_events(memcg, PGACTIVATE));
    seq_buf_printf(&s, "pgdeactivate %lu\n", memcg_events(memcg, PGDEACTIVATE));
    seq_buf_printf(&s, "pglazyfree %lu\n", memcg_events(memcg, PGLAZYFREE));
    seq_buf_printf(&s, "pglazyfreed %lu\n", memcg_events(memcg, PGLAZYFREED));

#ifdef CONFIG_TRANSPARENT_HUGEPAGE
    seq_buf_printf(&s, "thp_fault_alloc %lu\n",
               memcg_events(memcg, THP_FAULT_ALLOC));
    seq_buf_printf(&s, "thp_collapse_alloc %lu\n",
               memcg_events(memcg, THP_COLLAPSE_ALLOC));
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */

    /* The above should easily fit into one page */
    WARN_ON_ONCE(seq_buf_has_overflowed(&s));

    return s.buffer;
} 
  1. 前面说了do_anonymous_page申请页成功后会调用mem_cgroup_try_charge_delay进行计费。mem_cgroup_try_charge_delay-->mem_cgroup_try_charge-->try_charge -->page_counter_try_charge 。主要逻辑:如果超过内存超过限制,会尝试最多5次内存回收,如果5次都失败了退出并返回错误码
static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
              unsigned int nr_pages)
{
    unsigned int batch = max(MEMCG_CHARGE_BATCH, nr_pages);  // 一次计费最小32个页
    int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;

    if (mem_cgroup_is_root(memcg))   // 根cgroup不计费
        return 0;
retry:
    if (consume_stock(memcg, nr_pages)) // 根据上次计费没有超过32页,直接返回
        return 0;

    if (!do_memsw_account() ||
        page_counter_try_charge(&memcg->memsw, batch, &counter)) { // swap 已经关闭
        if (page_counter_try_charge(&memcg->memory, batch, &counter))  // 对mem使用量计费
            goto done_restock;
        if (do_memsw_account())
            page_counter_uncharge(&memcg->memsw, batch);
        mem_over_limit = mem_cgroup_from_counter(counter, memory);
    } else {
        mem_over_limit = mem_cgroup_from_counter(counter, memsw);
        may_swap = false;
    }

    if (batch > nr_pages) {
        batch = nr_pages;
        goto retry;
    }

    if (gfp_mask & __GFP_ATOMIC)
        goto force;

    /*
     * Unlike in global OOM situations, memcg is not in a physical
     * memory shortage.  Allow dying and OOM-killed tasks to
     * bypass the last charges so that they can exit quickly and
     * free their memory.
     */
    if (unlikely(should_force_charge()))  // 如果进程正要oom或者标记为要退出的状态,则强只计费,然后退出。
        goto force;
......

    nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
                            gfp_mask, may_swap); // 超出内存限制尝试回收内存
......
    if (mem_cgroup_wait_acct_move(mem_over_limit))
        goto retry;

    if (nr_retries--)     // 5次内存回收的机会
        goto retry;

    if (gfp_mask & __GFP_RETRY_MAYFAIL)
        goto nomem;

    if (gfp_mask & __GFP_NOFAIL)
        goto force;

    if (fatal_signal_pending(current))
        goto force;
 ......
nomem:
    if (!(gfp_mask & __GFP_NOFAIL))
        return -ENOMEM;
force:
    /*
     * The allocation either can't fail or will lead to more memory
     * being freed very soon.  Allow memory usage go over the limit
     * temporarily by force charging it.
     */
    page_counter_charge(&memcg->memory, nr_pages);
    if (do_memsw_account())
        page_counter_charge(&memcg->memsw, nr_pages);
    css_get_many(&memcg->css, nr_pages);

    return 0;

done_restock:
    css_get_many(&memcg->css, batch);
    if (batch > nr_pages)
        refill_stock(memcg, batch - nr_pages);
......
    do {
        if (page_counter_read(&memcg->memory) > memcg->high) {
            /* Don't bother a random interrupted task */
            if (in_interrupt()) {
                schedule_work(&memcg->high_work);
                break;
            }
            current->memcg_nr_pages_over_high += batch;
            set_notify_resume(current);
            break;
        }
    } while ((memcg = parent_mem_cgroup(memcg)));

    return 0;
}

page_counter_try_charge 逻辑清晰,如果usage超过c->max(limit_in_bytes),则增加失败次数failcnt,撤销计费,退出并返回false。没有超过内存限制返回true

bool page_counter_try_charge(struct page_counter *counter,
                 unsigned long nr_pages,
                 struct page_counter **fail)
{
    struct page_counter *c;

    for (c = counter; c; c = c->parent) {
        long new;
        new = atomic_long_add_return(nr_pages, &c->usage);
        if (new > c->max) {
            atomic_long_sub(nr_pages, &c->usage);
            propagate_protected_usage(counter, new);
 ......
            c->failcnt++;
            *fail = c;
            goto failed;
        }
        propagate_protected_usage(counter, new);
......
        if (new > c->watermark)
            c->watermark = new;
    }
    return true;

failed:
    for (c = counter; c != *fail; c = c->parent)
        page_counter_cancel(c, nr_pages);

    return false;
}
  1. 设置page类型(加入到哪个lru表:inactive_anon还是active_anon)
    mem_cgroup_commit_charge->commit_charge->unlock_page_lru->add_page_to_lru_list
static void unlock_page_lru(struct page *page, int isolated)
{
  ......
        add_page_to_lru_list(page, lruvec, page_lru(page));
    }
    spin_unlock_irq(&pgdat->lru_lock);
}

/**
 * page_lru - which LRU list should a page be on?
 * @page: the page to test
 *
 * Returns the LRU list a page should be on, as an index
 * into the array of LRU lists.
 */
static __always_inline enum lru_list page_lru(struct page *page) //  根据page的类型获取不同lru表的索引,以便加到相应的lru表
{
    enum lru_list lru;

    if (PageUnevictable(page))
        lru = LRU_UNEVICTABLE;
    else {
        lru = page_lru_base_type(page);
        if (PageActive(page))
            lru += LRU_ACTIVE;
    }
    return lru;
}
  1. cgroup内存回收
    主动回收:主动回收是内核线程 kswapd干的事,内存紧张时kswapd会调用shrink_node进行内存回收,这点不论cgroup还会全区内存都一样。slab的主动回收是通过每个cpu上的reap_work工作队列最终调用shrink_slab进行回收。
    被动回收:超过cgroup内存限制时通过try_to_free_mem_cgroup_pages->do_try_to_free_pages进行内存回收。
static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
                      struct scan_control *sc)
{
    int initial_priority = sc->priority;
    pg_data_t *last_pgdat;
    struct zoneref *z;
    struct zone *zone;
retry:
    delayacct_freepages_start();

    if (global_reclaim(sc))
        __count_zid_vm_events(ALLOCSTALL, sc->reclaim_idx, 1);

    do {
        vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup,
                sc->priority);
        sc->nr_scanned = 0;
        shrink_zones(zonelist, sc); // 主要的回收逻辑

        if (sc->nr_reclaimed >= sc->nr_to_reclaim)  // 已经回收的页数nr_reclaimed达到了想要回收的页数nr_to_reclaim就返回了
            break;

        if (sc->compaction_ready)
            break;

        /*
         * If we're getting trouble reclaiming, start doing
         * writepage even in laptop mode.
         */
        if (sc->priority < DEF_PRIORITY - 2)
            sc->may_writepage = 1;
    } while (--sc->priority >= 0);
......
    return 0;
}

回收哪些资源

  1. lru上主要有五种类型的cache :匿名页活跃和不活跃,文件活跃和不活跃,及不可回收的。因为关闭swap匿名页不会回收,所以lru中只回收活跃和不活跃的文件cache。主要调用逻辑do_try_to_free_pages->shrink_zones->shrink_node->shrink_node_memcg (cgroup的numa node内存回收函数)
static const char *const mem_cgroup_lru_names[] = {
    "inactive_anon",
    "active_anon",
    "inactive_file",
    "active_file",
    "unevictable",
};

 * This is a basic per-node page freer.  Used by both kswapd and direct reclaim.
 */
static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memcg,
                  struct scan_control *sc, unsigned long *lru_pages)
{
......
    get_scan_count(lruvec, memcg, sc, nr, lru_pages); // 获取可以回收的page cache数量

    /* Record the original scan target for proportional adjustments later */
    memcpy(targets, nr, sizeof(nr));

    scan_adjusted = (global_reclaim(sc) && !current_is_kswapd() &&
             sc->priority == DEF_PRIORITY);

    blk_start_plug(&plug);
    while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
                    nr[LRU_INACTIVE_FILE]) {
        unsigned long nr_anon, nr_file, percentage;
        unsigned long nr_scanned;

        for_each_evictable_lru(lru) {
            if (nr[lru]) {
                nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX);
                nr[lru] -= nr_to_scan;

                nr_reclaimed += shrink_list(lru, nr_to_scan,  // 真正回收的地方
                                lruvec, sc);
            }
        }

        cond_resched();

 }


static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
               struct scan_control *sc, unsigned long *nr,
               unsigned long *lru_pages)
{
   ......
        // 没有开启swap,就不扫描匿名页
    /* If we have no swap space, do not bother scanning anon pages. */
    if (!sc->may_swap || mem_cgroup_get_nr_swap_pages(memcg) <= 0) {
        scan_balance = SCAN_FILE;
        goto out;
    }
......
}
  1. slab主要回收dentry和inode cache。主要调用逻辑do_try_to_free_pages->shrink_zones->shrink_node-> shrink_slab->shrink_slab_memcg。
    之前在3.10.*版本内核遇见过容器内存不足时cgroup没有回收slab的问题。目前如果申请内存时内存不足,能够回收memcrougp的slab。不会影响全局的slab
static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
                 struct mem_cgroup *memcg,
                 int priority)
{
    unsigned long ret, freed = 0;
    struct shrinker *shrinker;

    /*
     * The root memcg might be allocated even though memcg is disabled
     * via "cgroup_disable=memory" boot parameter.  This could make
     * mem_cgroup_is_root() return false, then just run memcg slab
     * shrink, but skip global shrink.  This may result in premature
     * oom.
     */
    if (!mem_cgroup_disabled() && !mem_cgroup_is_root(memcg))
        return shrink_slab_memcg(gfp_mask, nid, memcg, priority);

结论

  1. 根据以上分析可以知道目前真正的内存使用应该是
    real_used = memory.usage_in_bytes – memory.stat .( total_inactive_file + total_active_file ) - memory.kmem.slabinfo.(inode_cache + xfs_inode)。
  2. 判断内存是否达到limit的逻辑很清晰,不可能出现内存超过150%情况,这个问题还需要排查下指标统计是否有问题