前言

Linux内核的内存分配很复杂，单独开篇来慢慢记录。

SLAB

参考：linux内存源码分析 - SLAB分配器概述 - tolimit - 博客园 (cnblogs.com)

函数链：

1	kmem_cache_alloc()->slab_alloc()->__do_cache_alloc()->__cache_alloc()

最终在__cache_alloc进行实际的内存分配，在这进行分配分叉点。

//v5.8 /mm/slab.c
static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
{
	void *objp;
    //当前slab描述符的对象缓冲池
	struct array_cache *ac;
	//...
    //获取缓冲池,实际上为对应的
    //CPU_addr+kmalloc_caches[xx][xx].cpu_cache
    //这个kmalloc_caches[xx][xx]即为对应大小的slab描述符
	ac = cpu_cache_get(cachep);
    //如果存在可用的,就直接进行分配-------freelist式分配
	if (likely(ac->avail)) {
		ac->touched = 1;
        //这里即把avail也当作一个idx进行索引了
		objp = ac->entry[--ac->avail];
		STATS_INC_ALLOCHIT(cachep);
		goto out;
	}

	STATS_INC_ALLOCMISS(cachep);
    //不存在的话,则进入另一种分配--------其他缓冲池分配
	objp = cache_alloc_refill(cachep, flags);
	/*
	 * the 'ac' may be updated by cache_alloc_refill(),
	 * and kmemleak_erase() requires its correct value.
	 */
	ac = cpu_cache_get(cachep);

out:
	/*
	 * To avoid a false negative, if an object that is in one of the
	 * per-CPU caches is leaked, we need to make sure kmemleak doesn't
	 * treat the array pointers as a reference to the object.
	 */
	if (objp)
		kmemleak_erase(&ac->entry[ac->avail]);
	return objp;
}

1.本地缓冲池分配

这个就不多说了，依据CPUX_addr + kmalloc_caches[xx][xx].cpu_cache.entry当作一个array_cache进行分配，分配一个即将对应的array_cache.avail减1。

2.其他缓冲池分配

进入cache_alloc_refill()函数，还是会有相关的分叉

//v5.9 /mm/slab.c
static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
{
	int batchcount;
	struct kmem_cache_node *n;
	struct array_cache *ac, *shared;
	int node;
	void *list = NULL;
	struct page *page;

	check_irq_off();
    //获取node节点id和array_cache相关信息
	node = numa_mem_id();
	ac = cpu_cache_get(cachep);
	batchcount = ac->batchcount;
  	//不知道干啥的
	if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
		/*
		 * If there was little recent activity on this cache, then
		 * perform only a partial refill.  Otherwise we could generate
		 * refill bouncing.
		 */
		batchcount = BATCHREFILL_LIMIT;
	}
    //真实的node节点指针
	n = get_node(cachep, node);
    
    
	BUG_ON(ac->avail > 0 || !n);
    //尝试从共享对象缓冲池shared_entry进行分配
	shared = READ_ONCE(n->shared);
	if (!n->free_objects && (!shared || !shared->avail))
		goto direct_grow;
	//相关自旋锁
	spin_lock(&n->list_lock);
	shared = READ_ONCE(n->shared);
	/* See if we can refill from the shared array */
    //transfer_objects()函数会从共享对象缓冲池shared_entry
    //转移batchcount个空闲对象到本地缓冲池进行分配
	if (shared && transfer_objects(ac, shared, batchcount)) {
        //为什么设置touched?
		shared->touched = 1;
		goto alloc_done;
	}

	//共享对象缓冲池shared_entry没有空闲对象时,查看
	//slabs_partial(部分空闲)链表和slabs_free(全部空闲)链表
	while (batchcount > 0) {
		/* Get slab alloc is to come from. */
        //进入实际分配函数
		page = get_first_slab(n, false);
        //如果slabs_partial(部分空闲)链表和slabs_free(全部空闲)链表
        //都没有则重新分配一个slab及对应空间
		if (!page)
			goto must_grow;

		check_spinlock_acquired(cachep);

		batchcount = alloc_block(cachep, ac, page, batchcount);
		fixup_slab_list(cachep, n, page, &list);
	}

//重新分配一个slab
must_grow:
	n->free_objects -= ac->avail;
alloc_done:
	spin_unlock(&n->list_lock);
	fixup_objfreelist_debug(cachep, &list);


direct_grow:
	if (unlikely(!ac->avail)) {
		/* Check if we can use obj in pfmemalloc slab */
		if (sk_memalloc_socks()) {
			void *obj = cache_alloc_pfmemalloc(cachep, n, flags);
			if (obj)
				return obj;
		}

		page = cache_grow_begin(cachep, gfp_exact_node(flags), node);

		/*
		 * cache_grow_begin() can reenable interrupts,
		 * then ac could change.
		 */
		ac = cpu_cache_get(cachep);
		if (!ac->avail && page)
			alloc_block(cachep, ac, page, batchcount);
		cache_grow_end(cachep, page);

		if (!ac->avail)
			return NULL;
	}
	ac->touched = 1;

	return ac->entry[--ac->avail];
}

①shared缓存池分配

尝试将kmalloc_caches[xx][xx].node.shared.entry当作一个array_cache进行检测，尝试分配

②slabs_partial和slabs_free分配

在get_first_slab()函数中进行分配，尝试获取一个slab页框描述符，依据相关索引，从其中的page->mapping开始获取堆块对象obj，挨个转移到对应CPU的kmalloc-xx的array_cache（即对应的本地缓冲池中）。

//v5.9 /mm/slab.c
static struct page *get_first_slab(struct kmem_cache_node *n, bool pfmemalloc)
{
	struct page *page;
    
	assert_spin_locked(&n->list_lock);
	page = list_first_entry_or_null(&n->slabs_partial, struct page,slab_list);
	if (!page) {
		n->free_touched = 1;
		page = list_first_entry_or_null(&n->slabs_free, struct page,
						slab_list);
		if (page)
			n->free_slabs--;
	}
	if (sk_memalloc_socks())
		page = get_valid_first_slab(n, page, pfmemalloc);

	return page;
}

3.重新分配一个slab

上面也提到，进入到must_grow即进行slab的重新分配，这个着实有点复杂，不是很会，涉及ZONE、NODE什么之类的数据结构，还有NUMA机制之类的。

SLUB

参考：Linux内存管理：slub分配器 - 知乎 (zhihu.com)

函数链：

1	kmem_cache_alloc()->slab_alloc()->slab_alloc_node()

从slab_alloc_node()函数开始分配

//v5.9 /mm/slub.c
static __always_inline void *slab_alloc_node(struct kmem_cache *s,
		gfp_t gfpflags, int node, unsigned long addr)
{
	void *object;
	struct kmem_cache_cpu *c;
	struct page *page;
	unsigned long tid;
	struct obj_cgroup *objcg = NULL;

    //获取对应的kmem_cache描述符
	s = slab_pre_alloc_hook(s, &objcg, 1, gfpflags);
	if (!s)
		return NULL;
redo:
	//.....
    //尝试从本地缓冲池进行分配
	object = c->freelist;
	page = c->page;
	if (unlikely(!object || !node_match(page, node))) {
        //本地缓冲池已分配完毕
        //分配不成功就进入后续的__slab_alloc()函数
		object = __slab_alloc(s, gfpflags, node, addr, c);
		stat(s, ALLOC_SLOWPATH);
	} else {
        //相关SLUB保护的指针运算
		void *next_object = get_freepointer_safe(s, object);
		//...一大堆检查
	}
    //一大堆检查看不太懂

	return object;
}

1.本地缓冲池分配

这个就不说了，直接就是从本地的kmem_cache_cpu下cpu_slab的freelist开始分配，上面的slab_alloc_node()函数中也相关体现了。

之后在__slab_alloc()函数中进行相关判断后会进入到___slab_alloc()函数，进行后续的不同情况判断。不是很懂在干嘛，和CONFIG_PREEMPTION配置有关。

//v5.9 /mm/slub.c
static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
			  unsigned long addr, struct kmem_cache_cpu *c)
{
	void *p;
	unsigned long flags;

	local_irq_save(flags);
#ifdef CONFIG_PREEMPTION
	/*
	 * We may have been preempted and rescheduled on a different
	 * cpu before disabling interrupts. Need to reload cpu area
	 * pointer.
	 */
	c = this_cpu_ptr(s->cpu_slab);
#endif

	p = ___slab_alloc(s, gfpflags, node, addr, c);
	local_irq_restore(flags);
	return p;
}

如下，当cpu_slab->freelist被分配完毕之后，cpu_slab->page也被清空

2.`partial`分配

在定义配置时，需要CONFIG_SLUB_CPU_PARTIAL=y才会有

进入___slab_alloc()函数后，会分为不少情况

static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
			  unsigned long addr, struct kmem_cache_cpu *c)
{
	void *freelist;
	struct page *page;
	//这个c即为对应kmalloc-xx下的cpu_slab
    //检查page是否为NULL,如果为NULL代表本地缓冲池的freelist已经分配完毕
    //那么就会进入到new_slab
	page = c->page;
	if (!page) {
		/*
		 * if the node is not online or has no normal memory, just
		 * ignore the node constraint
		 */
		if (unlikely(node != NUMA_NO_NODE &&
			     !node_state(node, N_NORMAL_MEMORY)))
			node = NUMA_NO_NODE;
		goto new_slab;
	}
redo:
    //这里有一些匹配检查,会检查page的nid和node是否能对上
    //不能对上就不会进行相关分配,接着重来,不太懂
    //也常常碰上不匹配的partial
	if (unlikely(!node_match(page, node))) {
		/*
		 * same as above but node_match() being false already
		 * implies node != NUMA_NO_NODE
		 */
		if (!node_state(node, N_NORMAL_MEMORY)) {
			node = NUMA_NO_NODE;
			goto redo;
		} else {
			stat(s, ALLOC_NODE_MISMATCH);
			deactivate_slab(s, page, c->freelist, c);
			goto new_slab;
		}
	}

    //.....

	/* must check again c->freelist in case of cpu migration or IRQ */
	freelist = c->freelist;
	if (freelist)
		goto load_freelist;
    
	freelist = get_freelist(s, page);
	if (!freelist) {
		c->page = NULL;
		stat(s, DEACTIVATE_BYPASS);
		goto new_slab;
	}
	stat(s, ALLOC_REFILL);

    //本地CPU的page被赋值之后,加载freelist的过程
load_freelist:
	/*
	 * freelist is pointing to the list of objects to be used.
	 * page is pointing to the page from which the objects are obtained.
	 * That page must be frozen for per cpu allocations to work.
	 */
	VM_BUG_ON(!c->page->frozen);
	c->freelist = get_freepointer(s, freelist);
	c->tid = next_tid(c->tid);
	return freelist;

    //这里完成本地CPU的partial寻找以及重新从buddy伙伴中分配slab描述符
new_slab:
    //检查本地CPU的partial是否存在
	if (slub_percpu_partial(c)) {
        //本地CPU的partial则直接赋值给本地CPU的page,进入redo加载freelist
		page = c->page = slub_percpu_partial(c);
  		//对本地CPU刚赋值的page进行相关设置,包括slab_cache等之类的设置
		slub_set_percpu_partial(c, page);
		stat(s, CPU_PARTIAL_ALLOC);
        //跳转redo,依据新的page重新加载freelist
		goto redo;
	}
	//new_slab_objects会检查当前kmalloc-xx对应的node下
    //是否存在partial可供分配,没有则会从buddy伙伴系统中分配slab页框描述符
	freelist = new_slab_objects(s, gfpflags, node, &c);
	//后面就是一些相关的检查及加载freelist
	if (unlikely(!freelist)) {
		slab_out_of_memory(s, gfpflags, node);
		return NULL;
	}
	page = c->page;
	if (likely(!kmem_cache_debug(s) && pfmemalloc_match(page, gfpflags)))
		goto load_freelist;
    //............
	deactivate_slab(s, page, get_freepointer(s, freelist), c);
	return freelist;
}

(1)本地CPU的`partial`

首先就是判断本地CPU，即cpu_slab的partial，其也为一个page页框，会与其他的页框依据struct list_head lru;域组成双向循环链表。

如果存在，则将该partial当作一个slab描述页框page，遍历其struct list_head lru;域，找到合适的slab描述页框page，赋值给cpu_slab->page，并且该partial也会在slub_set_percpu_partial(c, page);函数中被原本的partial的next覆盖。

请忽略这个freelist上还有值，截图的时候没有分配光freelist。

完成之后如下所示，被覆盖掉
然后进入redo，依据新赋值的page加载freelist，并且保存在page中的freelist被置空，最终如下

(2)`node`中的`partial`

然后进入new_slab_objects函数判断本kmalloc-xx对应的node中是否存在partial，存在则进行相关赋值后，直接返回freelist

static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags,
			int node, struct kmem_cache_cpu **pc)
{
	void *freelist;
	struct kmem_cache_cpu *c = *pc;
	struct page *page;

	//.....
    //查看本kmalloc-xx对应的node中的partial是否存在
    //存在进行相关赋值后直接返回freelist了
	freelist = get_partial(s, flags, node, c);
	if (freelist)
		return freelist;

    //本kmalloc-xx对应的node中的partial不存在,从buddy伙伴系统分配
	page = new_slab(s, flags, node);
	if (page) {
		c = raw_cpu_ptr(s->cpu_slab);
		if (c->page)
			flush_slab(s, c);
		/*
		 * No other reference to the page yet so we can
		 * muck around with it freely without cmpxchg
		 */
		freelist = page->freelist;
		page->freelist = NULL;

		stat(s, ALLOC_SLAB);
		c->page = page;
		*pc = c;
	}

	return freelist;
}

这里有点奇怪的是，遍历对应struct kmem_cache的kmalloc-xx下的node时，其partial指向的是page的struct list_head lru;域地址，而非实际的page地址，所以真实的page地址为struct list_head lru;域地址减去0x8。

之后即从该page中取freelist进行赋值给本地的kmem_cache_cpu下cpu_slab的freelist，即本地缓冲池。

3.buddy分配

如上述所示，在new_slab_objects()中判断partial不存在之后，即从buddy伙伴算法中分配新的slab页框描述符，之后就太复杂了，后续再学把。

前言

SLAB

1.本地缓冲池分配

2.其他缓冲池分配

①shared缓存池分配

②slabs_partial和slabs_free分配

3.重新分配一个slab

SLUB

1.本地缓冲池分配

2.partial分配

(1)本地CPU的partial

(2)node中的partial

3.buddy分配

2.`partial`分配

(1)本地CPU的`partial`

(2)`node`中的`partial`