Linux内核内存分配笔记

前言

Linux内核的内存分配很复杂,单独开篇来慢慢记录。

SLAB

参考:linux内存源码分析 - SLAB分配器概述 - tolimit - 博客园 (cnblogs.com)

函数链:

1
kmem_cache_alloc()->slab_alloc()->__do_cache_alloc()->__cache_alloc()

最终在__cache_alloc进行实际的内存分配,在这进行分配分叉点。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
//v5.8 /mm/slab.c
static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
{
void *objp;
//当前slab描述符的对象缓冲池
struct array_cache *ac;
//...
//获取缓冲池,实际上为对应的
//CPU_addr+kmalloc_caches[xx][xx].cpu_cache
//这个kmalloc_caches[xx][xx]即为对应大小的slab描述符
ac = cpu_cache_get(cachep);
//如果存在可用的,就直接进行分配-------freelist式分配
if (likely(ac->avail)) {
ac->touched = 1;
//这里即把avail也当作一个idx进行索引了
objp = ac->entry[--ac->avail];
STATS_INC_ALLOCHIT(cachep);
goto out;
}

STATS_INC_ALLOCMISS(cachep);
//不存在的话,则进入另一种分配--------其他缓冲池分配
objp = cache_alloc_refill(cachep, flags);
/*
* the 'ac' may be updated by cache_alloc_refill(),
* and kmemleak_erase() requires its correct value.
*/
ac = cpu_cache_get(cachep);

out:
/*
* To avoid a false negative, if an object that is in one of the
* per-CPU caches is leaked, we need to make sure kmemleak doesn't
* treat the array pointers as a reference to the object.
*/
if (objp)
kmemleak_erase(&ac->entry[ac->avail]);
return objp;
}

1.本地缓冲池分配

这个就不多说了,依据CPUX_addr + kmalloc_caches[xx][xx].cpu_cache.entry当作一个array_cache进行分配,分配一个即将对应的array_cache.avail减1。

2.其他缓冲池分配

进入cache_alloc_refill()函数,还是会有相关的分叉

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
//v5.9 /mm/slab.c
static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
{
int batchcount;
struct kmem_cache_node *n;
struct array_cache *ac, *shared;
int node;
void *list = NULL;
struct page *page;

check_irq_off();
//获取node节点id和array_cache相关信息
node = numa_mem_id();
ac = cpu_cache_get(cachep);
batchcount = ac->batchcount;
//不知道干啥的
if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
/*
* If there was little recent activity on this cache, then
* perform only a partial refill. Otherwise we could generate
* refill bouncing.
*/
batchcount = BATCHREFILL_LIMIT;
}
//真实的node节点指针
n = get_node(cachep, node);


BUG_ON(ac->avail > 0 || !n);
//尝试从共享对象缓冲池shared_entry进行分配
shared = READ_ONCE(n->shared);
if (!n->free_objects && (!shared || !shared->avail))
goto direct_grow;
//相关自旋锁
spin_lock(&n->list_lock);
shared = READ_ONCE(n->shared);
/* See if we can refill from the shared array */
//transfer_objects()函数会从共享对象缓冲池shared_entry
//转移batchcount个空闲对象到本地缓冲池进行分配
if (shared && transfer_objects(ac, shared, batchcount)) {
//为什么设置touched?
shared->touched = 1;
goto alloc_done;
}

//共享对象缓冲池shared_entry没有空闲对象时,查看
//slabs_partial(部分空闲)链表和slabs_free(全部空闲)链表
while (batchcount > 0) {
/* Get slab alloc is to come from. */
//进入实际分配函数
page = get_first_slab(n, false);
//如果slabs_partial(部分空闲)链表和slabs_free(全部空闲)链表
//都没有则重新分配一个slab及对应空间
if (!page)
goto must_grow;

check_spinlock_acquired(cachep);

batchcount = alloc_block(cachep, ac, page, batchcount);
fixup_slab_list(cachep, n, page, &list);
}

//重新分配一个slab
must_grow:
n->free_objects -= ac->avail;
alloc_done:
spin_unlock(&n->list_lock);
fixup_objfreelist_debug(cachep, &list);


direct_grow:
if (unlikely(!ac->avail)) {
/* Check if we can use obj in pfmemalloc slab */
if (sk_memalloc_socks()) {
void *obj = cache_alloc_pfmemalloc(cachep, n, flags);
if (obj)
return obj;
}

page = cache_grow_begin(cachep, gfp_exact_node(flags), node);

/*
* cache_grow_begin() can reenable interrupts,
* then ac could change.
*/
ac = cpu_cache_get(cachep);
if (!ac->avail && page)
alloc_block(cachep, ac, page, batchcount);
cache_grow_end(cachep, page);

if (!ac->avail)
return NULL;
}
ac->touched = 1;

return ac->entry[--ac->avail];
}

①shared缓存池分配

尝试将kmalloc_caches[xx][xx].node.shared.entry当作一个array_cache进行检测,尝试分配

image-20220517162554930

②slabs_partial和slabs_free分配

get_first_slab()函数中进行分配,尝试获取一个slab页框描述符,依据相关索引,从其中的page->mapping开始获取堆块对象obj,挨个转移到对应CPUkmalloc-xxarray_cache(即对应的本地缓冲池中)。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
//v5.9 /mm/slab.c
static struct page *get_first_slab(struct kmem_cache_node *n, bool pfmemalloc)
{
struct page *page;

assert_spin_locked(&n->list_lock);
page = list_first_entry_or_null(&n->slabs_partial, struct page,slab_list);
if (!page) {
n->free_touched = 1;
page = list_first_entry_or_null(&n->slabs_free, struct page,
slab_list);
if (page)
n->free_slabs--;
}
if (sk_memalloc_socks())
page = get_valid_first_slab(n, page, pfmemalloc);

return page;
}

3.重新分配一个slab

上面也提到,进入到must_grow即进行slab的重新分配,这个着实有点复杂,不是很会,涉及ZONE、NODE什么之类的数据结构,还有NUMA机制之类的。

SLUB

参考:Linux内存管理:slub分配器 - 知乎 (zhihu.com)

函数链:

1
kmem_cache_alloc()->slab_alloc()->slab_alloc_node()

slab_alloc_node()函数开始分配

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
//v5.9 /mm/slub.c
static __always_inline void *slab_alloc_node(struct kmem_cache *s,
gfp_t gfpflags, int node, unsigned long addr)
{
void *object;
struct kmem_cache_cpu *c;
struct page *page;
unsigned long tid;
struct obj_cgroup *objcg = NULL;

//获取对应的kmem_cache描述符
s = slab_pre_alloc_hook(s, &objcg, 1, gfpflags);
if (!s)
return NULL;
redo:
//.....
//尝试从本地缓冲池进行分配
object = c->freelist;
page = c->page;
if (unlikely(!object || !node_match(page, node))) {
//本地缓冲池已分配完毕
//分配不成功就进入后续的__slab_alloc()函数
object = __slab_alloc(s, gfpflags, node, addr, c);
stat(s, ALLOC_SLOWPATH);
} else {
//相关SLUB保护的指针运算
void *next_object = get_freepointer_safe(s, object);
//...一大堆检查
}
//一大堆检查看不太懂

return object;
}

1.本地缓冲池分配

这个就不说了,直接就是从本地的kmem_cache_cpucpu_slabfreelist开始分配,上面的slab_alloc_node()函数中也相关体现了。

之后在__slab_alloc()函数中进行相关判断后会进入到___slab_alloc()函数,进行后续的不同情况判断。不是很懂在干嘛,和CONFIG_PREEMPTION配置有关。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
//v5.9 /mm/slub.c
static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
unsigned long addr, struct kmem_cache_cpu *c)
{
void *p;
unsigned long flags;

local_irq_save(flags);
#ifdef CONFIG_PREEMPTION
/*
* We may have been preempted and rescheduled on a different
* cpu before disabling interrupts. Need to reload cpu area
* pointer.
*/
c = this_cpu_ptr(s->cpu_slab);
#endif

p = ___slab_alloc(s, gfpflags, node, addr, c);
local_irq_restore(flags);
return p;
}

如下,当cpu_slab->freelist被分配完毕之后,cpu_slab->page也被清空image-20220519192842119

2.partial分配

在定义配置时,需要CONFIG_SLUB_CPU_PARTIAL=y才会有

进入___slab_alloc()函数后,会分为不少情况

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
unsigned long addr, struct kmem_cache_cpu *c)
{
void *freelist;
struct page *page;
//这个c即为对应kmalloc-xx下的cpu_slab
//检查page是否为NULL,如果为NULL代表本地缓冲池的freelist已经分配完毕
//那么就会进入到new_slab
page = c->page;
if (!page) {
/*
* if the node is not online or has no normal memory, just
* ignore the node constraint
*/
if (unlikely(node != NUMA_NO_NODE &&
!node_state(node, N_NORMAL_MEMORY)))
node = NUMA_NO_NODE;
goto new_slab;
}
redo:
//这里有一些匹配检查,会检查page的nid和node是否能对上
//不能对上就不会进行相关分配,接着重来,不太懂
//也常常碰上不匹配的partial
if (unlikely(!node_match(page, node))) {
/*
* same as above but node_match() being false already
* implies node != NUMA_NO_NODE
*/
if (!node_state(node, N_NORMAL_MEMORY)) {
node = NUMA_NO_NODE;
goto redo;
} else {
stat(s, ALLOC_NODE_MISMATCH);
deactivate_slab(s, page, c->freelist, c);
goto new_slab;
}
}

//.....

/* must check again c->freelist in case of cpu migration or IRQ */
freelist = c->freelist;
if (freelist)
goto load_freelist;

freelist = get_freelist(s, page);
if (!freelist) {
c->page = NULL;
stat(s, DEACTIVATE_BYPASS);
goto new_slab;
}
stat(s, ALLOC_REFILL);

//本地CPU的page被赋值之后,加载freelist的过程
load_freelist:
/*
* freelist is pointing to the list of objects to be used.
* page is pointing to the page from which the objects are obtained.
* That page must be frozen for per cpu allocations to work.
*/
VM_BUG_ON(!c->page->frozen);
c->freelist = get_freepointer(s, freelist);
c->tid = next_tid(c->tid);
return freelist;

//这里完成本地CPU的partial寻找以及重新从buddy伙伴中分配slab描述符
new_slab:
//检查本地CPU的partial是否存在
if (slub_percpu_partial(c)) {
//本地CPU的partial则直接赋值给本地CPU的page,进入redo加载freelist
page = c->page = slub_percpu_partial(c);
//对本地CPU刚赋值的page进行相关设置,包括slab_cache等之类的设置
slub_set_percpu_partial(c, page);
stat(s, CPU_PARTIAL_ALLOC);
//跳转redo,依据新的page重新加载freelist
goto redo;
}
//new_slab_objects会检查当前kmalloc-xx对应的node下
//是否存在partial可供分配,没有则会从buddy伙伴系统中分配slab页框描述符
freelist = new_slab_objects(s, gfpflags, node, &c);
//后面就是一些相关的检查及加载freelist
if (unlikely(!freelist)) {
slab_out_of_memory(s, gfpflags, node);
return NULL;
}
page = c->page;
if (likely(!kmem_cache_debug(s) && pfmemalloc_match(page, gfpflags)))
goto load_freelist;
//............
deactivate_slab(s, page, get_freepointer(s, freelist), c);
return freelist;
}

(1)本地CPU的partial

首先就是判断本地CPU,即cpu_slabpartial,其也为一个page页框,会与其他的页框依据struct list_head lru;域组成双向循环链表。

  • 如果存在,则将该partial当作一个slab描述页框page,遍历其struct list_head lru;域,找到合适的slab描述页框page,赋值给cpu_slab->page,并且该partial也会在slub_set_percpu_partial(c, page);函数中被原本的partialnext覆盖。

    请忽略这个freelist上还有值,截图的时候没有分配光freelist

    image-20220519210608695

    完成之后如下所示,被覆盖掉

    image-20220519211038094

  • 然后进入redo,依据新赋值的page加载freelist,并且保存在page中的freelist被置空,最终如下

    image-20220519211452815

(2)node中的partial

然后进入new_slab_objects函数判断本kmalloc-xx对应的node中是否存在partial,存在则进行相关赋值后,直接返回freelist

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags,
int node, struct kmem_cache_cpu **pc)
{
void *freelist;
struct kmem_cache_cpu *c = *pc;
struct page *page;

//.....
//查看本kmalloc-xx对应的node中的partial是否存在
//存在进行相关赋值后直接返回freelist了
freelist = get_partial(s, flags, node, c);
if (freelist)
return freelist;

//本kmalloc-xx对应的node中的partial不存在,从buddy伙伴系统分配
page = new_slab(s, flags, node);
if (page) {
c = raw_cpu_ptr(s->cpu_slab);
if (c->page)
flush_slab(s, c);
/*
* No other reference to the page yet so we can
* muck around with it freely without cmpxchg
*/
freelist = page->freelist;
page->freelist = NULL;

stat(s, ALLOC_SLAB);
c->page = page;
*pc = c;
}

return freelist;
}

这里有点奇怪的是,遍历对应struct kmem_cachekmalloc-xx下的node时,其partial指向的是pagestruct list_head lru;域地址,而非实际的page地址,所以真实的page地址为struct list_head lru;域地址减去0x8

image-20220519201826071

之后即从该page中取freelist进行赋值给本地的kmem_cache_cpucpu_slabfreelist,即本地缓冲池。

3.buddy分配

如上述所示,在new_slab_objects()中判断partial不存在之后,即从buddy伙伴算法中分配新的slab页框描述符,之后就太复杂了,后续再学把。