linux kernel heap

Buddy System

伙伴系统

采用二分法分配内存

分配的内存大小必须是2的整数次幂，2的指数（幂）就称为一个块的order。

块的大小单位为一个页的大小，比如一个页为4k的系统，order为0的堆块的大小就是(2^0)*4K。

分配连续的物理空间

memory zone

DMA and DMA32 zones, highmem zone and a Normal zone and other zones

vmalloc 的分配的空间在虚拟内存下时连续的，但是不一定要在物理内存下时连续的。

通过/proc/buddyinfo和/proc/pagetypeinfo查看分配的情况

1
2
3

/ # cat /proc/buddyinfo
Node 0, zone      DMA      0      0      0      1      2      1      1      0      1      1      3
Node 0, zone    DMA32      0      5      3      6      7      8      8     10      7      3     14

分配

首先有个大堆块

当分配时首先确定需要的块大小（确定需要那个oder的块）

如果当前的order有剩余的块则分配值，否则将order更大的块进行切分

Linux中通过get_free_pages实现

释放

当有块释放时，检查相邻的块（从同一个块中切分出来的），就将其合并成更大的order的块。

Linux中通过alloc_pages实现

buddy allocator

分配页的函数在”linux/gfp.h”中（gfp是get free page的意思），其中 gfp_mask是alloc_pages用来表示谁来请求这次的内存分配。比如GFP_KERNEL用来表示这个页框是在内核中使用的，GFP_USER表示在用户空间使用。

#define alloc_pages(gfp_mask, order) \
		alloc_pages_node(numa_node_id(), gfp_mask, order)

/*
 * Allocate pages, preferring the node given as nid. When nid == NUMA_NO_NODE,
 * prefer the current CPU's closest node. Otherwise node must be valid and
 * online.
 */
static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask,
						unsigned int order)

/*
 * Allocate pages, preferring the node given as nid. The node must be valid and
 * online. For more general interface, see alloc_pages_node().
 */
static inline struct page *
__alloc_pages_node(int nid, gfp_t gfp_mask, unsigned int order)

/*
 * This is the 'heart' of the zoned buddy allocator.
 */
struct page *
__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid,
							nodemask_t *nodemask)

释放页

void free_pages(unsigned long addr, unsigned int order)
{
	if (addr != 0) {
		VM_BUG_ON(!virt_addr_valid((void *)addr));
		__free_pages(virt_to_page((void *)addr), order);
	}
}
//--->....

slab

slabs_full 完全分配的slab

slabs_partital 部分分配的slab

slabs_empty 空空的slab

SLAB: 从伙伴系统分配的 2^order 个物理页就组成了一个 SLAB ，而后续的操作就是在这个 SLAB 上在进行细分的，具体的结构体是 struct slab 。

每个kmem_cache里面包含的很多slab结构，同一个kmem_cache中包含的obj的大小都是相同的，每个slab只针对一种数据类型

每个slab结构中可以包含很多的obj

每个slab是有一个或者多个连续的页组成

kmalloc是基于slab分配器的

利用cat /porc/slabinfo来打印出相关的结构

Boot took 0.99 seconds
/ # cat /proc/slabinfo
slabinfo - version: 2.1
# name            <active_objs> <num_objs> <objsize> <objperslab> <pagesperslab> : tunables <limit> <batchcount> <sharedfactor> : slabdata <active_slabs> <num_slabs> <sharedavail>
kcopyd_job             0      0   3312    9    8 : tunables    0    0    0 : slabdata      0      0      0
scsi_sense_cache      32     32    128   32    1 : tunables    0    0    0 : slabdata      1      1      0
i915_vma               0      0    576   14    2 : tunables    0    0    0 : slabdata      0      0      0
execute_cb             0      0    128   32    1 : tunables    0    0    0 : slabdata      0      0      0
i915_request           0      0    640   12    2 : tunables    0    0    0 : slabdata      0      0      0
sgpool-16              8      8    512    8    1 : tunables    0    0    0 : slabdata      1      1      0
isofs_inode_cache      0      0    624   13    2 : tunables    0    0    0 : slabdata      0      0      0
fat_inode_cache        0      0    712   11    2 : tunables    0    0    0 : slabdata      0      0      0
fat_cache              0      0     40  102    1 : tunables    0    0    0 : slabdata      0      0      0
jbd2_journal_handle      0      0     48   85    1 : tunables    0    0    0 : slabdata      0      0      0
jbd2_journal_head      0      0    120   34    1 : tunables    0    0    0 : slabdata      0      0      0
jbd2_revoke_table_s      0      0     16  256    1 : tunables    0    0    0 : slabdata      0      0      0
ext4_inode_cache       0      0   1064   15    4 : tunables    0    0    0 : slabdata      0      0      0
ext4_allocation_context      0      0    128   32    1 : tunables    0    0    0 : slabdata      0      0      0

active_objs 正在被使用的obj的数量

num_objs obj的总量

objsize obj的大小

objperslab 每个slab中可以有多少个obj

pagesperslab 每个slab对应几个页·

slab allocator

buddy系统会产生大量的内碎片，比如（伙伴系统只能分配连续的物理内存页）一个33page的请求就伙伴系统就会分配2^6=64pages，相当于浪费了31pages。引入slab allocator就是为了把页分成更小的块来进行分配。

slab 核心思想是在保存那些常用堆块的缓存，以便能够在内核中进行分配。这是非常有意义的，通过缓存被释放obj，有一些常用的结构就能够在复制新的同样的结构时进行快速分配。通过重新利用被释放的obj，在某些情况下，内核就不必再重新申请内存了。Linux的slab allocator经过长期的发展，已经发生了很大的变化。现在slab allocator有三种不同的实现

SLOB Allocator 是在solaris os被实现的最原始的slab allocator，现在大多被用在内存很小的嵌入式系统里，分配小内存非常高效，首次适应分配算法
SLAB Allocator SLAB的升级，旨在变得更加的“cache friendly”
SLUB Allocator 通过减少使用的队列/链来回获得比SLAB更好的执行时间

今天大多数系统中使用的SLUB

一个slab cache包含许多slab，一个slab中包含很多obj

？分别对应什么结构

slab cahes —>struct kmem_cache

页 —> struct page

slab —> page

obj的结构

有两种主要的cache

Dedicate（专用的）：这些缓存是给那些常用的内核中的结构的。这写结构被分配时就是初始化过的，被释放的时候还是保持被初始化的，一边下一次分配的时候会变得更快
Generic（通用的）：这些都是通用的缓存，在大多数情况下，它们的大小对应于2的幂。

dma-kmalloc-256 以上时专用缓存，dma-kmalloc-256及以下时通用缓存。

vm_area_struct     65543  66082    208   19    1 : tunables    0    0    0 : slabdata   3478   3478      0
mm_struct            213    225   2112   15    8 : tunables    0    0    0 : slabdata     15     15      0
files_cache          228    230    704   23    4 : tunables    0    0    0 : slabdata     10     10      0
signal_cache         399    448   1024   16    4 : tunables    0    0    0 : slabdata     28     28      0
sighand_cache        414    435   2112   15    8 : tunables    0    0    0 : slabdata     29     29      0
task_struct         1102   1125   5952    5    8 : tunables    0    0    0 : slabdata    225    225      0
dma-kmalloc-256        0      0    256   16    1 : tunables    0    0    0 : slabdata      0      0      0
dma-kmalloc-128        0      0    128   32    1 : tunables    0    0    0 : slabdata      0      0      0
dma-kmalloc-64         0      0     64   64    1 : tunables    0    0    0 : slabdata      0      0      0
dma-kmalloc-32         0      0     32  128    1 : tunables    0    0    0 : slabdata      0      0      0
dma-kmalloc-16         0      0     16  256    1 : tunables    0    0    0 : slabdata      0      0      0
dma-kmalloc-8          0      0      8  512    1 : tunables    0    0    0 : slabdata      0      0      0
kmalloc-256         1801   2064    256   16    1 : tunables    0    0    0 : slabdata    129    129      0
kmalloc-192         4410   4410    192   21    1 : tunables    0    0    0 : slabdata    210    210      0
kmalloc-128         2689   2752    128   32    1 : tunables    0    0    0 : slabdata     86     86      0
kmalloc-96          6952   7350     96   42    1 : tunables    0    0    0 : slabdata    175    175      0
kmalloc-64         25933  26496     64   64    1 : tunables    0    0    0 : slabdata    414    414      0
kmalloc-32         15150  15616     32  128    1 : tunables    0    0    0 : slabdata    122    122      0
kmalloc-16         18432  18432     16  256    1 : tunables    0    0    0 : slabdata     72     72      0
kmalloc-8          10149  10240      8  512    1 : tunables    0    0    0 : slabdata     20     20      0

kmalloc

kmalloc是内核通过slab allocator进行一般内存分配的接口

// "include/linux/slab.h"
static __always_inline void *kmalloc(size_t size, gfp_t flags);
//allocates memory through slab allocator.

static inline void *kzalloc(size_t size, gfp_t flags);
//allocates memory (and zeroes it out like calloc() in libc) through the slab allocator.
//和libc中的calloc相似

void * __must_check krealloc(const void *, size_t, gfp_t);
//resize existing allocation.
//和libc中realloc相似

void kfree(const void *);
//frees memory previously allocated.

void kzfree(const void *);

专用缓存（Dedicated Cache）的初始化

以vm_area_struct的slab cache 为例子

//"kernel/fork.c"

/* slab cache for vm_area_struct structures */
static struct kmem_cache *vm_area_cachep;

vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT);
/*
 KMEM_CACHE() is a macro defined in "/include/linux/slab.h" which actually expands
 to a call to kmem_cache_create() which is the procedure to register a new slab
 cache and adds the new cache to the list of all slabs in the system.
 */

struct vm_area_struct *vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
/*
 kmem_cache_alloc() is the procedure to allocate the object from the dedicated
 cache memory. The allocation will be attempted first on a partially filled slab,
 then (if no partially filled slabs are available) in a free slab,
 and if no free slabs are found,
 it will try to allocate new page frames from the underlying buddy allocator.
 */

 kmem_cache_free(vm_area_cachep, vma);
 /*
  kmem_cache_free() is the procedure that frees the memory allocated to the vma object
  previously allocated. The (now free'd) slab,
  will be kept in order to be used for future allocations and the memory is NOT
  released immediately after this call. When all of the slabs have been free'd,
  kernel modules have to call kmem_cache_destroy() to release the pre allocated memory.
 */

SLAB Cache 管理

kmem_cache中的结构

unsigned int gfporder

每个slab能够存放的页的数量。

struct kmem_cache_node *node[MAN_NUMNODES]

用来存储不同状态的slab，主要维护三个变量，这个主要是维护这个kmem_cache_node中的slab的信息。

/*
 * The slab lists for all objects.
 */
struct kmem_cache_node {
	spinlock_t list_lock;

#ifdef CONFIG_SLAB
	struct list_head slabs_partial;	/* partial list first, better asm code */
	struct list_head slabs_full;
	struct list_head slabs_free;
	unsigned long total_slabs;	/* length of all slab lists */
	unsigned long free_slabs;	/* length of free slab list only */
	unsigned long free_objects;
	unsigned int free_limit;
	unsigned int colour_next;	/* Per-node cache coloring */
	struct array_cache *shared;	/* shared per node */
	struct alien_cache **alien;	/* on other nodes */
	unsigned long next_reap;	/* updated without locking */
	int free_touched;		/* updated without locking */
#endif

#ifdef CONFIG_SLUB
	unsigned long nr_partial;
	struct list_head partial;
#ifdef CONFIG_SLUB_DEBUG
	atomic_long_t nr_slabs;
	atomic_long_t total_objects;
	struct list_head full;
#endif
#endif

};

struct array_cache __percpu *cpu_cache

这个结构是管理被释放的obj的块，entry中的每一项指向被释放的对象。

/*
 * struct array_cache
 *
 * Purpose:
 * - LIFO ordering, to hand out cache-warm objects from _alloc
 * - reduce the number of linked list operations
 * - reduce spinlock operations
 *
 * The limit is stored in the per-cpu structure to reduce the data cache
 * footprint.
 *
 */
struct array_cache {
	unsigned int avail;
	unsigned int limit;
	unsigned int batchcount;
	unsigned int touched;
	void *entry[];	/*
			 * Must have this definition in here for the proper
			 * alignment of array_cache. Also simplifies accessing
			 * the entries.
			 */
};

page中的void s_mem指向slab中的第一个obj;void freelist;指向第一个free的obj

struct {	/* slab, slob and slub */
			union {
				struct list_head slab_list;
				struct {	/* Partial pages */
					struct page *next;
#ifdef CONFIG_64BIT
					int pages;	/* Nr of pages left */
					int pobjects;	/* Approximate count */
#else
					short int pages;
					short int pobjects;
#endif
				};
			};
			struct kmem_cache *slab_cache; /* not slob */
			/* Double-word boundary */
			void *freelist;		/* first free object */
			union {
				void *s_mem;	/* slab: first object */
				unsigned long counters;		/* SLUB */
				struct {			/* SLUB */
					unsigned inuse:16;
					unsigned objects:15;
					unsigned frozen:1;
				};
			};
		};

SLUB

用来管理被释放的obj

struct kmem_cache_cpu {
	void **freelist;	/* Pointer to next available object */
	unsigned long tid;	/* Globally unique transaction id */
	struct page *page;	/* The slab from which we are allocating */
#ifdef CONFIG_SLUB_CPU_PARTIAL
	struct page *partial;	/* Partially allocated frozen slabs */
#endif
#ifdef CONFIG_SLUB_STATS
	unsigned stat[NR_SLUB_STAT_ITEMS];
#endif
};