linux内存管理--vmalloc

时间：2016-04-29 16:26:18 阅读：214 评论：0 收藏：0 [点我收藏+]

标签：

详细的概念性解释就不说了，如果对vmalloc没有一点概念的话，可以稍微找些资料了解下，这里主要就是分析下在内核中vmalloc的实现；

直接物理内存映射（内核逻辑地址）-- 8 MB -- vm -- 1 page -- vm -- 1page --vm ......

大概就是这样：逻辑地址以high_memory为结束边界；然后是 8MB 的空洞（主要是防止指针越界访问）；接着就是 VMALLOC_START为边界开始了vmalloc 区域，该区域有多个vm小区域组成，每个小区域之间有1页（一个page大小）的空洞地址，作用还是防止越界访问；结束是以VMALLOC_END，后面还有个空洞地址，接着最后就是固定映射和临时映射的区域了；

结构体：

struct vm_struct {
    struct vm_struct    *next;//所有vm_struct链接的链表，vmlist是表头
    void            *addr;//分配得到的子区域在虚拟地址空间中的起始地址
    unsigned long       size;//表示区域长度
    unsigned long       flags;//标识
    struct page     **pages;//这是个指针数组，每个数组元素都是一个被映射的page指针
    unsigned int        nr_pages;//表示多少个page被映射
    phys_addr_t     phys_addr;
    const void      *caller;
};

这个结构体和进程虚拟地址空间的vma非常相识，值得注意；

下面这个结构体是用来管理kvm地址的

struct vmap_area {
    unsigned long va_start;
    unsigned long va_end;
    unsigned long flags;
    struct rb_node rb_node;         /* address sorted rbtree */
    struct list_head list;          /* address sorted list */
    struct list_head purge_list;    /* "lazy purge" list */
    struct vm_struct *vm;
    struct rcu_head rcu_head;
};

/**
 *  vmalloc  -  allocate virtually contiguous memory
 *  @size:      allocation size
 *  Allocate enough pages to cover @size from the page level
 *  allocator and map them into contiguous kernel virtual space.
 *
 *  For tight control over page level allocator and protection flags
 *  use __vmalloc() instead.
 */
void *vmalloc(unsigned long size)
{
    return __vmalloc_node_flags(size, NUMA_NO_NODE,
                    GFP_KERNEL | __GFP_HIGHMEM);//从高内存分配
}

static inline void *__vmalloc_node_flags(unsigned long size,
                    int node, gfp_t flags)
{
    return __vmalloc_node(size, 1, flags, PAGE_KERNEL,
                    node, __builtin_return_address(0));
}

__builtin_return_address(0)的含义是，得到当前函数返回地址，即此函数被别的函数调用，然后此函数执行完毕后，返回，所谓返回地址就是那时候的地址。__builtin_return_address(1)的含义是，得到当前函数的调用者的返回地址。注意是调用者的返回地址，而不是函数起始地址。

/**
 *  __vmalloc_node  -  allocate virtually contiguous memory
 *  @size:      allocation size
 *  @align:     desired alignment
 *  @gfp_mask:  flags for the page level allocator
 *  @prot:      protection mask for the allocated pages
 *  @node:      node to use for allocation or NUMA_NO_NODE
 *  @caller:    caller's return address
 *
 *  Allocate enough pages to cover @size from the page level
 *  allocator with @gfp_mask flags.  Map them into contiguous
 *  kernel virtual space, using a pagetable protection of @prot.
 */
static void *__vmalloc_node(unsigned long size, unsigned long align,
                gfp_t gfp_mask, pgprot_t prot,
                int node, const void *caller)
{
    return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END,
                gfp_mask, prot, node, caller);
}

这是个主要函数，说明下参数：

unsigned long size ：表示要分配的内存大小；

unsigned long align：表示以什么对齐，到这里是 1；

unsigned long start：表示映射区域从什么地方开始查找，这里为：VMALLOC_START;

unsigned long end ：表示映射区域从什么地方结束查找，这里为：VMALLOC_END;

gfp_t gfp_mask：表示分配的标识，这里为：GFP_KERNEL | __GFP_HIGHMEM;

pgprot_t prot：表示区域的保护模式，这里为：PAGE_KERNEL;

int node：表示分配节点，这里为：-1；

const void *caller：表示函数地址，这里表示的是__vmalloc_node的返回地址

/**
 *  __vmalloc_node_range  -  allocate virtually contiguous memory
 *  @size:      allocation size
 *  @align:     desired alignment
 *  @start:     vm area range start
 *  @end:       vm area range end
 *  @gfp_mask:  flags for the page level allocator
 *  @prot:      protection mask for the allocated pages
 *  @node:      node to use for allocation or NUMA_NO_NODE
 *  @caller:    caller's return address
 *
 *  Allocate enough pages to cover @size from the page level
 *  allocator with @gfp_mask flags.  Map them into contiguous
 *  kernel virtual space, using a pagetable protection of @prot.
 */
void *__vmalloc_node_range(unsigned long size, unsigned long align,
            unsigned long start, unsigned long end, gfp_t gfp_mask,
            pgprot_t prot, int node, const void *caller)
{
    struct vm_struct *area;
    void *addr;
    unsigned long real_size = size;

    size = PAGE_ALIGN(size);//size必须页面对齐，因为是映射到页面上，所以必须的页面对齐
    if (!size || (size >> PAGE_SHIFT) > totalram_pages)//大小检查下
        goto fail;

    area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNLIST,
                  start, end, node, gfp_mask, caller);//从这里已经得到area了（也可能为NULL）
    if (!area)
        goto fail;

    addr = __vmalloc_area_node(area, gfp_mask, prot, node, caller);
    if (!addr)
        return NULL;

    /*
     * In this function, newly allocated vm_struct has VM_UNLIST flag.
     * It means that vm_struct is not fully initialized.
     * Now, it is fully initialized, so remove this flag here.
     */
    clear_vm_unlist(area);//已经把所有成员都初始化好了，可以清除VM_UNLIST标识了

    /*
     * A ref_count = 3 is needed because the vm_struct and vmap_area
     * structures allocated in the __get_vm_area_node() function contain
     * references to the virtual address of the vmalloc'ed block.
     */
    kmemleak_alloc(addr, real_size, 3, gfp_mask);

    return addr;

fail:
    warn_alloc_failed(gfp_mask, 0,
              "vmalloc: allocation failure: %lu bytes\n",
              real_size);
    return NULL;
}

static struct vm_struct *__get_vm_area_node(unsigned long size,
        unsigned long align, unsigned long flags, unsigned long start,
        unsigned long end, int node, gfp_t gfp_mask, const void *caller)
{
    struct vmap_area *va;
    struct vm_struct *area;

    BUG_ON(in_interrupt());
    if (flags & VM_IOREMAP) {
        int bit = fls(size);

        if (bit > IOREMAP_MAX_ORDER)
            bit = IOREMAP_MAX_ORDER;
        else if (bit < PAGE_SHIFT)
            bit = PAGE_SHIFT;

        align = 1ul << bit;
    }//ioremap映射时要做的一些检查

    size = PAGE_ALIGN(size);//页对齐
    if (unlikely(!size))
        return NULL;
    //分配一个area结构体内存
    area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node);
    if (unlikely(!area))
        return NULL;

    /*
     * We always allocate a guard page.
     */
    size += PAGE_SIZE;//加上空洞页，空洞页是不分配物理内存的

    va = alloc_vmap_area(size, align, start, end, node, gfp_mask);//分配一个虚拟内存区域kvm
    if (IS_ERR(va)) {
        kfree(area);
        return NULL;
    }

    /*
     * When this function is called from __vmalloc_node_range,
     * we add VM_UNLIST flag to avoid accessing uninitialized
     * members of vm_struct such as pages and nr_pages fields.
     * They will be set later.
     */
    if (flags & VM_UNLIST)//标识含义上面有解释，下面的函数主要是从va中赋值给area
        setup_vmalloc_vm(area, va, flags, caller);
    else
        insert_vmalloc_vm(area, va, flags, caller);

    return area;
}

下面是从

/*
 * Allocate a region of KVA of the specified size and alignment, within the
 * vstart and vend.
 */
static struct vmap_area *alloc_vmap_area(unsigned long size,
                unsigned long align,
                unsigned long vstart, unsigned long vend,
                int node, gfp_t gfp_mask)
{
    struct vmap_area *va;
    struct rb_node *n;
    unsigned long addr;
    int purged = 0;
    struct vmap_area *first;

    BUG_ON(!size);//size = 0
    BUG_ON(size & ~PAGE_MASK); //size要页对齐
    BUG_ON(!is_power_of_2(align));//size要以2的n次幂对齐
    //分配结构体
    va = kmalloc_node(sizeof(struct vmap_area),
            gfp_mask & GFP_RECLAIM_MASK, node);
    if (unlikely(!va))
        return ERR_PTR(-ENOMEM);

retry:
    spin_lock(&vmap_area_lock);
    /*
     * Invalidate cache if we have more permissive parameters.
     * cached_hole_size notes the largest hole noticed _below_
     * the vmap_area cached in free_vmap_cache: if size fits
     * into that hole, we want to scan from vstart to reuse
     * the hole instead of allocating above free_vmap_cache.
     * Note that __free_vmap_area may update free_vmap_cache
     * without updating cached_hole_size or cached_align.
     *///下面判断cache vmap是否有用，主要检查是否存在、大小、起始地址、对齐
    if (!free_vmap_cache ||
            size < cached_hole_size ||
            vstart < cached_vstart ||
            align < cached_align) {
nocache:
        cached_hole_size = 0;
        free_vmap_cache = NULL;
    }
    /* record if we encounter less permissive parameters */
    cached_vstart = vstart;
    cached_align = align;

    /* find starting point for our search */
    if (free_vmap_cache) {//把cache 中的vmap拿出来比较下
        first = rb_entry(free_vmap_cache, struct vmap_area, rb_node);
        addr = ALIGN(first->va_end, align);//首先要对齐后再比较
        if (addr < vstart)//结束地址都比开始地址小，那肯定不能用
            goto nocache;
        if (addr + size < addr)//地址越界
            goto overflow;

    } else {//没有free_vmap_cache
        addr = ALIGN(vstart, align);//和上面一样检查下地址
        if (addr + size < addr)
            goto overflow;

        n = vmap_area_root.rb_node;
        first = NULL;
//下面是红黑树的遍历，主要是看看比较的条件
        while (n) {
            struct vmap_area *tmp;
            tmp = rb_entry(n, struct vmap_area, rb_node);
            if (tmp->va_end >= addr) {//找到一个结束地址大于需要映射的开始地址
                first = tmp;
                if (tmp->va_start <= addr)//这里就表明，起始地址在区域中间
                    break;
                n = n->rb_left;//这里往叶子节点走，则分配地址更小的区域
            } else
                n = n->rb_right;//这边分配，则分配地址更大的区域
        }

        if (!first)//表示找到了起始地址，映射起始地址比任何区域的结束地址都大
            goto found;
    }

    /* from the starting point, walk areas until a suitable hole is found */
    while (addr + size > first->va_start && addr + size <= vend) {//这里是计算空洞地址是否足够
        if (addr + cached_hole_size < first->va_start)
            cached_hole_size = first->va_start - addr;
        addr = ALIGN(first->va_end, align);//重点是addr每次都会移动到区域结尾处
        if (addr + size < addr)
            goto overflow;

        if (list_is_last(&first->list, &vmap_area_list))//如果是最后一个区域，那接下来的都是空洞地址
            goto found;

        first = list_entry(first->list.next,
                struct vmap_area, list);//下一个地址
    }

found://如果要理解上面的代码，其实分析下first的几种情况就可以明了了；
    if (addr + size > vend)//看看是否超出vmalloc_end的界限
        goto overflow;
    //下面开始赋值了
    va->va_start = addr;
    va->va_end = addr + size;
    va->flags = 0;
    __insert_vmap_area(va);//插入红黑树和链表中
    free_vmap_cache = &va->rb_node;
    spin_unlock(&vmap_area_lock);

    BUG_ON(va->va_start & (align-1));
    BUG_ON(va->va_start < vstart);
    BUG_ON(va->va_end > vend);

    return va;

overflow://没有地址分配的打印
    spin_unlock(&vmap_area_lock);
    if (!purged) {
        purge_vmap_area_lazy();
        purged = 1;
        goto retry;
    }
    if (printk_ratelimit())
        printk(KERN_WARNING
            "vmap allocation for size %lu failed: "
            "use vmalloc=<size> to increase size.\n", size);
    kfree(va);
    return ERR_PTR(-EBUSY);
}

static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
                 pgprot_t prot, int node, const void *caller)
{
    const int order = 0;
    struct page **pages;
    unsigned int nr_pages, array_size, i;
    gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;//分配初始化为0的内存页

    nr_pages = (area->size - PAGE_SIZE) >> PAGE_SHIFT;//去掉一个空洞页
    array_size = (nr_pages * sizeof(struct page *));//数组大小

    area->nr_pages = nr_pages;//实际映射的页数
    /* Please note that the recursion is strictly bounded. */
    if (array_size > PAGE_SIZE) {//如果大于一个page，则使用vmalloc来分配。这里是递归
        pages = __vmalloc_node(array_size, 1, nested_gfp|__GFP_HIGHMEM,
                PAGE_KERNEL, node, caller);
        area->flags |= VM_VPAGES;//标识是vmalloc分配的内存
    } else {//数组比较下，就用kmalloc来分配，node = -1
        pages = kmalloc_node(array_size, nested_gfp, node);
    }
    area->pages = pages;
    area->caller = caller;//这是<span style="font-family: Arial, Helvetica, sans-serif;">__vmalloc_node_flags()函数的返回地址吧，这个不知道有什么用？？</span>

    if (!area->pages) {//分配数组空间失败，就释放area
        remove_vm_area(area->addr);
        kfree(area);
        return NULL;
    }

    for (i = 0; i < area->nr_pages; i++) {
        struct page *page;
        gfp_t tmp_mask = gfp_mask | __GFP_NOWARN;

        if (node < 0)
            page = alloc_page(tmp_mask);
        else
            page = alloc_pages_node(node, tmp_mask, order);

        if (unlikely(!page)) {//如果有一个页分配失败的话就全部失败，释放掉开始分配的内存；
            /* Successfully allocated i pages, free them in __vunmap() */
            area->nr_pages = i;
            goto fail;
        }
        area->pages[i] = page;//记录页面数组
    }

    if (map_vm_area(area, prot, &pages))//利用页表项来建立映射
        goto fail;
    return area->addr;

fail:
    warn_alloc_failed(gfp_mask, order,
              "vmalloc: allocation failure, allocated %ld of %ld bytes\n",
              (area->nr_pages*PAGE_SIZE), area->size);
    vfree(area->addr);
    return NULL;
}

-------------------------------释放vmalloc分配的页==vfree()-------------------------------------

/**
 *  vfree  -  release memory allocated by vmalloc()
 *  @addr:      memory base address
 *
 *  Free the virtually continuous memory area starting at @addr, as
 *  obtained from vmalloc(), vmalloc_32() or __vmalloc(). If @addr is
 *  NULL, no operation is performed.
 *
 *  Must not be called in NMI context (strictly speaking, only if we don't
 *  have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling
 *  conventions for vfree() arch-depenedent would be a really bad idea)
 *
 *  NOTE: assumes that the object at *addr has a size >= sizeof(llist_node)
 *  
 */
void vfree(const void *addr)
{
    BUG_ON(in_nmi());

    kmemleak_free(addr);//检查内存泄漏函数

    if (!addr)//简单做下检查
        return;
    if (unlikely(in_interrupt())) {
        struct vfree_deferred *p = &__get_cpu_var(vfree_deferred);
        llist_add((struct llist_node *)addr, &p->list);
        schedule_work(&p->wq);
    } else
        __vunmap(addr, 1);
}

释放的主要函数，vmalloc和其他虚拟映射的地址释放也是调用该函数：参数是：addr和1

static void __vunmap(const void *addr, int deallocate_pages)
{
    struct vm_struct *area;

    if (!addr)//NULL
        return;

    if ((PAGE_SIZE-1) & (unsigned long)addr) {//对齐检查
        WARN(1, KERN_ERR "Trying to vfree() bad address (%p)\n", addr);
        return;
    }

    area = remove_vm_area(addr);//释放虚拟地址
    if (unlikely(!area)) {
        WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n",
                addr);
        return;
    }

    debug_check_no_locks_freed(addr, area->size);
    debug_check_no_obj_freed(addr, area->size);

    if (deallocate_pages) {
        int i;

        for (i = 0; i < area->nr_pages; i++) {//释放物理内存页
            struct page *page = area->pages[i];

            BUG_ON(!page);
            __free_page(page);
        }

        if (area->flags & VM_VPAGES)//如果pages是vmalloc分配的（数组大小大于一个page时）则用vfree释放
            vfree(area->pages);
        else
            kfree(area->pages);
    }

    kfree(area);
    return;
}

linux内存管理--vmalloc

标签：

原文地址：http://blog.csdn.net/yuzhihui_no1/article/details/50782616

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)

周排行