linux内存管理--实际分配函数 buffered_rmqueue

时间：2016-04-29 16:17:11 阅读：252 评论：0 收藏：0 [点我收藏+]

标签：

不管是快速分配还是慢速分配，实际分配内存的都是 buffered_rmqueue()函数，其他的都是在选择从哪个地方来分配比较合适；

还是先来说说各个参数：

struct zone *preferred_zone 表示分配所能接受的最大zone类型

struct zone *zone 表示就在该zone上分配内存；

int order 表示分配页的阶数

gfp_t gfp_flags 分配的标识

page = buffered_rmqueue(preferred_zone, zone, order,
                        gfp_mask, migratetype);



/*
 * Really, prep_compound_page() should be called from __rmqueue_bulk().  But
 * we cheat by calling it from here, in the order > 0 path.  Saves a branch
 * or two.
 */
static inline
struct page *buffered_rmqueue(struct zone *preferred_zone,
            struct zone *zone, int order, gfp_t gfp_flags,
            int migratetype)
{
    unsigned long flags;
    struct page *page;
    int cold = !!(gfp_flags & __GFP_COLD);//是否指定冷热页

again:
    if (likely(order == 0)) {//分配单页
        struct per_cpu_pages *pcp;
        struct list_head *list;

        local_irq_save(flags);//禁止本地CPU中断，禁止前先保存中断状态
        pcp = &this_cpu_ptr(zone->pageset)->pcp;//获取到cpu高速缓存页
        list = &pcp->lists[migratetype];//根据迁移类型，得到高速缓存区的freelist
        if (list_empty(list)) {//空的，高速缓存没有数据；这可能是上次获取的cpu高速缓存迁移类型和这次不一样
            pcp->count += rmqueue_bulk(zone, 0,
                    pcp->batch, list,
                    migratetype, cold);//该函数向高速缓存中添加内存页，具体分析见文章后面
            if (unlikely(list_empty(list)))
                goto failed;
        }

        if (cold)
            page = list_entry(list->prev, struct page, lru);
        else
            page = list_entry(list->next, struct page, lru);

        list_del(&page->lru);
        pcp->count--;
    } else {
        if (unlikely(gfp_flags & __GFP_NOFAIL)) {
            /*
             * __GFP_NOFAIL is not to be used in new code.
             *
             * All __GFP_NOFAIL callers should be fixed so that they
             * properly detect and handle allocation failures.
             *
             * We most definitely don't want callers attempting to
             * allocate greater than order-1 page units with
             * __GFP_NOFAIL.
             */
            WARN_ON_ONCE(order > 1);
        }
        spin_lock_irqsave(&zone->lock, flags);
        page = __rmqueue(zone, order, migratetype);
        spin_unlock(&zone->lock);
        if (!page)
            goto failed;
        __mod_zone_freepage_state(zone, -(1 << order),
                      get_pageblock_migratetype(page));
    }

    __count_zone_vm_events(PGALLOC, zone, 1 << order);
    zone_statistics(preferred_zone, zone, gfp_flags);
    local_irq_restore(flags);

    VM_BUG_ON(bad_range(zone, page));
    if (prep_new_page(page, order, gfp_flags))
        goto again;
    return page;

failed:
    local_irq_restore(flags);
    return NULL;
}

struct zone结构体中有个 struct per_cpu_pageset __percpu *pageset; 成员，该成员用于冷热分配器，热页表示已经在cpu的高速缓存中了；

struct per_cpu_pageset {
    struct per_cpu_pages pcp;
#ifdef CONFIG_NUMA
    s8 expire;
#endif
#ifdef CONFIG_SMP
    s8 stat_threshold;
    s8 vm_stat_diff[NR_VM_ZONE_STAT_ITEMS];
#endif
};

cpu缓存页数组

struct per_cpu_pages {
    int count;      /* number of pages in the list */列表中页数
    int high;       /* high watermark, emptying needed */列表页数的上限
    int batch;      /* chunk size for buddy add/remove */添加和删除页时，一次操作多少页。不是单页删除和填充的，而是以该单位页来操作的

    /* Lists of pages, one per migrate type stored on the pcp-lists */
    struct list_head lists[MIGRATE_PCPTYPES];//迁移类型的链表
};

从伙伴系统中得到页，然后填充到cpu的高速缓存中

/*
 * Obtain a specified number of elements from the buddy allocator, all under
 * a single hold of the lock, for efficiency.  Add them to the supplied list.
 * Returns the number of new pages which were placed at *list.
 */
static int rmqueue_bulk(struct zone *zone, unsigned int order,
            unsigned long count, struct list_head *list,
            int migratetype, int cold)
{
    int mt = migratetype, i;

    spin_lock(&zone->lock);
    for (i = 0; i < count; ++i) {//一个页面一个页面处理，
        struct page *page = __rmqueue(zone, order, migratetype);//分配到指定迁移类型的内存页
        if (unlikely(page == NULL))
            break;

        /*   
         * Split buddy pages returned by expand() are received here
         * in physical page order. The page is added to the callers and
         * list and the list head then moves forward. From the callers
         * perspective, the linked list is ordered by page number in
         * some conditions. This is useful for IO devices that can
         * merge IO requests if the physical pages are ordered
         * properly.
         */
        if (likely(cold == 0))
            list_add(&page->lru, list);//如果是冷页，则添加到链表头
        else 
            list_add_tail(&page->lru, list);//否则添加链表尾部
        if (IS_ENABLED(CONFIG_CMA)) {//条件编译了CONFIG_CMA选项
            mt = get_pageblock_migratetype(page);//获取页面的迁移类型
            if (!is_migrate_cma(mt) && !is_migrate_isolate(mt))//如果不是MIGRATE_CMA和 MIGRATE_CMA
                mt = migratetype;
        }    
        set_freepage_migratetype(page, mt); //设置page的迁移类型
        list = &page->lru;//循环链接下一个页
        if (is_migrate_cma(mt))//如果是MIGRATE_CMA迁移类型
            __mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
                          -(1 << order));//修改cma迁移类型的页面计数
    }
    __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));//修改空闲页面的计数
    spin_unlock(&zone->lock);
    return i;//返回添加到cpu高速缓存链表的页面个数
}

修改对应类型的页面计数

static inline void __mod_zone_page_state(struct zone *zone,
            enum zone_stat_item item, int delta)
{       
    zone_page_state_add(delta, zone, item);
}  

static inline void zone_page_state_add(long x, struct zone *zone,
                 enum zone_stat_item item)
{
    atomic_long_add(x, &zone->vm_stat[item]);
    atomic_long_add(x, &vm_stat[item]);
}

/*
 * Do the hard work of removing an element from the buddy allocator.
 * Call me with the zone->lock already held.
 */
static struct page *__rmqueue(struct zone *zone, unsigned int order,
                        int migratetype)
{
    struct page *page;

retry_reserve:
    page = __rmqueue_smallest(zone, order, migratetype);//常规情况下，从zone上分配指定的迁移类型的内存页

    if (unlikely(!page) && migratetype != MIGRATE_RESERVE) {//上面没有分配到内存页，并且不是紧急的迁移类型
        page = __rmqueue_fallback(zone, order, migratetype);//修改搬迁其他迁移类型的页，

        /*
         * Use MIGRATE_RESERVE rather than fail an allocation. goto
         * is used because __rmqueue_smallest is an inline function
         * and we want just one call site
         */
        if (!page) {//没有成功，则把迁移类型调整为 MIGRATE_RESERVE表示是紧急分配
            migratetype = MIGRATE_RESERVE;
            goto retry_reserve;//重试
        }
    }

    trace_mm_page_alloc_zone_locked(page, order, migratetype);
    return page;
}

/*
 * Go through the free lists for the given migratetype and remove
 * the smallest available page from the freelists
 */
static inline
struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
                        int migratetype)
{
    unsigned int current_order;
    struct free_area * area;
    struct page *page;

    /* Find a page of the appropriate size in the preferred list */
    for (current_order = order; current_order < MAX_ORDER; ++current_order) {//扫描所有阶的内存
        area = &(zone->free_area[current_order]);
        if (list_empty(&area->free_list[migratetype]))//查看下迁移类型下的链表是否为空
            continue;
        //获取到链表中的页
        page = list_entry(area->free_list[migratetype].next,
                            struct page, lru);
        list_del(&page->lru);
        rmv_page_order(page);//设置属性，清除buddy标识，也就是设置 page->_mapcount = -1
        area->nr_free--;//从这里可以看出，nr_free是表示该阶下的页块的数目，而不是页的个数
        expand(zone, page, order, current_order, area, migratetype);//这是把从高阶分配的页，逐渐对半分给下一阶，直到自己需要的
        return page;
    }

    return NULL;
}

这是buddy的一个重要函数：在高阶分配得到内存块时，比如 8阶分配得到内存块时。而我们需要的是低价的，比如 6；那么就要调用下面该函数，把8阶分配得到的内存块，挂到7阶上，然后从该内存块上截取一半，再到6阶上，这时候再比较发现正是我们需要分配的内存阶，就直接返回了；

说下参数：

struct zone *zone：所有的操作都在该zone上完成

struct page *page：高阶上分配得到的页块

int low：我们需要的内存阶

int high：在该阶上分配到的内存

struct free_area *area：这是zone上的高阶空闲页数组项

int migratetype：迁移类型

/*
 * The order of subdivision here is critical for the IO subsystem.
 * Please do not alter this order without good reasons and regression
 * testing. Specifically, as large blocks of memory are subdivided,
 * the order in which smaller blocks are delivered depends on the order
 * they're subdivided in this function. This is the primary factor
 * influencing the order in which pages are delivered to the IO
 * subsystem according to empirical testing, and this is also justified
 * by considering the behavior of a buddy system containing a single
 * large block of memory acted on by a series of small allocations.
 * This behavior is a critical factor in sglist merging's success.
 *
 * -- nyc
 */
static inline void expand(struct zone *zone, struct page *page,
    int low, int high, struct free_area *area,
    int migratetype)
{
    unsigned long size = 1 << high;

    while (high > low) {//如果在同阶上分配得到了内存页就不需要执行该函数了
        area--;//从高阶空闲数组元素，递减到下一个阶的空闲数组元素
        high--;//下一个阶
        size >>= 1;//内存大小的一半
        VM_BUG_ON(bad_range(zone, &page[size]));

#ifdef CONFIG_DEBUG_PAGEALLOC
        if (high < debug_guardpage_minorder()) {
            /*
             * Mark as guard pages (or page), that will allow to
             * merge back to allocator when buddy will be freed.
             * Corresponding page table entries will not be touched,
             * pages will stay not present in virtual address space
             */
            INIT_LIST_HEAD(&page[size].lru);
            set_page_guard_flag(&page[size]);
            set_page_private(&page[size], high);
            /* Guard pages are not available for any usage */
            __mod_zone_freepage_state(zone, -(1 << high),
                          migratetype);
            continue;
        }
#endif
        list_add(&page[size].lru, &area->free_list[migratetype]);//挂入该阶的对应迁移类型下的链表中
        area->nr_free++;//该阶上的内存块增加
        set_page_order(&page[size], high);//设置private为高阶，清除掉buddy标识，因为该页已经不是伙伴系统的页了
    }
}

跑到这个函数时，表明上面指定迁移类型从伙伴系统中分配内存失败，所以要用备用迁移列表；

/*
 * This array describes the order lists are fallen back to when
 * the free lists for the desirable migrate type are depleted
 */
static int fallbacks[MIGRATE_TYPES][4] = {
    [MIGRATE_UNMOVABLE]   = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE,     MIGRATE_RESERVE },
    [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE,   MIGRATE_MOVABLE,     MIGRATE_RESERVE },
#ifdef CONFIG_CMA
    [MIGRATE_MOVABLE]     = { MIGRATE_CMA,         MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },
    [MIGRATE_CMA]         = { MIGRATE_RESERVE }, /* Never used */
#else
    [MIGRATE_MOVABLE]     = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE,   MIGRATE_RESERVE },
#endif
    [MIGRATE_RESERVE]     = { MIGRATE_RESERVE }, /* Never used */
#ifdef CONFIG_MEMORY_ISOLATION
    [MIGRATE_ISOLATE]     = { MIGRATE_RESERVE }, /* Never used */
#endif
};

根据上面的备用迁移类型来遍历

/* Remove an element from the buddy allocator from the fallback list */
static inline struct page *
__rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
{
    struct free_area * area;
    int current_order;
    struct page *page;
    int migratetype, i;

    /* Find the largest possible block of pages in the other list */
    for (current_order = MAX_ORDER-1; current_order >= order;
                        --current_order) {//这是和指定迁移类型的遍历不一样，这里是从最大阶开始遍历，就是为了防止内存碎片
        for (i = 0;; i++) {
            migratetype = fallbacks[start_migratetype][i];

            /* MIGRATE_RESERVE handled later if necessary */
            if (migratetype == MIGRATE_RESERVE)//这是最后的选择，现在还不到时候
                break;

            area = &(zone->free_area[current_order]);//得到高阶空闲数组元素
            if (list_empty(&area->free_list[migratetype]))//如果对应阶上的对应迁移类型的空闲页链表是空的，则循环找备用迁移类型的空闲链表
                continue;

            page = list_entry(area->free_list[migratetype].next,
                    struct page, lru);//如果找到了空闲页块，则当前阶上的空闲页块递减
            area->nr_free--;

            /*
             * If breaking a large block of pages, move all free
             * pages to the preferred allocation list. If falling
             * back for a reclaimable kernel allocation, be more
             * aggressive about taking ownership of free pages
             *
             * On the other hand, never change migration
             * type of MIGRATE_CMA pageblocks nor move CMA
             * pages on different free lists. We don't
             * want unmovable pages to be allocated from
             * MIGRATE_CMA areas.
             *///下面是解决剩余的空闲页，上面的注释说的很清楚了
	     //解释下几个有关迁移类型的全局变量，pageblock_order 表示内核认为是大的分配阶(看自己配置，一般会配置MAX_ORDER - 1)；pageblock_nr_pages 大分配阶对应的页数
            if (!is_migrate_cma(migratetype) &&//不是CMA区域
                (unlikely(current_order >= pageblock_order / 2) || //大内存块，则全部转到start_migratetype类型下
                 start_migratetype == MIGRATE_RECLAIMABLE || //可回收内存页，就迁移类型转换时，会更加积极
                 page_group_by_mobility_disabled)) {
                int pages;
                pages = move_freepages_block(zone, page,
                                start_migratetype);//把这些页面转换到 start_migratetype 迁移类型下面去

                /* Claim the whole block if over half of it is free */
                if (pages >= (1 << (pageblock_order-1)) ||
                        page_group_by_mobility_disabled)
                    set_pageblock_migratetype(page,
                                start_migratetype);//这里是设置整个页块的迁移类型，上面move_freepage_block()函数是设置每个页的迁移类型

                migratetype = start_migratetype;
            }

            /* Remove the page from the freelists */
            list_del(&page->lru);
            rmv_page_order(page);//清除buddy的标识，标识该page将不是buddy系统的了

            /* Take ownership for orders >= pageblock_order */
            if (current_order >= pageblock_order &&
                !is_migrate_cma(migratetype))
                change_pageblock_range(page, current_order,
                            start_migratetype);//这个函数是把剩下的其他pageblock块都设置成start_migratetype类型

            expand(zone, page, order, current_order, area,
                   is_migrate_cma(migratetype)
                 ? migratetype : start_migratetype);//瓜分大伙伴页块，分成小伙伴页块

            trace_mm_page_alloc_extfrag(page, order, current_order,
                start_migratetype, migratetype);

            return page;
        }
    }

    return NULL;
}

int move_freepages_block(struct zone *zone, struct page *page,
                int migratetype)
{
    unsigned long start_pfn, end_pfn;
    struct page *start_page, *end_page;

    start_pfn = page_to_pfn(page);//页帧号
    start_pfn = start_pfn & ~(pageblock_nr_pages-1);//pageblock_nr_pages是迁移类型认为大阶所对应的页数
    start_page = pfn_to_page(start_pfn);
    end_page = start_page + pageblock_nr_pages - 1;//准备迁移pgeblock_nr_pages个页面，一般要转换迁移类型的话，就转换pageblock_nr_pages个连续页面，这样会减少内存碎片
    end_pfn = start_pfn + pageblock_nr_pages - 1;

    /* Do not cross zone boundaries */
    if (!zone_spans_pfn(zone, start_pfn))
        start_page = page;
    if (!zone_spans_pfn(zone, end_pfn))//判断要迁移的内存区是否在一个zone上，不能交错zone
        return 0;

    return move_freepages(zone, start_page, end_page, migratetype);//把要转换迁移类型的内存页面地址范围给move_freepages()进行转换
}

/*
 * Move the free pages in a range to the free lists of the requested type.
 * Note that start_page and end_pages are not aligned on a pageblock
 * boundary. If alignment is required, use move_freepages_block()
 *///对注释有点不理解？？前一个调用函数明明做了pageblock_nr_pages 对齐处理的，而这里却说不必对齐？？？？？？？？？？
int move_freepages(struct zone *zone,
              struct page *start_page, struct page *end_page,
              int migratetype)
{
    struct page *page;
    unsigned long order;
    int pages_moved = 0;

#ifndef CONFIG_HOLES_IN_ZONE
    /*
     * page_zone is not safe to call in this context when
     * CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant
     * anyway as we check zone boundaries in move_freepages_block().
     * Remove at a later date when no bug reports exist related to
     * grouping pages by mobility
     */
    BUG_ON(page_zone(start_page) != page_zone(end_page));
#endif

    for (page = start_page; page <= end_page;) {
        /* Make sure we are not inadvertently changing nodes */
        VM_BUG_ON(page_to_nid(page) != zone_to_nid(zone));

        if (!pfn_valid_within(page_to_pfn(page))) {
            page++;
            continue;
        }

        if (!PageBuddy(page)) {//现在页还是伙伴系统的
            page++;
            continue;
        }

        order = page_order(page);//得到阶
        list_move(&page->lru,
              &zone->free_area[order].free_list[migratetype]);//把这些页搬迁到指定迁移类型对应的链表上
        set_freepage_migratetype(page, migratetype);//设置这些页的迁移类型，page->index = migratetype
        page += 1 << order;//一下子就转换了 2^order 个页面
        pages_moved += 1 << order;
    }

    return pages_moved;//把范围内的页都迁移完，返回实际迁移了多少页
}

static void change_pageblock_range(struct page *pageblock_page,
                    int start_order, int migratetype)
{
    int nr_pageblocks = 1 << (start_order - pageblock_order);//得到有多少个pageblock_order的页块

    while (nr_pageblocks--) {//循环设置每个pageblock_order页块
        set_pageblock_migratetype(pageblock_page, migratetype);//设置页块的迁移类型
        pageblock_page += pageblock_nr_pages;//调整到下一个页块的地址上去
    }
}

linux内存管理--实际分配函数 buffered_rmqueue

标签：

原文地址：http://blog.csdn.net/yuzhihui_no1/article/details/50793634

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)

周排行