Linux内核源代码情景分析-内存管理之用户页面的换入

时间：2015-03-01 18:33:25 阅读：216 评论：0 收藏：0 [点我收藏+]

标签：

在下面几种情况下会发生，页面出错异常（也叫缺页中断）：

1、相应的页面目录项或者页面表项为空，也就是该线性地址与物理地址的映射关系尚未建立，或者已经撤销。

2、相应的物理页面不在内存中。本文讨论的就是这种情况。

3、指令中规定的访问方式与页面的权限不符，例如企图写一个“只读”的页面。

假设已经建立好了映射，但是页表项最后一位P为0，表示页面不在内存中；整个页表项如下图，offset表示页面在一个磁盘设备的位置，也就是磁盘设备的逻辑页面号；而type则是指该页面在哪一个磁盘设备中。

技术分享

图 1 页面交换项结构

这里假定CPU的运行已经到达了页面异常服务程序的主体do_page_fault()的入口处。

代码如下： arch/i386/mm/fault.c

asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code)
{
	struct task_struct *tsk;
	struct mm_struct *mm;
	struct vm_area_struct * vma;
	unsigned long address;
	unsigned long page;
	unsigned long fixup;
	int write;
	siginfo_t info;

	/* get the address */
	__asm__("movl %%cr2,%0":"=r" (address));//把映射的失败的地址保存在address中

	tsk = current;//task_struct

	/*
	 * We fault-in kernel-space virtual memory on-demand. The
	 * ‘reference‘ page table is init_mm.pgd.
	 *
	 * NOTE! We MUST NOT take any locks for this case. We may
	 * be in an interrupt or a critical region, and should
	 * only copy the information from the master page table,
	 * nothing more.
	 */
	if (address >= TASK_SIZE)
		goto vmalloc_fault;

	mm = tsk->mm;//mm_struct
	info.si_code = SEGV_MAPERR;

	/*
	 * If we‘re in an interrupt or have no user
	 * context, we must not take the fault..
	 */
	if (in_interrupt() || !mm)
		goto no_context;

	down(&mm->mmap_sem);

	vma = find_vma(mm, address);//找出结束地址大于给定地址的第一个区间。
	if (!vma)//没有找到，说明没有一个区间的结束地址高于给定的地址，参考上图，说明这个地址是在堆栈之下，也就是3G字节以上了。
		goto bad_area;
	if (vma->vm_start <= address)//起始地址不高于address，说明映射已经建立，转到good_area去进一步检查失败原因。
		goto good_area;
	if (!(vma->vm_flags & VM_GROWSDOWN))
		goto bad_area;
	....
/*
 * Ok, we have a good vm_area for this memory access, so
 * we can handle it..
 */
good_area:
	info.si_code = SEGV_ACCERR;
	write = 0;
	switch (error_code & 3) {// 110 & 011 = 2
		default:	/* 3: write, present */
#ifdef TEST_VERIFY_AREA
			if (regs->cs == KERNEL_CS)
				printk("WP fault at %08lx\n", regs->eip);
#endif
			/* fall through */
		case 2:		/* write, not present */
			if (!(vma->vm_flags & VM_WRITE))
				goto bad_area;
			write++;//执行到这里
			break;
		case 1:		/* read, present */
			goto bad_area;
		case 0:		/* read, not present */
			if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
				goto bad_area;
	}

	/*
	 * If for any reason at all we couldn‘t handle the fault,
	 * make sure we exit gracefully rather than endlessly redo
	 * the fault.
	 */
	switch (handle_mm_fault(mm, vma, address, write)) {
	case 1:
		tsk->min_flt++;
		break;
	case 2:
		tsk->maj_flt++;
		break;
	case 0:
		goto do_sigbus;
	default:
		goto out_of_memory;
	}

	/*
	 * Did it hit the DOS screen memory VA from vm86 mode?
	 */
	if (regs->eflags & VM_MASK) {
		unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT;
		if (bit < 32)
			tsk->thread.screen_bitmap |= 1 << bit;
	}
	up(&mm->mmap_sem);
	return;
        .......
}

内核的中断/异常响应机制还传过来两个参数。一个是pt_regs结构指针regs，它指向例外发生前夕CPU中各寄存器内容的一份副本。而error_code则进一步指明映射失败的具体原因。

error_code:

bit 0 == 0 means no page found, 1 means protection fault

bit 1 == 0 means read, 1 means write

bit 2 == 0 means kernel, 1 means user-mode 此时，error_code为110，用户态，页面不在内存中，写。

handle_mm_fault函数，代码如下：

int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct * vma,
	unsigned long address, int write_access)
{
	int ret = -1;
	pgd_t *pgd;
	pmd_t *pmd;


	pgd = pgd_offset(mm, address);//返回页面表项指针
	pmd = pmd_alloc(pgd, address);//中转了一下，还是页目录表项指针


	if (pmd) {
		pte_t * pte = pte_alloc(pmd, address);//返回指向页表项的指针
		if (pte)
			ret = handle_pte_fault(mm, vma, address, write_access, pte);
	}
	return ret;
}

handle_pte_fault函数，如下：

static inline int handle_pte_fault(struct mm_struct *mm,
	struct vm_area_struct * vma, unsigned long address,
	int write_access, pte_t * pte)
{
	pte_t entry;

	/*
	 * We need the page table lock to synchronize with kswapd
	 * and the SMP-safe atomic PTE updates.
	 */
	spin_lock(&mm->page_table_lock);
	entry = *pte;//页表项中内容
	if (!pte_present(entry)) {//页面不在内存中
		/*
		 * If it truly wasn‘t present, we know that kswapd
		 * and the PTE updates will not touch it later. So
		 * drop the lock.
		 */
		spin_unlock(&mm->page_table_lock);
		if (pte_none(entry))//页表项不为空
			return do_no_page(mm, vma, address, write_access, pte);
		return do_swap_page(mm, vma, address, pte, pte_to_swp_entry(entry), write_access);//执行到这里
	}

	if (write_access) {
		if (!pte_write(entry))
			return do_wp_page(mm, vma, address, pte, entry);

		entry = pte_mkdirty(entry);
	}
	entry = pte_mkyoung(entry);
	establish_pte(vma, address, pte, entry);
	spin_unlock(&mm->page_table_lock);
	return 1;
}

do_swap_page函数，如下：

static int do_swap_page(struct mm_struct * mm,
	struct vm_area_struct * vma, unsigned long address,
	pte_t * page_table, swp_entry_t entry, int write_access)
{
	struct page *page = lookup_swap_cache(entry);//从hash表中寻找
	pte_t pte;

	if (!page) {
		lock_kernel();
		swapin_readahead(entry);//预读页面
		page = read_swap_cache(entry);//真正得到一个页面，这个页面可能从hash表中寻找到，因为上面预读了。或者自己申请页面，并且从盘上将其内容读进来。
		unlock_kernel();
		if (!page)
			return -1;

		flush_page_to_ram(page);
		flush_icache_page(vma, page);
	}

	mm->rss++;

	pte = mk_pte(page, vma->vm_page_prot);//形成页表项

	/*
	 * Freeze the "shared"ness of the page, ie page_count + swap_count.
	 * Must lock page before transferring our swap count to already
	 * obtained page count.
	 */
	lock_page(page);
	swap_free(entry);
	if (write_access && !is_page_shared(page))
		pte = pte_mkwrite(pte_mkdirty(pte));//页表项赋予已写过对应的物理页，可进行读、写或者执行
	UnlockPage(page);

	set_pte(page_table, pte);//页表项(属性刚才已经设置了)指向对应的页面
	/* No need to invalidate - it was non-present before */
	update_mmu_cache(vma, address, pte);
	return 1;	/* Minor fault */
}

一、下面分别解释各个函数。首先解释swapin_readahead函数，如下：

void swapin_readahead(swp_entry_t entry)
{
	int i, num;
	struct page *new_page;
	unsigned long offset;

	/*
	 * Get the number of handles we should do readahead io to. Also,
	 * grab temporary references on them, releasing them as io completes.
	 */
	num = valid_swaphandles(entry, &offset);
	for (i = 0; i < num; offset++, i++) {
		......
		new_page = read_swap_cache_async(SWP_ENTRY(SWP_TYPE(entry), offset), 0);
		if (new_page != NULL)
			page_cache_release(new_page);//page使用计数减1
		swap_free(SWP_ENTRY(SWP_TYPE(entry), offset));
	}
	return;
}

提前预读相邻的盘面，根据下面的描述，__get_free_page，page使用计数为1，add_to_swap_cache，page使用计数再加1；此时page_cache_release，page使用计数又变成了1。直到有进程认领，才变成2。

read_swap_cache_async函数，如下：

struct page * read_swap_cache_async(swp_entry_t entry, int wait)
{
	struct page *found_page = 0, *new_page;
	unsigned long new_page_addr;
	
	/*
	 * Make sure the swap entry is still in use.
	 */
	if (!swap_duplicate(entry))	/* Account for the swap cache */
		goto out;
	/*
	 * Look for the page in the swap cache.
	 */
	found_page = lookup_swap_cache(entry);//假设没有找到
	if (found_page)
		goto out_free_swap;

	new_page_addr = __get_free_page(GFP_USER);//刚申请的page结构，使用计数为1
	if (!new_page_addr)
		goto out_free_swap;	/* Out of memory */
	new_page = virt_to_page(new_page_addr);//转化成对应的page结构指针

	/*
	 * Check the swap cache again, in case we stalled above.
	 */
	found_page = lookup_swap_cache(entry);//假设没有找到
	if (found_page)
		goto out_free_page;
	/* 
	 * Add it to the swap cache and read its contents.
	 */
	lock_page(new_page);
	add_to_swap_cache(new_page, entry);//加入到对应的链表上
	rw_swap_page(READ, new_page, wait);//真正的把磁盘上的数据读到新申请的page上，等待块设备驱动一章再来看
	return new_page;

out_free_page:
	page_cache_release(new_page);
out_free_swap:
	swap_free(entry);
out:
	return found_page;
}

add_to_swap_cache函数是重点，代码如下：

void add_to_swap_cache(struct page *page, swp_entry_t entry)
{
	unsigned long flags;

#ifdef SWAP_CACHE_INFO
	swap_cache_add_total++;
#endif
	if (!PageLocked(page))
		BUG();
	if (PageTestandSetSwapCache(page))
		BUG();
	if (page->mapping)
		BUG();
	flags = page->flags & ~((1 << PG_error) | (1 << PG_arch_1));
	page->flags = flags | (1 << PG_uptodate);
	add_to_page_cache_locked(page, &swapper_space, entry.val);
}

add_to_page_cache_locked函数，代码如下：

void add_to_page_cache_locked(struct page * page, struct address_space *mapping, unsigned long index)
{
	if (!PageLocked(page))
		BUG();

	page_cache_get(page);//增加了使用计数，现在使用计数为2
	spin_lock(&pagecache_lock);
	page->index = index;//index存着页面交换项
	add_page_to_inode_queue(mapping, page);//page->list链入mapping->clean_pages
	add_page_to_hash_queue(page, page_hash(mapping, index));//page->next_hash和page->pprev_hash链入全局的Hash表
	lru_cache_add(page);//page->lru链入了全局的active_list
	spin_unlock(&pagecache_lock);
}

add_page_to_inode_queue函数，代码如下：

static inline void add_page_to_inode_queue(struct address_space *mapping, struct page * page)
{
	struct list_head *head = &mapping->clean_pages;

	mapping->nrpages++;
	list_add(&page->list, head);//page->list链入mapping->clean_pages
	page->mapping = mapping;//mapping指向了swapper_space
}

struct address_space swapper_space = {
	LIST_HEAD_INIT(swapper_space.clean_pages),
	LIST_HEAD_INIT(swapper_space.dirty_pages),
	LIST_HEAD_INIT(swapper_space.locked_pages),
	0,				/* nrpages	*/
	&swap_aops,
};

add_page_to_hash_queue函数，如下：

static void add_page_to_hash_queue(struct page * page, struct page **p)
{
	struct page *next = *p;//page->next_hash和page->pprev_hash链入全局的Hash表

	*p = page;
	page->next_hash = next;
	page->pprev_hash = p;
	if (next)
		next->pprev_hash = &page->next_hash;
	if (page->buffers)
		PAGE_BUG(page);
	atomic_inc(&page_cache_size);
}

lru_cache_add函数，如下：

void lru_cache_add(struct page * page)
{
	spin_lock(&pagemap_lru_lock);
	if (!PageLocked(page))
		BUG();
	DEBUG_ADD_PAGE
	add_page_to_active_list(page);
	/* This should be relatively rare */
	if (!page->age)
		deactivate_page_nolock(page);
	spin_unlock(&pagemap_lru_lock);
}

add_page_to_active_list函数，如下：

#define add_page_to_active_list(page) { 	DEBUG_ADD_PAGE 	ZERO_PAGE_BUG 	SetPageActive(page); 	list_add(&(page)->lru, &active_list); \ //page->lru链入了全局的active_list
	nr_active_pages++; \ //全局的nr_active_pages加1
}

二、下面解释read_swap_cache函数，如下：

#define read_swap_cache(entry) read_swap_cache_async(entry, 1);

还是调用read_swap_cache_async函数，只是本次执行，很可能从lookup_swap_cache函数，找到了page。

struct page * read_swap_cache_async(swp_entry_t entry, int wait)
{
	struct page *found_page = 0, *new_page;
	unsigned long new_page_addr;
	
	/*
	 * Make sure the swap entry is still in use.
	 */
	if (!swap_duplicate(entry))	/* Account for the swap cache */
		goto out;
	/*
	 * Look for the page in the swap cache.
	 */
	found_page = lookup_swap_cache(entry);//假设在hash表中找到对应的page，有进程认领了，使用计数为2
	if (found_page)
		goto out_free_swap;

	new_page_addr = __get_free_page(GFP_USER);
	if (!new_page_addr)
		goto out_free_swap;	/* Out of memory */
	new_page = virt_to_page(new_page_addr);

	/*
	 * Check the swap cache again, in case we stalled above.
	 */
	found_page = lookup_swap_cache(entry);//有可能__get_free_page，没有足够的可分配的页面，切换到其他进程了，再切回来时，在Hash表中再寻找一遍
	if (found_page)
		goto out_free_page;
	/* 
	 * Add it to the swap cache and read its contents.
	 */
	lock_page(new_page);
	add_to_swap_cache(new_page, entry);
	rw_swap_page(READ, new_page, wait);
	return new_page;

out_free_page:
	page_cache_release(new_page);
out_free_swap:
	swap_free(entry);
out:
	return found_page;
}

三、lookup_swap_cache函数，如下：

struct page * lookup_swap_cache(swp_entry_t entry)
{
	struct page *found;

#ifdef SWAP_CACHE_INFO
	swap_cache_find_total++;
#endif
	while (1) {
		/*
		 * Right now the pagecache is 32-bit only.  But it‘s a 32 bit index. =)
		 */
repeat:
		found = find_lock_page(&swapper_space, entry.val);//entry.val为页面交换项
		if (!found)
			return 0;
		/*
		 * Though the "found" page was in the swap cache an instant
		 * earlier, it might have been removed by refill_inactive etc.
		 * Re search ... Since find_lock_page grabs a reference on
		 * the page, it can not be reused for anything else, namely
		 * it can not be associated with another swaphandle, so it
		 * is enough to check whether the page is still in the scache.
		 */
		if (!PageSwapCache(found)) {
			UnlockPage(found);
			page_cache_release(found);
			goto repeat;
		}
		if (found->mapping != &swapper_space)
			goto out_bad;
#ifdef SWAP_CACHE_INFO
		swap_cache_find_success++;
#endif
		UnlockPage(found);
		return found;
}

find_lock_page函数，如下：

#define find_lock_page(mapping, index) 		__find_lock_page(mapping, index, page_hash(mapping, index))

__find_lock_page函数，如下：

struct page * __find_lock_page (struct address_space *mapping,
				unsigned long offset, struct page **hash)
{
	struct page *page;

	/*
	 * We scan the hash list read-only. Addition to and removal from
	 * the hash-list needs a held write-lock.
	 */
repeat:
	spin_lock(&pagecache_lock);
	page = __find_page_nolock(mapping, offset, *hash);//得到了hash表的其中一个链表的头
	if (page) {
		page_cache_get(page);//增加使用计数
		spin_unlock(&pagecache_lock);

		lock_page(page);

		/* Is the page still hashed? Ok, good.. */
		if (page->mapping)
			return page;

		/* Nope: we raced. Release and try again.. */
		UnlockPage(page);
		page_cache_release(page);
		goto repeat;
	}
	spin_unlock(&pagecache_lock);
	return NULL;
}

__find_page_nolock函数，如下：

static inline struct page * __find_page_nolock(struct address_space *mapping, unsigned long offset, struct page *page)
{
	goto inside;

	for (;;) {
		page = page->next_hash;//从hash表中寻找
inside:
		if (!page)
			goto not_found;
		if (page->mapping != mapping)
			continue;
		if (page->index == offset)
			break;
	}
	/*
	 * Touching the page may move it to the active list.
	 * If we end up with too few inactive pages, we wake
	 * up kswapd.
	 */
	age_page_up(page);
	if (inactive_shortage() > inactive_target / 2 && free_shortage())
			wakeup_kswapd(0);
not_found:
	return page;
}

根据页面交换项，在hash表中寻找page结构。

      swapin_readahead(entry);//预读页面
      page = read_swap_cache(entry);//真正得到一个页面，这个页面可能从hash表中寻找到，因为上面预读了。或者自己申请页面，并且从盘上将其内容读进来。

read_swap_cache无论从hash表中读取页面，还是自己申请页面，并加入到对应的链表。最后使用计数都是2。

swapin_readahead预读了很多页面，如果没有被进程认领，那么使用计数为1。

Linux内核源代码情景分析-内存管理之用户页面的换入

标签：

原文地址：http://blog.csdn.net/jltxgcy/article/details/44002859

踩

(0)

评论一句话评论（0）

分享档案

更多>

2021年07月29日 (22)
2021年07月28日 (40)
2021年07月27日 (32)
2021年07月26日 (79)
2021年07月23日 (29)
2021年07月22日 (30)
2021年07月21日 (42)
2021年07月20日 (16)
2021年07月19日 (90)
2021年07月16日 (35)

周排行