标签:
一、库函数shmget()--共享内存区的创建与寻找
asmlinkage long sys_shmget (key_t key, size_t size, int shmflg)
{
struct shmid_kernel *shp;
int err, id = 0;
down(&shm_ids.sem);
if (key == IPC_PRIVATE) {
err = newseg(key, shmflg, size);//分配一个共享内存区供本进程专用,最后返回的是一体化的标示号
} else if ((id = ipc_findkey(&shm_ids, key)) == -1) {//在shm_ids寻找shmid_kernel结构(共享内存区),如果没有找到,id为-1。如果找到了id为标示号。
if (!(shmflg & IPC_CREAT))//没有找到也不允许创建,那么就出错返回
err = -ENOENT;
else
err = newseg(key, shmflg, size);//否则创建一个共享内存区
} else if ((shmflg & IPC_CREAT) && (shmflg & IPC_EXCL)) {//如果找到了,但是要求的是创建,那么也返回出错
err = -EEXIST;
} else {//如果找到了,也不要求创建,就是正常情况下了
shp = shm_lock(id);//通过标示号id,获取共享内存区
if(shp==NULL)
BUG();
if (shp->shm_segsz < size)
err = -EINVAL;
else if (ipcperms(&shp->shm_perm, shmflg))
err = -EACCES;
else
err = shm_buildid(id, shp->shm_perm.seq);//最后返回的还是一体化参数
shm_unlock(id);
}
up(&shm_ids.sem);
return err;//无论是创建还是查找,最后都返回的是一体化的标示号
} 键值IPC_PRIVATE,即0,是特殊的,它表示要分配一个共享内存区供本进程专用。其他键值则表示要创建或寻找的是"共享"内存区。而标志位IPC_CREAT则表示目的在于创建。1、当键值是IPC_PRIVATE时,会调用newseg,分配一个共享内存区供本进程专用,代码如下:
static int newseg (key_t key, int shmflg, size_t size)
{
int error;
struct shmid_kernel *shp;
int numpages = (size + PAGE_SIZE -1) >> PAGE_SHIFT;
struct file * file;
char name[13];
int id;
if (size < SHMMIN || size > shm_ctlmax)
return -EINVAL;
if (shm_tot + numpages >= shm_ctlall)
return -ENOSPC;
shp = (struct shmid_kernel *) kmalloc (sizeof (*shp), GFP_USER);//分配shmid_kernel结构
if (!shp)
return -ENOMEM;
sprintf (name, "SYSV%08x", key);
file = shmem_file_setup(name, size);//在特殊文件系统"shm"中建立映射文件
error = PTR_ERR(file);
if (IS_ERR(file))
goto no_file;
error = -ENOSPC;
id = shm_addid(shp);//将shmid_kernel结构链入shm_ids
if(id == -1)
goto no_id;
shp->shm_perm.key = key;
shp->shm_flags = (shmflg & S_IRWXUGO);
shp->shm_cprid = current->pid;
shp->shm_lprid = 0;
shp->shm_atim = shp->shm_dtim = 0;
shp->shm_ctim = CURRENT_TIME;
shp->shm_segsz = size;
shp->shm_nattch = 0;
shp->id = shm_buildid(id,shp->shm_perm.seq);//将这个标识号转换成一个一体化的标示号
shp->shm_file = file;//指向新建立的file
file->f_dentry->d_inode->i_ino = shp->id;
file->f_op = &shm_file_operations;//最后又重新设置了一遍f_op,这里是shm_file_operations,而不是shmem_file_operations
shm_tot += numpages;
shm_unlock (id);
return shp->id;//返回的是一体化的标示号
no_id:
fput(file);
no_file:
kfree(shp);
return error;
} shmid_kernel结构如下:struct shmid_kernel /* private to the kernel */
{
struct kern_ipc_perm shm_perm;
struct file * shm_file;
int id;
unsigned long shm_nattch;
unsigned long shm_segsz;
time_t shm_atim;
time_t shm_dtim;
time_t shm_ctim;
pid_t shm_cprid;
pid_t shm_lprid;
}; shmem_file_setup,在特殊文件系统"shm"中建立映射文件,代码如下:struct file *shmem_file_setup(char * name, loff_t size)
{
int error;
struct file *file;
struct inode * inode;
struct dentry *dentry, *root;
struct qstr this;
int vm_enough_memory(long pages);
error = -ENOMEM;
if (!vm_enough_memory((size) >> PAGE_SHIFT))
goto out;
this.name = name;
this.len = strlen(name);
this.hash = 0; /* will go */
root = shmem_fs_type.kern_mnt->mnt_root;//shm特殊文件系统的根节点的dentry结构
dentry = d_alloc(root, &this);//分配shm节点的dentry结构
if (!dentry)
goto out;
error = -ENFILE;
file = get_empty_filp();
if (!file)
goto put_dentry;
error = -ENOSPC;
inode = shmem_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0);//分配shm节点的inode结构
if (!inode)
goto close_file;
d_instantiate(dentry, inode);//shm节点的dentry结构和shm节点的inode结构相关联
dentry->d_inode->i_size = size;
file->f_vfsmnt = mntget(shmem_fs_type.kern_mnt);
file->f_dentry = dentry;//指向刚刚的dentry
file->f_op = &shmem_file_operations;//设置如下
file->f_mode = FMODE_WRITE | FMODE_READ;
inode->i_nlink = 0; /* It is unlinked */
return(file);
close_file:
put_filp(file);
put_dentry:
dput (dentry);
out:
return ERR_PTR(error);
} 其中shmem_fs_type.kern_mnt->mnt_root是在init_shmem_fs中建立的。static DECLARE_FSTYPE(shmem_fs_type, "shm", shmem_read_super, FS_LITTER);
static int __init init_shmem_fs(void)
{
int error;
struct vfsmount * res;
if ((error = register_filesystem(&shmem_fs_type))) {
printk (KERN_ERR "Could not register shmem fs\n");
return error;
}
res = kern_mount(&shmem_fs_type);
if (IS_ERR (res)) {
printk (KERN_ERR "could not kern_mount shmem fs\n");
unregister_filesystem(&shmem_fs_type);
return PTR_ERR(res);
}
devfs_mk_dir (NULL, "shm", NULL);
return 0;
} shmem_get_inode,分配shm节点的inode结构,代码如下:struct inode *shmem_get_inode(struct super_block *sb, int mode, int dev)
{
struct inode * inode;
spin_lock (&sb->u.shmem_sb.stat_lock);
if (!sb->u.shmem_sb.free_inodes) {
spin_unlock (&sb->u.shmem_sb.stat_lock);
return NULL;
}
sb->u.shmem_sb.free_inodes--;
spin_unlock (&sb->u.shmem_sb.stat_lock);
inode = new_inode(sb);
if (inode) {
inode->i_mode = mode;
inode->i_uid = current->fsuid;
inode->i_gid = current->fsgid;
inode->i_blksize = PAGE_CACHE_SIZE;
inode->i_blocks = 0;
inode->i_rdev = to_kdev_t(dev);
inode->i_mapping->a_ops = &shmem_aops;//shmem_aops设置如下
inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
spin_lock_init (&inode->u.shmem_i.lock);
switch (mode & S_IFMT) {
default:
init_special_inode(inode, mode, dev);
break;
case S_IFREG://i_op和i_fop设置如下
inode->i_op = &shmem_inode_operations;
inode->i_fop = &shmem_file_operations;
break;
case S_IFDIR:
inode->i_op = &shmem_dir_inode_operations;
inode->i_fop = &shmem_dir_operations;
break;
case S_IFLNK:
inode->i_op = &page_symlink_inode_operations;
break;
}
spin_lock (&shmem_ilock);
list_add (&inode->u.shmem_i.list, &shmem_inodes);
spin_unlock (&shmem_ilock);
}
return inode;
} inode->i_op = &shmem_inode_operations,代码如下:static struct inode_operations shmem_inode_operations = {
truncate: shmem_truncate,
}; inode->i_fop = &shmem_file_operations,代码如下:static struct file_operations shmem_file_operations = {
mmap: shmem_mmap
}; inode->i_mapping->a_ops = &shmem_aops,代码如下:
static struct address_space_operations shmem_aops = {
writepage: shmem_writepage
}; 返回到shmem_file_setup,file->f_op = &shmem_file_operations,如下:static struct file_operations shmem_file_operations = {
mmap: shmem_mmap
}; 返回到newseg,shm_addid,将shmid_kernel结构链入shm_ids,代码如下:
static inline int shm_addid(struct shmid_kernel *shp)
{
return ipc_addid(&shm_ids, &shp->shm_perm, shm_ctlmni+1);//shp->shm_perm是kern_ipc_perm
}
int ipc_addid(struct ipc_ids* ids, struct kern_ipc_perm* new, int size)
{
int id;
size = grow_ary(ids,size);
for (id = 0; id < size; id++) {
if(ids->entries[id].p == NULL)
goto found;
}
return -1;
found:
ids->in_use++;
if (id > ids->max_id)
ids->max_id = id;
new->cuid = new->uid = current->euid;
new->gid = new->cgid = current->egid;
new->seq = ids->seq++;
if(ids->seq > ids->seq_max)
ids->seq = 0;
spin_lock(&ids->ary);
ids->entries[id].p = new;//把shp->shem_perm链入到全局的shm_ids中
return id;//返回标识号
} 其中shm_ids如下:struct ipc_ids {
int size;
int in_use;
int max_id;
unsigned short seq;
unsigned short seq_max;
struct semaphore sem;
spinlock_t ary;
struct ipc_id* entries;
};
static struct ipc_ids shm_ids;
struct ipc_id {
struct kern_ipc_perm* p;
}; 继续执行,shm_buildid,将这个标识号转换成一个一体化的标示号,代码如下:#define shm_buildid(id, seq) ipc_buildid(&shm_ids, id, seq)
extern inline int ipc_buildid(struct ipc_ids* ids, int id, int seq)
{
return SEQ_MULTIPLIER*seq + id;
}
int ipc_findkey(struct ipc_ids* ids, key_t key)
{
int id;
struct kern_ipc_perm* p;
for (id = 0; id <= ids->max_id; id++) {
p = ids->entries[id].p;
if(p==NULL)
continue;
if (key == p->key)
return id;//返回标示号,而不是一体化标示号
}
return -1;
} 如果找到了,也不要求创建,就是正常情况下了,执行shm_lock,通过标识号id获取共享内存区,如下:#define shm_lock(id) ((struct shmid_kernel*)ipc_lock(&shm_ids,id))
extern inline struct kern_ipc_perm* ipc_lock(struct ipc_ids* ids, int id)
{
struct kern_ipc_perm* out;
int lid = id % SEQ_MULTIPLIER;//无论是标示号还是一体化标示号都通吃
if(lid > ids->size)
return NULL;
spin_lock(&ids->ary);
out = ids->entries[lid].p;
if(out==NULL)
spin_unlock(&ids->ary);
return out;
}二、库函数shmat()--建立共享内存区的映射
通过shmget()以给定键值创建了一个共享内存区,或者取得了已创建共享内存区的一体化的标示号以后,还要通过shmat()将这个内存区映射到本进程的虚拟空间,sys_shmat代码如下:
asmlinkage long sys_shmat (int shmid, char *shmaddr, int shmflg, ulong *raddr)//shmaddr为当前进程所要求映射的目标地址,也就是映射后该共享内存区在这个进程的用户空间中的起始地址
{
struct shmid_kernel *shp;
unsigned long addr;
struct file * file;
int err;
unsigned long flags;
unsigned long prot;
unsigned long o_flags;
int acc_mode;
void *user_addr;
if (shmid < 0)
return -EINVAL;
if ((addr = (ulong)shmaddr)) {
if (addr & (SHMLBA-1)) {
if (shmflg & SHM_RND)
addr &= ~(SHMLBA-1); /* round down */
else
return -EINVAL;
}
flags = MAP_SHARED | MAP_FIXED;
} else
flags = MAP_SHARED;
if (shmflg & SHM_RDONLY) {
prot = PROT_READ;
o_flags = O_RDONLY;
acc_mode = S_IRUGO;
} else {
prot = PROT_READ | PROT_WRITE;
o_flags = O_RDWR;
acc_mode = S_IRUGO | S_IWUGO;
}
/*
* We cannot rely on the fs check since SYSV IPC does have an
* aditional creator id...
*/
shp = shm_lock(shmid);//通过一体化标示号找到共享内存区
if(shp == NULL)
return -EINVAL;
if (ipcperms(&shp->shm_perm, acc_mode)) {
shm_unlock(shmid);
return -EACCES;
}
file = shp->shm_file;//找到file结构
shp->shm_nattch++;
shm_unlock(shmid);
down(¤t->mm->mmap_sem);
user_addr = (void *) do_mmap (file, addr, file->f_dentry->d_inode->i_size, prot, flags, 0);//建立起文件与虚拟空间的映射
up(¤t->mm->mmap_sem);
down (&shm_ids.sem);
if(!(shp = shm_lock(shmid)))
BUG();
shp->shm_nattch--;
if(shp->shm_nattch == 0 &&
shp->shm_flags & SHM_DEST)
shm_destroy (shp);
shm_unlock(shmid);
up (&shm_ids.sem);
*raddr = (unsigned long) user_addr;
err = 0;
if (IS_ERR(user_addr))
err = PTR_ERR(user_addr);
return err;
} do_mmap,建立起文件与虚拟空间的映射。代码如下:static inline unsigned long do_mmap(struct file *file, unsigned long addr,
unsigned long len, unsigned long prot,
unsigned long flag, unsigned long offset)
{
unsigned long ret = -EINVAL;
if ((offset + PAGE_ALIGN(len)) < offset)
goto out;
if (!(offset & ~PAGE_MASK))
ret = do_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT);
out:
return ret;
}unsigned long do_mmap_pgoff(struct file * file, unsigned long addr, unsigned long len,
unsigned long prot, unsigned long flags, unsigned long pgoff)
{
struct mm_struct * mm = current->mm;
struct vm_area_struct * vma;
int correct_wcount = 0;
int error;
......
if (flags & MAP_FIXED) {
if (addr & ~PAGE_MASK)
return -EINVAL;
} else {
addr = get_unmapped_area(addr, len);//如果addr为0,那么就自行分配一个虚拟空间
if (!addr)
return -ENOMEM;
}
/* Determine the object being mapped and call the appropriate
* specific mapper. the address has already been validated, but
* not unmapped, but the maps are removed from the list.
*/
vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);//分配了vm_area_struct结构
if (!vma)
return -ENOMEM;
vma->vm_mm = mm;
vma->vm_start = addr;
vma->vm_end = addr + len;
vma->vm_flags = vm_flags(prot,flags) | mm->def_flags;
if (file) {
VM_ClearReadHint(vma);
vma->vm_raend = 0;
if (file->f_mode & FMODE_READ)
vma->vm_flags |= VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
if (flags & MAP_SHARED) {
vma->vm_flags |= VM_SHARED | VM_MAYSHARE;
/* This looks strange, but when we don‘t have the file open
* for writing, we can demote the shared mapping to a simpler
* private mapping. That also takes care of a security hole
* with ptrace() writing to a shared mapping without write
* permissions.
*
* We leave the VM_MAYSHARE bit on, just to get correct output
* from /proc/xxx/maps..
*/
if (!(file->f_mode & FMODE_WRITE))
vma->vm_flags &= ~(VM_MAYWRITE | VM_SHARED);
}
} else {
vma->vm_flags |= VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
if (flags & MAP_SHARED)
vma->vm_flags |= VM_SHARED | VM_MAYSHARE;
}
vma->vm_page_prot = protection_map[vma->vm_flags & 0x0f];
vma->vm_ops = NULL;
vma->vm_pgoff = pgoff;
vma->vm_file = NULL;
vma->vm_private_data = NULL;
/* Clear old maps */
error = -ENOMEM;
if (do_munmap(mm, addr, len))
goto free_vma;
/* Check against address space limit. */
if ((mm->total_vm << PAGE_SHIFT) + len
> current->rlim[RLIMIT_AS].rlim_cur)
goto free_vma;
/* Private writable mapping? Check memory availability.. */
if ((vma->vm_flags & (VM_SHARED | VM_WRITE)) == VM_WRITE &&
!(flags & MAP_NORESERVE) &&
!vm_enough_memory(len >> PAGE_SHIFT))
goto free_vma;
if (file) {
if (vma->vm_flags & VM_DENYWRITE) {
error = deny_write_access(file);
if (error)
goto free_vma;
correct_wcount = 1;
}
vma->vm_file = file;//这里是重点
get_file(file);
error = file->f_op->mmap(file, vma);//最后设置成shmem_mmap
if (error)
goto unmap_and_free_vma;
} else if (flags & MAP_SHARED) {
error = shmem_zero_setup(vma);
if (error)
goto free_vma;
}
/* Can addr have changed??
*
* Answer: Yes, several device drivers can do it in their
* f_op->mmap method. -DaveM
*/
flags = vma->vm_flags;
addr = vma->vm_start;
insert_vm_struct(mm, vma);
if (correct_wcount)
atomic_inc(&file->f_dentry->d_inode->i_writecount);
mm->total_vm += len >> PAGE_SHIFT;
if (flags & VM_LOCKED) {
mm->locked_vm += len >> PAGE_SHIFT;
make_pages_present(addr, addr + len);
}
return addr;//起始虚拟地址
unmap_and_free_vma:
if (correct_wcount)
atomic_inc(&file->f_dentry->d_inode->i_writecount);
vma->vm_file = NULL;
fput(file);
/* Undo any partial mapping done by a device driver. */
flush_cache_range(mm, vma->vm_start, vma->vm_end);
zap_page_range(mm, vma->vm_start, vma->vm_end - vma->vm_start);
flush_tlb_range(mm, vma->vm_start, vma->vm_end);
free_vma:
kmem_cache_free(vm_area_cachep, vma);
return error;
} file->f_op->mmap(file, vma),最后设置成shmem_mmap,代码如下:static int shm_mmap(struct file * file, struct vm_area_struct * vma)
{
UPDATE_ATIME(file->f_dentry->d_inode);
vma->vm_ops = &shm_vm_ops;
shm_inc(file->f_dentry->d_inode->i_ino);
return 0;
}static struct vm_operations_struct shm_vm_ops = {
open: shm_open, /* callback for a new vm-area open */
close: shm_close, /* callback for when the vm-area is released */
nopage: shmem_nopage,
};在sys_shmat()中实际上并没有建立页面的映射,而是把它推迟到了实际需要的时候。
三、所以,在将一块共享内存区纳入一个进程的存储空间以后,当其中的任何一个页面首次受到访问时就会因为“缺页”而产生一次页面异常。从do_page_fault()开始,顺着handle_mm_fault()、handle_pte_fault(),一直到do_no_page。在do_no_page()中,如果产生异常的地址所属区间的指针vm_ops指向一个vm_operations_struct数据结构,并且该结构中的函数指针nopage非零,就会调用这个函数来建立所在页面的映射表项。
static int do_no_page(struct mm_struct * mm, struct vm_area_struct * vma,
unsigned long address, int write_access, pte_t *page_table)
{
struct page * new_page;
pte_t entry;
if (!vma->vm_ops || !vma->vm_ops->nopage)
return do_anonymous_page(mm, vma, page_table, write_access, address);
/*
* The third argument is "no_share", which tells the low-level code
* to copy, not share the page even if sharing is possible. It‘s
* essentially an early COW detection.
*/
new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, (vma->vm_flags & VM_SHARED)?0:write_access);//对于共享内存来说,指向了shmem_page
if (new_page == NULL) /* no page was available -- SIGBUS */
return 0;
if (new_page == NOPAGE_OOM)
return -1;
++mm->rss;
/*
* This silly early PAGE_DIRTY setting removes a race
* due to the bad i386 page protection. But it‘s valid
* for other architectures too.
*
* Note that if write_access is true, we either now have
* an exclusive copy of the page, or this is a shared mapping,
* so we can make it writable and dirty to avoid having to
* handle that later.
*/
flush_page_to_ram(new_page);
flush_icache_page(vma, new_page);
entry = mk_pte(new_page, vma->vm_page_prot);
if (write_access) {
entry = pte_mkwrite(pte_mkdirty(entry));
} else if (page_count(new_page) > 1 &&
!(vma->vm_flags & VM_SHARED))
entry = pte_wrprotect(entry);
set_pte(page_table, entry);//把页表项指向新申请的page,这样就建立了映射
/* no need to invalidate: a not-present page shouldn‘t be cached */
update_mmu_cache(vma, address, entry);
return 2; /* Major fault */
} vma->vm_ops->nopage,对于共享内存来说,指向了shmem_page,代码如下:struct page * shmem_nopage(struct vm_area_struct * vma, unsigned long address, int no_share)
{
unsigned long size;
struct page * page;
unsigned int idx;
swp_entry_t *entry;
struct inode * inode = vma->vm_file->f_dentry->d_inode;
struct address_space * mapping = inode->i_mapping;
struct shmem_inode_info *info;
idx = (address - vma->vm_start) >> PAGE_SHIFT;
idx += vma->vm_pgoff;
down (&inode->i_sem);
size = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;//页面数
page = NOPAGE_SIGBUS;
if ((idx >= size) && (vma->vm_mm == current->mm))
goto out;
/* retry, we may have slept */
page = __find_lock_page(mapping, idx, page_hash (mapping, idx));
if (page)
goto cached_page;
info = &inode->u.shmem_i;
entry = shmem_swp_entry (info, idx);
if (!entry)
goto oom;
if (entry->val) {//目前为0
unsigned long flags;
/* Look it up and read it in.. */
page = lookup_swap_cache(*entry);
if (!page) {
lock_kernel();
swapin_readahead(*entry);
page = read_swap_cache(*entry);
unlock_kernel();
if (!page)
goto oom;
}
/* We have to this with page locked to prevent races */
spin_lock (&info->lock);
swap_free(*entry);
lock_page(page);
delete_from_swap_cache_nolock(page);
*entry = (swp_entry_t) {0};
flags = page->flags & ~((1 << PG_uptodate) | (1 << PG_error) | (1 << PG_referenced) | (1 << PG_arch_1));
page->flags = flags | (1 << PG_dirty);
add_to_page_cache_locked(page, mapping, idx);
info->swapped--;
spin_unlock (&info->lock);
} else {//执行这里
spin_lock (&inode->i_sb->u.shmem_sb.stat_lock);
if (inode->i_sb->u.shmem_sb.free_blocks == 0)
goto no_space;
inode->i_sb->u.shmem_sb.free_blocks--;
spin_unlock (&inode->i_sb->u.shmem_sb.stat_lock);
/* Ok, get a new page */
page = page_cache_alloc();//分配一个页面
if (!page)
goto oom;
clear_user_highpage(page, address);
inode->i_blocks++;
add_to_page_cache (page, mapping, idx);//这个函数是重点,此时的mapping是inode->mapping,而不是交换分区&swapper_space
}
/* We have the page */
SetPageUptodate (page);
cached_page:
UnlockPage (page);
up(&inode->i_sem);
if (no_share) {
struct page *new_page = page_cache_alloc();
if (new_page) {
copy_user_highpage(new_page, page, address);
flush_page_to_ram(new_page);
} else
new_page = NOPAGE_OOM;
page_cache_release(page);
return new_page;
}
flush_page_to_ram (page);
return(page);
no_space:
spin_unlock (&inode->i_sb->u.shmem_sb.stat_lock);
oom:
page = NOPAGE_OOM;
out:
up(&inode->i_sem);
return page;
} add_to_page_cache,将page加入到相关队里中去。相关代码请参考Linux内核源代码情景分析-内存管理之用户页面的换入,只不是此时的mapping是inode->mapping,而不是交换分区&swapper_space。 page->list链入mapping->clean_pages;
page->next_hash和page->pprev_hash链入全局的Hash表;
page->lru链入了全局的active_list;
page->mapping来自于inode->mapping。也是在这里赋值的。
返回到do_no_page,把页表项指向新申请的page。这样就建立了映射。
假设两个进程一个是申请sys_shmget的共享内存区,一个是查找sys_shmget刚刚申请的共享内存区;都通过sys_shmat,将这个内存区映射到本进程的虚拟空间。
第一个进程如上面的步骤,建立了映射后,往共享内存区写数据。第二个进程会调用page = __find_lock_page(mapping, idx, page_hash (mapping, idx));找到刚刚分配的内存,并建立映射。这样第二个进程就能读取刚刚写入的数据。
四、当内存紧张时,共享内存区也会被换入到交换分区,参考Linux内核源代码情景分析-内存管理之用户页面的定期换出。
kswapd内核线程:
1、refill_inactive_scan和swap_out,把活跃的页面变成不活跃脏的页面。挑选的原则是最近没有被访问,且age小于0。
2、page_launder,把不活跃脏的页面变成不活跃干净的页面。
我们这里主要分析page_launder,算法如下:
if (PageDirty(page)) {
int (*writepage)(struct page *) = page->mapping->a_ops->writepage;//还记得我们设置过shmem_writepage
int result;
if (!writepage)
goto page_active;
/* First time through? Move it to the back of the list */
if (!launder_loop) {
list_del(page_lru);
list_add(page_lru, &inactive_dirty_list);
UnlockPage(page);
continue;
}
/* OK, do a physical asynchronous write to swap. */
ClearPageDirty(page);
page_cache_get(page);
spin_unlock(&pagemap_lru_lock);
result = writepage(page);//shmem_writepage
page_cache_release(page);
/* And re-start the thing.. */
spin_lock(&pagemap_lru_lock);
if (result != 1)
continue;
/* writepage refused to do anything */
set_page_dirty(page);
goto page_active;
} inode->i_mapping->a_ops = &shmem_aops,代码如下:
static struct address_space_operations shmem_aops = {
writepage: shmem_writepage
}; writepage(page),也就是shmem_writepage(page),代码如下:static int shmem_writepage(struct page * page)
{
int error;
struct shmem_inode_info *info;
swp_entry_t *entry, swap;
info = &page->mapping->host->u.shmem_i;
if (info->locked)
return 1;
swap = __get_swap_page(2);//从交换设备上分配一个页面
if (!swap.val)
return 1;
spin_lock(&info->lock);
entry = shmem_swp_entry (info, page->index);//根据物理页面号,通过这个函数在文件的swp_entry_t表中找到相应的表项,此表项表示一个页面在交换设备上的页面号,目前什么内容没有
if (!entry) /* this had been allocted on page allocation */
BUG();
error = -EAGAIN;
if (entry->val) {
__swap_free(swap, 2);
goto out;
}
*entry = swap;//页面在交换设备上的页面号
error = 0;
/* Remove the from the page cache */
lru_cache_del(page);
remove_inode_page(page);
/* Add it to the swap cache */
add_to_swap_cache(page, swap);
page_cache_release(page);
set_page_dirty(page);
info->swapped++;
out:
spin_unlock(&info->lock);
UnlockPage(page);
return error;
}shmem_swp_entry,根据物理页面号,通过这个函数在文件的swp_entry_t表中找到相应的表项,此表项表示一个页面在交换设备上的页面号。
static swp_entry_t * shmem_swp_entry (struct shmem_inode_info *info, unsigned long index)
{
if (index < SHMEM_NR_DIRECT)
return info->i_direct+index;
index -= SHMEM_NR_DIRECT;
if (index >= ENTRIES_PER_PAGE*ENTRIES_PER_PAGE)
return NULL;
if (!info->i_indirect) {
info->i_indirect = (swp_entry_t **) get_zeroed_page(GFP_USER);
if (!info->i_indirect)
return NULL;
}
if(!(info->i_indirect[index/ENTRIES_PER_PAGE])) {
info->i_indirect[index/ENTRIES_PER_PAGE] = (swp_entry_t *) get_zeroed_page(GFP_USER);
if (!info->i_indirect[index/ENTRIES_PER_PAGE])
return NULL;
}
return info->i_indirect[index/ENTRIES_PER_PAGE]+index%ENTRIES_PER_PAGE;
}struct shmem_inode_info {
spinlock_t lock;
swp_entry_t i_direct[SHMEM_NR_DIRECT]; /* for the first blocks */
swp_entry_t **i_indirect; /* doubly indirect blocks */
unsigned long swapped;
int locked; /* into memory */
struct list_head list;
}; 返回到shmem_writepage,执行如下:/* Remove the from the page cache */ lru_cache_del(page); remove_inode_page(page);
page->list为空;
page->next_hash和page->pprev_hash位空;
page->lru为空;
继续执行,代码如下:
/* Add it to the swap cache */ add_to_swap_cache(page, swap); page_cache_release(page);
void add_to_swap_cache(struct page *page, swp_entry_t entry)
{
unsigned long flags;
#ifdef SWAP_CACHE_INFO
swap_cache_add_total++;
#endif
if (!PageLocked(page))
BUG();
if (PageTestandSetSwapCache(page))
BUG();
if (page->mapping)
BUG();
flags = page->flags & ~((1 << PG_error) | (1 << PG_arch_1));
page->flags = flags | (1 << PG_uptodate);
add_to_page_cache_locked(page, &swapper_space, entry.val);
} 参考Linux内核源代码情景分析-内存管理之用户页面的换入,执行后的结果如下: page->list链入mapping->clean_pages;
page->next_hash和page->pprev_hash链入全局的Hash表;
page->lru链入了全局的active_list;
只是此时mapping是交换分区&swapper_space。而不是的是inode->mapping,所以page->mapping->a_ops->writepage就指向了swap_writepage了。
当page_launcher再次扫描到这个页面时,它的page->mapping->a_ops->writepage已经指向了swap_writepage了。流程就和Linux内核源代码情景分析-内存管理之用户页面的定期换出完全一样了。
static int swap_writepage(struct page *page)
{
rw_swap_page(WRITE, page, 0);
return 0;
} 把页面写入了交换分区。最后: page->list链入mapping->dirty_pages或者clean_pages(保持原样);
page->next_hash和page->pprev_hash链入全局的Hash表;
page->lru链入了page->zone->inactive_clean_list;
五、恢复映射
1、如果refill_inactive_scan和swap_out,把活跃的页面变成不活跃脏的页面。挑选的原则是最近没有被访问,且age小于0。
或者,page_launder,把不活跃脏的页面变成不活跃干净的页面。
不活跃脏的页面,有如下特点:
使用计数为1;
page->list链入mapping->dirty_pages/clean_pages;
page->next_hash和page->pprev_hash链入全局的Hash表;
page->lru链入了全局的inactive_dirty_list;
page->flags对应为设置为PG_dirty。
不活跃干净的页面,有如下特点:
使用计数为1;
page->list链入mapping->dirty_pages/clean_pages(保持原样);
page->next_hash和page->pprev_hash链入全局的Hash表;
page->lru链入了page->zone->inactive_clean_list;
如果发生缺页中断,do_no_page中调用shmem_nopage,再次访问到这个页面,那么会调用lookup_swap_cache,会在全局的Hash表找到对应的页面,并且引用计数加1,变成2,但还没有移到活跃队列中。什么时候转移到活跃队列中呢?
答案在,page_launder和reclaim_page中。
page_launder:
if (PageTestandClearReferenced(page) || page->age > 0 || //此时引用计数大于1
(!page->buffers && page_count(page) > 1) ||
page_ramdisk(page)) {
del_page_from_inactive_dirty_list(page);
add_page_to_active_list(page);
continue;
} reclaim_page:if (PageTestandClearReferenced(page) || page->age > 0 ||
(!page->buffers && page_count(page) > 1)) {//此时引用计数大于1
del_page_from_inactive_clean_list(page);
add_page_to_active_list(page);
continue;
} 如果发生缺页中断,do_no_page调用shmem_nopage,再次访问到这个页面,调用look_swap_cache为NULL,所以继续执行,代码位于shmem_nopage:
if (entry->val) {//目前不为0了,应为刚刚换出时设置了
unsigned long flags;
/* Look it up and read it in.. */
page = lookup_swap_cache(*entry);
if (!page) {
lock_kernel();
swapin_readahead(*entry);//从交换区预读
page = read_swap_cache(*entry);//从交换区真读
unlock_kernel();
if (!page)
goto oom;
}
/* We have to this with page locked to prevent races */
spin_lock (&info->lock);
swap_free(*entry);
lock_page(page);
delete_from_swap_cache_nolock(page);//从交换区队列中移除
*entry = (swp_entry_t) {0};//swap_entry_t项清零
flags = page->flags & ~((1 << PG_uptodate) | (1 << PG_error) | (1 << PG_referenced) | (1 << PG_arch_1));
page->flags = flags | (1 << PG_dirty);
add_to_page_cache_locked(page, mapping, idx);
info->swapped--;
spin_unlock (&info->lock);
} else { add_to_page_cache_locked,最后的结构就是: page->list链入mapping->clean_pages;
page->next_hash和page->pprev_hash链入全局的Hash表;
page->lru链入了全局的active_list。
标签:
原文地址:http://blog.csdn.net/jltxgcy/article/details/45190057