This function is responsible for registering and mounting the tmpfs and shmemfs filesystems.
1451 #ifdef CONFIG_TMPFS
1453 static DECLARE_FSTYPE(shmem_fs_type, "shm",
shmem_read_super, FS_LITTER);
1454 static DECLARE_FSTYPE(tmpfs_fs_type, "tmpfs",
shmem_read_super, FS_LITTER);
1455 #else
1456 static DECLARE_FSTYPE(tmpfs_fs_type, "tmpfs",
shmem_read_super, FS_LITTER|FS_NOMOUNT);
1457 #endif
1560 static int __init init_tmpfs(void)
1561 {
1562 int error;
1563
1564 error = register_filesystem(&tmpfs_fs_type);
1565 if (error) {
1566 printk(KERN_ERR "Could not register tmpfs\n");
1567 goto out3;
1568 }
1569 #ifdef CONFIG_TMPFS
1570 error = register_filesystem(&shmem_fs_type);
1571 if (error) {
1572 printk(KERN_ERR "Could not register shm fs\n");
1573 goto out2;
1574 }
1575 devfs_mk_dir(NULL, "shm", NULL);
1576 #endif
1577 shm_mnt = kern_mount(&tmpfs_fs_type);
1578 if (IS_ERR(shm_mnt)) {
1579 error = PTR_ERR(shm_mnt);
1580 printk(KERN_ERR "Could not kern_mount tmpfs\n");
1581 goto out1;
1582 }
1583
1584 /* The internal instance should not do size checking */
1585 shmem_set_size(SHMEM_SB(shm_mnt->mnt_sb), ULONG_MAX, ULONG_MAX);
1586 return 0;
1587
1588 out1:
1589 #ifdef CONFIG_TMPFS
1590 unregister_filesystem(&shmem_fs_type);
1591 out2:
1592 #endif
1593 unregister_filesystem(&tmpfs_fs_type);
1594 out3:
1595 shm_mnt = ERR_PTR(error);
1596 return error;
1597 }
1598 module_init(init_tmpfs)
This is the callback function provided for the filesystem which “reads” the superblock. With an ordinary filesystem, this would entail reading the information from the disk but as this is a RAM-based filesystem, it instead populates a struct super_block.
1452 static struct super_block *shmem_read_super(struct super_block *sb,
void* data, int silent)
1453 {
1454 struct inode *inode;
1455 struct dentry *root;
1456 unsigned long blocks, inodes;
1457 int mode = S_IRWXUGO | S_ISVTX;
1458 uid_t uid = current->fsuid;
1459 gid_t gid = current->fsgid;
1460 struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
1461 struct sysinfo si;
1462
1463 /*
1464 * Per default we only allow half of the physical ram per
1465 * tmpfs instance
1466 */
1467 si_meminfo(&si);
1468 blocks = inodes = si.totalram / 2;
1469
1470 #ifdef CONFIG_TMPFS
1471 if (shmem_parse_options(data, &mode, &uid,
&gid, &blocks, &inodes))
1472 return NULL;
1473 #endif
1474
1475 spin_lock_init(&sbinfo->stat_lock);
1476 sbinfo->max_blocks = blocks;
1477 sbinfo->free_blocks = blocks;
1478 sbinfo->max_inodes = inodes;
1479 sbinfo->free_inodes = inodes;
1480 sb->s_maxbytes = SHMEM_MAX_BYTES;
1481 sb->s_blocksize = PAGE_CACHE_SIZE;
1482 sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
1483 sb->s_magic = TMPFS_MAGIC;
1484 sb->s_op = &shmem_ops;
1485 inode = shmem_get_inode(sb, S_IFDIR | mode, 0);
1486 if (!inode)
1487 return NULL;
1488
1489 inode->i_uid = uid;
1490 inode->i_gid = gid;
1491 root = d_alloc_root(inode);
1492 if (!root) {
1493 iput(inode);
1494 return NULL;
1495 }
1496 sb->s_root = root;
1497 return sb;
1498 }
This function updates the number of available blocks and inodes in the filesystem. It is set while the filesystem is being mounted or remounted.
861 static int shmem_set_size(struct shmem_sb_info *info,
862 unsigned long max_blocks,
unsigned long max_inodes)
863 {
864 int error;
865 unsigned long blocks, inodes;
866
867 spin_lock(&info->stat_lock);
868 blocks = info->max_blocks - info->free_blocks;
869 inodes = info->max_inodes - info->free_inodes;
870 error = -EINVAL;
871 if (max_blocks < blocks)
872 goto out;
873 if (max_inodes < inodes)
874 goto out;
875 error = 0;
876 info->max_blocks = max_blocks;
877 info->free_blocks = max_blocks - blocks;
878 info->max_inodes = max_inodes;
879 info->free_inodes = max_inodes - inodes;
880 out:
881 spin_unlock(&info->stat_lock);
882 return error;
883 }
This is the top-level function called when creating a new file.
1164 static int shmem_create(struct inode *dir,
struct dentry *dentry,
int mode)
1165 {
1166 return shmem_mknod(dir, dentry, mode | S_IFREG, 0);
1167 }
1139 static int shmem_mknod(struct inode *dir,
struct dentry *dentry,
int mode, int dev)
1140 {
1141 struct inode *inode = shmem_get_inode(dir->i_sb, mode, dev);
1142 int error = -ENOSPC;
1143
1144 if (inode) {
1145 dir->i_size += BOGO_DIRENT_SIZE;
1146 dir->i_ctime = dir->i_mtime = CURRENT_TIME;
1147 d_instantiate(dentry, inode);
1148 dget(dentry); /* Extra count - pin the dentry in core */
1149 error = 0;
1150 }
1151 return error;
1152 }
809 struct inode *shmem_get_inode(struct super_block *sb,
int mode,
int dev)
810 {
811 struct inode *inode;
812 struct shmem_inode_info *info;
813 struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
814
815 spin_lock(&sbinfo->stat_lock);
816 if (!sbinfo->free_inodes) {
817 spin_unlock(&sbinfo->stat_lock);
818 return NULL;
819 }
820 sbinfo->free_inodes--;
821 spin_unlock(&sbinfo->stat_lock);
822
823 inode = new_inode(sb);
This preamble section is responsible for updating the free inode count and allocating an inode with new_inode().
824 if (inode) {
825 inode->i_mode = mode;
826 inode->i_uid = current->fsuid;
827 inode->i_gid = current->fsgid;
828 inode->i_blksize = PAGE_CACHE_SIZE;
829 inode->i_blocks = 0;
830 inode->i_rdev = NODEV;
831 inode->i_mapping->a_ops = &shmem_aops;
832 inode->i_atime = inode->i_mtime
= inode->i_ctime
= CURRENT_TIME;
833 info = SHMEM_I(inode);
834 info->inode = inode;
835 spin_lock_init(&info->lock);
836 switch (mode & S_IFMT) {
837 default:
838 init_special_inode(inode, mode, dev);
839 break;
840 case S_IFREG:
841 inode->i_op = &shmem_inode_operations;
842 inode->i_fop = &shmem_file_operations;
843 spin_lock(&shmem_ilock);
844 list_add_tail(&info->list, &shmem_inodes);
845 spin_unlock(&shmem_ilock);
846 break;
847 case S_IFDIR:
848 inode->i_nlink++;
849 /* Some things misbehave if size == 0 on a directory */
850 inode->i_size = 2 * BOGO_DIRENT_SIZE;
851 inode->i_op = &shmem_dir_inode_operations;
852 inode->i_fop = &dcache_dir_ops;
853 break;
854 case S_IFLNK:
855 break;
856 }
857 }
858 return inode;
859 }
The tasks for memory mapping a virtual file are simple. The only changes that need to be made is to update the VMAs vm_operations_struct field (vma→vm_ops) to use the shmfs equivilants for faulting.
796 static int shmem_mmap(struct file * file, struct vm_area_struct * vma)
797 {
798 struct vm_operations_struct *ops;
799 struct inode *inode = file->f_dentry->d_inode;
800
801 ops = &shmem_vm_ops;
802 if (!S_ISREG(inode->i_mode))
803 return -EACCES;
804 UPDATE_ATIME(inode);
805 vma->vm_ops = ops;
806 return 0;
807 }
This is the top-level function called for read()ing a tmpfs file.
1088 static ssize_t shmem_file_read(struct file *filp, char *buf,
size_t count, loff_t *ppos)
1089 {
1090 read_descriptor_t desc;
1091
1092 if ((ssize_t) count < 0)
1093 return -EINVAL;
1094 if (!access_ok(VERIFY_WRITE, buf, count))
1095 return -EFAULT;
1096 if (!count)
1097 return 0;
1098
1099 desc.written = 0;
1100 desc.count = count;
1101 desc.buf = buf;
1102 desc.error = 0;
1103
1104 do_shmem_file_read(filp, ppos, &desc);
1105 if (desc.written)
1106 return desc.written;
1107 return desc.error;
1108 }
This function retrieves the pages needed for the file read with shmem_getpage() and calls file_read_actor() to copy the data to userspace.
1003 static void do_shmem_file_read(struct file *filp,
loff_t *ppos,
read_descriptor_t *desc)
1004 {
1005 struct inode *inode = filp->f_dentry->d_inode;
1006 struct address_space *mapping = inode->i_mapping;
1007 unsigned long index, offset;
1008
1009 index = *ppos >> PAGE_CACHE_SHIFT;
1010 offset = *ppos & ~PAGE_CACHE_MASK;
1011
1012 for (;;) {
1013 struct page *page = NULL;
1014 unsigned long end_index, nr, ret;
1015
1016 end_index = inode->i_size >> PAGE_CACHE_SHIFT;
1017 if (index > end_index)
1018 break;
1019 if (index == end_index) {
1020 nr = inode->i_size & ~PAGE_CACHE_MASK;
1021 if (nr <= offset)
1022 break;
1023 }
1024
1025 desc->error = shmem_getpage(inode, index, &page, SGP_READ);
1026 if (desc->error) {
1027 if (desc->error == -EINVAL)
1028 desc->error = 0;
1029 break;
1030 }
1031
1036 nr = PAGE_CACHE_SIZE;
1037 end_index = inode->i_size >> PAGE_CACHE_SHIFT;
1038 if (index == end_index) {
1039 nr = inode->i_size & ~PAGE_CACHE_MASK;
1040 if (nr <= offset) {
1041 page_cache_release(page);
1042 break;
1043 }
1044 }
1045 nr -= offset;
1046
1047 if (page != ZERO_PAGE(0)) {
1053 if (mapping->i_mmap_shared != NULL)
1054 flush_dcache_page(page);
1055 /*
1056 * Mark the page accessed if we read the
1057 * beginning or we just did an lseek.
1058 */
1059 if (!offset || !filp->f_reada)
1060 mark_page_accessed(page);
1061 }
1062
1073 ret = file_read_actor(desc, page, offset, nr);
1074 offset += ret;
1075 index += offset >> PAGE_CACHE_SHIFT;
1076 offset &= ~PAGE_CACHE_MASK;
1077
1078 page_cache_release(page);
1079 if (ret != nr || !desc->count)
1080 break;
1081 }
1082
1083 *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset;
1084 filp->f_reada = 1;
1085 UPDATE_ATIME(inode);
1086 }
This function is responsible for copying data from a page to a userspace buffer. It is ultimatly called by a number of functions including generic_file_read(), generic_file_write() and shmem_file_read().
1669 int file_read_actor(read_descriptor_t * desc,
struct page *page,
unsigned long offset,
unsigned long size)
1670 {
1671 char *kaddr;
1672 unsigned long left, count = desc->count;
1673
1674 if (size > count)
1675 size = count;
1676
1677 kaddr = kmap(page);
1678 left = __copy_to_user(desc->buf, kaddr + offset, size);
1679 kunmap(page);
1680
1681 if (left) {
1682 size -= left;
1683 desc->error = -EFAULT;
1684 }
1685 desc->count = count - size;
1686 desc->written += size;
1687 desc->buf += size;
1688 return size;
1689 }
925 shmem_file_write(struct file *file, const char *buf,
size_t count, loff_t *ppos)
926 {
927 struct inode *inode = file->f_dentry->d_inode;
928 loff_t pos;
929 unsigned long written;
930 int err;
931
932 if ((ssize_t) count < 0)
933 return -EINVAL;
934
935 if (!access_ok(VERIFY_READ, buf, count))
936 return -EFAULT;
937
938 down(&inode->i_sem);
939
940 pos = *ppos;
941 written = 0;
942
943 err = precheck_file_write(file, inode, &count, &pos);
944 if (err || !count)
945 goto out;
946
947 remove_suid(inode);
948 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
949
Function preamble.
950 do {
951 struct page *page = NULL;
952 unsigned long bytes, index, offset;
953 char *kaddr;
954 int left;
955
956 offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
957 index = pos >> PAGE_CACHE_SHIFT;
958 bytes = PAGE_CACHE_SIZE - offset;
959 if (bytes > count)
960 bytes = count;
961
962 /*
963 * We don't hold page lock across copy from user -
964 * what would it guard against? - so no deadlock here.
965 */
966
967 err = shmem_getpage(inode, index, &page, SGP_WRITE);
968 if (err)
969 break;
970
971 kaddr = kmap(page);
972 left = __copy_from_user(kaddr + offset, buf, bytes);
973 kunmap(page);
974
975 written += bytes;
976 count -= bytes;
977 pos += bytes;
978 buf += bytes;
979 if (pos > inode->i_size)
980 inode->i_size = pos;
981
982 flush_dcache_page(page);
983 SetPageDirty(page);
984 SetPageReferenced(page);
985 page_cache_release(page);
986
987 if (left) {
988 pos -= left;
989 written -= left;
990 err = -EFAULT;
991 break;
992 }
993 } while (count);
994
995 *ppos = pos;
996 if (written)
997 err = written;
998 out:
999 up(&inode->i_sem);
1000 return err;
1001 }
This function is responsible for creating a symbolic link symname and deciding where to store the information. The name of the link will be stored in the inode if the name is small enough and in a page frame otherwise.
1272 static int shmem_symlink(struct inode * dir,
struct dentry *dentry,
const char * symname)
1273 {
1274 int error;
1275 int len;
1276 struct inode *inode;
1277 struct page *page = NULL;
1278 char *kaddr;
1279 struct shmem_inode_info *info;
1280
1281 len = strlen(symname) + 1;
1282 if (len > PAGE_CACHE_SIZE)
1283 return -ENAMETOOLONG;
1284
1285 inode = shmem_get_inode(dir->i_sb, S_IFLNK|S_IRWXUGO, 0);
1286 if (!inode)
1287 return -ENOSPC;
1288
1289 info = SHMEM_I(inode);
1290 inode->i_size = len-1;
This block performs basic sanity checks and creating a new inode for the symbolic link.
1291 if (len <= sizeof(struct shmem_inode_info)) {
1292 /* do it inline */
1293 memcpy(info, symname, len);
1294 inode->i_op = &shmem_symlink_inline_operations;
1295 } else {
1296 error = shmem_getpage(inode, 0, &page, SGP_WRITE);
1297 if (error) {
1298 iput(inode);
1299 return error;
1300 }
1301 inode->i_op = &shmem_symlink_inode_operations;
1302 spin_lock(&shmem_ilock);
1303 list_add_tail(&info->list, &shmem_inodes);
1304 spin_unlock(&shmem_ilock);
1305 kaddr = kmap(page);
1306 memcpy(kaddr, symname, len);
1307 kunmap(page);
1308 SetPageDirty(page);
1309 page_cache_release(page);
1310 }
This block is responsible for storing the link information.
1311 dir->i_size += BOGO_DIRENT_SIZE; 1312 dir->i_ctime = dir->i_mtime = CURRENT_TIME; 1313 d_instantiate(dentry, inode); 1314 dget(dentry); 1315 return 0; 1316 }
1318 static int shmem_readlink_inline(struct dentry *dentry,
char *buffer, int buflen)
1319 {
1320 return vfs_readlink(dentry, buffer, buflen,
(const char *)SHMEM_I(dentry->d_inode));
1321 }
1323 static int shmem_follow_link_inline(struct dentry *dentry,
struct nameidata *nd)
1324 {
1325 return vfs_follow_link(nd,
(const char *)SHMEM_I(dentry->d_inode));
1326 }
1328 static int shmem_readlink(struct dentry *dentry,
char *buffer, int buflen)
1329 {
1330 struct page *page - NULL;
1331 int res = shmem_getpage(dentry->d_inode, 0, &page, SGP_READ);
1332 if (res)
1333 return res;
1334 res = vfs_readlink(dentry,buffer,buflen, kmap(page));
1335 kunmap(page);
1336 mark_page_accessed(page);
1337 page_cache_release(page);
1338 return res;
1339 }
1231 static int shmem_follow_link(struct dentry *dentry,
struct nameidata *nd)
1232 {
1233 struct page * page;
1234 int res = shmem_getpage(dentry->d_inode, 0, &page);
1235 if (res)
1236 return res;
1237
1238 res = vfs_follow_link(nd, kmap(page));
1239 kunmap(page);
1240 page_cache_release(page);
1241 return res;
1242 }
This function simply returns 0 as the file exists only in memory and does not need to be synchronised with a file on disk.
1446 static int shmem_sync_file(struct file * file,
struct dentry *dentry,
int datasync)
1447 {
1448 return 0;
1449 }
By the time this function has been called, the inode→i_size has been set to the new size by vmtruncate(). It is the job of this function to either create or remove pages as necessary to set the size of the file.
351 static void shmem_truncate(struct inode *inode)
352 {
353 struct shmem_inode_info *info = SHMEM_I(inode);
354 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
355 unsigned long freed = 0;
356 unsigned long index;
357
358 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
359 index = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
360 if (index >= info->next_index)
361 return;
362
363 spin_lock(&info->lock);
364 while (index < info->next_index)
365 freed += shmem_truncate_indirect(info, index);
366 BUG_ON(info->swapped > info->next_index);
367 spin_unlock(&info->lock);
368
369 spin_lock(&sbinfo->stat_lock);
370 sbinfo->free_blocks += freed;
371 inode->i_blocks -= freed*BLOCKS_PER_PAGE;
372 spin_unlock(&sbinfo->stat_lock);
373 }
This function locates the last doubly-indirect block in the inode and calls shmem_truncate_direct() to truncate it.
308 static inline unsigned long
309 shmem_truncate_indirect(struct shmem_inode_info *info,
unsigned long index)
310 {
311 swp_entry_t ***base;
312 unsigned long baseidx, start;
313 unsigned long len = info->next_index;
314 unsigned long freed;
315
316 if (len <= SHMEM_NR_DIRECT) {
317 info->next_index = index;
318 if (!info->swapped)
319 return 0;
320 freed = shmem_free_swp(info->i_direct + index,
321 info->i_direct + len);
322 info->swapped -= freed;
323 return freed;
324 }
325
326 if (len <= ENTRIES_PER_PAGEPAGE/2 + SHMEM_NR_DIRECT) {
327 len -= SHMEM_NR_DIRECT;
328 base = (swp_entry_t ***) &info->i_indirect;
329 baseidx = SHMEM_NR_DIRECT;
330 } else {
331 len -= ENTRIES_PER_PAGEPAGE/2 + SHMEM_NR_DIRECT;
332 BUG_ON(len > ENTRIES_PER_PAGEPAGE*ENTRIES_PER_PAGE/2);
333 baseidx = len - 1;
334 baseidx -= baseidx % ENTRIES_PER_PAGEPAGE;
335 base = (swp_entry_t ***) info->i_indirect +
336 ENTRIES_PER_PAGE/2 + baseidx/ENTRIES_PER_PAGEPAGE;
337 len -= baseidx;
338 baseidx += ENTRIES_PER_PAGEPAGE/2 + SHMEM_NR_DIRECT;
339 }
340
341 if (index > baseidx) {
342 info->next_index = index;
343 start = index - baseidx;
344 } else {
345 info->next_index = baseidx;
346 start = 0;
347 }
348 return *base? shmem_truncate_direct(info, base, start, len): 0;
349 }
This function is responsible for cycling through an indirect block and calling shmem_free_swp for each page that contains swap vectors which are to be truncated.
264 static inline unsigned long
265 shmem_truncate_direct(struct shmem_inode_info *info,
swp_entry_t ***dir,
unsigned long start, unsigned long len)
266 {
267 swp_entry_t **last, **ptr;
268 unsigned long off, freed_swp, freed = 0;
269
270 last = *dir + (len + ENTRIES_PER_PAGE - 1) / ENTRIES_PER_PAGE;
271 off = start % ENTRIES_PER_PAGE;
272
273 for (ptr = *dir + start/ENTRIES_PER_PAGE; ptr < last; ptr++, off = 0) {
274 if (!*ptr)
275 continue;
276
277 if (info->swapped) {
278 freed_swp = shmem_free_swp(*ptr + off,
279 *ptr + ENTRIES_PER_PAGE);
280 info->swapped -= freed_swp;
281 freed += freed_swp;
282 }
283
284 if (!off) {
285 freed++;
286 free_page((unsigned long) *ptr);
287 *ptr = 0;
288 }
289 }
290
291 if (!start) {
292 freed++;
293 free_page((unsigned long) *dir);
294 *dir = 0;
295 }
296 return freed;
297 }
This frees count number of swap entries starting with the entry at dir.
240 static int shmem_free_swp(swp_entry_t *dir, swp_entry_t *edir)
241 {
242 swp_entry_t *ptr;
243 int freed = 0;
244
245 for (ptr = dir; ptr < edir; ptr++) {
246 if (ptr->val) {
247 free_swap_and_cache(*ptr);
248 *ptr = (swp_entry_t){0};
249 freed++;
250 }
251 }
252 return freed;
254 }
This function creates a hard link with dentry to old_dentry.
1172 static int shmem_link(struct dentry *old_dentry,
struct inode *dir,
struct dentry *dentry)
1173 {
1174 struct inode *inode = old_dentry->d_inode;
1175
1176 if (S_ISDIR(inode->i_mode))
1177 return -EPERM;
1178
1179 dir->i_size += BOGO_DIRENT_SIZE;
1180 inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
1181 inode->i_nlink++;
1182 atomic_inc(&inode->i_count);
1183 dget(dentry);
1184 d_instantiate(dentry, inode);
1185 return 0;
1186 }
1221 static int shmem_unlink(struct inode* dir,
struct dentry *dentry)
1222 {
1223 struct inode *inode = dentry->d_inode;
1224
1225 dir->i_size -= BOGO_DIRENT_SIZE;
1226 inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
1227 inode->i_nlink--;
1228 dput(dentry);
1229 return 0;
1230 }
1154 static int shmem_mkdir(struct inode *dir,
struct dentry *dentry,
int mode)
1155 {
1156 int error;
1157
1158 if ((error = shmem_mknod(dir, dentry, mode | S_IFDIR, 0)))
1159 return error;
1160 dir->i_nlink++;
1161 return 0;
1162 }
1232 static int shmem_rmdir(struct inode *dir, struct dentry *dentry)
1233 {
1234 if (!shmem_empty(dentry))
1235 return -ENOTEMPTY;
1236
1237 dir->i_nlink--;
1238 return shmem_unlink(dir, dentry);
1239 }
This function checks to see if a directory is empty or not.
1201 static int shmem_empty(struct dentry *dentry)
1202 {
1203 struct list_head *list;
1204
1205 spin_lock(&dcache_lock);
1206 list = dentry->d_subdirs.next;
1207
1208 while (list != &dentry->d_subdirs) {
1209 struct dentry *de = list_entry(list,
struct dentry, d_child);
1210
1211 if (shmem_positive(de)) {
1212 spin_unlock(&dcache_lock);
1213 return 0;
1214 }
1215 list = list->next;
1216 }
1217 spin_unlock(&dcache_lock);
1218 return 1;
1219 }
1188 static inline int shmem_positive(struct dentry *dentry)
1189 {
1190 return dentry->d_inode && !d_unhashed(dentry);
1191 }
This is the toplevel nopage() function that is called by do_no_page() when faulting in a page. This is called regardless of the fault being the first fault or if it is being faulted in from backing storage.
763 struct page * shmem_nopage(struct vm_area_struct *vma,
unsigned long address,
int unused)
764 {
765 struct inode *inode = vma->vm_file->f_dentry->d_inode;
766 struct page *page = NULL;
767 unsigned long idx;
768 int error;
769
770 idx = (address - vma->vm_start) >> PAGE_SHIFT;
771 idx += vma->vm_pgoff;
772 idx >>= PAGE_CACHE_SHIFT - PAGE_SHIFT;
773
774 error = shmem_getpage(inode, idx, &page, SGP_CACHE);
775 if (error)
776 return (error == -ENOMEM)? NOPAGE_OOM: NOPAGE_SIGBUS;
777
778 mark_page_accessed(page);
779 flush_page_to_ram(page);
780 return page;
781 }
583 static int shmem_getpage(struct inode *inode,
unsigned long idx,
struct page **pagep,
enum sgp_type sgp)
584 {
585 struct address_space *mapping = inode->i_mapping;
586 struct shmem_inode_info *info = SHMEM_I(inode);
587 struct shmem_sb_info *sbinfo;
588 struct page *filepage = *pagep;
589 struct page *swappage;
590 swp_entry_t *entry;
591 swp_entry_t swap;
592 int error = 0;
593
594 if (idx >= SHMEM_MAX_INDEX)
595 return -EFBIG;
596 /*
597 * Normally, filepage is NULL on entry, and either found
598 * uptodate immediately, or allocated and zeroed, or read
599 * in under swappage, which is then assigned to filepage.
600 * But shmem_readpage and shmem_prepare_write pass in a locked
601 * filepage, which may be found not uptodate by other callers
602 * too, and may need to be copied from the swappage read in.
603 */
604 repeat:
605 if (!filepage)
606 filepage = find_lock_page(mapping, idx);
607 if (filepage && Page_Uptodate(filepage))
608 goto done;
609
610 spin_lock(&info->lock);
611 entry = shmem_swp_alloc(info, idx, sgp);
612 if (IS_ERR(entry)) {
613 spin_unlock(&info->lock);
614 error = PTR_ERR(entry);
615 goto failed;
616 }
617 swap = *entry;
619 if (swap.val) {
620 /* Look it up and read it in.. */
621 swappage = lookup_swap_cache(swap);
622 if (!swappage) {
623 spin_unlock(&info->lock);
624 swapin_readahead(swap);
625 swappage = read_swap_cache_async(swap);
626 if (!swappage) {
627 spin_lock(&info->lock);
628 entry = shmem_swp_alloc(info, idx, sgp);
629 if (IS_ERR(entry))
630 error = PTR_ERR(entry);
631 else if (entry->val == swap.val)
632 error = -ENOMEM;
633 spin_unlock(&info->lock);
634 if (error)
635 goto failed;
636 goto repeat;
637 }
638 wait_on_page(swappage);
639 page_cache_release(swappage);
640 goto repeat;
641 }
642
643 /* We have to do this with page locked to prevent races */
644 if (TryLockPage(swappage)) {
645 spin_unlock(&info->lock);
646 wait_on_page(swappage);
647 page_cache_release(swappage);
648 goto repeat;
649 }
650 if (!Page_Uptodate(swappage)) {
651 spin_unlock(&info->lock);
652 UnlockPage(swappage);
653 page_cache_release(swappage);
654 error = -EIO;
655 goto failed;
656 }
In this block, a valid swap entry exists for the page. The page will be first searched for in the swap cache and if it does not exist there, it will be read in from backing storage.
658 delete_from_swap_cache(swappage);
659 if (filepage) {
660 entry->val = 0;
661 info->swapped--;
662 spin_unlock(&info->lock);
663 flush_page_to_ram(swappage);
664 copy_highpage(filepage, swappage);
665 UnlockPage(swappage);
666 page_cache_release(swappage);
667 flush_dcache_page(filepage);
668 SetPageUptodate(filepage);
669 SetPageDirty(filepage);
670 swap_free(swap);
671 } else if (add_to_page_cache_unique(swappage,
672 mapping, idx, page_hash(mapping, idx)) == 0) {
673 entry->val = 0;
674 info->swapped--;
675 spin_unlock(&info->lock);
676 filepage = swappage;
677 SetPageUptodate(filepage);
678 SetPageDirty(filepage);
679 swap_free(swap);
680 } else {
681 if (add_to_swap_cache(swappage, swap) != 0)
682 BUG();
683 spin_unlock(&info->lock);
684 SetPageUptodate(swappage);
685 SetPageDirty(swappage);
686 UnlockPage(swappage);
687 page_cache_release(swappage);
688 goto repeat;
689 }
At this point, the page exists in the swap cache
690 } else if (sgp == SGP_READ && !filepage) {
691 filepage = find_get_page(mapping, idx);
692 if (filepage &&
693 (!Page_Uptodate(filepage) || TryLockPage(filepage))) {
694 spin_unlock(&info->lock);
695 wait_on_page(filepage);
696 page_cache_release(filepage);
697 filepage = NULL;
698 goto repeat;
699 }
700 spin_unlock(&info->lock);
In this block, a valid swap entry does not exist for the idx. If the page is being read and the pagep is NULL, then locate the page in the page cache.
701 } else {
702 sbinfo = SHMEM_SB(inode->i_sb);
703 spin_lock(&sbinfo->stat_lock);
704 if (sbinfo->free_blocks == 0) {
705 spin_unlock(&sbinfo->stat_lock);
706 spin_unlock(&info->lock);
707 error = -ENOSPC;
708 goto failed;
709 }
710 sbinfo->free_blocks--;
711 inode->i_blocks += BLOCKS_PER_PAGE;
712 spin_unlock(&sbinfo->stat_lock);
713
714 if (!filepage) {
715 spin_unlock(&info->lock);
716 filepage = page_cache_alloc(mapping);
717 if (!filepage) {
718 shmem_free_block(inode);
719 error = -ENOMEM;
720 goto failed;
721 }
722
723 spin_lock(&info->lock);
724 entry = shmem_swp_alloc(info, idx, sgp);
725 if (IS_ERR(entry))
726 error = PTR_ERR(entry);
727 if (error || entry->val ||
728 add_to_page_cache_unique(filepage,
729 mapping, idx, page_hash(mapping, idx)) != 0) {
730 spin_unlock(&info->lock);
731 page_cache_release(filepage);
732 shmem_free_block(inode);
733 filepage = NULL;
734 if (error)
735 goto failed;
736 goto repeat;
737 }
738 }
739
740 spin_unlock(&info->lock);
741 clear_highpage(filepage);
742 flush_dcache_page(filepage);
743 SetPageUptodate(filepage);
744 }
Else a page that is not in the page cache is being written to. It will need to be allocated.
745 done:
746 if (!*pagep) {
747 if (filepage) {
748 UnlockPage(filepage);
749 *pagep = filepage;
750 } else
751 *pagep = ZERO_PAGE(0);
752 }
753 return 0;
754
755 failed:
756 if (*pagep != filepage) {
757 UnlockPage(filepage);
758 page_cache_release(filepage);
759 }
760 return error;
761 }
This function is a top-level function that returns the swap entry corresponding to a particular page index within a file. If the swap entry does not exist, one will be allocated.
183 static inline swp_entry_t * shmem_alloc_entry (
struct shmem_inode_info *info,
unsigned long index)
184 {
185 unsigned long page = 0;
186 swp_entry_t * res;
187
188 if (index >= SHMEM_MAX_INDEX)
189 return ERR_PTR(-EFBIG);
190
191 if (info->next_index <= index)
192 info->next_index = index + 1;
193
194 while ((res = shmem_swp_entry(info,index,page)) ==
ERR_PTR(-ENOMEM)) {
195 page = get_zeroed_page(GFP_USER);
196 if (!page)
197 break;
198 }
199 return res;
200 }
This function uses information within the inode to locate the swp_entry_t for a given index. The inode itself is able to store SHMEM_NR_DIRECT swap vectors. After that indirect blocks are used.
127 static swp_entry_t *shmem_swp_entry (struct shmem_inode_info *info,
unsigned long index,
unsigned long page)
128 {
129 unsigned long offset;
130 void **dir;
131
132 if (index < SHMEM_NR_DIRECT)
133 return info->i_direct+index;
134 if (!info->i_indirect) {
135 if (page) {
136 info->i_indirect = (void **) *page;
137 *page = 0;
138 }
139 return NULL;
140 }
141
142 index -= SHMEM_NR_DIRECT;
143 offset = index % ENTRIES_PER_PAGE;
144 index /= ENTRIES_PER_PAGE;
145 dir = info->i_indirect;
146
147 if (index >= ENTRIES_PER_PAGE/2) {
148 index -= ENTRIES_PER_PAGE/2;
149 dir += ENTRIES_PER_PAGE/2 + index/ENTRIES_PER_PAGE;
150 index %= ENTRIES_PER_PAGE;
151 if (!*dir) {
152 if (page) {
153 *dir = (void *) *page;
154 *page = 0;
155 }
156 return NULL;
157 }
158 dir = ((void **)*dir);
159 }
160
161 dir += index;
162 if (!*dir) {
163 if (!page || !*page)
164 return NULL;
165 *dir = (void *) *page;
166 *page = 0;
167 }
168 return (swp_entry_t *) *dir + offset;
169 }
This function is responsible for moving a page from the page cache to the swap cache.
522 static int shmem_writepage(struct page *page)
523 {
524 struct shmem_inode_info *info;
525 swp_entry_t *entry, swap;
526 struct address_space *mapping;
527 unsigned long index;
528 struct inode *inode;
529
530 BUG_ON(!PageLocked(page));
531 if (!PageLaunder(page))
532 return fail_writepage(page);
533
534 mapping = page->mapping;
535 index = page->index;
536 inode = mapping->host;
537 info = SHMEM_I(inode);
538 if (info->flags & VM_LOCKED)
539 return fail_writepage(page);
This block is function preamble to make sure the operation is possible.
540 getswap: 541 swap = get_swap_page(); 542 if (!swap.val) 543 return fail_writepage(page); 544 545 spin_lock(&info->lock); 546 BUG_ON(index >= info->next_index); 547 entry = shmem_swp_entry(info, index, NULL); 548 BUG_ON(!entry); 549 BUG_ON(entry->val); 550
This block is responsible for allocating a swap slot from the backing storage and a swp_entry_t within the inode.
551 /* Remove it from the page cache */
552 remove_inode_page(page);
553 page_cache_release(page);
554
555 /* Add it to the swap cache */
556 if (add_to_swap_cache(page, swap) != 0) {
557 /*
558 * Raced with "speculative" read_swap_cache_async.
559 * Add page back to page cache, unref swap, try again.
560 */
561 add_to_page_cache_locked(page, mapping, index);
562 spin_unlock(&info->lock);
563 swap_free(swap);
564 goto getswap;
565 }
566
567 *entry = swap;
568 info->swapped++;
569 spin_unlock(&info->lock);
570 SetPageUptodate(page);
571 set_page_dirty(page);
572 UnlockPage(page);
573 return 0;
574 }
Move from the page cache to the swap cache and update statistics.
This function will search the shmem_inodes list for the inode that holds the information for the requsted entry and page. It is a very expensive operation but it is only called when a swap area is being deactivated so it is not a significant problem. On return, the swap entry will be freed and the page will be moved from the swap cache to the page cache.
498 int shmem_unuse(swp_entry_t entry, struct page *page)
499 {
500 struct list_head *p;
501 struct shmem_inode_info * nfo;
502
503 spin_lock(&shmem_ilock);
504 list_for_each(p, &shmem_inodes) {
505 info = list_entry(p, struct shmem_inode_info, list);
506
507 if (info->swapped && shmem_unuse_inode(info, entry, page)) {
508 /* move head to start search for next from here */
509 list_move_tail(&shmem_inodes, &info->list);
510 found = 1;
511 break;
512 }
513 }
514 spin_unlock(&shmem_ilock);
515 return found;
516 }
This function searches the inode information in info to determine if the entry and page belong to it. If they do, the entry will be cleared and the page will be removed from the swap cache and moved to the page cache instead.
436 static int shmem_unuse_inode(struct shmem_inode_info *info,
swp_entry_t entry,
struct page *page)
437 {
438 struct inode *inode;
439 struct address_space *mapping;
440 swp_entry_t *ptr;
441 unsigned long idx;
442 int offset;
443
444 idx = 0;
445 ptr = info->i_direct;
446 spin_lock(&info->lock);
447 offset = info->next_index;
448 if (offset > SHMEM_NR_DIRECT)
449 offset = SHMEM_NR_DIRECT;
450 offset = shmem_find_swp(entry, ptr, ptr + offset);
451 if (offset >= 0)
452 goto found;
453
454 for (idx = SHMEM_NR_DIRECT; idx < info->next_index;
455 idx += ENTRIES_PER_PAGE) {
456 ptr = shmem_swp_entry(info, idx, NULL);
457 if (!ptr)
458 continue;
459 offset = info->next_index - idx;
460 if (offset > ENTRIES_PER_PAGE)
461 offset = ENTRIES_PER_PAGE;
462 offset = shmem_find_swp(entry, ptr, ptr + offset);
463 if (offset >= 0)
464 goto found;
465 }
466 spin_unlock(&info->lock);
467 return 0;
468 found:
470 idx += offset;
471 inode = info->inode;
472 mapping = inode->i_mapping;
473 delete_from_swap_cache(page);
474
475 /* Racing against delete or truncate?
* Must leave out of page cache */
476 limit = (inode->i_state & I_FREEING)? 0:
477 (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
478
479 if (idx >= limit || add_to_page_cache_unique(page,
480 mapping, idx, page_hash(mapping, idx)) == 0) {
481 ptr[offset].val = 0;
482 info->swapped--;
483 } else if (add_to_swap_cache(page, entry) != 0)
484 BUG();
485 spin_unlock(&info->lock);
486 SetPageUptodate(page);
487 /*
488 * Decrement swap count even when the entry is left behind:
489 * try_to_unuse will skip over mms, then reincrement count.
490 */
491 swap_free(entry);
492 return 1;
493 }
This function searches an indirect block between the two pointers ptr and eptr for the requested entry. Note that the two pointers must be in the same indirect block.
425 static inline int shmem_find_swp(swp_entry_t entry,
swp_entry_t *dir,
swp_entry_t *edir)
426 {
427 swp_entry_t *ptr;
428
429 for (ptr = dir; ptr < edir; ptr++) {
430 if (ptr->val == entry.val)
431 return ptr - dir;
432 }
433 return -1;
434 }
This function is called to setup a VMA that is a shared region backed by anonymous pages. The call graph which shows this function is in Figure 12.5. This occurs when mmap() creates an anonymous region with the MAP_SHARED flag.
1664 int shmem_zero_setup(struct vm_area_struct *vma)
1665 {
1666 struct file *file;
1667 loff_t size = vma->vm_end - vma->vm_start;
1668
1669 file = shmem_file_setup("dev/zero", size);
1670 if (IS_ERR(file))
1671 return PTR_ERR(file);
1672
1673 if (vma->vm_file)
1674 fput(vma->vm_file);
1675 vma->vm_file = file;
1676 vma->vm_ops = &shmem_vm_ops;
1677 return 0;
1678 }
This function is called to create a new file in shmfs, the internal filesystem. As the filesystem is internal, the supplied name does not have to be unique within each directory. Hence, every file that is created by an anonymous region with shmem_zero_setup() will simple be called “dev/zero” and regions created with shmget() will be called “SYSVNN” where NN is the key that is passed as the first arguement to shmget().
1607 struct file *shmem_file_setup(char *name, loff_tsize)
1608 {
1609 int error;
1610 struct file *file;
1611 struct inode *inode;
1612 struct dentry *dentry, *root;
1613 struct qstr this;
1614 int vm_enough_memory(long pages);
1615
1616 if (IS_ERR(shm_mnt))
1617 return (void *)shm_mnt;
1618
1619 if (size > SHMEM_MAX_BYTES)
1620 return ERR_PTR(-EINVAL);
1621
1622 if (!vm_enough_memory(VM_ACCT(size)))
1623 return ERR_PTR(-ENOMEM);
1624
1625 this.name = name;
1626 this.len = strlen(name);
1627 this.hash = 0; /* will go */
1628 root = shm_mnt->mnt_root; 1629 dentry = d_alloc(root, &this); 1630 if (!dentry) 1631 return ERR_PTR(-ENOMEM); 1632 1633 error = -ENFILE; 1634 file = get_empty_filp(); 1635 if (!file) 1636 goto put_dentry; 1637 1638 error = -ENOSPC; 1639 inode = shmem_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0); 1640 if (!inode) 1641 goto close_file; 1642 1643 d_instantiate(dentry, inode); 1644 inode->i_size = size; 1645 inode->i_nlink = 0; /* It is unlinked */ 1646 file->f_vfsmnt = mntget(shm_mnt); 1647 file->f_dentry = dentry; 1648 file->f_op = &shmem_file_operations; 1649 file->f_mode = FMODE_WRITE | FMODE_READ; 1650 return file; 1651 1652 close_file: 1653 put_filp(file); 1654 put_dentry: 1655 dput(dentry); 1656 return ERR_PTR(error); 1657 }
229 asmlinkage long sys_shmget (key_t key, size_t size, int shmflg)
230 {
231 struct shmid_kernel *shp;
232 int err, id = 0;
233
234 down(&shm_ids.sem);
235 if (key == IPC_PRIVATE) {
236 err = newseg(key, shmflg, size);
237 } else if ((id = ipc_findkey(&shm_ids, key)) == -1) {
238 if (!(shmflg & IPC_CREAT))
239 err = -ENOENT;
240 else
241 err = newseg(key, shmflg, size);
242 } else if ((shmflg & IPC_CREAT) && (shmflg & IPC_EXCL)) {
243 err = -EEXIST;
244 } else {
245 shp = shm_lock(id);
246 if(shp==NULL)
247 BUG();
248 if (shp->shm_segsz < size)
249 err = -EINVAL;
250 else if (ipcperms(&shp->shm_perm, shmflg))
251 err = -EACCES;
252 else
253 err = shm_buildid(id, shp->shm_perm.seq);
254 shm_unlock(id);
255 }
256 up(&shm_ids.sem);
257 return err;
258 }
This function creates a new shared segment.
178 static int newseg (key_t key, int shmflg, size_t size)
179 {
180 int error;
181 struct shmid_kernel *shp;
182 int numpages = (size + PAGE_SIZE -1) >> PAGE_SHIFT;
183 struct file * file;
184 char name[13];
185 int id;
186
187 if (size < SHMMIN || size > shm_ctlmax)
188 return -EINVAL;
189
190 if (shm_tot + numpages >= shm_ctlall)
191 return -ENOSPC;
192
193 shp = (struct shmid_kernel *) kmalloc (sizeof (*shp), GFP_USER);
194 if (!shp)
195 return -ENOMEM;
196 sprintf (name, "SYSV%08x", key);
This block allocates the segment descriptor.
197 file = shmem_file_setup(name, size); 198 error = PTR_ERR(file); 199 if (IS_ERR(file)) 200 goto no_file; 201 202 error = -ENOSPC; 203 id = shm_addid(shp); 204 if(id == -1) 205 goto no_id; 206 shp->shm_perm.key = key; 207 shp->shm_flags = (shmflg & S_IRWXUGO); 208 shp->shm_cprid = current->pid; 209 shp->shm_lprid = 0; 210 shp->shm_atim = shp->shm_dtim = 0; 211 shp->shm_ctim = CURRENT_TIME; 212 shp->shm_segsz = size; 213 shp->shm_nattch = 0; 214 shp->id = shm_buildid(id,shp->shm_perm.seq); 215 shp->shm_file = file; 216 file->f_dentry->d_inode->i_ino = shp->id; 217 file->f_op = &shm_file_operations; 218 shm_tot += numpages; 219 shm_unlock (id); 220 return shp->id; 221 222 no_id: 223 fput(file); 224 no_file: 225 kfree(shp); 226 return error; 227 }
568 asmlinkage long sys_shmat (int shmid, char *shmaddr,
int shmflg, ulong *raddr)
569 {
570 struct shmid_kernel *shp;
571 unsigned long addr;
572 unsigned long size;
573 struct file * file;
574 int err;
575 unsigned long flags;
576 unsigned long prot;
577 unsigned long o_flags;
578 int acc_mode;
579 void *user_addr;
580
581 if (shmid < 0)
582 return -EINVAL;
583
584 if ((addr = (ulong)shmaddr)) {
585 if (addr & (SHMLBA-1)) {
586 if (shmflg & SHM_RND)
587 addr &= ~(SHMLBA-1); /* round down */
588 else
589 return -EINVAL;
590 }
591 flags = MAP_SHARED | MAP_FIXED;
592 } else {
593 if ((shmflg & SHM_REMAP))
594 return -EINVAL;
595
596 flags = MAP_SHARED;
597 }
598
599 if (shmflg & SHM_RDONLY) {
600 prot = PROT_READ;
601 o_flags = O_RDONLY;
602 acc_mode = S_IRUGO;
603 } else {
604 prot = PROT_READ | PROT_WRITE;
605 o_flags = O_RDWR;
606 acc_mode = S_IRUGO | S_IWUGO;
607 }
This section ensures the parameters to shmat() are valid.
613 shp = shm_lock(shmid);
614 if(shp == NULL)
615 return -EINVAL;
616 err = shm_checkid(shp,shmid);
617 if (err) {
618 shm_unlock(shmid);
619 return err;
620 }
621 if (ipcperms(&shp->shm_perm, acc_mode)) {
622 shm_unlock(shmid);
623 return -EACCES;
624 }
625 file = shp->shm_file;
626 size = file->f_dentry->d_inode->i_size;
627 shp->shm_nattch++;
628 shm_unlock(shmid);
This block ensures the IPC permissions are valid
630 down_write(¤t->mm->mmap_sem);
631 if (addr && !(shmflg & SHM_REMAP)) {
632 user_addr = ERR_PTR(-EINVAL);
633 if (find_vma_intersection(current->mm, addr, addr + size))
634 goto invalid;
635 /*
636 * If shm segment goes below stack, make sure there is some
637 * space left for the stack to grow (at least 4 pages).
638 */
639 if (addr < current->mm->start_stack &&
640 addr > current->mm->start_stack - size - PAGE_SIZE * 5)
641 goto invalid;
642 }
643
644 user_addr = (void*) do_mmap (file, addr, size, prot, flags, 0);
This block is where do_mmap() will be called to attach the region to the calling process.
646 invalid: 647 up_write(¤t->mm->mmap_sem); 648 649 down (&shm_ids.sem); 650 if(!(shp = shm_lock(shmid))) 651 BUG(); 652 shp->shm_nattch--; 653 if(shp->shm_nattch == 0 && 654 shp->shm_flags & SHM_DEST) 655 shm_destroy (shp); 656 else 657 shm_unlock(shmid); 658 up (&shm_ids.sem); 659 660 *raddr = (unsigned long) user_addr; 661 err = 0; 662 if (IS_ERR(user_addr)) 663 err = PTR_ERR(user_addr); 664 return err; 665 666 }