This section addresses how pages are added and removed from the page cache and LRU lists, both of which are heavily intertwined.
Acquire the lock protecting the page cache before calling __add_to_page_cache() which will add the page to the page hash table and inode queue which allows the pages belonging to files to be found quickly.
667 void add_to_page_cache(struct page * page,
struct address_space * mapping,
unsigned long offset)
668 {
669 spin_lock(&pagecache_lock);
670 __add_to_page_cache(page, mapping,
offset, page_hash(mapping, offset));
671 spin_unlock(&pagecache_lock);
672 lru_cache_add(page);
673 }
In many respects, this function is very similar to add_to_page_cache(). The principal difference is that this function will check the page cache with the pagecache_lock spinlock held before adding the page to the cache. It is for callers may race with another process for inserting a page in the cache such as add_to_swap_cache()(See Section K.2.1.1).
675 int add_to_page_cache_unique(struct page * page,
676 struct address_space *mapping, unsigned long offset,
677 struct page **hash)
678 {
679 int err;
680 struct page *alias;
681
682 spin_lock(&pagecache_lock);
683 alias = __find_page_nolock(mapping, offset, *hash);
684
685 err = 1;
686 if (!alias) {
687 __add_to_page_cache(page,mapping,offset,hash);
688 err = 0;
689 }
690
691 spin_unlock(&pagecache_lock);
692 if (!err)
693 lru_cache_add(page);
694 return err;
695 }
Clear all page flags, lock it, take a reference and add it to the inode and hash queues.
653 static inline void __add_to_page_cache(struct page * page,
654 struct address_space *mapping, unsigned long offset,
655 struct page **hash)
656 {
657 unsigned long flags;
658
659 flags = page->flags & ~(1 << PG_uptodate |
1 << PG_error | 1 << PG_dirty |
1 << PG_referenced | 1 << PG_arch_1 |
1 << PG_checked);
660 page->flags = flags | (1 << PG_locked);
661 page_cache_get(page);
662 page->index = offset;
663 add_page_to_inode_queue(mapping, page);
664 add_page_to_hash_queue(page, hash);
665 }
85 static inline void add_page_to_inode_queue(
struct address_space *mapping, struct page * page)
86 {
87 struct list_head *head = &mapping->clean_pages;
88
89 mapping->nrpages++;
90 list_add(&page->list, head);
91 page->mapping = mapping;
92 }
This adds page to the top of hash bucket headed by p. Bear in mind that p is an element of the array page_hash_table.
71 static void add_page_to_hash_queue(struct page * page,
struct page **p)
72 {
73 struct page *next = *p;
74
75 *p = page;
76 page->next_hash = next;
77 page->pprev_hash = p;
78 if (next)
79 next->pprev_hash = &page->next_hash;
80 if (page->buffers)
81 PAGE_BUG(page);
82 atomic_inc(&page_cache_size);
83 }
130 void remove_inode_page(struct page *page)
131 {
132 if (!PageLocked(page))
133 PAGE_BUG(page);
134
135 spin_lock(&pagecache_lock);
136 __remove_inode_page(page);
137 spin_unlock(&pagecache_lock);
138 }
This is the top-level function for removing a page from the page cache for callers with the pagecache_lock spinlock held. Callers that do not have this lock acquired should call remove_inode_page().
124 void __remove_inode_page(struct page *page)
125 {
126 remove_page_from_inode_queue(page);
127 remove_page_from_hash_queue(page);
128
94 static inline void remove_page_from_inode_queue(struct page * page)
95 {
96 struct address_space * mapping = page->mapping;
97
98 if (mapping->a_ops->removepage)
99 mapping->a_ops->removepage(page);
100 list_del(&page->list);
101 page->mapping = NULL;
102 wmb();
103 mapping->nr_pages--;
104 }
107 static inline void remove_page_from_hash_queue(struct page * page)
108 {
109 struct page *next = page->next_hash;
110 struct page **pprev = page->pprev_hash;
111
112 if (next)
113 next->pprev_hash = pprev;
114 *pprev = next;
115 page->pprev_hash = NULL;
116 atomic_dec(&page_cache_size);
117 }
Source: include/linux/pagemap.h
31 #define page_cache_get(x) get_page(x)
Source: include/linux/pagemap.h
32 #define page_cache_release(x) __free_page(x)
Source: include/linux/pagemap.h
Top level macro for finding a page in the page cache. It simply looks up the page hash
75 #define find_get_page(mapping, index) \ 76 __find_get_page(mapping, index, page_hash(mapping, index))
This function is responsible for finding a struct page given an entry in page_hash_table as a starting point.
931 struct page * __find_get_page(struct address_space *mapping,
932 unsigned long offset, struct page **hash)
933 {
934 struct page *page;
935
936 /*
937 * We scan the hash list read-only. Addition to and removal from
938 * the hash-list needs a held write-lock.
939 */
940 spin_lock(&pagecache_lock);
941 page = __find_page_nolock(mapping, offset, *hash);
942 if (page)
943 page_cache_get(page);
944 spin_unlock(&pagecache_lock);
945 return page;
946 }
This function traverses the hash collision list looking for the page specified by the address_space and offset.
443 static inline struct page * __find_page_nolock(
struct address_space *mapping,
unsigned long offset,
struct page *page)
444 {
445 goto inside;
446
447 for (;;) {
448 page = page->next_hash;
449 inside:
450 if (!page)
451 goto not_found;
452 if (page->mapping != mapping)
453 continue;
454 if (page->index == offset)
455 break;
456 }
457
458 not_found:
459 return page;
460 }
Source: include/linux/pagemap.h
This is the top level function for searching the page cache for a page and having it returned in a locked state.
84 #define find_lock_page(mapping, index) \ 85 __find_lock_page(mapping, index, page_hash(mapping, index))
This function acquires the pagecache_lock spinlock before calling the core function __find_lock_page_helper() to locate the page and lock it.
1005 struct page * __find_lock_page (struct address_space *mapping,
1006 unsigned long offset, struct page **hash)
1007 {
1008 struct page *page;
1009
1010 spin_lock(&pagecache_lock);
1011 page = __find_lock_page_helper(mapping, offset, *hash);
1012 spin_unlock(&pagecache_lock);
1013 return page;
1014 }
This function uses __find_page_nolock() to locate a page within the page cache. If it is found, the page will be locked for returning to the caller.
972 static struct page * __find_lock_page_helper(
struct address_space *mapping,
973 unsigned long offset, struct page *hash)
974 {
975 struct page *page;
976
977 /*
978 * We scan the hash list read-only. Addition to and removal from
979 * the hash-list needs a held write-lock.
980 */
981 repeat:
982 page = __find_page_nolock(mapping, offset, hash);
983 if (page) {
984 page_cache_get(page);
985 if (TryLockPage(page)) {
986 spin_unlock(&pagecache_lock);
987 lock_page(page);
988 spin_lock(&pagecache_lock);
989
990 /* Has the page been re-allocated while we slept? */
991 if (page->mapping != mapping || page->index != offset) {
992 UnlockPage(page);
993 page_cache_release(page);
994 goto repeat;
995 }
996 }
997 }
998 return page;
999 }
Adds a page to the LRU inactive_list.
58 void lru_cache_add(struct page * page)
59 {
60 if (!PageLRU(page)) {
61 spin_lock(&pagemap_lru_lock);
62 if (!TestSetPageLRU(page))
63 add_page_to_inactive_list(page);
64 spin_unlock(&pagemap_lru_lock);
65 }
66 }
Adds the page to the active_list
178 #define add_page_to_active_list(page) \
179 do { \
180 DEBUG_LRU_PAGE(page); \
181 SetPageActive(page); \
182 list_add(&(page)->lru, &active_list); \
183 nr_active_pages++; \
184 } while (0)
Adds the page to the inactive_list
186 #define add_page_to_inactive_list(page) \
187 do { \
188 DEBUG_LRU_PAGE(page); \
189 list_add(&(page)->lru, &inactive_list); \
190 nr_inactive_pages++; \
191 } while (0)
Acquire the lock protecting the LRU lists before calling __lru_cache_del().
90 void lru_cache_del(struct page * page)
91 {
92 spin_lock(&pagemap_lru_lock);
93 __lru_cache_del(page);
94 spin_unlock(&pagemap_lru_lock);
95 }
Select which function is needed to remove the page from the LRU list.
75 void __lru_cache_del(struct page * page)
76 {
77 if (TestClearPageLRU(page)) {
78 if (PageActive(page)) {
79 del_page_from_active_list(page);
80 } else {
81 del_page_from_inactive_list(page);
82 }
83 }
84 }
Remove the page from the active_list
193 #define del_page_from_active_list(page) \
194 do { \
195 list_del(&(page)->lru); \
196 ClearPageActive(page); \
197 nr_active_pages--; \
198 } while (0)
200 #define del_page_from_inactive_list(page) \
201 do { \
202 list_del(&(page)->lru); \
203 nr_inactive_pages--; \
204 } while (0)
This marks that a page has been referenced. If the page is already on the active_list or the referenced flag is clear, the referenced flag will be simply set. If it is in the inactive_list and the referenced flag has been set, activate_page() will be called to move the page to the top of the active_list.
1332 void mark_page_accessed(struct page *page)
1333 {
1334 if (!PageActive(page) && PageReferenced(page)) {
1335 activate_page(page);
1336 ClearPageReferenced(page);
1337 } else
1338 SetPageReferenced(page);
1339 }
Acquire the LRU lock before calling activate_page_nolock() which moves the page from the inactive_list to the active_list.
47 void activate_page(struct page * page)
48 {
49 spin_lock(&pagemap_lru_lock);
50 activate_page_nolock(page);
51 spin_unlock(&pagemap_lru_lock);
52 }
Move the page from the inactive_list to the active_list
39 static inline void activate_page_nolock(struct page * page)
40 {
41 if (PageLRU(page) && !PageActive(page)) {
42 del_page_from_inactive_list(page);
43 add_page_to_active_list(page);
44 }
45 }
This section covers how pages are moved from the active lists to the inactive lists.
Move nr_pages from the active_list to the inactive_list. The parameter nr_pages is calculated by shrink_caches() and is a number which tries to keep the active list two thirds the size of the page cache.
533 static void refill_inactive(int nr_pages)
534 {
535 struct list_head * entry;
536
537 spin_lock(&pagemap_lru_lock);
538 entry = active_list.prev;
539 while (nr_pages && entry != &active_list) {
540 struct page * page;
541
542 page = list_entry(entry, struct page, lru);
543 entry = entry->prev;
544 if (PageTestandClearReferenced(page)) {
545 list_del(&page->lru);
546 list_add(&page->lru, &active_list);
547 continue;
548 }
549
550 nr_pages--;
551
552 del_page_from_active_list(page);
553 add_page_to_inactive_list(page);
554 SetPageReferenced(page);
555 }
556 spin_unlock(&pagemap_lru_lock);
557 }
This section covers how a page is reclaimed once it has been selected for pageout.
338 static int shrink_cache(int nr_pages, zone_t * classzone,
unsigned int gfp_mask, int priority)
339 {
340 struct list_head * entry;
341 int max_scan = nr_inactive_pages / priority;
342 int max_mapped = min((nr_pages << (10 - priority)),
max_scan / 10);
343
344 spin_lock(&pagemap_lru_lock);
345 while (--max_scan >= 0 &&
(entry = inactive_list.prev) != &inactive_list) {
346 struct page * page;
347
348 if (unlikely(current->need_resched)) {
349 spin_unlock(&pagemap_lru_lock);
350 __set_current_state(TASK_RUNNING);
351 schedule();
352 spin_lock(&pagemap_lru_lock);
353 continue;
354 }
355
356 page = list_entry(entry, struct page, lru); 357 358 BUG_ON(!PageLRU(page)); 359 BUG_ON(PageActive(page)); 360 361 list_del(entry); 362 list_add(entry, &inactive_list); 363 364 /* 365 * Zero page counts can happen because we unlink the pages 366 * _after_ decrementing the usage count.. 367 */ 368 if (unlikely(!page_count(page))) 369 continue; 370 371 if (!memclass(page_zone(page), classzone)) 372 continue; 373 374 /* Racy check to avoid trylocking when not worthwhile */ 375 if (!page->buffers && (page_count(page) != 1 || !page->mapping)) 376 goto page_mapped;
382 if (unlikely(TryLockPage(page))) {
383 if (PageLaunder(page) && (gfp_mask & __GFP_FS)) {
384 page_cache_get(page);
385 spin_unlock(&pagemap_lru_lock);
386 wait_on_page(page);
387 page_cache_release(page);
388 spin_lock(&pagemap_lru_lock);
389 }
390 continue;
391 }
Page is locked and the launder bit is set. In this case, it is the second time this page has been found dirty. The first time it was scheduled for IO and placed back on the list. This time we wait until the IO is complete and then try to free the page.
392
393 if (PageDirty(page) &&
is_page_cache_freeable(page) &&
page->mapping) {
394 /*
395 * It is not critical here to write it only if
396 * the page is unmapped beause any direct writer
397 * like O_DIRECT would set the PG_dirty bitflag
398 * on the phisical page after having successfully
399 * pinned it and after the I/O to the page is finished,
400 * so the direct writes to the page cannot get lost.
401 */
402 int (*writepage)(struct page *);
403
404 writepage = page->mapping->a_ops->writepage;
405 if ((gfp_mask & __GFP_FS) && writepage) {
406 ClearPageDirty(page);
407 SetPageLaunder(page);
408 page_cache_get(page);
409 spin_unlock(&pagemap_lru_lock);
410
411 writepage(page);
412 page_cache_release(page);
413
414 spin_lock(&pagemap_lru_lock);
415 continue;
416 }
417 }
This handles the case where a page is dirty, is not mapped by any process, has no buffers and is backed by a file or device mapping. The page is cleaned and will be reclaimed by the previous block of code when the IO is complete.
424 if (page->buffers) {
425 spin_unlock(&pagemap_lru_lock);
426
427 /* avoid to free a locked page */
428 page_cache_get(page);
429
430 if (try_to_release_page(page, gfp_mask)) {
431 if (!page->mapping) {
438 spin_lock(&pagemap_lru_lock);
439 UnlockPage(page);
440 __lru_cache_del(page);
441
442 /* effectively free the page here */
443 page_cache_release(page);
444
445 if (--nr_pages)
446 continue;
447 break;
448 } else {
454 page_cache_release(page);
455
456 spin_lock(&pagemap_lru_lock);
457 }
458 } else {
459 /* failed to drop the buffers so stop here */
460 UnlockPage(page);
461 page_cache_release(page);
462
463 spin_lock(&pagemap_lru_lock);
464 continue;
465 }
466 }
Page has buffers associated with it that must be freed.
468 spin_lock(&pagecache_lock);
469
470 /*
471 * this is the non-racy check for busy page.
472 */
473 if (!page->mapping || !is_page_cache_freeable(page)) {
474 spin_unlock(&pagecache_lock);
475 UnlockPage(page);
476 page_mapped:
477 if (--max_mapped >= 0)
478 continue;
479
484 spin_unlock(&pagemap_lru_lock);
485 swap_out(priority, gfp_mask, classzone);
486 return nr_pages;
487 }
493 if (PageDirty(page)) {
494 spin_unlock(&pagecache_lock);
495 UnlockPage(page);
496 continue;
497 }
498
499 /* point of no return */
500 if (likely(!PageSwapCache(page))) {
501 __remove_inode_page(page);
502 spin_unlock(&pagecache_lock);
503 } else {
504 swp_entry_t swap;
505 swap.val = page->index;
506 __delete_from_swap_cache(page);
507 spin_unlock(&pagecache_lock);
508 swap_free(swap);
509 }
510
511 __lru_cache_del(page);
512 UnlockPage(page);
513
514 /* effectively free the page here */
515 page_cache_release(page);
516
517 if (--nr_pages)
518 continue;
519 break;
520 }
521 spin_unlock(&pagemap_lru_lock); 522 523 return nr_pages; 524 }
The call graph for this function is shown in Figure 10.4.
560 static int shrink_caches(zone_t * classzone, int priority,
unsigned int gfp_mask, int nr_pages)
561 {
562 int chunk_size = nr_pages;
563 unsigned long ratio;
564
565 nr_pages -= kmem_cache_reap(gfp_mask);
566 if (nr_pages <= 0)
567 return 0;
568
569 nr_pages = chunk_size;
570 /* try to keep the active list 2/3 of the size of the cache */
571 ratio = (unsigned long) nr_pages *
nr_active_pages / ((nr_inactive_pages + 1) * 2);
572 refill_inactive(ratio);
573
574 nr_pages = shrink_cache(nr_pages, classzone, gfp_mask, priority);
575 if (nr_pages <= 0)
576 return 0;
577
578 shrink_dcache_memory(priority, gfp_mask);
579 shrink_icache_memory(priority, gfp_mask);
580 #ifdef CONFIG_QUOTA
581 shrink_dqcache_memory(DEF_PRIORITY, gfp_mask);
582 #endif
583
584 return nr_pages;
585 }
This function cycles through all pgdats and tries to balance the preferred allocation zone (usually ZONE_NORMAL) for each of them. This function is only called from one place, buffer.c:free_more_memory() when the buffer manager fails to create new buffers or grow existing ones. It calls try_to_free_pages() with GFP_NOIO as the gfp_mask.
This results in the first zone in pg_data_t→node_zonelists having pages freed so that buffers can grow. This array is the preferred order of zones to allocate from and usually will begin with ZONE_NORMAL which is required by the buffer manager. On NUMA architectures, some nodes may have ZONE_DMA as the preferred zone if the memory bank is dedicated to IO devices and UML also uses only this zone. As the buffer manager is restricted in the zones is uses, there is no point balancing other zones.
607 int try_to_free_pages(unsigned int gfp_mask)
608 {
609 pg_data_t *pgdat;
610 zonelist_t *zonelist;
611 unsigned long pf_free_pages;
612 int error = 0;
613
614 pf_free_pages = current->flags & PF_FREE_PAGES;
615 current->flags &= ~PF_FREE_PAGES;
616
617 for_each_pgdat(pgdat) {
618 zonelist = pgdat->node_zonelists +
(gfp_mask & GFP_ZONEMASK);
619 error |= try_to_free_pages_zone(
zonelist->zones[0], gfp_mask);
620 }
621
622 current->flags |= pf_free_pages;
623 return error;
624 }
Try to free SWAP_CLUSTER_MAX pages from the requested zone. As will as being used by kswapd, this function is the entry for the buddy allocator's direct-reclaim path.
587 int try_to_free_pages_zone(zone_t *classzone,
unsigned int gfp_mask)
588 {
589 int priority = DEF_PRIORITY;
590 int nr_pages = SWAP_CLUSTER_MAX;
591
592 gfp_mask = pf_gfp_mask(gfp_mask);
593 do {
594 nr_pages = shrink_caches(classzone, priority,
gfp_mask, nr_pages);
595 if (nr_pages <= 0)
596 return 1;
597 } while (--priority);
598
599 /*
600 * Hmm.. Cache shrink failed - time to kill something?
601 * Mhwahahhaha! This is the part I really like. Giggle.
602 */
603 out_of_memory();
604 return 0;
605 }
This section covers the path where too many process mapped pages have been found in the LRU lists. This path will start scanning whole processes and reclaiming the mapped pages.
The call graph for this function is shown in Figure 10.5. This function linearaly searches through every processes page tables trying to swap out SWAP_CLUSTER_MAX number of pages. The process it starts with is the swap_mm and the starting address is mm→swap_address
296 static int swap_out(unsigned int priority, unsigned int gfp_mask,
zone_t * classzone)
297 {
298 int counter, nr_pages = SWAP_CLUSTER_MAX;
299 struct mm_struct *mm;
300
301 counter = mmlist_nr;
302 do {
303 if (unlikely(current->need_resched)) {
304 __set_current_state(TASK_RUNNING);
305 schedule();
306 }
307
308 spin_lock(&mmlist_lock);
309 mm = swap_mm;
310 while (mm->swap_address == TASK_SIZE || mm == &init_mm) {
311 mm->swap_address = 0;
312 mm = list_entry(mm->mmlist.next,
struct mm_struct, mmlist);
313 if (mm == swap_mm)
314 goto empty;
315 swap_mm = mm;
316 }
317
318 /* Make sure the mm doesn't disappear
when we drop the lock.. */
319 atomic_inc(&mm->mm_users);
320 spin_unlock(&mmlist_lock);
321
322 nr_pages = swap_out_mm(mm, nr_pages, &counter, classzone);
323
324 mmput(mm);
325
326 if (!nr_pages)
327 return 1;
328 } while (--counter >= 0);
329
330 return 0;
331
332 empty:
333 spin_unlock(&mmlist_lock);
334 return 0;
335 }
Walk through each VMA and call swap_out_mm() for each one.
256 static inline int swap_out_mm(struct mm_struct * mm, int count,
int * mmcounter, zone_t * classzone)
257 {
258 unsigned long address;
259 struct vm_area_struct* vma;
260
265 spin_lock(&mm->page_table_lock);
266 address = mm->swap_address;
267 if (address == TASK_SIZE || swap_mm != mm) {
268 /* We raced: don't count this mm but try again */
269 ++*mmcounter;
270 goto out_unlock;
271 }
272 vma = find_vma(mm, address);
273 if (vma) {
274 if (address < vma->vm_start)
275 address = vma->vm_start;
276
277 for (;;) {
278 count = swap_out_vma(mm, vma, address,
count, classzone);
279 vma = vma->vm_next;
280 if (!vma)
281 break;
282 if (!count)
283 goto out_unlock;
284 address = vma->vm_start;
285 }
286 }
287 /* Indicate that we reached the end of address space */
288 mm->swap_address = TASK_SIZE;
289
290 out_unlock:
291 spin_unlock(&mm->page_table_lock);
292 return count;
293 }
Walk through this VMA and for each PGD in it, call swap_out_pgd().
227 static inline int swap_out_vma(struct mm_struct * mm,
struct vm_area_struct * vma,
unsigned long address, int count,
zone_t * classzone)
228 {
229 pgd_t *pgdir;
230 unsigned long end;
231
232 /* Don't swap out areas which are reserved */
233 if (vma->vm_flags & VM_RESERVED)
234 return count;
235
236 pgdir = pgd_offset(mm, address);
237
238 end = vma->vm_end;
239 BUG_ON(address >= end);
240 do {
241 count = swap_out_pgd(mm, vma, pgdir,
address, end, count, classzone);
242 if (!count)
243 break;
244 address = (address + PGDIR_SIZE) & PGDIR_MASK;
245 pgdir++;
246 } while (address && (address < end));
247 return count;
248 }
Step through all PMD's in the supplied PGD and call swap_out_pmd()
197 static inline int swap_out_pgd(struct mm_struct * mm,
struct vm_area_struct * vma, pgd_t *dir,
unsigned long address, unsigned long end,
int count, zone_t * classzone)
198 {
199 pmd_t * pmd;
200 unsigned long pgd_end;
201
202 if (pgd_none(*dir))
203 return count;
204 if (pgd_bad(*dir)) {
205 pgd_ERROR(*dir);
206 pgd_clear(dir);
207 return count;
208 }
209
210 pmd = pmd_offset(dir, address);
211
212 pgd_end = (address + PGDIR_SIZE) & PGDIR_MASK;
213 if (pgd_end && (end > pgd_end))
214 end = pgd_end;
215
216 do {
217 count = swap_out_pmd(mm, vma, pmd,
address, end, count, classzone);
218 if (!count)
219 break;
220 address = (address + PMD_SIZE) & PMD_MASK;
221 pmd++;
222 } while (address && (address < end));
223 return count;
224 }
For each PTE in this PMD, call try_to_swap_out(). On completion, mm→swap_address is updated to show where we finished to prevent the same page been examined soon after this scan.
158 static inline int swap_out_pmd(struct mm_struct * mm,
struct vm_area_struct * vma, pmd_t *dir,
unsigned long address, unsigned long end,
int count, zone_t * classzone)
159 {
160 pte_t * pte;
161 unsigned long pmd_end;
162
163 if (pmd_none(*dir))
164 return count;
165 if (pmd_bad(*dir)) {
166 pmd_ERROR(*dir);
167 pmd_clear(dir);
168 return count;
169 }
170
171 pte = pte_offset(dir, address);
172
173 pmd_end = (address + PMD_SIZE) & PMD_MASK;
174 if (end > pmd_end)
175 end = pmd_end;
176
177 do {
178 if (pte_present(*pte)) {
179 struct page *page = pte_page(*pte);
180
181 if (VALID_PAGE(page) && !PageReserved(page)) {
182 count -= try_to_swap_out(mm, vma,
address, pte,
page, classzone);
183 if (!count) {
184 address += PAGE_SIZE;
185 break;
186 }
187 }
188 }
189 address += PAGE_SIZE;
190 pte++;
191 } while (address && (address < end));
192 mm->swap_address = address;
193 return count;
194 }
This function tries to swap out a page from a process. It is quite a large function so will be dealt with in parts. Broadly speaking they are
47 static inline int try_to_swap_out(struct mm_struct * mm,
struct vm_area_struct* vma,
unsigned long address,
pte_t * page_table,
struct page *page,
zone_t * classzone)
48 {
49 pte_t pte;
50 swp_entry_t entry;
51
52 /* Don't look at this pte if it's been accessed recently. */
53 if ((vma->vm_flags & VM_LOCKED) ||
ptep_test_and_clear_young(page_table)) {
54 mark_page_accessed(page);
55 return 0;
56 }
57
58 /* Don't bother unmapping pages that are active */
59 if (PageActive(page))
60 return 0;
61
62 /* Don't bother replenishing zones not under pressure.. */
63 if (!memclass(page_zone(page), classzone))
64 return 0;
65
66 if (TryLockPage(page))
67 return 0;
74 flush_cache_page(vma, address); 75 pte = ptep_get_and_clear(page_table); 76 flush_tlb_page(vma, address); 77 78 if (pte_dirty(pte)) 79 set_page_dirty(page); 80
86 if (PageSwapCache(page)) {
87 entry.val = page->index;
88 swap_duplicate(entry);
89 set_swap_pte:
90 set_pte(page_table, swp_entry_to_pte(entry));
91 drop_pte:
92 mm->rss--;
93 UnlockPage(page);
94 {
95 int freeable =
page_count(page) - !!page->buffers <= 2;
96 page_cache_release(page);
97 return freeable;
98 }
99 }
Handle the case where the page is already in the swap cache
115 if (page->mapping) 116 goto drop_pte; 117 if (!PageDirty(page)) 118 goto drop_pte; 124 if (page->buffers) 125 goto preserve;
126
127 /*
128 * This is a dirty, swappable page. First of all,
129 * get a suitable swap entry for it, and make sure
130 * we have the swap cache set up to associate the
131 * page with that swap entry.
132 */
133 for (;;) {
134 entry = get_swap_page();
135 if (!entry.val)
136 break;
137 /* Add it to the swap cache and mark it dirty
138 * (adding to the page cache will clear the dirty
139 * and uptodate bits, so we need to do it again)
140 */
141 if (add_to_swap_cache(page, entry) == 0) {
142 SetPageUptodate(page);
143 set_page_dirty(page);
144 goto set_swap_pte;
145 }
146 /* Raced with "speculative" read_swap_cache_async */
147 swap_free(entry);
148 }
149
150 /* No swap space left */
151 preserve:
152 set_pte(page_table, pte);
153 UnlockPage(page);
154 return 0;
155 }
This section details the main loops used by the kswapd daemon which is woken-up when memory is low. The main functions covered are the ones that determine if kswapd can sleep and how it determines which nodes need balancing.
Start the kswapd kernel thread
767 static int __init kswapd_init(void)
768 {
769 printk("Starting kswapd\n");
770 swap_setup();
771 kernel_thread(kswapd, NULL, CLONE_FS
| CLONE_FILES
| CLONE_SIGNAL);
772 return 0;
773 }
The main function of the kswapd kernel thread.
720 int kswapd(void *unused)
721 {
722 struct task_struct *tsk = current;
723 DECLARE_WAITQUEUE(wait, tsk);
724
725 daemonize();
726 strcpy(tsk->comm, "kswapd");
727 sigfillset(&tsk->blocked);
728
741 tsk->flags |= PF_MEMALLOC;
742
746 for (;;) {
747 __set_current_state(TASK_INTERRUPTIBLE);
748 add_wait_queue(&kswapd_wait, &wait);
749
750 mb();
751 if (kswapd_can_sleep())
752 schedule();
753
754 __set_current_state(TASK_RUNNING);
755 remove_wait_queue(&kswapd_wait, &wait);
756
762 kswapd_balance();
763 run_task_queue(&tq_disk);
764 }
765 }
Simple function to cycle through all pgdats to call kswapd_can_sleep_pgdat() on each.
695 static int kswapd_can_sleep(void)
696 {
697 pg_data_t * pgdat;
698
699 for_each_pgdat(pgdat) {
700 if (!kswapd_can_sleep_pgdat(pgdat))
701 return 0;
702 }
703
704 return 1;
705 }
Cycles through all zones to make sure none of them need balance. The zone→need_balanace flag is set by __alloc_pages() when the number of free pages in the zone reaches the pages_low watermark.
680 static int kswapd_can_sleep_pgdat(pg_data_t * pgdat)
681 {
682 zone_t * zone;
683 int i;
684
685 for (i = pgdat->nr_zones-1; i >= 0; i--) {
686 zone = pgdat->node_zones + i;
687 if (!zone->need_balance)
688 continue;
689 return 0;
690 }
691
692 return 1;
693 }
Continuously cycle through each pgdat until none require balancing
667 static void kswapd_balance(void)
668 {
669 int need_more_balance;
670 pg_data_t * pgdat;
671
672 do {
673 need_more_balance = 0;
674
675 for_each_pgdat(pgdat)
676 need_more_balance |= kswapd_balance_pgdat(pgdat);
677 } while (need_more_balance);
678 }
This function will check if a node requires balance by examining each of the nodes in it. If any zone requires balancing, try_to_free_pages_zone() will be called.
641 static int kswapd_balance_pgdat(pg_data_t * pgdat)
642 {
643 int need_more_balance = 0, i;
644 zone_t * zone;
645
646 for (i = pgdat->nr_zones-1; i >= 0; i--) {
647 zone = pgdat->node_zones + i;
648 if (unlikely(current->need_resched))
649 schedule();
650 if (!zone->need_balance)
651 continue;
652 if (!try_to_free_pages_zone(zone, GFP_KSWAPD)) {
653 zone->need_balance = 0;
654 __set_current_state(TASK_INTERRUPTIBLE);
655 schedule_timeout(HZ);
656 continue;
657 }
658 if (check_classzone_need_balance(zone))
659 need_more_balance = 1;
660 else
661 zone->need_balance = 0;
662 }
663
664 return need_more_balance;
665 }