The call graph for this function is shown at 6.3. It is declared as follows;
439 static inline struct page * alloc_pages(unsigned int gfp_mask,
unsigned int order)
440 {
444 if (order >= MAX_ORDER)
445 return NULL;
446 return _alloc_pages(gfp_mask, order);
447 }
The function _alloc_pages() comes in two varieties. The first is designed to only work with UMA architectures such as the x86 and is in mm/page_alloc.c. It only refers to the static node contig_page_data. The second is in mm/numa.c and is a simple extension. It uses a node-local allocation policy which means that memory will be allocated from the bank closest to the processor. For the purposes of this book, only the mm/page_alloc.c version will be examined but developers on NUMA architectures should read _alloc_pages() and _alloc_pages_pgdat() as well in mm/numa.c
244 #ifndef CONFIG_DISCONTIGMEM
245 struct page *_alloc_pages(unsigned int gfp_mask,
unsigned int order)
246 {
247 return __alloc_pages(gfp_mask, order,
248 contig_page_data.node_zonelists+(gfp_mask & GFP_ZONEMASK));
249 }
250 #endif
At this stage, we've reached what is described as the “heart of the zoned buddy allocator”, the __alloc_pages() function. It is responsible for cycling through the fallback zones and selecting one suitable for the allocation. If memory is tight, it will take some steps to address the problem. It will wake kswapd and if necessary it will do the work of kswapd manually.
327 struct page * __alloc_pages(unsigned int gfp_mask,
unsigned int order,
zonelist_t *zonelist)
328 {
329 unsigned long min;
330 zone_t **zone, * classzone;
331 struct page * page;
332 int freed;
333
334 zone = zonelist->zones;
335 classzone = *zone;
336 if (classzone == NULL)
337 return NULL;
338 min = 1UL << order;
339 for (;;) {
340 zone_t *z = *(zone++);
341 if (!z)
342 break;
343
344 min += z->pages_low;
345 if (z->free_pages > min) {
346 page = rmqueue(z, order);
347 if (page)
348 return page;
349 }
350 }
352 classzone->need_balance = 1;
353 mb();
354 if (waitqueue_active(&kswapd_wait))
355 wake_up_interruptible(&kswapd_wait);
356
357 zone = zonelist->zones;
358 min = 1UL << order;
359 for (;;) {
360 unsigned long local_min;
361 zone_t *z = *(zone++);
362 if (!z)
363 break;
364
365 local_min = z->pages_min;
366 if (!(gfp_mask & __GFP_WAIT))
367 local_min >>= 2;
368 min += local_min;
369 if (z->free_pages > min) {
370 page = rmqueue(z, order);
371 if (page)
372 return page;
373 }
374 }
375
376 /* here we're in the low on memory slow path */
377
378 rebalance:
379 if (current->flags & (PF_MEMALLOC | PF_MEMDIE)) {
380 zone = zonelist->zones;
381 for (;;) {
382 zone_t *z = *(zone++);
383 if (!z)
384 break;
385
386 page = rmqueue(z, order);
387 if (page)
388 return page;
389 }
390 return NULL;
391 }
393 /* Atomic allocations - we can't balance anything */
394 if (!(gfp_mask & __GFP_WAIT))
395 return NULL;
396
397 page = balance_classzone(classzone, gfp_mask, order, &freed);
398 if (page)
399 return page;
400
401 zone = zonelist->zones;
402 min = 1UL << order;
403 for (;;) {
404 zone_t *z = *(zone++);
405 if (!z)
406 break;
407
408 min += z->pages_min;
409 if (z->free_pages > min) {
410 page = rmqueue(z, order);
411 if (page)
412 return page;
413 }
414 }
415
416 /* Don't let big-order allocations loop */
417 if (order > 3)
418 return NULL;
419
420 /* Yield for kswapd, and try again */
421 yield();
422 goto rebalance;
423 }
This function is called from __alloc_pages(). It is responsible for finding a block of memory large enough to be used for the allocation. If a block of memory of the requested size is not available, it will look for a larger order that may be split into two buddies. The actual splitting is performed by the expand() (See Section F.1.5) function.
198 static FASTCALL(struct page *rmqueue(zone_t *zone,
unsigned int order));
199 static struct page * rmqueue(zone_t *zone, unsigned int order)
200 {
201 free_area_t * area = zone->free_area + order;
202 unsigned int curr_order = order;
203 struct list_head *head, *curr;
204 unsigned long flags;
205 struct page *page;
206
207 spin_lock_irqsave(&zone->lock, flags);
208 do {
209 head = &area->free_list;
210 curr = head->next;
211
212 if (curr != head) {
213 unsigned int index;
214
215 page = list_entry(curr, struct page, list);
216 if (BAD_RANGE(zone,page))
217 BUG();
218 list_del(curr);
219 index = page - zone->zone_mem_map;
220 if (curr_order != MAX_ORDER-1)
221 MARK_USED(index, curr_order, area);
222 zone->free_pages -= 1UL << order;
223
224 page = expand(zone, page, index, order,
curr_order, area);
225 spin_unlock_irqrestore(&zone->lock, flags);
226
227 set_page_count(page, 1);
228 if (BAD_RANGE(zone,page))
229 BUG();
230 if (PageLRU(page))
231 BUG();
232 if (PageActive(page))
233 BUG();
234 return page;
235 }
236 curr_order++;
237 area++;
238 } while (curr_order < MAX_ORDER);
239 spin_unlock_irqrestore(&zone->lock, flags);
240
241 return NULL;
242 }
This function splits page blocks of higher orders until a page block of the needed order is available.
177 static inline struct page * expand (zone_t *zone,
struct page *page,
unsigned long index,
int low,
int high,
free_area_t * area)
179 {
180 unsigned long size = 1 << high;
181
182 while (high > low) {
183 if (BAD_RANGE(zone,page))
184 BUG();
185 area--;
186 high--;
187 size >>= 1;
188 list_add(&(page)->list, &(area)->free_list);
189 MARK_USED(index, high, area);
190 index += size;
191 page += size;
192 }
193 if (BAD_RANGE(zone,page))
194 BUG();
195 return page;
196 }
This function is part of the direct-reclaim path. Allocators which can sleep will call this function to start performing the work of kswapd in a synchronous fashion. As the process is performing the work itself, the pages it frees of the desired order are reserved in a linked list in current→local_pages and the number of page blocks in the list is stored in current→nr_local_pages. Note that page blocks is not the same as number of pages. A page block could be of any order.
253 static struct page * balance_classzone(zone_t * classzone,
unsigned int gfp_mask,
unsigned int order,
int * freed)
254 {
255 struct page * page = NULL;
256 int __freed = 0;
257
258 if (!(gfp_mask & __GFP_WAIT))
259 goto out;
260 if (in_interrupt())
261 BUG();
262
263 current->allocation_order = order;
264 current->flags |= PF_MEMALLOC | PF_FREE_PAGES;
265
266 __freed = try_to_free_pages_zone(classzone, gfp_mask);
267
268 current->flags &= ~(PF_MEMALLOC | PF_FREE_PAGES);
269
270 if (current->nr_local_pages) {
271 struct list_head * entry, * local_pages;
272 struct page * tmp;
273 int nr_pages;
274
275 local_pages = ¤t->local_pages;
276
277 if (likely(__freed)) {
278 /* pick from the last inserted so we're lifo */
279 entry = local_pages->next;
280 do {
281 tmp = list_entry(entry, struct page, list);
282 if (tmp->index == order &&
memclass(page_zone(tmp), classzone)) {
283 list_del(entry);
284 current->nr_local_pages--;
285 set_page_count(tmp, 1);
286 page = tmp;
287
288 if (page->buffers)
289 BUG();
290 if (page->mapping)
291 BUG();
292 if (!VALID_PAGE(page))
293 BUG();
294 if (PageLocked(page))
295 BUG();
296 if (PageLRU(page))
297 BUG();
298 if (PageActive(page))
299 BUG();
300 if (PageDirty(page))
301 BUG();
302
303 break;
304 }
305 } while ((entry = entry->next) != local_pages);
306 }
Presuming that pages exist in the local_pages list, this function will cycle through the list looking for a page block belonging to the desired zone and order.
308 nr_pages = current->nr_local_pages;
309 /* free in reverse order so that the global
* order will be lifo */
310 while ((entry = local_pages->prev) != local_pages) {
311 list_del(entry);
312 tmp = list_entry(entry, struct page, list);
313 __free_pages_ok(tmp, tmp->index);
314 if (!nr_pages--)
315 BUG();
316 }
317 current->nr_local_pages = 0;
318 }
319 out:
320 *freed = __freed;
321 return page;
322 }
This block frees the remaining pages in the list.
This section will cover miscellaneous helper functions and macros the Buddy Allocator uses to allocate pages. Very few of them do “real” work and are available just for the convenience of the programmer.
This trivial macro just calls alloc_pages() with an order of 0 to return 1 page. It is declared as follows
449 #define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0)
This trivial function calls __get_free_pages() with an order of 0 to return 1 page. It is declared as follows
454 #define __get_free_page(gfp_mask) \ 455 __get_free_pages((gfp_mask),0)
This function is for callers who do not want to worry about pages and only get back an address it can use. It is declared as follows
428 unsigned long __get_free_pages(unsigned int gfp_mask,
unsigned int order)
428 {
430 struct page * page;
431
432 page = alloc_pages(gfp_mask, order);
433 if (!page)
434 return 0;
435 return (unsigned long) page_address(page);
436 }
This is of principle interest to device drivers. It will return memory from ZONE_DMA suitable for use with DMA devices. It is declared as follows
457 #define __get_dma_pages(gfp_mask, order) \ 458 __get_free_pages((gfp_mask) | GFP_DMA,(order))
This function will allocate one page and then zero out the contents of it. It is declared as follows
438 unsigned long get_zeroed_page(unsigned int gfp_mask)
439 {
440 struct page * page;
441
442 page = alloc_pages(gfp_mask, 0);
443 if (page) {
444 void *address = page_address(page);
445 clear_page(address);
446 return (unsigned long) address;
447 }
448 return 0;
449 }
The call graph for this function is shown in Figure 6.4. Just to be confusing, the opposite to alloc_pages() is not free_pages(), it is __free_pages(). free_pages() is a helper function which takes an address as a parameter, it will be discussed in a later section.
451 void __free_pages(struct page *page, unsigned int order)
452 {
453 if (!PageReserved(page) && put_page_testzero(page))
454 __free_pages_ok(page, order);
455 }
This function will do the actual freeing of the page and coalesce the buddies if possible.
81 static void FASTCALL(__free_pages_ok (struct page *page,
unsigned int order));
82 static void __free_pages_ok (struct page *page, unsigned int order)
83 {
84 unsigned long index, page_idx, mask, flags;
85 free_area_t *area;
86 struct page *base;
87 zone_t *zone;
88
93 if (PageLRU(page)) {
94 if (unlikely(in_interrupt()))
95 BUG();
96 lru_cache_del(page);
97 }
98
99 if (page->buffers)
100 BUG();
101 if (page->mapping)
102 BUG();
103 if (!VALID_PAGE(page))
104 BUG();
105 if (PageLocked(page))
106 BUG();
107 if (PageActive(page))
108 BUG();
109 page->flags &= ~((1<<PG_referenced) | (1<<PG_dirty));
110 111 if (current->flags & PF_FREE_PAGES) 112 goto local_freelist; 113 back_local_freelist: 114 115 zone = page_zone(page); 116 117 mask = (~0UL) << order; 118 base = zone->zone_mem_map; 119 page_idx = page - base; 120 if (page_idx & ~mask) 121 BUG(); 122 index = page_idx >> (1 + order); 123 124 area = zone->free_area + order; 125
126 spin_lock_irqsave(&zone->lock, flags);
127
128 zone->free_pages -= mask;
129
130 while (mask + (1 << (MAX_ORDER-1))) {
131 struct page *buddy1, *buddy2;
132
133 if (area >= zone->free_area + MAX_ORDER)
134 BUG();
135 if (!__test_and_change_bit(index, area->map))
136 /*
137 * the buddy page is still allocated.
138 */
139 break;
140 /*
141 * Move the buddy up one level.
142 * This code is taking advantage of the identity:
143 * -mask = 1+~mask
144 */
145 buddy1 = base + (page_idx ^ -mask);
146 buddy2 = base + page_idx;
147 if (BAD_RANGE(zone,buddy1))
148 BUG();
149 if (BAD_RANGE(zone,buddy2))
150 BUG();
151
152 list_del(&buddy1->list);
153 mask <<= 1;
154 area++;
155 index >>= 1;
156 page_idx &= mask;
157 }
158 list_add(&(base + page_idx)->list, &area->free_list); 159 160 spin_unlock_irqrestore(&zone->lock, flags); 161 return; 162 163 local_freelist: 164 if (current->nr_local_pages) 165 goto back_local_freelist; 166 if (in_interrupt()) 167 goto back_local_freelist; 168 169 list_add(&page->list, ¤t->local_pages); 170 page->index = order; 171 current->nr_local_pages++; 172 }
These functions are very similar to the page allocation helper functions in that they do no “real” work themselves and depend on the __free_pages() function to perform the actual free.
This function takes an address instead of a page as a parameter to free. It is declared as follows
457 void free_pages(unsigned long addr, unsigned int order)
458 {
459 if (addr != 0)
460 __free_pages(virt_to_page(addr), order);
461 }
This trivial macro just calls the function __free_pages() (See Section F.3.1) with an order 0 for 1 page. It is declared as follows
472 #define __free_page(page) __free_pages((page), 0)
This trivial macro just calls the function free_pages(). The essential difference between this macro and __free_page() is that this function takes a virtual address as a parameter and __free_page() takes a struct page.
472 #define free_page(addr) free_pages((addr),0)