The call graph for this function is shown in 8.3. This function is responsible for the creation of a new cache and will be dealt with in chunks due to its size. The chunks roughly are;
621 kmem_cache_t *
622 kmem_cache_create (const char *name, size_t size,
623 size_t offset, unsigned long flags,
void (*ctor)(void*, kmem_cache_t *, unsigned long),
624 void (*dtor)(void*, kmem_cache_t *, unsigned long))
625 {
626 const char *func_nm = KERN_ERR "kmem_create: ";
627 size_t left_over, align, slab_size;
628 kmem_cache_t *cachep = NULL;
629
633 if ((!name) ||
634 ((strlen(name) >= CACHE_NAMELEN - 1)) ||
635 in_interrupt() ||
636 (size < BYTES_PER_WORD) ||
637 (size > (1<<MAX_OBJ_ORDER)*PAGE_SIZE) ||
638 (dtor && !ctor) ||
639 (offset < 0 || offset > size))
640 BUG();
641
Perform basic sanity checks for bad usage
642 #if DEBUG
643 if ((flags & SLAB_DEBUG_INITIAL) && !ctor) {
645 printk("%sNo con, but init state check
requested - %s\n", func_nm, name);
646 flags &= ~SLAB_DEBUG_INITIAL;
647 }
648
649 if ((flags & SLAB_POISON) && ctor) {
651 printk("%sPoisoning requested, but con given - %s\n",
func_nm, name);
652 flags &= ~SLAB_POISON;
653 }
654 #if FORCED_DEBUG
655 if ((size < (PAGE_SIZE>>3)) &&
!(flags & SLAB_MUST_HWCACHE_ALIGN))
660 flags |= SLAB_RED_ZONE;
661 if (!ctor)
662 flags |= SLAB_POISON;
663 #endif
664 #endif
670 BUG_ON(flags & ~CREATE_MASK);
This block performs debugging checks if CONFIG_SLAB_DEBUG is set
673 cachep =
(kmem_cache_t *) kmem_cache_alloc(&cache_cache,
SLAB_KERNEL);
674 if (!cachep)
675 goto opps;
676 memset(cachep, 0, sizeof(kmem_cache_t));
Allocate a kmem_cache_t from the cache_cache slab cache.
682 if (size & (BYTES_PER_WORD-1)) {
683 size += (BYTES_PER_WORD-1);
684 size &= ~(BYTES_PER_WORD-1);
685 printk("%sForcing size word alignment
- %s\n", func_nm, name);
686 }
687
688 #if DEBUG
689 if (flags & SLAB_RED_ZONE) {
694 flags &= ~SLAB_HWCACHE_ALIGN;
695 size += 2*BYTES_PER_WORD;
696 }
697 #endif
698 align = BYTES_PER_WORD;
699 if (flags & SLAB_HWCACHE_ALIGN)
700 align = L1_CACHE_BYTES;
701
703 if (size >= (PAGE_SIZE>>3))
708 flags |= CFLGS_OFF_SLAB;
709
710 if (flags & SLAB_HWCACHE_ALIGN) {
714 while (size < align/2)
715 align /= 2;
716 size = (size+align-1)&(~(align-1));
717 }
Align the object size to some word-sized boundary.
724 do {
725 unsigned int break_flag = 0;
726 cal_wastage:
727 kmem_cache_estimate(cachep->gfporder,
size, flags,
728 &left_over,
&cachep->num);
729 if (break_flag)
730 break;
731 if (cachep->gfporder >= MAX_GFP_ORDER)
732 break;
733 if (!cachep->num)
734 goto next;
735 if (flags & CFLGS_OFF_SLAB &&
cachep->num > offslab_limit) {
737 cachep->gfporder--;
738 break_flag++;
739 goto cal_wastage;
740 }
741
746 if (cachep->gfporder >= slab_break_gfp_order)
747 break;
748
749 if ((left_over*8) <= (PAGE_SIZE<<cachep->gfporder))
750 break;
751 next:
752 cachep->gfporder++;
753 } while (1);
754
755 if (!cachep->num) {
756 printk("kmem_cache_create: couldn't
create cache %s.\n", name);
757 kmem_cache_free(&cache_cache, cachep);
758 cachep = NULL;
759 goto opps;
760 }
Calculate how many objects will fit on a slab and adjust the slab size as necessary
761 slab_size = L1_CACHE_ALIGN(
cachep->num*sizeof(kmem_bufctl_t) +
sizeof(slab_t));
762
767 if (flags & CFLGS_OFF_SLAB && left_over >= slab_size) {
768 flags &= ~CFLGS_OFF_SLAB;
769 left_over -= slab_size;
770 }
Align the slab size to the hardware cache
773 offset += (align-1); 774 offset &= ~(align-1); 775 if (!offset) 776 offset = L1_CACHE_BYTES; 777 cachep->colour_off = offset; 778 cachep->colour = left_over/offset;
Calculate colour offsets.
781 if (!cachep->gfporder && !(flags & CFLGS_OFF_SLAB))
782 flags |= CFLGS_OPTIMIZE;
783
784 cachep->flags = flags;
785 cachep->gfpflags = 0;
786 if (flags & SLAB_CACHE_DMA)
787 cachep->gfpflags |= GFP_DMA;
788 spin_lock_init(&cachep->spinlock);
789 cachep->objsize = size;
790 INIT_LIST_HEAD(&cachep->slabs_full);
791 INIT_LIST_HEAD(&cachep->slabs_partial);
792 INIT_LIST_HEAD(&cachep->slabs_free);
793
794 if (flags & CFLGS_OFF_SLAB)
795 cachep->slabp_cache =
kmem_find_general_cachep(slab_size,0);
796 cachep->ctor = ctor;
797 cachep->dtor = dtor;
799 strcpy(cachep->name, name);
800
801 #ifdef CONFIG_SMP
802 if (g_cpucache_up)
803 enable_cpucache(cachep);
804 #endif
Initialise remaining fields in cache descriptor
806 down(&cache_chain_sem);
807 {
808 struct list_head *p;
809
810 list_for_each(p, &cache_chain) {
811 kmem_cache_t *pc = list_entry(p,
kmem_cache_t, next);
812
814 if (!strcmp(pc->name, name))
815 BUG();
816 }
817 }
818
822 list_add(&cachep->next, &cache_chain);
823 up(&cache_chain_sem);
824 opps:
825 return cachep;
826 }
Add the new cache to the cache chain
During cache creation, it is determined how many objects can be stored in a slab and how much waste-age there will be. The following function calculates how many objects may be stored, taking into account if the slab and bufctl's must be stored on-slab.
388 static void kmem_cache_estimate (unsigned long gfporder,
size_t size,
389 int flags, size_t *left_over, unsigned int *num)
390 {
391 int i;
392 size_t wastage = PAGE_SIZE<<gfporder;
393 size_t extra = 0;
394 size_t base = 0;
395
396 if (!(flags & CFLGS_OFF_SLAB)) {
397 base = sizeof(slab_t);
398 extra = sizeof(kmem_bufctl_t);
399 }
400 i = 0;
401 while (i*size + L1_CACHE_ALIGN(base+i*extra) <= wastage)
402 i++;
403 if (i > 0)
404 i--;
405
406 if (i > SLAB_LIMIT)
407 i = SLAB_LIMIT;
408
409 *num = i;
410 wastage -= i*size;
411 wastage -= L1_CACHE_ALIGN(base+i*extra);
412 *left_over = wastage;
413 }
The call graph for kmem_cache_shrink() is shown in Figure 8.5. Two varieties of shrink functions are provided. kmem_cache_shrink() removes all slabs from slabs_free and returns the number of pages freed as a result. __kmem_cache_shrink() frees all slabs from slabs_free and then verifies that slabs_partial and slabs_full are empty. This is important during cache destruction when it doesn't matter how many pages are freed, just that the cache is empty.
This function performs basic debugging checks and then acquires the cache descriptor lock before freeing slabs. At one time, it also used to call drain_cpu_caches() to free up objects on the per-cpu cache. It is curious that this was removed as it is possible slabs could not be freed due to an object been allocation on a per-cpu cache but not in use.
966 int kmem_cache_shrink(kmem_cache_t *cachep)
967 {
968 int ret;
969
970 if (!cachep || in_interrupt() ||
!is_chained_kmem_cache(cachep))
971 BUG();
972
973 spin_lock_irq(&cachep->spinlock);
974 ret = __kmem_cache_shrink_locked(cachep);
975 spin_unlock_irq(&cachep->spinlock);
976
977 return ret << cachep->gfporder;
978 }
This function is identical to kmem_cache_shrink() except it returns if the cache is empty or not. This is important during cache destruction when it is not important how much memory was freed, just that it is safe to delete the cache and not leak memory.
945 static int __kmem_cache_shrink(kmem_cache_t *cachep)
946 {
947 int ret;
948
949 drain_cpu_caches(cachep);
950
951 spin_lock_irq(&cachep->spinlock);
952 __kmem_cache_shrink_locked(cachep);
953 ret = !list_empty(&cachep->slabs_full) ||
954 !list_empty(&cachep->slabs_partial);
955 spin_unlock_irq(&cachep->spinlock);
956 return ret;
957 }
This does the dirty work of freeing slabs. It will keep destroying them until the growing flag gets set, indicating the cache is in use or until there is no more slabs in slabs_free.
917 static int __kmem_cache_shrink_locked(kmem_cache_t *cachep)
918 {
919 slab_t *slabp;
920 int ret = 0;
921
923 while (!cachep->growing) {
924 struct list_head *p;
925
926 p = cachep->slabs_free.prev;
927 if (p == &cachep->slabs_free)
928 break;
929
930 slabp = list_entry(cachep->slabs_free.prev,
slab_t, list);
931 #if DEBUG
932 if (slabp->inuse)
933 BUG();
934 #endif
935 list_del(&slabp->list);
936
937 spin_unlock_irq(&cachep->spinlock);
938 kmem_slab_destroy(cachep, slabp);
939 ret++;
940 spin_lock_irq(&cachep->spinlock);
941 }
942 return ret;
943 }
When a module is unloaded, it is responsible for destroying any cache is has created as during module loading, it is ensured there is not two caches of the same name. Core kernel code often does not destroy its caches as their existence persists for the life of the system. The steps taken to destroy a cache are
The call graph for this function is shown in Figure 8.7.
997 int kmem_cache_destroy (kmem_cache_t * cachep)
998 {
999 if (!cachep || in_interrupt() || cachep->growing)
1000 BUG();
1001
1002 /* Find the cache in the chain of caches. */
1003 down(&cache_chain_sem);
1004 /* the chain is never empty, cache_cache is never destroyed */
1005 if (clock_searchp == cachep)
1006 clock_searchp = list_entry(cachep->next.next,
1007 kmem_cache_t, next);
1008 list_del(&cachep->next);
1009 up(&cache_chain_sem);
1010
1011 if (__kmem_cache_shrink(cachep)) {
1012 printk(KERN_ERR
"kmem_cache_destroy: Can't free all objects %p\n",
1013 cachep);
1014 down(&cache_chain_sem);
1015 list_add(&cachep->next,&cache_chain);
1016 up(&cache_chain_sem);
1017 return 1;
1018 }
1019 #ifdef CONFIG_SMP
1020 {
1021 int i;
1022 for (i = 0; i < NR_CPUS; i++)
1023 kfree(cachep->cpudata[i]);
1024 }
1025 #endif
1026 kmem_cache_free(&cache_cache, cachep);
1027
1028 return 0;
1029 }
The call graph for this function is shown in Figure 8.4. Because of the size of this function, it will be broken up into three separate sections. The first is simple function preamble. The second is the selection of a cache to reap and the third is the freeing of the slabs. The basic tasks were described in Section 8.1.7.
1738 int kmem_cache_reap (int gfp_mask)
1739 {
1740 slab_t *slabp;
1741 kmem_cache_t *searchp;
1742 kmem_cache_t *best_cachep;
1743 unsigned int best_pages;
1744 unsigned int best_len;
1745 unsigned int scan;
1746 int ret = 0;
1747
1748 if (gfp_mask & __GFP_WAIT)
1749 down(&cache_chain_sem);
1750 else
1751 if (down_trylock(&cache_chain_sem))
1752 return 0;
1753
1754 scan = REAP_SCANLEN;
1755 best_len = 0;
1756 best_pages = 0;
1757 best_cachep = NULL;
1758 searchp = clock_searchp;
1759 do {
1760 unsigned int pages;
1761 struct list_head* p;
1762 unsigned int full_free;
1763
1765 if (searchp->flags & SLAB_NO_REAP)
1766 goto next;
1767 spin_lock_irq(&searchp->spinlock);
1768 if (searchp->growing)
1769 goto next_unlock;
1770 if (searchp->dflags & DFLGS_GROWN) {
1771 searchp->dflags &= ~DFLGS_GROWN;
1772 goto next_unlock;
1773 }
1774 #ifdef CONFIG_SMP
1775 {
1776 cpucache_t *cc = cc_data(searchp);
1777 if (cc && cc->avail) {
1778 __free_block(searchp, cc_entry(cc),
cc->avail);
1779 cc->avail = 0;
1780 }
1781 }
1782 #endif
1783
1784 full_free = 0;
1785 p = searchp->slabs_free.next;
1786 while (p != &searchp->slabs_free) {
1787 slabp = list_entry(p, slab_t, list);
1788 #if DEBUG
1789 if (slabp->inuse)
1790 BUG();
1791 #endif
1792 full_free++;
1793 p = p->next;
1794 }
1795
1801 pages = full_free * (1<<searchp->gfporder);
1802 if (searchp->ctor)
1803 pages = (pages*4+1)/5;
1804 if (searchp->gfporder)
1805 pages = (pages*4+1)/5;
1806 if (pages > best_pages) {
1807 best_cachep = searchp;
1808 best_len = full_free;
1809 best_pages = pages;
1810 if (pages >= REAP_PERFECT) {
1811 clock_searchp =
list_entry(searchp->next.next,
1812 kmem_cache_t,next);
1813 goto perfect;
1814 }
1815 }
1816 next_unlock:
1817 spin_unlock_irq(&searchp->spinlock);
1818 next:
1819 searchp =
list_entry(searchp->next.next,kmem_cache_t,next);
1820 } while (--scan && searchp != clock_searchp);
This block examines REAP_SCANLEN number of caches to select one to free
1822 clock_searchp = searchp;
1823
1824 if (!best_cachep)
1826 goto out;
1827
1828 spin_lock_irq(&best_cachep->spinlock);
1829 perfect:
1830 /* free only 50% of the free slabs */
1831 best_len = (best_len + 1)/2;
1832 for (scan = 0; scan < best_len; scan++) {
1833 struct list_head *p;
1834
1835 if (best_cachep->growing)
1836 break;
1837 p = best_cachep->slabs_free.prev;
1838 if (p == &best_cachep->slabs_free)
1839 break;
1840 slabp = list_entry(p,slab_t,list);
1841 #if DEBUG
1842 if (slabp->inuse)
1843 BUG();
1844 #endif
1845 list_del(&slabp->list);
1846 STATS_INC_REAPED(best_cachep);
1847
1848 /* Safe to drop the lock. The slab is no longer
1849 * lined to the cache.
1850 */
1851 spin_unlock_irq(&best_cachep->spinlock);
1852 kmem_slab_destroy(best_cachep, slabp);
1853 spin_lock_irq(&best_cachep->spinlock);
1854 }
1855 spin_unlock_irq(&best_cachep->spinlock);
1856 ret = scan * (1 << best_cachep->gfporder);
1857 out:
1858 up(&cache_chain_sem);
1859 return ret;
1860 }
This block will free half of the slabs from the selected cache
This function will either allocate allocate space to keep the slab descriptor off cache or reserve enough space at the beginning of the slab for the descriptor and the bufctls.
1032 static inline slab_t * kmem_cache_slabmgmt (
kmem_cache_t *cachep,
1033 void *objp,
int colour_off,
int local_flags)
1034 {
1035 slab_t *slabp;
1036
1037 if (OFF_SLAB(cachep)) {
1039 slabp = kmem_cache_alloc(cachep->slabp_cache,
local_flags);
1040 if (!slabp)
1041 return NULL;
1042 } else {
1047 slabp = objp+colour_off;
1048 colour_off += L1_CACHE_ALIGN(cachep->num *
1049 sizeof(kmem_bufctl_t) +
sizeof(slab_t));
1050 }
1051 slabp->inuse = 0;
1052 slabp->colouroff = colour_off;
1053 slabp->s_mem = objp+colour_off;
1054
1055 return slabp;
1056 }
If the slab descriptor is to be kept off-slab, this function, called during cache creation will find the appropriate sizes cache to use and will be stored within the cache descriptor in the field slabp_cache.
1620 kmem_cache_t * kmem_find_general_cachep (size_t size,
int gfpflags)
1621 {
1622 cache_sizes_t *csizep = cache_sizes;
1623
1628 for ( ; csizep->cs_size; csizep++) {
1629 if (size > csizep->cs_size)
1630 continue;
1631 break;
1632 }
1633 return (gfpflags & GFP_DMA) ? csizep->cs_dmacachep :
csizep->cs_cachep;
1634 }
The call graph for this function is shown in 8.11. The basic tasks for this function are;
1105 static int kmem_cache_grow (kmem_cache_t * cachep, int flags)
1106 {
1107 slab_t *slabp;
1108 struct page *page;
1109 void *objp;
1110 size_t offset;
1111 unsigned int i, local_flags;
1112 unsigned long ctor_flags;
1113 unsigned long save_flags;
Basic declarations. The parameters of the function are
1118 if (flags & ~(SLAB_DMA|SLAB_LEVEL_MASK|SLAB_NO_GROW))
1119 BUG();
1120 if (flags & SLAB_NO_GROW)
1121 return 0;
1122
1129 if (in_interrupt() &&
(flags & SLAB_LEVEL_MASK) != SLAB_ATOMIC)
1130 BUG();
1131
1132 ctor_flags = SLAB_CTOR_CONSTRUCTOR;
1133 local_flags = (flags & SLAB_LEVEL_MASK);
1134 if (local_flags == SLAB_ATOMIC)
1139 ctor_flags |= SLAB_CTOR_ATOMIC;
Perform basic sanity checks to guard against bad usage. The checks are made here rather than kmem_cache_alloc() to protect the speed-critical path. There is no point checking the flags every time an object needs to be allocated.
1142 spin_lock_irqsave(&cachep->spinlock, save_flags); 1143 1145 offset = cachep->colour_next; 1146 cachep->colour_next++; 1147 if (cachep->colour_next >= cachep->colour) 1148 cachep->colour_next = 0; 1149 offset *= cachep->colour_off; 1150 cachep->dflags |= DFLGS_GROWN; 1151 1152 cachep->growing++; 1153 spin_unlock_irqrestore(&cachep->spinlock, save_flags);
Calculate colour offset for objects in this slab
1165 if (!(objp = kmem_getpages(cachep, flags)))
1166 goto failed;
1167
1169 if (!(slabp = kmem_cache_slabmgmt(cachep,
objp, offset,
local_flags)))
1160 goto opps1;
Allocate memory for slab and acquire a slab descriptor
1173 i = 1 << cachep->gfporder;
1174 page = virt_to_page(objp);
1175 do {
1176 SET_PAGE_CACHE(page, cachep);
1177 SET_PAGE_SLAB(page, slabp);
1178 PageSetSlab(page);
1179 page++;
1180 } while (--i);
Link the pages for the slab used to the slab and cache descriptors
1182 kmem_cache_init_objs(cachep, slabp, ctor_flags);
1184 spin_lock_irqsave(&cachep->spinlock, save_flags); 1185 cachep->growing--; 1186 1188 list_add_tail(&slabp->list, &cachep->slabs_free); 1189 STATS_INC_GROWN(cachep); 1190 cachep->failures = 0; 1191 1192 spin_unlock_irqrestore(&cachep->spinlock, save_flags); 1193 return 1;
Add the slab to the cache
1194 opps1: 1195 kmem_freepages(cachep, objp); 1196 failed: 1197 spin_lock_irqsave(&cachep->spinlock, save_flags); 1198 cachep->growing--; 1199 spin_unlock_irqrestore(&cachep->spinlock, save_flags); 1300 return 0; 1301 }
Error handling
The call graph for this function is shown at Figure 8.13. For reability, the debugging sections has been omitted from this function but they are almost identical to the debugging section during object allocation. See Section H.3.1.1 for how the markers and poison pattern are checked.
555 static void kmem_slab_destroy (kmem_cache_t *cachep, slab_t *slabp)
556 {
557 if (cachep->dtor
561 ) {
562 int i;
563 for (i = 0; i < cachep->num; i++) {
564 void* objp = slabp->s_mem+cachep->objsize*i;
565-574 DEBUG: Check red zone markers
575 if (cachep->dtor)
576 (cachep->dtor)(objp, cachep, 0);
577-584 DEBUG: Check poison pattern
585 }
586 }
587
588 kmem_freepages(cachep, slabp->s_mem-slabp->colouroff);
589 if (OFF_SLAB(cachep))
590 kmem_cache_free(cachep->slabp_cache, slabp);
591 }
This section will cover how objects are managed. At this point, most of the real hard work has been completed by either the cache or slab managers.
The vast part of this function is involved with debugging so we will start with the function without the debugging and explain that in detail before handling the debugging part. The two sections that are debugging are marked in the code excerpt below as Part 1 and Part 2.
1058 static inline void kmem_cache_init_objs (kmem_cache_t * cachep,
1059 slab_t * slabp, unsigned long ctor_flags)
1060 {
1061 int i;
1062
1063 for (i = 0; i < cachep->num; i++) {
1064 void* objp = slabp->s_mem+cachep->objsize*i;
1065-1072 /* Debugging Part 1 */
1079 if (cachep->ctor)
1080 cachep->ctor(objp, cachep, ctor_flags);
1081-1094 /* Debugging Part 2 */
1095 slab_bufctl(slabp)[i] = i+1;
1096 }
1097 slab_bufctl(slabp)[i-1] = BUFCTL_END;
1098 slabp->free = 0;
1099 }
That covers the core of initialising objects. Next the first debugging part will be covered
1065 #if DEBUG
1066 if (cachep->flags & SLAB_RED_ZONE) {
1067 *((unsigned long*)(objp)) = RED_MAGIC1;
1068 *((unsigned long*)(objp + cachep->objsize -
1069 BYTES_PER_WORD)) = RED_MAGIC1;
1070 objp += BYTES_PER_WORD;
1071 }
1072 #endif
1081 #if DEBUG
1082 if (cachep->flags & SLAB_RED_ZONE)
1083 objp -= BYTES_PER_WORD;
1084 if (cachep->flags & SLAB_POISON)
1086 kmem_poison_obj(cachep, objp);
1087 if (cachep->flags & SLAB_RED_ZONE) {
1088 if (*((unsigned long*)(objp)) != RED_MAGIC1)
1089 BUG();
1090 if (*((unsigned long*)(objp + cachep->objsize -
1091 BYTES_PER_WORD)) != RED_MAGIC1)
1092 BUG();
1093 }
1094 #endif
This is the debugging block that takes place after the constructor, if it exists, has been called.
The call graph for this function is shown in Figure 8.14. This trivial function simply calls __kmem_cache_alloc().
1529 void * kmem_cache_alloc (kmem_cache_t *cachep, int flags)
1531 {
1532 return __kmem_cache_alloc(cachep, flags);
1533 }
This will take the parts of the function specific to the UP case. The SMP case will be dealt with in the next section.
1338 static inline void * __kmem_cache_alloc (kmem_cache_t *cachep,
int flags)
1339 {
1340 unsigned long save_flags;
1341 void* objp;
1342
1343 kmem_cache_alloc_head(cachep, flags);
1344 try_again:
1345 local_irq_save(save_flags);
1367 objp = kmem_cache_alloc_one(cachep);
1369 local_irq_restore(save_flags);
1370 return objp;
1371 alloc_new_slab:
1376 local_irq_restore(save_flags);
1377 if (kmem_cache_grow(cachep, flags))
1381 goto try_again;
1382 return NULL;
1383 }
This is what the function looks like in the SMP case
1338 static inline void * __kmem_cache_alloc (kmem_cache_t *cachep,
int flags)
1339 {
1340 unsigned long save_flags;
1341 void* objp;
1342
1343 kmem_cache_alloc_head(cachep, flags);
1344 try_again:
1345 local_irq_save(save_flags);
1347 {
1348 cpucache_t *cc = cc_data(cachep);
1349
1350 if (cc) {
1351 if (cc->avail) {
1352 STATS_INC_ALLOCHIT(cachep);
1353 objp = cc_entry(cc)[--cc->avail];
1354 } else {
1355 STATS_INC_ALLOCMISS(cachep);
1356 objp =
kmem_cache_alloc_batch(cachep,cc,flags);
1357 if (!objp)
1358 goto alloc_new_slab_nolock;
1359 }
1360 } else {
1361 spin_lock(&cachep->spinlock);
1362 objp = kmem_cache_alloc_one(cachep);
1363 spin_unlock(&cachep->spinlock);
1364 }
1365 }
1366 local_irq_restore(save_flags);
1370 return objp;
1371 alloc_new_slab:
1373 spin_unlock(&cachep->spinlock);
1374 alloc_new_slab_nolock:
1375 local_irq_restore(save_flags);
1377 if (kmem_cache_grow(cachep, flags))
1381 goto try_again;
1382 return NULL;
1383 }
This simple function ensures the right combination of slab and GFP flags are used for allocation from a slab. If a cache is for DMA use, this function will make sure the caller does not accidently request normal memory and vice versa
1231 static inline void kmem_cache_alloc_head(kmem_cache_t *cachep,
int flags)
1232 {
1233 if (flags & SLAB_DMA) {
1234 if (!(cachep->gfpflags & GFP_DMA))
1235 BUG();
1236 } else {
1237 if (cachep->gfpflags & GFP_DMA)
1238 BUG();
1239 }
1240 }
This is a preprocessor macro. It may seem strange to not make this an inline function but it is a preprocessor macro for a goto optimisation in __kmem_cache_alloc() (see Section H.3.2.2)
1283 #define kmem_cache_alloc_one(cachep) \
1284 ({ \
1285 struct list_head * slabs_partial, * entry; \
1286 slab_t *slabp; \
1287 \
1288 slabs_partial = &(cachep)->slabs_partial; \
1289 entry = slabs_partial->next; \
1290 if (unlikely(entry == slabs_partial)) { \
1291 struct list_head * slabs_free; \
1292 slabs_free = &(cachep)->slabs_free; \
1293 entry = slabs_free->next; \
1294 if (unlikely(entry == slabs_free)) \
1295 goto alloc_new_slab; \
1296 list_del(entry); \
1297 list_add(entry, slabs_partial); \
1298 } \
1299 \
1300 slabp = list_entry(entry, slab_t, list); \
1301 kmem_cache_alloc_one_tail(cachep, slabp); \
1302 })
This function is responsible for the allocation of one object from a slab. Much of it is debugging code.
1242 static inline void * kmem_cache_alloc_one_tail (
kmem_cache_t *cachep,
1243 slab_t *slabp)
1244 {
1245 void *objp;
1246
1247 STATS_INC_ALLOCED(cachep);
1248 STATS_INC_ACTIVE(cachep);
1249 STATS_SET_HIGH(cachep);
1250
1252 slabp->inuse++;
1253 objp = slabp->s_mem + slabp->free*cachep->objsize;
1254 slabp->free=slab_bufctl(slabp)[slabp->free];
1255
1256 if (unlikely(slabp->free == BUFCTL_END)) {
1257 list_del(&slabp->list);
1258 list_add(&slabp->list, &cachep->slabs_full);
1259 }
1260 #if DEBUG
1261 if (cachep->flags & SLAB_POISON)
1262 if (kmem_check_poison_obj(cachep, objp))
1263 BUG();
1264 if (cachep->flags & SLAB_RED_ZONE) {
1266 if (xchg((unsigned long *)objp, RED_MAGIC2) !=
1267 RED_MAGIC1)
1268 BUG();
1269 if (xchg((unsigned long *)(objp+cachep->objsize -
1270 BYTES_PER_WORD), RED_MAGIC2) != RED_MAGIC1)
1271 BUG();
1272 objp += BYTES_PER_WORD;
1273 }
1274 #endif
1275 return objp;
1276 }
This function allocate a batch of objects to a CPU cache of objects. It is only used in the SMP case. In many ways it is very similar kmem_cache_alloc_one()(See Section H.3.2.5).
1305 void* kmem_cache_alloc_batch(kmem_cache_t* cachep,
cpucache_t* cc, int flags)
1306 {
1307 int batchcount = cachep->batchcount;
1308
1309 spin_lock(&cachep->spinlock);
1310 while (batchcount--) {
1311 struct list_head * slabs_partial, * entry;
1312 slab_t *slabp;
1313 /* Get slab alloc is to come from. */
1314 slabs_partial = &(cachep)->slabs_partial;
1315 entry = slabs_partial->next;
1316 if (unlikely(entry == slabs_partial)) {
1317 struct list_head * slabs_free;
1318 slabs_free = &(cachep)->slabs_free;
1319 entry = slabs_free->next;
1320 if (unlikely(entry == slabs_free))
1321 break;
1322 list_del(entry);
1323 list_add(entry, slabs_partial);
1324 }
1325
1326 slabp = list_entry(entry, slab_t, list);
1327 cc_entry(cc)[cc->avail++] =
1328 kmem_cache_alloc_one_tail(cachep, slabp);
1329 }
1330 spin_unlock(&cachep->spinlock);
1331
1332 if (cc->avail)
1333 return cc_entry(cc)[--cc->avail];
1334 return NULL;
1335 }
The call graph for this function is shown in Figure 8.15.
1576 void kmem_cache_free (kmem_cache_t *cachep, void *objp)
1577 {
1578 unsigned long flags;
1579 #if DEBUG
1580 CHECK_PAGE(virt_to_page(objp));
1581 if (cachep != GET_PAGE_CACHE(virt_to_page(objp)))
1582 BUG();
1583 #endif
1584
1585 local_irq_save(flags);
1586 __kmem_cache_free(cachep, objp);
1587 local_irq_restore(flags);
1588 }
This covers what the function looks like in the UP case. Clearly, it simply releases the object to the slab.
1493 static inline void __kmem_cache_free (kmem_cache_t *cachep,
void* objp)
1494 {
1517 kmem_cache_free_one(cachep, objp);
1519 }
This case is slightly more interesting. In this case, the object is released to the per-cpu cache if it is available.
1493 static inline void __kmem_cache_free (kmem_cache_t *cachep,
void* objp)
1494 {
1496 cpucache_t *cc = cc_data(cachep);
1497
1498 CHECK_PAGE(virt_to_page(objp));
1499 if (cc) {
1500 int batchcount;
1501 if (cc->avail < cc->limit) {
1502 STATS_INC_FREEHIT(cachep);
1503 cc_entry(cc)[cc->avail++] = objp;
1504 return;
1505 }
1506 STATS_INC_FREEMISS(cachep);
1507 batchcount = cachep->batchcount;
1508 cc->avail -= batchcount;
1509 free_block(cachep,
1510 &cc_entry(cc)[cc->avail],batchcount);
1511 cc_entry(cc)[cc->avail++] = objp;
1512 return;
1513 } else {
1514 free_block(cachep, &objp, 1);
1515 }
1519 }
1414 static inline void kmem_cache_free_one(kmem_cache_t *cachep,
void *objp)
1415 {
1416 slab_t* slabp;
1417
1418 CHECK_PAGE(virt_to_page(objp));
1425 slabp = GET_PAGE_SLAB(virt_to_page(objp));
1426
1427 #if DEBUG
1428 if (cachep->flags & SLAB_DEBUG_INITIAL)
1433 cachep->ctor(objp, cachep,
SLAB_CTOR_CONSTRUCTOR|SLAB_CTOR_VERIFY);
1434
1435 if (cachep->flags & SLAB_RED_ZONE) {
1436 objp -= BYTES_PER_WORD;
1437 if (xchg((unsigned long *)objp, RED_MAGIC1) !=
RED_MAGIC2)
1438 BUG();
1440 if (xchg((unsigned long *)(objp+cachep->objsize -
1441 BYTES_PER_WORD), RED_MAGIC1) !=
RED_MAGIC2)
1443 BUG();
1444 }
1445 if (cachep->flags & SLAB_POISON)
1446 kmem_poison_obj(cachep, objp);
1447 if (kmem_extra_free_checks(cachep, slabp, objp))
1448 return;
1449 #endif
1450 {
1451 unsigned int objnr = (objp-slabp->s_mem)/cachep->objsize;
1452
1453 slab_bufctl(slabp)[objnr] = slabp->free;
1454 slabp->free = objnr;
1455 }
1456 STATS_DEC_ACTIVE(cachep);
1457
1459 {
1460 int inuse = slabp->inuse;
1461 if (unlikely(!--slabp->inuse)) {
1462 /* Was partial or full, now empty. */
1463 list_del(&slabp->list);
1464 list_add(&slabp->list, &cachep->slabs_free);
1465 } else if (unlikely(inuse == cachep->num)) {
1466 /* Was full. */
1467 list_del(&slabp->list);
1468 list_add(&slabp->list, &cachep->slabs_partial);
1469 }
1470 }
1471 }
This function is only used in the SMP case when the per CPU cache gets too full. It is used to free a batch of objects in bulk
1481 static void free_block (kmem_cache_t* cachep, void** objpp,
int len)
1482 {
1483 spin_lock(&cachep->spinlock);
1484 __free_block(cachep, objpp, len);
1485 spin_unlock(&cachep->spinlock);
1486 }
This function is responsible for freeing each of the objects in the per-CPU array objpp.
1474 static inline void __free_block (kmem_cache_t* cachep,
1475 void** objpp, int len)
1476 {
1477 for ( ; len > 0; len--, objpp++)
1478 kmem_cache_free_one(cachep, *objpp);
1479 }
This function is responsible for creating pairs of caches for small memory buffers suitable for either normal or DMA memory.
436 void __init kmem_cache_sizes_init(void)
437 {
438 cache_sizes_t *sizes = cache_sizes;
439 char name[20];
440
444 if (num_physpages > (32 << 20) >> PAGE_SHIFT)
445 slab_break_gfp_order = BREAK_GFP_ORDER_HI;
446 do {
452 snprintf(name, sizeof(name), "size-%Zd",
sizes->cs_size);
453 if (!(sizes->cs_cachep =
454 kmem_cache_create(name, sizes->cs_size,
455 0, SLAB_HWCACHE_ALIGN, NULL, NULL))) {
456 BUG();
457 }
458
460 if (!(OFF_SLAB(sizes->cs_cachep))) {
461 offslab_limit = sizes->cs_size-sizeof(slab_t);
462 offslab_limit /= 2;
463 }
464 snprintf(name, sizeof(name), "size-%Zd(DMA)",
sizes->cs_size);
465 sizes->cs_dmacachep = kmem_cache_create(name,
sizes->cs_size, 0,
466 SLAB_CACHE_DMA|SLAB_HWCACHE_ALIGN,
NULL, NULL);
467 if (!sizes->cs_dmacachep)
468 BUG();
469 sizes++;
470 } while (sizes->cs_size);
471 }
Ths call graph for this function is shown in Figure 8.16.
1555 void * kmalloc (size_t size, int flags)
1556 {
1557 cache_sizes_t *csizep = cache_sizes;
1558
1559 for (; csizep->cs_size; csizep++) {
1560 if (size > csizep->cs_size)
1561 continue;
1562 return __kmem_cache_alloc(flags & GFP_DMA ?
1563 csizep->cs_dmacachep :
csizep->cs_cachep, flags);
1564 }
1565 return NULL;
1566 }
The call graph for this function is shown in Figure 8.17. It is worth noting that the work this function does is almost identical to the function kmem_cache_free() with debugging enabled (See Section H.3.3.1).
1597 void kfree (const void *objp)
1598 {
1599 kmem_cache_t *c;
1600 unsigned long flags;
1601
1602 if (!objp)
1603 return;
1604 local_irq_save(flags);
1605 CHECK_PAGE(virt_to_page(objp));
1606 c = GET_PAGE_CACHE(virt_to_page(objp));
1607 __kmem_cache_free(c, (void*)objp);
1608 local_irq_restore(flags);
1609 }
The structure of the Per-CPU object cache and how objects are added or removed from them is covered in detail in Sections 8.5.1 and 8.5.2.
Figure H.1: Call Graph: enable_all_cpucaches()
This function locks the cache chain and enables the cpucache for every cache. This is important after the cache_cache and sizes cache have been enabled.
1714 static void enable_all_cpucaches (void)
1715 {
1716 struct list_head* p;
1717
1718 down(&cache_chain_sem);
1719
1720 p = &cache_cache.next;
1721 do {
1722 kmem_cache_t* cachep = list_entry(p, kmem_cache_t, next);
1723
1724 enable_cpucache(cachep);
1725 p = cachep->next.next;
1726 } while (p != &cache_cache.next);
1727
1728 up(&cache_chain_sem);
1729 }
This function calculates what the size of a cpucache should be based on the size of the objects the cache contains before calling kmem_tune_cpucache() which does the actual allocation.
1693 static void enable_cpucache (kmem_cache_t *cachep)
1694 {
1695 int err;
1696 int limit;
1697
1699 if (cachep->objsize > PAGE_SIZE)
1700 return;
1701 if (cachep->objsize > 1024)
1702 limit = 60;
1703 else if (cachep->objsize > 256)
1704 limit = 124;
1705 else
1706 limit = 252;
1707
1708 err = kmem_tune_cpucache(cachep, limit, limit/2);
1709 if (err)
1710 printk(KERN_ERR
"enable_cpucache failed for %s, error %d.\n",
1711 cachep->name, -err);
1712 }
This function is responsible for allocating memory for the cpucaches. For each CPU on the system, kmalloc gives a block of memory large enough for one cpu cache and fills a ccupdate_struct_t struct. The function smp_call_function_all_cpus() then calls do_ccupdate_local() which swaps the new information with the old information in the cache descriptor.
1639 static int kmem_tune_cpucache (kmem_cache_t* cachep,
int limit, int batchcount)
1640 {
1641 ccupdate_struct_t new;
1642 int i;
1643
1644 /*
1645 * These are admin-provided, so we are more graceful.
1646 */
1647 if (limit < 0)
1648 return -EINVAL;
1649 if (batchcount < 0)
1650 return -EINVAL;
1651 if (batchcount > limit)
1652 return -EINVAL;
1653 if (limit != 0 && !batchcount)
1654 return -EINVAL;
1655
1656 memset(&new.new,0,sizeof(new.new));
1657 if (limit) {
1658 for (i = 0; i< smp_num_cpus; i++) {
1659 cpucache_t* ccnew;
1660
1661 ccnew = kmalloc(sizeof(void*)*limit+
1662 sizeof(cpucache_t),
GFP_KERNEL);
1663 if (!ccnew)
1664 goto oom;
1665 ccnew->limit = limit;
1666 ccnew->avail = 0;
1667 new.new[cpu_logical_map(i)] = ccnew;
1668 }
1669 }
1670 new.cachep = cachep;
1671 spin_lock_irq(&cachep->spinlock);
1672 cachep->batchcount = batchcount;
1673 spin_unlock_irq(&cachep->spinlock);
1674
1675 smp_call_function_all_cpus(do_ccupdate_local, (void *)&new);
1676
1677 for (i = 0; i < smp_num_cpus; i++) {
1678 cpucache_t* ccold = new.new[cpu_logical_map(i)];
1679 if (!ccold)
1680 continue;
1681 local_irq_disable();
1682 free_block(cachep, cc_entry(ccold), ccold->avail);
1683 local_irq_enable();
1684 kfree(ccold);
1685 }
1686 return 0;
1687 oom:
1688 for (i--; i >= 0; i--)
1689 kfree(new.new[cpu_logical_map(i)]);
1690 return -ENOMEM;
1691 }
This calls the function func() for all CPU's. In the context of the slab allocator, the function is do_ccupdate_local() and the argument is ccupdate_struct_t.
859 static void smp_call_function_all_cpus(void (*func) (void *arg),
void *arg)
860 {
861 local_irq_disable();
862 func(arg);
863 local_irq_enable();
864
865 if (smp_call_function(func, arg, 1, 1))
866 BUG();
867 }
This function swaps the cpucache information in the cache descriptor with the information in info for this CPU.
874 static void do_ccupdate_local(void *info)
875 {
876 ccupdate_struct_t *new = (ccupdate_struct_t *)info;
877 cpucache_t *old = cc_data(new->cachep);
878
879 cc_data(new->cachep) = new->new[smp_processor_id()];
880 new->new[smp_processor_id()] = old;
881 }
This function is called to drain all objects in a per-cpu cache. It is called when a cache needs to be shrunk for the freeing up of slabs. A slab would not be freeable if an object was in the per-cpu cache even though it is not in use.
885 static void drain_cpu_caches(kmem_cache_t *cachep)
886 {
887 ccupdate_struct_t new;
888 int i;
889
890 memset(&new.new,0,sizeof(new.new));
891
892 new.cachep = cachep;
893
894 down(&cache_chain_sem);
895 smp_call_function_all_cpus(do_ccupdate_local, (void *)&new);
896
897 for (i = 0; i < smp_num_cpus; i++) {
898 cpucache_t* ccold = new.new[cpu_logical_map(i)];
899 if (!ccold || (ccold->avail == 0))
900 continue;
901 local_irq_disable();
902 free_block(cachep, cc_entry(ccold), ccold->avail);
903 local_irq_enable();
904 ccold->avail = 0;
905 }
906 smp_call_function_all_cpus(do_ccupdate_local, (void *)&new);
907 up(&cache_chain_sem);
908 }
This function will
416 void __init kmem_cache_init(void)
417 {
418 size_t left_over;
419
420 init_MUTEX(&cache_chain_sem);
421 INIT_LIST_HEAD(&cache_chain);
422
423 kmem_cache_estimate(0, cache_cache.objsize, 0,
424 &left_over, &cache_cache.num);
425 if (!cache_cache.num)
426 BUG();
427
428 cache_cache.colour = left_over/cache_cache.colour_off;
429 cache_cache.colour_next = 0;
430 }
This allocates pages for the slab allocator
486 static inline void * kmem_getpages (kmem_cache_t *cachep,
unsigned long flags)
487 {
488 void *addr;
495 flags |= cachep->gfpflags;
496 addr = (void*) __get_free_pages(flags, cachep->gfporder);
503 return addr;
504 }
This frees pages for the slab allocator. Before it calls the buddy allocator API, it will remove the PG_slab bit from the page flags.
507 static inline void kmem_freepages (kmem_cache_t *cachep, void *addr)
508 {
509 unsigned long i = (1<<cachep->gfporder);
510 struct page *page = virt_to_page(addr);
511
517 while (i--) {
518 PageClearSlab(page);
519 page++;
520 }
521 free_pages((unsigned long)addr, cachep->gfporder);
522 }