

在Linux Kernel的物理内存管理的Buddy System中,引入了冷热页的概念。冷页表示该空闲页已经不再高速缓存中了(一般是指L2 Cache),热页表示该空闲页仍然在高速缓存中。冷热页是针对于每CPU的,每个zone中,都会针对于所有的CPU初始化一个冷热页的per-cpu-pageset. 



 Buddy Allocator在分配order为0的空闲页的时候,如果分配一个热页,那么由于该页已经存在于L2 Cache中了。CPU写访问的时候,不需要先把内存中的内容读到Cache中,然后再写。如果分配一个冷页,说明该页不在L2 Cache中。一般情况下,尽可能用热页,是容易理解的。什么时候用冷页呢?While allocating a physical page frame, there is a bit specifying whether we would like a hot or a cold page (that is, a page likely to be in the CPU cache, or a page not likely to be there). If the page will be used by the CPU, a hot page will be faster. If the page will be used for device DMA the CPU cache would be invalidated anyway, and a cold page does not waste precious cache contents. 

 Buddy System在给某个进程分配某个zone中空闲页的时候,首先需要用自旋锁锁住该zone,然后分配页。这样,如果多个CPU上的进程同时进行分配页,便会竞争。引入了per-cpu-set后,当多个CPU上的进程同时分配页的时候,竞争便不会发生,提高了效率。另外当释放单个页面时,空闲页面首先放回到per-cpu-pageset中,以减少zone中自旋锁的使用。当页面缓存中的页面数量超过阀值时,再将页面放回到伙伴系统中。



   struct per_cpu_pages {    int count;    // number of pages in the list    int high;    // high watermark, emptying needed    int batch;    // chunk size for buddy add/remove     // Lists of pages, one per migrate type stored on the pcp-lists     每个CPU在每个zone上都有MIGRATE_PCPTYPES个冷热页链表(根据迁移类型划分)     struct list_head lists[MIGRATE_PCPTYPES];   };   





   /*   * Really, prep_compound_page() should be called from __rmqueue_bulk(). But   * we cheat by calling it from here, in the order > 0 path. Saves a branch   * or two.   */  static inline  struct page *buffered_rmqueue(struct zone *preferred_zone,     struct zone *zone, int order, gfp_t gfp_flags,     int migratetype)  {   unsigned long flags;   struct page *page;   //分配标志是__GFP_COLD才分配冷页   int cold = !!(gfp_flags & __GFP_COLD);  again:   if (likely(order == 0)) {    struct per_cpu_pages *pcp;    struct list_head *list;    local_irq_save(flags);    pcp = &this_cpu_ptr(zone->pageset)->pcp;    list = &pcp->lists[migratetype];    if (list_empty(list)) {     //如果缺少页,则从Buddy System中分配。     pcp->count += rmqueue_bulk(zone, 0,       pcp->batch, list,       migratetype, cold);     if (unlikely(list_empty(list)))      goto failed;    }    if (cold)    //分配冷页时,从链表尾部分配,list为链表头,list->prev表示链表尾     page = list_entry(list->prev, struct page, lru);    else    //分配热页时,从链表头分配     page = list_entry(list->next, struct page, lru);    //分配完一个页框后从冷热页链表中删去该页    list_del(&page->lru);    pcp->count--;   } else {//如果order!=0(页框数>1),则不从冷热页链表中分配    if (unlikely(gfp_flags & __GFP_NOFAIL)) {     /*      * __GFP_NOFAIL is not to be used in new code.      *      * All __GFP_NOFAIL callers should be fixed so that they      * properly detect and handle allocation failures.      *      * We most definitely don't want callers attempting to      * allocate greater than order-1 page units with      * __GFP_NOFAIL.      */     WARN_ON_ONCE(order > 1);    }    spin_lock_irqsave(&zone->lock, flags);    page = __rmqueue(zone, order, migratetype);    spin_unlock(&zone->lock);    if (!page)     goto failed;    __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order));   }   __count_zone_vm_events(PGALLOC, zone, 1 << order);   zone_statistics(preferred_zone, zone, gfp_flags);   local_irq_restore(flags);   VM_BUG_ON(bad_range(zone, page));   if (prep_new_page(page, order, gfp_flags))    goto again;   return page;  failed:   local_irq_restore(flags);   return NULL;  }

