Linux内存管理 (4)分配物理页面
專題:Linux內存管理專題
關鍵詞:分配掩碼、伙伴系統、水位(watermark)、空閑伙伴塊合并。
?
我們知道Linux內存管理是以頁為單位進行的,對內存的管理是通過伙伴系統進行。
從Linux內存管理框架圖可知,頁面分配器是其他林林總總內存操作的基礎。
這也是為什么在介紹了《Linux內存管理 (1)物理內存初始化》、《Linux內存管理 (2)頁表的映射過程》、《Linux內存管理 (3)內核內存的布局圖》之后,緊接著就要弄明白頁面分配器的原因。
1. 重要數據結構
1.1 頁面分配掩碼
alloc_pages是內核中常用的分配物理內存頁面的接口函數,他有兩個參數,其中一個就是分配掩碼。
include\linux\gfp.h存放了GFP(Get Free Page)分配掩碼,分配掩碼可以分為兩類:以__GFP_開頭的分配掩碼;以GFP_開頭的一般是__GFP_的組合。
?
__GFP_掩碼分為兩大類:zone modifiers和action modifiers。
zone modifiers是掩碼的低4位,用來指定從那個zone分配頁面。
action modifiers定義了分配頁面的屬性
/* Plain integer GFP bitmasks. Do not use this directly. */ #define ___GFP_DMA 0x01u #define ___GFP_HIGHMEM 0x02u #define ___GFP_DMA32 0x04u #define ___GFP_MOVABLE 0x08u #define ___GFP_WAIT 0x10u #define ___GFP_HIGH 0x20u #define ___GFP_IO 0x40u #define ___GFP_FS 0x80u #define ___GFP_COLD 0x100u #define ___GFP_NOWARN 0x200u #define ___GFP_REPEAT 0x400u #define ___GFP_NOFAIL 0x800u #define ___GFP_NORETRY 0x1000u #define ___GFP_MEMALLOC 0x2000u #define ___GFP_COMP 0x4000u #define ___GFP_ZERO 0x8000u #define ___GFP_NOMEMALLOC 0x10000u #define ___GFP_HARDWALL 0x20000u #define ___GFP_THISNODE 0x40000u #define ___GFP_RECLAIMABLE 0x80000u #define ___GFP_NOTRACK 0x200000u #define ___GFP_NO_KSWAPD 0x400000u #define ___GFP_OTHER_NODE 0x800000u #define ___GFP_WRITE 0x1000000u /* If the above are modified, __GFP_BITS_SHIFT may need updating */?
在實際使用中多使用GFP_開頭的掩碼:
/* This equals 0, but use constants in case they ever change */ #define GFP_NOWAIT (GFP_ATOMIC & ~__GFP_HIGH) /* GFP_ATOMIC means both !wait (__GFP_WAIT not set) and use emergency pool */ #define GFP_ATOMIC (__GFP_HIGH) #define GFP_NOIO (__GFP_WAIT) #define GFP_NOFS (__GFP_WAIT | __GFP_IO) #define GFP_KERNEL (__GFP_WAIT | __GFP_IO | __GFP_FS) #define GFP_TEMPORARY (__GFP_WAIT | __GFP_IO | __GFP_FS | \__GFP_RECLAIMABLE) #define GFP_USER (__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HARDWALL) #define GFP_HIGHUSER (GFP_USER | __GFP_HIGHMEM) #define GFP_HIGHUSER_MOVABLE (GFP_HIGHUSER | __GFP_MOVABLE) #define GFP_IOFS (__GFP_IO | __GFP_FS) #define GFP_TRANSHUGE (GFP_HIGHUSER_MOVABLE | __GFP_COMP | \__GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN | \__GFP_NO_KSWAPD)/** GFP_THISNODE does not perform any reclaim, you most likely want to* use __GFP_THISNODE to allocate from a given node without fallback!*/ #ifdef CONFIG_NUMA #define GFP_THISNODE (__GFP_THISNODE | __GFP_NOWARN | __GFP_NORETRY) #else #define GFP_THISNODE ((__force gfp_t)0) #endif/* This mask makes up all the page movable related flags */ #define GFP_MOVABLE_MASK (__GFP_RECLAIMABLE|__GFP_MOVABLE)/* Control page allocator reclaim behavior */ #define GFP_RECLAIM_MASK (__GFP_WAIT|__GFP_HIGH|__GFP_IO|__GFP_FS|\__GFP_NOWARN|__GFP_REPEAT|__GFP_NOFAIL|\ __GFP_NORETRY|__GFP_MEMALLOC|__GFP_NOMEMALLOC) /* Control slab gfp mask during early boot */ #define GFP_BOOT_MASK (__GFP_BITS_MASK & ~(__GFP_WAIT|__GFP_IO|__GFP_FS)) /* Control allocation constraints */ #define GFP_CONSTRAINT_MASK (__GFP_HARDWALL|__GFP_THISNODE) /* Do not use these with a slab allocator */ #define GFP_SLAB_BUG_MASK (__GFP_DMA32|__GFP_HIGHMEM|~__GFP_BITS_MASK) /* Flag - indicates that the buffer will be suitable for DMA. Ignored on some platforms, used as appropriate on others */ #define GFP_DMA __GFP_DMA /* 4GB DMA on some platforms */ #define GFP_DMA32 __GFP_DMA32?
?
?
2.? 伙伴系統分配內存
alloc_page-------------------------------分配單頁 get_zeroed_page-->__get_free_pagesalloc_pages--------------------------分配2^odrder個頁面alloc_pages_node-----------------增加node id參數__alloc_pages__alloc_pages_node_mask--增加nodemaks參數?__alloc_pages_nodemask is the 'heart' of the zoned buddy allocator.
首先__alloc_pages_nodemask很重要,其次說明了這里的伙伴頁面分配器是基于Zone的。
?struct alloc_context是伙伴系統分配函數中用于保存相關參數的數據結構。
struct alloc_context {struct zonelist *zonelist;nodemask_t *nodemask;struct zone *preferred_zone;int classzone_idx;int migratetype; enum zone_type high_zoneidx; };?這里的zonelist,已經通過node_zonelist(nid, gfp_mask)得到:zonelist=NODE_DATA(nid)->node_zonelists+gfp_zonelist(flags)
struct page * __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,struct zonelist *zonelist, nodemask_t *nodemask) {struct zoneref *preferred_zoneref;struct page *page = NULL;unsigned int cpuset_mems_cookie;int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR;gfp_t alloc_mask; /* The gfp_t that was actually used for allocation */struct alloc_context ac = {.high_zoneidx = gfp_zone(gfp_mask),----------------------------------gfp_zone根據gfp_mask低4位,找到對應的zone_type。ZONE_NORMAL?ZONE_HIGHMEM?.nodemask = nodemask,.migratetype = gfpflags_to_migratetype(gfp_mask),--------------------根據gfp_mask得出頁面migratetype,是MIGRATE_RECLAIMABLE?MIGRATE_MOVABLE?};gfp_mask &= gfp_allowed_mask;lockdep_trace_alloc(gfp_mask);might_sleep_if(gfp_mask & __GFP_WAIT);if (should_fail_alloc_page(gfp_mask, order))return NULL;/** Check the zones suitable for the gfp_mask contain at least one* valid zone. It's possible to have an empty zonelist as a result* of GFP_THISNODE and a memoryless node*/if (unlikely(!zonelist->_zonerefs->zone))return NULL;if (IS_ENABLED(CONFIG_CMA) && ac.migratetype == MIGRATE_MOVABLE)alloc_flags |= ALLOC_CMA;retry_cpuset:cpuset_mems_cookie = read_mems_allowed_begin();/* We set it here, as __alloc_pages_slowpath might have changed it */ac.zonelist = zonelist;/* The preferred zone is used for statistics later */preferred_zoneref = first_zones_zonelist(ac.zonelist, ac.high_zoneidx,ac.nodemask ? : &cpuset_current_mems_allowed,&ac.preferred_zone);if (!ac.preferred_zone)goto out;ac.classzone_idx = zonelist_zone_idx(preferred_zoneref);/* First allocation attempt */alloc_mask = gfp_mask|__GFP_HARDWALL;page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac);---------嘗試分配物理頁面if (unlikely(!page)) {/** Runtime PM, block IO and its error handling path* can deadlock because I/O on the device might not* complete.*/alloc_mask = memalloc_noio_flags(gfp_mask);page = __alloc_pages_slowpath(alloc_mask, order, &ac);-----------------如果分配失敗,則在這里進行很多特殊場景的處理。}if (kmemcheck_enabled && page)kmemcheck_pagealloc_alloc(page, order, gfp_mask);trace_mm_page_alloc(page, order, alloc_mask, ac.migratetype);out:/** When updating a task's mems_allowed, it is possible to race with* parallel threads in such a way that an allocation can fail while* the mask is being updated. If a page allocation is about to fail,* check if the cpuset changed during allocation and if so, retry.*/if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))goto retry_cpuset;--------------------------------------------------重試頁面分配return page; }?get_page_from_freelist遍歷ac->zonelist中的zone,在里面尋找滿足條件的zone,然后找到頁面,返回。
static struct page * get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,const struct alloc_context *ac) {struct zonelist *zonelist = ac->zonelist;struct zoneref *z;struct page *page = NULL;struct zone *zone;nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */int zlc_active = 0; /* set if using zonelist_cache */int did_zlc_setup = 0; /* just call zlc_setup() one time */bool consider_zone_dirty = (alloc_flags & ALLOC_WMARK_LOW) &&(gfp_mask & __GFP_WRITE);int nr_fair_skipped = 0;bool zonelist_rescan;zonelist_scan:-------------------------------------------------------------------開始檢查ac->zonelist。zonelist_rescan = false;/** Scan zonelist, looking for a zone with enough free.* See also __cpuset_node_allowed() comment in kernel/cpuset.c.*/for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->high_zoneidx,--------從zonelist給定的ac->high_zoneidx開始查找,返回的是zone。ac->nodemask) { ...-----------------------------------------------------------------------------一系列檢查條件,不滿足跳出當前for循環,進入下一個zone。滿足的進入水位檢查。mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];-----------------這里的alloc_flags包含ALLOC_WMARK_LOWif (!zone_watermark_ok(zone, order, mark,-------------------------------所以此處會檢查zone的低水位,不滿足則進行檢查,或者嘗試zone_reclaim。ac->classzone_idx, alloc_flags)) {int ret;/* Checked here to keep the fast path fast */BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);if (alloc_flags & ALLOC_NO_WATERMARKS)goto try_this_zone; ...ret = zone_reclaim(zone, gfp_mask, order);-------------------------通過zone_reclaim進行一些頁面回收switch (ret) { ...default:/* did we reclaim enough */if (zone_watermark_ok(zone, order, mark,ac->classzone_idx, alloc_flags))---------------------再次檢查水位是否滿足goto try_this_zone;/** Failed to reclaim enough to meet watermark.* Only mark the zone full if checking the min* watermark or if we failed to reclaim just* 1<<order pages or else the page allocator* fastpath will prematurely mark zones full* when the watermark is between the low and* min watermarks.*/if (((alloc_flags & ALLOC_WMARK_MASK) == ALLOC_WMARK_MIN) ||ret == ZONE_RECLAIM_SOME)goto this_zone_full;continue;}}try_this_zone:---------------------------------------------------------------包括水位各種條件都滿足之后,可以在此zone進行頁面分配工作。page = buffered_rmqueue(ac->preferred_zone, zone, order,-------------從zone中進行頁面分配工作gfp_mask, ac->migratetype);if (page) {if (prep_new_page(page, order, gfp_mask, alloc_flags))goto try_this_zone;return page;} this_zone_full:if (IS_ENABLED(CONFIG_NUMA) && zlc_active)zlc_mark_zone_full(zonelist, z);}/** The first pass makes sure allocations are spread fairly within the* local node. However, the local node might have free pages left* after the fairness batches are exhausted, and remote zones haven't* even been considered yet. Try once more without fairness, and* include remote zones now, before entering the slowpath and waking* kswapd: prefer spilling to a remote zone over swapping locally.*/if (alloc_flags & ALLOC_FAIR) {alloc_flags &= ~ALLOC_FAIR;if (nr_fair_skipped) {zonelist_rescan = true;reset_alloc_batches(ac->preferred_zone);}if (nr_online_nodes > 1)zonelist_rescan = true;}if (unlikely(IS_ENABLED(CONFIG_NUMA) && zlc_active)) {/* Disable zlc cache for second zonelist scan */zlc_active = 0;zonelist_rescan = true;}if (zonelist_rescan)goto zonelist_scan;return NULL; }
?關于水位的計算在watermark中有詳細介紹。
下面看看判斷當前zone空閑頁面是否滿足alloc_flags指定水位的函數__zone_watermark_ok。
z-zone結構體,order待分配頁面的階數,mark水位數值,classzone_idx是zone序號,alloc_flags分配掩碼,free_pages當前空閑頁面數。
static bool __zone_watermark_ok(struct zone *z, unsigned int order,unsigned long mark, int classzone_idx, int alloc_flags,long free_pages) {/* free_pages may go negative - that's OK */long min = mark;int o;long free_cma = 0;free_pages -= (1 << order) - 1;---------------------------------------------減去待分配頁面后剩余頁面數,-1??if (alloc_flags & ALLOC_HIGH)min -= min / 2;if (alloc_flags & ALLOC_HARDER)min -= min / 4; ...if (free_pages - free_cma <= min + z->lowmem_reserve[classzone_idx])--------空閑頁面數要保證大于min值和lowmem_resreve保留值之和return false;for (o = 0; o < order; o++) {-----------------------------------------------遍歷buddy中比當前請求分配order小的所有order,依次檢查free pages是否滿足watermark需求/* At the next order, this order's pages become unavailable */free_pages -= z->free_area[o].nr_free << o;-----------------------------從總free_pages種減去當前order的free pages/* Require fewer higher order pages to be free */min >>= 1;--------------------------------------------------------------水位值縮半if (free_pages <= min)--------------------------------------------------在比較是否滿足水位需求return false;}return true;----------------------------------------------------------------以上所有條件都滿足,返回True }函數中循環的目的可歸結為:
依次循環,檢查內存中是否有足夠多的大塊(即order比較高)空閑內存。
每次循環處理中,先把當前order的free page從總free pages中減掉,因為我們是看是否有足夠多的大塊內存。
當然,既然已經把free pages中的一部分已經劃掉了,比較標準也應該相應放寬。
放寬多少,就是前面說的對min的右移處理。
參考:__zone_watermark_ok分析
zone_reclaim:
int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) {int node_id;int ret;/** Zone reclaim reclaims unmapped file backed pages and* slab pages if we are over the defined limits.** A small portion of unmapped file backed pages is needed for* file I/O otherwise pages read by file I/O will be immediately* thrown out if the zone is overallocated. So we do not reclaim* if less than a specified percentage of the zone is used by* unmapped file backed pages.*/if (zone_pagecache_reclaimable(zone) <= zone->min_unmapped_pages &&zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages)return ZONE_RECLAIM_FULL;if (!zone_reclaimable(zone))return ZONE_RECLAIM_FULL;/** Do not scan if the allocation should not be delayed.*/if (!(gfp_mask & __GFP_WAIT) || (current->flags & PF_MEMALLOC))return ZONE_RECLAIM_NOSCAN;/** Only run zone reclaim on the local zone or on zones that do not* have associated processors. This will favor the local processor* over remote processors and spread off node memory allocations* as wide as possible.*/node_id = zone_to_nid(zone);if (node_state(node_id, N_CPU) && node_id != numa_node_id())return ZONE_RECLAIM_NOSCAN;if (test_and_set_bit(ZONE_RECLAIM_LOCKED, &zone->flags))return ZONE_RECLAIM_NOSCAN;ret = __zone_reclaim(zone, gfp_mask, order);clear_bit(ZONE_RECLAIM_LOCKED, &zone->flags);if (!ret)count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED);return ret; }?
?
buffered_rmqueue:
/** Allocate a page from the given zone. Use pcplists for order-0 allocations.*/ static inline struct page *buffered_rmqueue(struct zone *preferred_zone,struct zone *zone, unsigned int order,gfp_t gfp_flags, int migratetype) {unsigned long flags;struct page *page;bool cold = ((gfp_flags & __GFP_COLD) != 0);if (likely(order == 0)) {struct per_cpu_pages *pcp;struct list_head *list;local_irq_save(flags);pcp = &this_cpu_ptr(zone->pageset)->pcp;list = &pcp->lists[migratetype];if (list_empty(list)) {pcp->count += rmqueue_bulk(zone, 0,pcp->batch, list,migratetype, cold);if (unlikely(list_empty(list)))goto failed;}if (cold)page = list_entry(list->prev, struct page, lru);elsepage = list_entry(list->next, struct page, lru);list_del(&page->lru);pcp->count--;} else {if (unlikely(gfp_flags & __GFP_NOFAIL)) {/** __GFP_NOFAIL is not to be used in new code.** All __GFP_NOFAIL callers should be fixed so that they* properly detect and handle allocation failures.** We most definitely don't want callers attempting to* allocate greater than order-1 page units with* __GFP_NOFAIL.*/WARN_ON_ONCE(order > 1);}spin_lock_irqsave(&zone->lock, flags);page = __rmqueue(zone, order, migratetype);spin_unlock(&zone->lock);if (!page)goto failed;__mod_zone_freepage_state(zone, -(1 << order),get_freepage_migratetype(page));}__mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));if (atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]) <= 0 &&!test_bit(ZONE_FAIR_DEPLETED, &zone->flags))set_bit(ZONE_FAIR_DEPLETED, &zone->flags);__count_zone_vm_events(PGALLOC, zone, 1 << order);zone_statistics(preferred_zone, zone, gfp_flags);local_irq_restore(flags);VM_BUG_ON_PAGE(bad_range(zone, page), page);return page;failed:local_irq_restore(flags);return NULL; }?
?
3. 釋放頁面?
??
__free_page free_page-->free_pages__free_pagesfree_hot_cold_page__free_pages_ok?
?
?
4. 伙伴系統相關節點
4.1 /proc/pagetypeinfo
Page block order: 10 Pages per block: 1024Free pages count per migrate type at order 0 1 2 3 4 5 6 7 8 9 10 Node 0, zone Normal, type Unmovable 243 105 26 7 2 0 1 0 0 0 0 Node 0, zone Normal, type Reclaimable 1 1 0 2 0 0 0 1 1 1 0 Node 0, zone Normal, type Movable 4 2 3 4 4 2 3 3 2 2 156 Node 0, zone Normal, type Reserve 0 0 0 0 0 0 0 0 0 0 1 Node 0, zone Normal, type CMA 0 0 0 0 0 0 0 0 0 0 0 Node 0, zone Normal, type Isolate 0 0 0 0 0 0 0 0 0 0 0 Node 0, zone HighMem, type Unmovable 1 1 1 0 1 0 0 1 1 1 0 Node 0, zone HighMem, type Reclaimable 0 0 0 0 0 0 0 0 0 0 0 Node 0, zone HighMem, type Movable 1 0 1 0 1 0 1 1 1 0 63 Node 0, zone HighMem, type Reserve 0 0 0 0 0 0 0 0 0 0 1 Node 0, zone HighMem, type CMA 0 0 0 0 0 0 0 0 0 0 0 Node 0, zone HighMem, type Isolate 0 0 0 0 0 0 0 0 0 0 0 Number of blocks type Unmovable Reclaimable Movable Reserve CMA Isolate Node 0, zone Normal 6 19 164 1 0 0 Node 0, zone HighMem 1 0 64 1 0 0?
轉載于:https://www.cnblogs.com/arnoldlu/p/8250734.html
總結
以上是生活随笔為你收集整理的Linux内存管理 (4)分配物理页面的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: Vue错误集
- 下一篇: ASP.NET Core 框架源码地址