cgroup代码浅析(2)
info
數據結構
Cgroup和Task的關聯
task->css_set
struct task_struct {struct css_set __rcu *cgroups; // 每個進程中,都對應有一個css_set結構體,css_set其實就是cgroup_subsys_state對象的集合,而每個cgroup_subsys_state代表一個subsystem... }struct css_set {struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT];... }css_set的初始化發生在kernel boot,從如下代碼可見
asmlinkage __visible void __init start_kernel(void) {cpuset_init();cgroup_init();... }一個task可以屬于多個cgroup,一個cgroup也可以擁有多個task,這種M:N的關系,linux kernel中是通過cgrp_cset_link結構體表示的:
/** A cgroup can be associated with multiple css_sets as different tasks may* belong to different cgroups on different hierarchies. In the other* direction, a css_set is naturally associated with multiple cgroups.* This M:N relationship is represented by the following link structure* which exists for each association and allows traversing the associations* from both sides.*/ struct cgrp_cset_link {/* the cgroup and css_set this link associates */struct cgroup *cgrp;struct css_set *cset;/* list of cgrp_cset_links anchored at cgrp->cset_links */struct list_head cset_link;/* list of cgrp_cset_links anchored at css_set->cgrp_links */struct list_head cgrp_link; };這個結構其實就是一個link,cgrp就是這個link關聯的cgroup,cset屬于一個task,于是可以代表一個進程。
而cset_link是給struct cgroup查找struct cgrp_cset_link用的。那么怎么找呢?
我們首先來看如何把一個cgroup與一個css_set關聯起來
/*** link_css_set - a helper function to link a css_set to a cgroup* @tmp_links: cgrp_cset_link objects allocated by allocate_cgrp_cset_links()* @cset: the css_set to be linked* @cgrp: the destination cgroup*//* link_css_set函數的功能就是把一個css_set與一個cgroup通過struct */cgrp_cset_link聯系起來。 static void link_css_set(struct list_head *tmp_links, struct css_set *cset, struct cgroup *cgrp) {struct cgrp_cset_link *link;BUG_ON(list_empty(tmp_links));if (cgroup_on_dfl(cgrp))cset->dfl_cgrp = cgrp;// 從已經分配好的一個cgrp_cset_link鏈表(表頭為tmp_links)中拿一個出來,填上cgroup與css_set的指針link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link);link->cset = cset;link->cgrp = cgrp;// 把這個cgrp_cset_link從原來的鏈表中移出來,加入到cgrp(這個就是那個cgroup)的cset_links鏈表中list_move_tail(&link->cset_link, &cgrp->cset_links);// 把cgrp_cset_link的cgrp_link加入到cset的cgrp_links鏈表中list_add_tail(&link->cgrp_link, &cset->cgrp_links);if (cgroup_parent(cgrp))cgroup_get(cgrp); }上面注釋中提到,用于分配cgrp_cset_link(表頭為tmp_links)的函數是allocate_cgrp_cset_links,其定義如下:
/*** allocate_cgrp_cset_links - allocate cgrp_cset_links* @count: the number of links to allocate* @tmp_links: list_head the allocated links are put on** Allocate @count cgrp_cset_link structures and chain them on @tmp_links* through ->cset_link. Returns 0 on success or -errno.*/ static int allocate_cgrp_cset_links(int count, struct list_head *tmp_links) {struct cgrp_cset_link *link;int i;INIT_LIST_HEAD(tmp_links);for (i = 0; i < count; i++) {link = kzalloc(sizeof(*link), GFP_KERNEL);if (!link) {free_cgrp_cset_links(tmp_links);return -ENOMEM;}list_add(&link->cset_link, tmp_links);}return 0; }這個函數很簡單,就是申請count個struct cgrp_cset_link,同時把它們一個個加到tmp_links這個鏈表里。這count的數據結構是通過struct cgrp_cset_link->cset_link連接起來的,但是前面說到這個變量是給struct cgroup用的。這是因為目前分配出來的這些個數據結構只是臨時的,也就是說暫時借用一下這個變量,到后面會再來恢復這個變量的本來用途。這也是為什么link_css_set函數中cgrp_link成員用list_add,而cset_link用list_move。
于是,可以用下圖來表示allocate_cgrp_cset_links的結果:
而link_css_set的結果則可以用下圖來表示:
這張圖也解釋了linux代碼中如何表現cgroup與subsystem之間多對多的關系。每個struct cgroup可以通過cgroup->cset_links和cgrp_cset_link->cset_link找到一串struct cgrp_cset_link,每個struct cgrp_cset_link都有著對應的css_set,這個css_set屬于一個tast_struct(其實是多個),其中包含著subsystem。
于是通過遍歷鏈表就能找到這個cgroup對應的所有task(其實找到的是css_set,但是對于Cgroups這個模塊來說,關心的并不是task_struct,而是這個css_set)。反之亦然,通過task_struct的cgroups變量(類型為struct css_set*)就能找到這個進程屬于的所有cgroup。
例如,給定一個task,我們想找到這個task在某個hierarchy中的cgroup,就可以調用如下函數:linux-4.4.19/kernel/cgroup.c
/** Return the cgroup for "task" from the given hierarchy. Must be* called with cgroup_mutex and css_set_lock held.*/ static struct cgroup *task_cgroup_from_root(struct task_struct *task,struct cgroup_root *root) {/** No need to lock the task - since we hold cgroup_mutex the* task can't change groups, so the only thing that can happen* is that it exits and its css is set back to init_css_set.*/return cset_cgroup_from_root(task_css_set(task), root); } /* look up cgroup associated with given css_set on the specified hierarchy */ static struct cgroup *cset_cgroup_from_root(struct css_set *cset,struct cgroup_root *root) {struct cgroup *res = NULL;lockdep_assert_held(&cgroup_mutex);lockdep_assert_held(&css_set_lock);if (cset == &init_css_set) {res = &root->cgrp;} else {struct cgrp_cset_link *link;list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {struct cgroup *c = link->cgrp;if (c->root == root) {res = c;break;}}}BUG_ON(!res);return res; }Cgroup與subsystem
linux-4.4.19/include/linux/cgroupsubsys.h中定義了所有的subsystem。
可以看到,共有cpuset, debug, cpu, cpuacct, memory, devices, freezer, netcls, blkio, perfevent, netprio, hugtlb等12個.
cpu subsystem
struct task_group就是cpu subsystem對應的子類, 代碼見
/* task group related information */ struct task_group {struct cgroup_subsys_state css;#ifdef CONFIG_FAIR_GROUP_SCHED/* schedulable entities of this group on each cpu */struct sched_entity **se;/* runqueue "owned" by this group on each cpu */struct cfs_rq **cfs_rq;unsigned long shares;#ifdef CONFIG_SMPatomic_long_t load_avg; #endif #endif#ifdef CONFIG_RT_GROUP_SCHEDstruct sched_rt_entity **rt_se;struct rt_rq **rt_rq;struct rt_bandwidth rt_bandwidth; #endifstruct rcu_head rcu;struct list_head list;struct task_group *parent;struct list_head siblings;struct list_head children;#ifdef CONFIG_SCHED_AUTOGROUPstruct autogroup *autogroup; #endifstruct cfs_bandwidth cfs_bandwidth; };Cgroups通過VFS來和用戶打交道, 用戶通過將各個subsystem mount到某個目錄下之后, cgroup文件系統會自動創建一系列虛擬文件, 用戶通過向不同的文件讀寫數據控制Cgroups的行為. 具體對CPU subsystem來說, 有一個tasks文件, 向其中寫入一些進程的pid, 就能將這些進程加入到這個cgroup. 另外還有個cpu.shares的文件, 向其中寫入一個數字后就能設置這個cgroup的進程的weight.
每個文件系統(包括Cgroups對應的cgroup文件系統)擁有一個數據結構, 其中有一系列函數指針, 當對這個文件系統進行讀寫操作時, 內核會調用這個文件系統的對應函數指針. 因此當向一個VFS的文件寫入數據時, 可以在這個函數指針指向的函數做一些其他事情. 具體對于CPU subsystem, 當向cpu.shares寫入一個數字時, 內核執行的函數干的事情是修改這個cgroup對應的struct task_group中的shares變量. 這個函數是:
linux-4.4.19/kernel/sched/core.c #8270
其中, csstg函數是找到具體的subsystem子類, 這里就是struct taskcgroup. schedgroupset_shares這個函數的定義如下:
int sched_group_set_shares(struct task_group *tg, unsigned long shares) {int i;unsigned long flags;/** We can't change the weight of the root cgroup.*/if (!tg->se[0])return -EINVAL;shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));mutex_lock(&shares_mutex);if (tg->shares == shares)goto done;tg->shares = shares;for_each_possible_cpu(i) {struct rq *rq = cpu_rq(i);struct sched_entity *se;se = tg->se[i];/* Propagate contribution to hierarchy */raw_spin_lock_irqsave(&rq->lock, flags);/* Possible calls to update_curr() need rq clock */update_rq_clock(rq);for_each_sched_entity(se)update_cfs_shares(group_cfs_rq(se));raw_spin_unlock_irqrestore(&rq->lock, flags);}done:mutex_unlock(&shares_mutex);return 0; }變量
根組:
extern struct mem_cgroup *root_mem_cgroup;函數
從page獲取mem_cgroup: page_mem_cgroup()
static inline struct mem_cgroup *page_mem_cgroup(struct page *page) {return page->mem_cgroup; }從pgdata + memcg 獲取lru: mem_cgroup_lruvec()
static inline struct lruvec *mem_cgroup_lruvec(struct pglist_data *pgdat,struct mem_cgroup *memcg) {struct mem_cgroup_per_node *mz;struct lruvec *lruvec;// 如果沒有開啟memcg,則,lru等于node上的lruif (mem_cgroup_disabled()) {lruvec = node_lruvec(pgdat);goto out;}// 獲取memcg里對應的node的mz,mz里保存了這個memcg在這個node上的lruvecmz = mem_cgroup_nodeinfo(memcg, pgdat->node_id);lruvec = &mz->lruvec; out:/** Since a node can be onlined after the mem_cgroup was created,* we have to be prepared to initialize lruvec->pgdat here;* and if offlined then reonlined, we need to reinitialize it.*/if (unlikely(lruvec->pgdat != pgdat))lruvec->pgdat = pgdat;return lruvec; }例子:
static void reclaim_pages_from_memcg(struct mem_cgroup *memcg) {pg_data_t *pgdat;struct lruvec *lruvec;pgdat = NODE_DATA(nid);lruvec = mem_cgroup_lruvec(pgdat, memcg); }常見函數
mem_cgroup_disabled()打印相關:
memcg_stat_show()charge 相關:
int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,gfp_t gfp_mask, struct mem_cgroup **memcgp,bool compound); void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,bool lrucare, bool compound); void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg,bool compound); void mem_cgroup_uncharge(struct page *page); void mem_cgroup_uncharge_list(struct list_head *page_list);charge/uncharge
mem_cgroup_uncharge
void mem_cgroup_uncharge(struct page *page) {if (mem_cgroup_disabled())return;/* Don't touch page->lru of any random page, pre-check: */if (!page->mem_cgroup)return;INIT_LIST_HEAD(&page->lru);uncharge_list(&page->lru); }memcg_stat_show
static int memcg_stat_show(struct seq_file *m, void *v) {struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));unsigned long memory, memsw;struct mem_cgroup *mi;unsigned int i;struct accumulated_stats acc;BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_stat_names) !=MEM_CGROUP_STAT_NSTATS);BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_events_names) !=MEM_CGROUP_EVENTS_NSTATS);BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {if (i == MEM_CGROUP_STAT_SWAP && !do_memsw_account())continue;seq_printf(m, "%s %lu\n", mem_cgroup_stat_names[i],mem_cgroup_read_stat(memcg, i) * PAGE_SIZE);}for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++)seq_printf(m, "%s %lu\n", mem_cgroup_events_names[i],mem_cgroup_read_events(memcg, i));for (i = 0; i < NR_LRU_LISTS; i++)seq_printf(m, "%s %lu\n", mem_cgroup_lru_names[i],mem_cgroup_nr_lru_pages(memcg, BIT(i)) * PAGE_SIZE);Refs
https://www.cnblogs.com/yjf512/p/6003094.html
https://blog.csdn.net/WaltonWang/article/details/53899191
轉載于:https://www.cnblogs.com/muahao/p/10281139.html
總結
以上是生活随笔為你收集整理的cgroup代码浅析(2)的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: Java程序员从阿里、京东、美团面试回来
- 下一篇: 落地华东总部、上线创新云、签约AIoT产