Linux数据报文接收发送总结6
2.3 協(xié)議棧注冊(cè)
內(nèi)核實(shí)現(xiàn)了網(wǎng)絡(luò)層的ip協(xié)議,也實(shí)現(xiàn)了傳輸層的tcp協(xié)議和udp協(xié)議。這些協(xié)議對(duì)應(yīng)的實(shí)現(xiàn)函數(shù)分別是ip_rcv(),tcp_v4_rcv()和udp_rcv()。和我們平時(shí)寫(xiě)代碼的方式不一樣的是,內(nèi)核是通過(guò)注冊(cè)的方式來(lái)實(shí)現(xiàn)的。Linux內(nèi)核中的fs_initcall和subsys_initcall類(lèi)似,也是初始化模塊的入口。fs_initcall調(diào)用inet_init后開(kāi)始網(wǎng)絡(luò)協(xié)議棧注冊(cè)。通過(guò)inet_init,將這些函數(shù)注冊(cè)到了inet_protos(傳輸層協(xié)議)和ptype_base(網(wǎng)絡(luò)/鏈路層協(xié)議)數(shù)據(jù)結(jié)構(gòu)中了。如下圖:
相關(guān)代碼如下
//file: net/ipv4/af_inet.cstatic const struct net_proto_family inet_family_ops = {.family = PF_INET,.create = inet_create,.owner = THIS_MODULE, };/* Upon startup we insert all the elements in inetsw_array[] into* the linked list inetsw.*/ static struct inet_protosw inetsw_array[] = {{.type = SOCK_STREAM,.protocol = IPPROTO_TCP,.prot = &tcp_prot,.ops = &inet_stream_ops,.flags = INET_PROTOSW_PERMANENT |INET_PROTOSW_ICSK,},{.type = SOCK_DGRAM,.protocol = IPPROTO_UDP,.prot = &udp_prot,.ops = &inet_dgram_ops,.flags = INET_PROTOSW_PERMANENT,},{.type = SOCK_DGRAM,.protocol = IPPROTO_ICMP,.prot = &ping_prot,.ops = &inet_sockraw_ops,.flags = INET_PROTOSW_REUSE,},{.type = SOCK_RAW,.protocol = IPPROTO_IP, /* wild card */.prot = &raw_prot,.ops = &inet_sockraw_ops,.flags = INET_PROTOSW_REUSE,} };static struct packet_type ip_packet_type __read_mostly = {.type = cpu_to_be16(ETH_P_IP),.func = ip_rcv, };static const struct net_protocol tcp_protocol = {.early_demux = tcp_v4_early_demux,.handler = tcp_v4_rcv,.err_handler = tcp_v4_err,.no_policy = 1,.netns_ok = 1,.icmp_strict_tag_validation = 1, };static const struct net_protocol udp_protocol = {.early_demux = udp_v4_early_demux,.handler = udp_rcv,.err_handler = udp_err,.no_policy = 1,.netns_ok = 1, };static const struct net_protocol icmp_protocol = {.handler = icmp_rcv,.err_handler = icmp_err,.no_policy = 1,.netns_ok = 1, };static int __init inet_init(void) {struct inet_protosw *q;struct list_head *r;int rc = -EINVAL;sock_skb_cb_check_size(sizeof(struct inet_skb_parm));rc = proto_register(&tcp_prot, 1);if (rc)goto out;rc = proto_register(&udp_prot, 1);if (rc)goto out_unregister_tcp_proto;rc = proto_register(&raw_prot, 1);if (rc)goto out_unregister_udp_proto;rc = proto_register(&ping_prot, 1);if (rc)goto out_unregister_raw_proto;/** Tell SOCKET that we are alive...*/(void)sock_register(&inet_family_ops); // 協(xié)議族注冊(cè), socket函數(shù)的第一個(gè)參數(shù)#ifdef CONFIG_SYSCTLip_static_sysctl_init(); #endif/** Add all the base protocols.*/if (inet_add_protocol(&icmp_protocol, IPPROTO_ICMP) < 0) //協(xié)議注冊(cè),ip層協(xié)議解析后,再向上解析傳輸層調(diào)用 pr_crit("%s: Cannot add ICMP protocol\n", __func__);if (inet_add_protocol(&udp_protocol, IPPROTO_UDP) < 0)pr_crit("%s: Cannot add UDP protocol\n", __func__);if (inet_add_protocol(&tcp_protocol, IPPROTO_TCP) < 0)pr_crit("%s: Cannot add TCP protocol\n", __func__); #ifdef CONFIG_IP_MULTICASTif (inet_add_protocol(&igmp_protocol, IPPROTO_IGMP) < 0)pr_crit("%s: Cannot add IGMP protocol\n", __func__); #endif/* Register the socket-side information for inet_create. */for (r = &inetsw[0]; r < &inetsw[SOCK_MAX]; ++r)INIT_LIST_HEAD(r);for (q = inetsw_array; q < &inetsw_array[INETSW_ARRAY_LEN]; ++q) // 類(lèi)型注冊(cè),對(duì)應(yīng)socket中的第二個(gè)參數(shù)inet_register_protosw(q);/** Set the ARP module up*/arp_init();/** Set the IP module up*/ip_init();tcp_v4_init();/* Setup TCP slab cache for open requests. */tcp_init();/* Setup UDP memory threshold */udp_init();/* Add UDP-Lite (RFC 3828) */udplite4_register();ping_init();/** Set the ICMP layer up*/if (icmp_init() < 0)panic("Failed to create the ICMP control socket.\n");/** Initialise the multicast router*/ #if defined(CONFIG_IP_MROUTE)if (ip_mr_init())pr_crit("%s: Cannot init ipv4 mroute\n", __func__); #endifif (init_inet_pernet_ops())pr_crit("%s: Cannot init ipv4 inet pernet ops\n", __func__);/** Initialise per-cpu ipv4 mibs*/if (init_ipv4_mibs())pr_crit("%s: Cannot init ipv4 mibs\n", __func__);ipv4_proc_init();ipfrag_init();dev_add_pack(&ip_packet_type); // 注冊(cè)到ptype_base, 接收?qǐng)?bào)文時(shí),根據(jù)協(xié)議進(jìn)行相應(yīng)的處理函數(shù)ip_tunnel_core_init();rc = 0; out:return rc; out_unregister_raw_proto:proto_unregister(&raw_prot); out_unregister_udp_proto:proto_unregister(&udp_prot); out_unregister_tcp_proto:proto_unregister(&tcp_prot);goto out; }fs_initcall(inet_init);proto_register注冊(cè)函數(shù),將對(duì)應(yīng)協(xié)議加到proto_list鏈表中。proto_list是一個(gè)全局的靜態(tài)鏈表,inet域支持的所有協(xié)議全部在這個(gè)鏈表中,但這個(gè)鏈表在協(xié)議棧中并沒(méi)有太大用途,它只是用于在/proc/net/protocols文件中輸出當(dāng)前系統(tǒng)所支持的所有協(xié)議。
inet_register_protosw注冊(cè)函數(shù),將對(duì)協(xié)議加到 inetsw 數(shù)組中,在socket函數(shù)系統(tǒng)調(diào)用時(shí)選擇具體的協(xié)議時(shí)會(huì)用到,發(fā)送報(bào)文時(shí)會(huì)用到。
int proto_register(struct proto *prot, int alloc_slab) {if (alloc_slab) {......}mutex_lock(&proto_list_mutex);list_add(&prot->node, &proto_list);assign_proto_idx(prot);mutex_unlock(&proto_list_mutex);return 0; } EXPORT_SYMBOL(proto_register);void inet_register_protosw(struct inet_protosw *p) {struct list_head *lh;struct inet_protosw *answer;int protocol = p->protocol;struct list_head *last_perm;spin_lock_bh(&inetsw_lock);if (p->type >= SOCK_MAX)goto out_illegal;/* If we are trying to override a permanent protocol, bail. */last_perm = &inetsw[p->type];list_for_each(lh, &inetsw[p->type]) {answer = list_entry(lh, struct inet_protosw, list);/* Check only the non-wild match. */if ((INET_PROTOSW_PERMANENT & answer->flags) == 0)break;if (protocol == answer->protocol)goto out_permanent;last_perm = lh;}/* Add the new entry after the last permanent entry if any, so that* the new entry does not override a permanent entry when matched with* a wild-card protocol. But it is allowed to override any existing* non-permanent entry. This means that when we remove this entry, the* system automatically returns to the old behavior.*/list_add_rcu(&p->list, last_perm); out:spin_unlock_bh(&inetsw_lock);return;...... }上面的代碼中我們可以看到,udp_protocol結(jié)構(gòu)體中的handler是udp_rcv,tcp_protocol結(jié)構(gòu)體中的handler是tcp_v4_rcv,通過(guò)inet_add_protocol被初始化了進(jìn)來(lái)。
int inet_add_protocol(const struct net_protocol *prot, unsigned char protocol){if (!prot->netns_ok) {pr_err("Protocol %u is not namespace aware, cannot register.\n",protocol);return -EINVAL;}return !cmpxchg((const struct net_protocol **)&inet_protos[protocol],NULL, prot) ? 0 : -1; }inet_add_protocol函數(shù)將tcp和udp對(duì)應(yīng)的處理函數(shù)都注冊(cè)到了 inet_protos (接收?qǐng)?bào)文時(shí),解析傳輸層應(yīng)用)數(shù)組中了。
再看dev_add_pack(&ip_packet_type);這一行,ip_packet_type結(jié)構(gòu)體中的type是協(xié)議名,func是ip_rcv函數(shù),在dev_add_pack中會(huì)被注冊(cè)到ptype_base(接收?qǐng)?bào)文時(shí),解析網(wǎng)絡(luò)層使用)哈希表中。(上面在net_dev_init時(shí),最多支持16個(gè)協(xié)議)
//file: net/core/dev.c void dev_add_pack(struct packet_type *pt){struct list_head *head = ptype_head(pt);...... } static inline struct list_head *ptype_head(const struct packet_type *pt){if (pt->type == htons(ETH_P_ALL))return &ptype_all;elsereturn &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK]; }這里我們需要記住inet_protos記錄著udp,tcp的處理函數(shù)地址,ptype_base存儲(chǔ)著ip_rcv()函數(shù)的處理地址。后面我們會(huì)看到軟中斷中會(huì)通過(guò)ptype_base找到ip_rcv函數(shù)地址,進(jìn)而將ip包正確地送到ip_rcv()中執(zhí)行。在ip_rcv中將會(huì)通過(guò)inet_protos找到tcp或者udp的處理函數(shù),再而把包轉(zhuǎn)發(fā)給udp_rcv()或tcp_v4_rcv()函數(shù)。
擴(kuò)展一下,如果看一下ip_rcv和udp_rcv等函數(shù)的代碼能看到很多協(xié)議的處理過(guò)程。例如,ip_rcv中會(huì)處理netfilter和iptable過(guò)濾,如果你有很多或者很復(fù)雜的 netfilter 或 iptables 規(guī)則,這些規(guī)則都是在軟中斷的上下文中執(zhí)行的,會(huì)加大網(wǎng)絡(luò)延遲。再例如,udp_rcv中會(huì)判斷socket接收隊(duì)列是否滿了。對(duì)應(yīng)的相關(guān)內(nèi)核參數(shù)是net.core.rmem_max和net.core.rmem_default。如果有興趣,建議大家好好讀一下inet_init這個(gè)函數(shù)的代碼。
總結(jié)
以上是生活随笔為你收集整理的Linux数据报文接收发送总结6的全部?jī)?nèi)容,希望文章能夠幫你解決所遇到的問(wèn)題。
- 上一篇: Linux数据报文接收发送总结5
- 下一篇: Linux数据报文接收发送总结7