linux socket原理,socket 的工作原理
首先談一下Socket
機制本身,socket為各種協議提供了統一接口的一種ipc機制。在linux中,它由幾個部分組成。為了討論,先討論幾個數據結構,如下所示:
struct
net_proto_family {
int?family;
int?(*create)(struct socket *sock, int protocol);
short?authentication;
short?encryption;
short?encrypt_net;
struct
module?*owner;
};
這個數據結構定義在linux的kernel中,在文件中。其中family是用來標示協議號的。而那個create函數指針則表示用來創建socket時所對應的create函數,owner則是這個協議的module結構。同時,還定義一個協議數:
#define
NPROTO?64
再看一下socket的本身的定義:
struct
socket {
socket_state?state;
unsigned
long?flags;
struct proto_ops?*ops;
struct
fasync_struct?*fasync_list;
struct
file?*file;
struct
sock?*sk;
wait_queue_head_t?wait;
short?type;
};
Ops指針所對應的是在這個socket上的一些操作,它的定義如下:
struct
proto_ops {
int?family;
struct
module?*owner;
int?(*release)?(struct socket
*sock);
int?(*bind)?(struct socket *sock,
struct sockaddr *myaddr,
int sockaddr_len);
int?(*connect)?(struct socket
*sock,
struct sockaddr *vaddr,
int sockaddr_len, int flags);
int?(*socketpair)(struct socket *sock1,
struct socket *sock2);
int?(*accept)?(struct socket *sock,
struct socket *newsock, int flags);
int?(*getname)?(struct socket
*sock,
struct sockaddr *addr,
int *sockaddr_len, int peer);
unsigned int?(*poll)?(struct file *file, struct socket *sock,
struct poll_table_struct *wait);
int?(*ioctl)?(struct socket *sock, unsigned int cmd,
unsigned long arg);
int?(*listen)?(struct socket *sock, int len);
int?(*shutdown)?(struct socket *sock, int
flags);
int?(*setsockopt)(struct socket *sock, int level,
int optname, char __user *optval, int optlen);
int?(*getsockopt)(struct socket *sock, int level,
int optname, char __user *optval, int __user *optlen);
int?(*sendmsg)?(struct kiocb *iocb,
struct socket *sock,
struct msghdr *m, size_t total_len);
int?(*recvmsg)?(struct kiocb *iocb,
struct socket *sock,
struct msghdr *m, size_t total_len,
int flags);
int?(*mmap)?(struct file *file, struct socket *sock,
struct vm_area_struct * vma);
ssize_t?(*sendpage)?(struct socket *sock, struct page
*page,
int offset, size_t size, int flags);
};
從這個定義可以看出它定義了很多函數指針,也就是當生成某個協議的socket時,這個協議所對應的函數可以賦給這些函數指針。這樣協議的實現者和socket本身的實現機制就可以分開。
在kernel中定義了一個靜態的全局數組,如下所示:
static
struct net_proto_family *net_families[NPROTO];這個定義在kernel的socket.c中。當linux系統啟動時,系統的init進程會調用sock_init函數對這個數組初始化,
在init進程中調用過程是:start_kernel –〉rest_init –〉kernel_thread(init, NULL, CLONE_FS |
CLONE_SIGHAND)-〉init-〉do_basic_setup –〉sock_init:
for(i = 0; i < NPROTO; i++)
也就是每一個協議對應這個數組的一項。同時在這個socket.c文件中還定義了一些socket注冊函數:
int sock_register(struct net_proto_family *ops)
{
int err;
if (ops->family >= NPROTO) {
printk(KERN_CRIT "protocol %d >= NPROTO(%d)\n", ops->family, NPROTO);
return -ENOBUFS;
}
net_family_write_lock();
err = -EEXIST;
if (net_families[ops->family] == NULL) {
net_families[ops->family]=ops;
err = 0;
}
net_family_write_unlock();
printk(KERN_INFO "NET: Registered protocol family %d\n",
ops->family);
return err;
}
從這個代碼可以看出,它最主要的工作就是在數組所對應的項中把協議所對應的socket操作函數的net_proto_family結構指針給賦上值,這樣當給定某個協議的socket時,就能通過協議號在這個數組中找對應的項,進而可以得到這個socket的實際的創建函數,從而在需要生成一個新的這個協議的socket時調用用這個創建函數。那么這個socket注冊函數是在哪調用的呢?一般是在協議初始化被調用的。如tipc協議在linux中是作為一個module來實現的,那么在module的
module_init(tipc_init);這個tipc_init調用關系如下:
tipc_init->start_core-〉start_core_base-〉socket_init-〉sock_register(&tipc_family_ops);
這個tipc_family_ops的定義如下:
static struct net_proto_family tipc_family_ops = {
.owner ?= THIS_MODULE,
.family?= AF_TIPC,
.create?= tipc_create
};
AF_TIPC就是TIPC對應的協議標示,其值是30。而tipc_create函數就是tipc的socket的創建函數。
static int tipc_create(struct socket *sock, int protocol)
{
struct tipc_sock *tsock;
struct tipc_port *port;
struct sock *sk;
u32 ref;
struct task_struct *tsk;
int size = (sizeof(tsock->comm) < sizeof(tsk->comm)) ?
sizeof(tsock->comm) : sizeof(tsk->comm);
if ((protocol < 0) || (protocol >= MAX_TIPC_STACKS)) {
warn("Invalid protocol number : %d, permitted range 0 - %d.\n",
protocol, MAX_TIPC_STACKS);
return -EPROTONOSUPPORT;
}
if (protocol != 0) {
int vres = handle_protocol(sock, protocol);
return vres;
}
ref = tipc_createport_raw(0, &dispatch, &wakeupdispatch,
TIPC_LOW_IMPORTANCE, 0);
if (unlikely(!ref))
return -ENOMEM;
sock->state = SS_UNCONNECTED;
switch (sock->type) {
case SOCK_STREAM:
sock->ops = &stream_ops;
break;
case SOCK_SEQPACKET:
sock->ops = &packet_ops;
break;
case SOCK_DGRAM:
tipc_set_portunreliable(ref, 1);
case SOCK_RDM:
tipc_set_portunreturnable(ref, 1);
sock->ops = &msg_ops;
sock->state = SS_READY;
break;
default:
tipc_deleteport(ref);
return -EPROTOTYPE;
}
#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,12)
sk = sk_alloc(AF_TIPC, GFP_KERNEL, &tipc_proto, 1);
#else
sk = sk_alloc(AF_TIPC, GFP_KERNEL, 1, tipc_cache);
#endif
if (!sk) {
tipc_deleteport(ref);
return -ENOMEM;
}
sock_init_data(sock, sk);
init_waitqueue_head(sk->sk_sleep);
sk->sk_rcvtimeo = 8 * HZ;
tsock = tipc_sk(sk);
port = tipc_get_port(ref);
tsock->p = port;
port->usr_handle = tsock;
init_MUTEX(&tsock->sem);
memset(tsock->comm, 0, size);
tsk = current;
task_lock(tsk);
tsock->pid = tsk->pid;
memcpy(tsock->comm, tsk->comm, size);
task_unlock(tsk);
tsock->comm[size-1]=0;
tsock->overload_hwm = 0;
tsock->ovld_limit = tipc_persocket_overload;
dbg("sock_create: %x\n",tsock);
atomic_inc(&tipc_user_count);
return 0;
}
從這個函數的定義中可以看出,根據這個協議的不同的類型,如SOCK_STREAM還是SOCK_SEQPACKET,這給生成socket的ops指針賦予不同的操作類型,如下所示:
static struct proto_ops packet_ops = {
.owner ?= THIS_MODULE,
.family?= AF_TIPC,
.release?= release,
.bind?= bind,
.connect?= connect,
.socketpair?= no_skpair,
.accept?= accept,
.getname?= get_name,
.poll?= poll,
.ioctl?= ioctl,
.listen?= listen,
.shutdown?= shutdown,
.setsockopt?= setsockopt,
.getsockopt?= getsockopt,
.sendmsg?= send_packet,
.recvmsg?= recv_msg,
.mmap?= no_mmap,
.sendpage = no_sendpage
};
static struct proto_ops stream_ops = {
.owner ?= THIS_MODULE,
.family?= AF_TIPC,
.release?= release,
.bind?= bind,
.connect?= connect,
.socketpair?= no_skpair,
.accept?= accept,
.getname?= get_name,
.poll?= poll,
.ioctl?= ioctl,
.listen?= listen,
.shutdown?= shutdown,
.setsockopt?= setsockopt,
.getsockopt?= getsockopt,
.sendmsg?= send_stream,
.recvmsg?= recv_stream,
.mmap?= no_mmap,
.sendpage = no_sendpage
};
以上所討論的都是linux內核當中的部分,但對于應用程序來說,是用socket編程時,并不是直接與這些內核當中的接口打交道的。由于應用程序運行在用戶空間,這這些接口是需要在內核空間才可以調到。那么就有一個問題,應用程序是如何調用到這些接口的呢?其中的奧秘就在于glibc這個庫。linux應用程序是調用glibc中的socket函數來編程的,在glibc中socket的函數只有一套,通過以上的這個機制它就可以對應各種協議的socket函數。那么glibc中是如何調用到內核中的函數的呢?
我們先來看一下內核socket.c這個文件,在這個文件中還定義了一個如下的函數:
#ifdef __ARCH_WANT_SYS_SOCKETCALL
#define AL(x) ((x) * sizeof(unsigned long))
static unsigned char nargs[18]={AL(0),AL(3),AL(3),AL(3),AL(2),AL(3),
AL(3),AL(3),AL(4),AL(4),AL(4),AL(6),
AL(6),AL(2),AL(5),AL(5),AL(3),AL(3)};
#undef AL
asmlinkage long sys_socketcall(int call, unsigned long __user *args)
{
unsigned long a[6];
unsigned long a0,a1;
int err;
if(call<1||call>SYS_RECVMSG)
return -EINVAL;
if (copy_from_user(a, args, nargs[call]))
return -EFAULT;
err = audit_socketcall(nargs[call]/sizeof(unsigned long), a);
if (err)
return err;
a0=a[0];
a1=a[1];
trace_socket_call(call, a0);
switch(call)
{
case SYS_SOCKET:
err = sys_socket(a0,a1,a[2]);
break;
case SYS_BIND:
err = sys_bind(a0,(struct sockaddr __user *)a1, a[2]);
break;
case SYS_CONNECT:
err = sys_connect(a0, (struct sockaddr __user *)a1, a[2]);
break;
case SYS_LISTEN:
err = sys_listen(a0,a1);
break;
case SYS_ACCEPT:
err = sys_accept(a0,(struct sockaddr __user *)a1, (int __user *)a[2]);
break;
case SYS_GETSOCKNAME:
err = sys_getsockname(a0,(struct sockaddr __user *)a1, (int __user *)a[2]);
break;
case SYS_GETPEERNAME:
err = sys_getpeername(a0, (struct sockaddr __user *)a1, (int __user *)a[2]);
break;
case SYS_SOCKETPAIR:
err = sys_socketpair(a0,a1, a[2], (int __user *)a[3]);
break;
case SYS_SEND:
err = sys_send(a0, (void __user *)a1, a[2], a[3]);
break;
case SYS_SENDTO:
err = sys_sendto(a0,(void __user *)a1, a[2], a[3],
(struct sockaddr __user *)a[4], a[5]);
break;
case SYS_RECV:
err = sys_recv(a0, (void __user *)a1, a[2], a[3]);
break;
case SYS_RECVFROM:
err = sys_recvfrom(a0, (void __user *)a1, a[2], a[3],
(struct sockaddr __user *)a[4], (int __user *)a[5]);
break;
case SYS_SHUTDOWN:
err = sys_shutdown(a0,a1);
break;
case SYS_SETSOCKOPT:
err = sys_setsockopt(a0, a1, a[2], (char __user *)a[3], a[4]);
break;
case SYS_GETSOCKOPT:
err = sys_getsockopt(a0, a1, a[2], (char __user *)a[3], (int __user *)a[4]);
break;
case SYS_SENDMSG:
err = sys_sendmsg(a0, (struct msghdr __user *) a1, a[2]);
break;
case SYS_RECVMSG:
err = sys_recvmsg(a0, (struct msghdr __user *) a1, a[2]);
break;
default:
err = -EINVAL;
break;
}
return err;
}
#endif
這個sys_socketcall是一個系統調用,所有的glibc中的socket函數都是通過這個系統調用進入到內核空間的。我們來看accept的調用。Glibc中accept的調用在:sysdeps\unix\sysv\linux\accept.S文件中:
#define?socket?accept
#define?__socket __libc_accept
#define?NARGS?3
#define NEED_CANCELLATION
#include
libc_hidden_def (accept)
這段與socket.S是accept()從用戶態進入內核態的關鍵代碼。accept.S中將accept定義為socket,__socket定義為__libc_accpet,NARGS定義為3,表示調用參數有3個。接下來包含了socket.S文件,如下:
#include
#include
#include
#define P(a, b) P2(a, b)
#define P2(a, b) a##b
.text
#ifndef __socket
# ifndef NO_WEAK_ALIAS
#?define __socket P(__,socket)
# else
#?define __socket socket
# endif
#endif
.globl __socket
cfi_startproc
ENTRY (__socket)
#if defined NEED_CANCELLATION && defined CENABLE
SINGLE_THREAD_P
jne 1f
#endif
movl �x, �x
cfi_register (3, 2)
movl $SYS_ify(socketcall), �x
movl $P(SOCKOP_,socket), �x
lea 4(%esp), �x
ENTER_KERNEL
movl �x, �x
cfi_restore (3)
cmpl $-125, �x
jae SYSCALL_ERROR_LABEL
L(pseudo_end):
ret
#if defined NEED_CANCELLATION && defined CENABLE
1:?pushl %esi
cfi_adjust_cfa_offset(4)
CENABLE
movl �x, %esi
cfi_offset(6, -8)
movl �x, �x
cfi_register (3, 2)
movl $SYS_ify(socketcall), �x
movl $P(SOCKOP_,socket), �x
lea 8(%esp), �x
ENTER_KERNEL
movl �x, �x
cfi_restore (3)
xchgl %esi, �x
CDISABLE
movl %esi, �x
popl %esi
cfi_restore (6)
cfi_adjust_cfa_offset(-4)
cmpl $-125, �x
jae SYSCALL_ERROR_LABEL
ret
#endif
cfi_endproc
PSEUDO_END (__socket)
#ifndef NO_WEAK_ALIAS
weak_alias (__socket, socket)
#endif
在sysdeps\unix\sysv\linux\i386\sysdep.h文件中
#undef SYS_ify
#define SYS_ify(syscall_name)?__NR_##syscall_name
可以看到,通過SYS_ify(socketcall),我們得到了__NR_socketcall
在內核linux/include/asm/unistd.h中,定義了:
#define __NR_restart_syscall?0
#define __NR_exit?1
#define __NR_fork?2
#define __NR_read?3
… …?…
… … …
#define __NR_socketcall?102
… … …
通過movl $SYS_ify(socketcall), �x我們可以看到,__NR_socketcall被定義為102,上面一行的代碼即是將eax的值賦成102,即此系統調用的調用號。
下面我們看movl $P(SOCKOP_,socket), �x這一句。在socketcall.h中有相應的定義:
在glibc的sysdeps\unix\sysv\linux\socketcall.h文件中,定于如下:
#define SOCKOP_socket?1
#define SOCKOP_bind?2
#define SOCKOP_connect?3
#define SOCKOP_listen?4
#define SOCKOP_accept?5
#define SOCKOP_getsockname?6
#define SOCKOP_getpeername?7
#define SOCKOP_socketpair 8
#define SOCKOP_send?9
#define SOCKOP_recv?10
#define SOCKOP_sendto?11
#define SOCKOP_recvfrom?12
#define SOCKOP_shutdown?13
#define SOCKOP_setsockopt 14
#define SOCKOP_getsockopt 15
#define SOCKOP_sendmsg?16
#define SOCKOP_recvmsg?17
那么這行代碼的意思就是將相應的操作碼賦予ebx,accept的操作碼是5。在sysdeps\unix\sysv\linux\i386\sysdep.h文件中,ENTER_KERNEL定義為:
#ifdef I386_USE_SYSENTER
# ifdef SHARED
#?define ENTER_KERNEL call *%gs:SYSINFO_OFFSET
# else
#?define ENTER_KERNEL call *_dl_sysinfo
# endif
#else
# define ENTER_KERNEL int $0x80
#endif
這就通過中斷進入內核,linux/arch/i386/kernel/entry.S文件中:
… … …
# system call handler stub
ENTRY(system_call)
pushl �x?# save orig_eax
SAVE_ALL
GET_THREAD_INFO(�p)
# system call tracing in operation / emulation
testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(�p)
jnz syscall_trace_entry
cmpl $(nr_syscalls), �x
jae syscall_badsys
syscall_call:
… … …
#ifdef CONFIG_DPA_ACCOUNTING
CHECK_DPA(�x,no_dpa_syscall_enter,dpa_syscall_enter)
#endif
call *sys_call_table(,�x,4)
movl �x,EAX(%esp)?# store the return value
syscall_exit:
… … …
在linux/arch/i386/kernel/syscall_table.S文件中定義了sys_call_table,而socketcall系統調用在這個表中的定義就是102,這樣傳入eax的也是102,這樣就調用到socketcall系統調用。通過上面sys_socketcall代碼的分析,它基本就是一個socket分發函數。
這樣當應用程序調用如下的一行代碼產生一個tipc的socket時,其調用關系就是:
int sd = socket (AF_TIPC, SOCK_SEQPACKET,0);
glibc的socket匯編代碼socket.S,系統調用sys_socketcall,進入內核調用sys_socket-〉sock_create-〉__sock_create-〉tipc_create,由于這個socket是SOCK_SEQPACKET類型,那么它的static struct proto_ops packet_ops = {
.owner ?= THIS_MODULE,
.family?= AF_TIPC,
.release?= release,
.bind?= bind,
.connect?= connect,
.socketpair?= no_skpair,
.accept?= accept,
.getname?= get_name,
.poll?= poll,
.ioctl?= ioctl,
.listen?= listen,
.shutdown?= shutdown,
.setsockopt?= setsockopt,
.getsockopt?= getsockopt,
.sendmsg?= send_packet,
.recvmsg?= recv_msg,
.mmap?= no_mmap,
.sendpage = no_sendpage
};
這樣當應用程序調用glibc的bind,recvmsg等,就會通過系統調用,進而調到這個tipc socket所對應的packet_ops的函數。
總結
以上是生活随笔為你收集整理的linux socket原理,socket 的工作原理的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: Linux读写锁释放,Linux读写锁的
- 下一篇: linux 找出耗io的,linux根据