epoll是什么?按照man手冊的說法:是為處理大批量句柄而作了改進的poll。當然,這不是2.6內核才有的,它是在2.5.44內核中被引進的(epoll(4) is a new API introduced in Linux kernel 2.5.44),它幾乎具備了之前所說的一切優點,被公認為Linux2.6下性能最好的多路I/O就緒通知方法。
如果我們在第1步將RFD添加到epoll描述符的時候使用了EPOLLET標志,那么在第5步調用epoll_wait(2)之后將有可能會掛起,因為剩余的數據還存在于文件的輸入緩沖區內,而且數據發出端還在等待一個針對已經發出數據的反饋信息。只有在監視的文件句柄上發生了某個事件的時候 ET 工作模式才會匯報事件。因此在第5步的時候,調用者可能會放棄等待仍在存在于文件輸入緩沖區內的剩余數據。在上面的例子中,會有一個事件產生在RFD句柄上,因為在第2步執行了一個寫操作,然后,事件將會在第3步被銷毀。因為第4步的讀取操作沒有讀空文件輸入緩沖區內的數據,因此我們在第5步調用 epoll_wait(2)完成后,是否掛起是不確定的。epoll工作在ET模式的時候,必須使用非阻塞套接口,以避免由于一個文件句柄的阻塞讀/阻塞寫操作把處理多個文件描述符的任務餓死。最好以下面的方式調用ET模式的epoll接口,在后面會介紹避免可能的缺陷。
ET (edge-triggered)是高速工作方式,只支持no-block socket,它效率要比LT更高。ET與LT的區別在于,當一個新的事件到來時,ET模式下當然可以從epoll_wait調用中獲取到這個事件,可是如果這次沒有把這個事件對應的套接字緩沖區處理完,在這個套接字中沒有新的事件再次到來時,在ET模式下是無法再次從epoll_wait調用中獲取這個事件的。而LT模式正好相反,只要一個事件對應的套接字緩沖區還有數據,就總能從epoll_wait中獲取這個事件。
/*171 * This structure is stored inside the "private_data" member of the file172 * structure and represents the main data structure for the eventpoll173 * interface.174 */175struct eventpoll {176 /* Protect the access to this structure */177 spinlock_t lock;178179 /*180 * This mutex is used to ensure that files are not removed181 * while epoll is using them. This is held during the event182 * collection loop, the file cleanup path, the epoll file exit183 * code and the ctl operations.184 */185 struct mutex mtx;186187 /* Wait queue used by sys_epoll_wait() */188 wait_queue_head_t wq;189190 /* Wait queue used by file->poll() */191 wait_queue_head_t poll_wait;192193 /* List of ready file descriptors */194 struct list_head rdllist;195196 /* RB tree root used to store monitored fd structs */197 struct rb_root rbr;//紅黑樹根節點,這棵樹存儲著所有添加到epoll中的事件,也就是這個epoll監控的事件198199 /*200 * This is a single linked list that chains all the "struct epitem" that201 * happened while transferring ready events to userspace w/out202 * holding ->lock.203 */204 struct epitem *ovflist;205206 /* wakeup_source used when ep_scan_ready_list is running */207 struct wakeup_source *ws;208209 /* The user that created the eventpoll descriptor */210 struct user_struct *user;211212 struct file *file;213214 /* used to optimize loop detection check */215 int visited;216 struct list_head visited_list_link;//雙向鏈表中保存著將要通過epoll_wait返回給用戶的、滿足條件的事件217};
/*130 * Each file descriptor added to the eventpoll interface will131 * have an entry of this type linked to the "rbr" RB tree.132 * Avoid increasing the size of this struct, there can be many thousands133 * of these on a server and we do not want this to take another cache line.134 */135struct epitem {136 /* RB tree node used to link this structure to the eventpoll RB tree */137 struct rb_node rbn;138139 /* List header used to link this structure to the eventpoll ready list */140 struct list_head rdllink;141142 /*143 * Works together "struct eventpoll"->ovflist in keeping the144 * single linked chain of items.145 */146 struct epitem *next;147148 /* The file descriptor information this item refers to */149 struct epoll_filefd ffd;150151 /* Number of active wait queue attached to poll operations */152 int nwait;153154 /* List containing poll wait queues */155 struct list_head pwqlist;156157 /* The "container" of this item */158 struct eventpoll *ep;159160 /* List header used to link this item to the "struct file" items list */161 struct list_head fllink;162163 /* wakeup_source used when EPOLLWAKEUP is set */164 struct wakeup_source __rcu *ws;165166 /* The structure that describe the interested events and the source fd */167 struct epoll_event event;168};
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <errno.h>
#include <sys/socket.h>
#include <netdb.h>
#include <fcntl.h>
#include <sys/epoll.h>
#include <string.h>#define MAXEVENTS 64//函數:
//功能:創建和綁定一個TCP socket
//參數:端口
//返回值:創建的socket
static int
create_and_bind (char *port)
{struct addrinfo hints;struct addrinfo *result, *rp;int s, sfd;memset (&hints, 0, sizeof (struct addrinfo));hints.ai_family = AF_UNSPEC; /* Return IPv4 and IPv6 choices */hints.ai_socktype = SOCK_STREAM; /* We want a TCP socket */hints.ai_flags = AI_PASSIVE; /* All interfaces */s = getaddrinfo (NULL, port, &hints, &result);if (s != 0){fprintf (stderr, "getaddrinfo: %s\n", gai_strerror (s));return -1;}for (rp = result; rp != NULL; rp = rp->ai_next){sfd = socket (rp->ai_family, rp->ai_socktype, rp->ai_protocol);if (sfd == -1)continue;s = bind (sfd, rp->ai_addr, rp->ai_addrlen);if (s == 0){/* We managed to bind successfully! */break;}close (sfd);}if (rp == NULL){fprintf (stderr, "Could not bind\n");return -1;}freeaddrinfo (result);return sfd;
}//函數
//功能:設置socket為非阻塞的
static int
make_socket_non_blocking (int sfd)
{int flags, s;//得到文件狀態標志flags = fcntl (sfd, F_GETFL, 0);if (flags == -1){perror ("fcntl");return -1;}//設置文件狀態標志flags |= O_NONBLOCK;s = fcntl (sfd, F_SETFL, flags);if (s == -1){perror ("fcntl");return -1;}return 0;
}//端口由參數argv[1]指定
int
main (int argc, char *argv[])
{int sfd, s;int efd;struct epoll_event event;struct epoll_event *events;if (argc != 2){fprintf (stderr, "Usage: %s [port]\n", argv[0]);exit (EXIT_FAILURE);}sfd = create_and_bind (argv[1]);if (sfd == -1)abort ();s = make_socket_non_blocking (sfd);if (s == -1)abort ();s = listen (sfd, SOMAXCONN);if (s == -1){perror ("listen");abort ();}//除了參數size被忽略外,此函數和epoll_create完全相同efd = epoll_create1 (0);if (efd == -1){perror ("epoll_create");abort ();}event.data.fd = sfd;event.events = EPOLLIN | EPOLLET;//讀入,邊緣觸發方式s = epoll_ctl (efd, EPOLL_CTL_ADD, sfd, &event);if (s == -1){perror ("epoll_ctl");abort ();}/* Buffer where events are returned */events = calloc (MAXEVENTS, sizeof event);/* The event loop */while (1){int n, i;n = epoll_wait (efd, events, MAXEVENTS, -1);for (i = 0; i < n; i++){if ((events[i].events & EPOLLERR) ||(events[i].events & EPOLLHUP) ||(!(events[i].events & EPOLLIN))){/* An error has occured on this fd, or the socket is notready for reading (why were we notified then?) */fprintf (stderr, "epoll error\n");close (events[i].data.fd);continue;}else if (sfd == events[i].data.fd){/* We have a notification on the listening socket, whichmeans one or more incoming connections. */while (1){struct sockaddr in_addr;socklen_t in_len;int infd;char hbuf[NI_MAXHOST], sbuf[NI_MAXSERV];in_len = sizeof in_addr;infd = accept (sfd, &in_addr, &in_len);if (infd == -1){if ((errno == EAGAIN) ||(errno == EWOULDBLOCK)){/* We have processed all incomingconnections. */break;}else{perror ("accept");break;}}//將地址轉化為主機名或者服務名s = getnameinfo (&in_addr, in_len,hbuf, sizeof hbuf,sbuf, sizeof sbuf,NI_NUMERICHOST | NI_NUMERICSERV);//flag參數:以數字名返回//主機地址和服務地址if (s == 0){printf("Accepted connection on descriptor %d ""(host=%s, port=%s)\n", infd, hbuf, sbuf);}/* Make the incoming socket non-blocking and add it to thelist of fds to monitor. */s = make_socket_non_blocking (infd);if (s == -1)abort ();event.data.fd = infd;event.events = EPOLLIN | EPOLLET;s = epoll_ctl (efd, EPOLL_CTL_ADD, infd, &event);if (s == -1){perror ("epoll_ctl");abort ();}}continue;}else{/* We have data on the fd waiting to be read. Read anddisplay it. We must read whatever data is availablecompletely, as we are running in edge-triggered modeand won't get a notification again for the samedata. */int done = 0;while (1){ssize_t count;char buf[512];count = read (events[i].data.fd, buf, sizeof(buf));if (count == -1){/* If errno == EAGAIN, that means we have read alldata. So go back to the main loop. */if (errno != EAGAIN){perror ("read");done = 1;}break;}else if (count == 0){/* End of file. The remote has closed theconnection. */done = 1;break;}/* Write the buffer to standard output */s = write (1, buf, count);if (s == -1){perror ("write");abort ();}}if (done){printf ("Closed connection on descriptor %d\n",events[i].data.fd);/* Closing the descriptor will make epoll remove itfrom the set of descriptors which are monitored. */close (events[i].data.fd);}}}}free (events);close (sfd);return EXIT_SUCCESS;
}