Understanding the poll System Call Implementation
poll(2) is a system call that works similarly to select(2): it waits for a set of file descriptors to become ready for I/O operations.
- Usage
- Implementation
Limitations of select(2):
- The maximum number of file descriptors that can be monitored is 1024.
- File descriptors are processed in order, making it inefficient to monitor arbitrary file descriptors.
poll(2) improves upon select(2) by allowing a dynamically sized set of file descriptors and enabling arbitrary selection of file descriptors.
struct pollfd {
int fd; /* file descriptor */
short events; /* requested events */
short revents; /* returned events */
};
- fd is the file descriptor to monitor.
- events represents the events to watch (bitmask).
- revents indicates the events that have occurred (bitmask).
#include <poll.h>
int poll(struct pollfd *fds, nfds_t nfds, int timeout);
- fds is the address of the file descriptor set.
- nfds is the number of file descriptors in the set.
- timeout is the maximum time to wait, in milliseconds.
The return value is the count of file descriptors with non-zero revents. A return of -1 indicates an error.
A simple example: wait for standard input to be ready, with a timeout of 3 seconds.
#include <poll.h>
#include <unistd.h>
#include <stdio.h>
int main()
{
int timeout = 3000;
struct pollfd fds = {0};
fds.events |= POLLIN; // fd = 0 waits for standard input
int ret = poll(&fds, 1, timeout);
if (ret == -1)
printf("error poll\n");
else if (ret)
printf("data is available now.\n");
else
printf("no data within 3000 ms.\n");
}
Implementation
The code is located in fs/select.c. Some references provide details on file callbacks and the poll structure.
poll()
SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds,
int, timeout_msecs)
{
struct timespec64 end_time, *to = NULL;
int ret;
if (timeout_msecs >= 0) {
to = &end_time;
poll_select_set_timeout(to, timeout_msecs / MSEC_PER_SEC,
NSEC_PER_MSEC * (timeout_msecs % MSEC_PER_SEC));
}
ret = do_sys_poll(ufds, nfds, to);
if (ret == -EINTR) {
struct restart_block *restart_block;
restart_block = ¤t->restart_block;
restart_block->fn = do_restart_poll;
restart_block->poll.ufds = ufds;
restart_block->poll.nfds = nfds;
if (timeout_msecs >= 0) {
restart_block->poll.tv_sec = end_time.tv_sec;
restart_block->poll.tv_nsec = end_time.tv_nsec;
restart_block->poll.has_timeout = 1;
} else
restart_block->poll.has_timeout = 0;
ret = -ERESTART_RESTARTBLOCK;
}
return ret;
}
poll() is straightforward:
- Handle timeout.
- Execute poll(2).
- Handle any post-processing: check for timeout or restart.
do_sys_poll()
static int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds,
struct timespec64 *end_time)
{
struct poll_wqueues table;
int err = -EFAULT, fdcount, len, size;
/* Allocate small arguments on the stack to save memory and be
faster - use long to make sure the buffer is aligned properly
on 64 bit archs to avoid unaligned access */
long stack_pps[POLL_STACK_ALLOC/sizeof(long)]; // 256 bytes
struct poll_list *const head = (struct poll_list *)stack_pps;
struct poll_list *walk = head;
unsigned long todo = nfds;
if (nfds > rlimit(RLIMIT_NOFILE)) // Maximum open files limit
return -EINVAL;
// N_STACK_PPS = (256 - 16) / 8 = 30, stack space holds 30 pollfd structures
// Copy user-space struct pollfd into a stack-based array
len = min_t(unsigned int, nfds, N_STACK_PPS);
for (;;) {
walk->next = NULL;
walk->len = len;
if (!len)
break;
if (copy_from_user(walk->entries, ufds + nfds-todo,
sizeof(struct pollfd) * walk->len))
goto out_fds;
todo -= walk->len;
if (!todo)
break;
// POLLFD_PER_PAGE = (4096 - 16) / 8 = 510, each page holds 510 pollfd structures
len = min(todo, POLLFD_PER_PAGE);
size = sizeof(struct poll_list) + sizeof(struct pollfd) * len;
walk = walk->next = kmalloc(size, GFP_KERNEL);
if (!walk) {
err = -ENOMEM;
goto out_fds;
}
}
// Move all pollfd structures into kernel space starting at head
poll_initwait(&table); // Initialize table, see select analysis for more details
fdcount = do_poll(head, &table, end_time);
poll_freewait(&table); // Free table
// Copy revents back to user space
for (walk = head; walk; walk = walk->next) {
struct pollfd *fds = walk->entries;
int j;
for (j = 0; j < walk->len; j++, ufds++)
if (__put_user(fds[j].revents, &ufds->revents))
goto out_fds;
}
err = fdcount;
out_fds:
walk = head->next;
while (walk) {
struct poll_list *pos = walk;
walk = walk->next;
kfree(pos);
}
return err;
}
do_sys_poll() is also divided into three steps:
- Copy data from user space to kernel space.
- Call the core implementation do_poll().
- Copy ready event data from kernel space back to user space.
do_poll()
static int do_poll(struct poll_list *list, struct poll_wqueues *wait,
struct timespec64 *end_time)
{
poll_table* pt = &wait->pt;
ktime_t expire, *to = NULL;
int timed_out = 0, count = 0;
u64 slack = 0;
__poll_t busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0;
unsigned long busy_start = 0;
/* Optimise the no-wait case */
if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
pt->_qproc = NULL;
timed_out = 1;
}
if (end_time && !timed_out)
slack = select_estimate_accuracy(end_time); // Estimate waiting time, returns nanoseconds
for (;;) {
struct poll_list *walk;
bool can_busy_loop = false;
for (walk = list; walk != NULL; walk = walk->next) {
struct pollfd * pfd, * pfd_end;
pfd = walk->entries;
pfd_end = pfd + walk->len;
for (; pfd != pfd_end; pfd++) { // Process all struct pollfd entries, do_pollfd handles individual fd
/*
* Fish for events. If we found one, record it
* and kill poll_table->_qproc, so we don't
* needlessly register any other waiters after
* this. They'll get immediately deregistered
* when we break out and return.
*/
if (do_pollfd(pfd, pt, &can_busy_loop,
busy_flag)) {
count++;
pt->_qproc = NULL;
/* found something, stop busy polling */
busy_flag = 0;
can_busy_loop = false;
}
}
}
/*
* All waiters have already been registered, so don't provide
* a poll_table->_qproc to them on the next loop iteration.
*/
pt->_qproc = NULL;
if (!count) {
count = wait->error;
if (signal_pending(current))
count = -EINTR;
}
if (count || timed_out)
break;
/* only if found POLL_BUSY_LOOP sockets && not out of time */
if (can_busy_loop && !need_resched()) {
if (!busy_start) {
busy_start = busy_loop_current_time();
continue;
}
if (!busy_loop_timeout(busy_start))
continue;
}
busy_flag = 0;
/*
* If this is the first loop and we have a timeout
* given, then we convert to ktime_t and set the to
* pointer to the expiry value.
*/
if (end_time && !to) {
expire = timespec64_to_ktime(*end_time);
to = &expire;
}
if (!poll_schedule_timeout(wait, TASK_INTERRUPTIBLE, to, slack)) // Schedule until timeout
timed_out = 1;
}
return count;
}
This function is well-documented with many comments.
- can_busy_loop is related to CONFIG_NET_RX_BUSY_POLL configuration and is not part of the general case, so its ignored for now.
- count is the return value, which increments when do_pollfd returns a matching mask, representing the number of ready file descriptors. When no file descriptors are ready, it returns the error code from the wait queue.
pt->_qprocis the function used for poll operations on the file descriptor. Setting it to NULL means there is no need to register additional waiters again. This is explained in detail in another article on select(2).
/*
* Fish for pollable events on the pollfd->fd file descriptor. We're only
* interested in events matching the pollfd->events mask, and the result
* matching that mask is both recorded in pollfd->revents and returned. The
* pwait poll_table will be used by the fd-provided poll handler for waiting,
* if pwait->_qproc is non-NULL.
*/
static inline __poll_t do_pollfd(struct pollfd *pollfd, poll_table *pwait,
bool *can_busy_poll,
__poll_t busy_flag)
{
__poll_t mask;
int fd;
mask = 0;
fd = pollfd->fd;
if (fd >= 0) {
struct fd f = fdget(fd);
mask = EPOLLNVAL; // 0x20
if (f.file) {
/* userland u16 ->events contains POLL... bitmap */
// Set the events to watch
__poll_t filter = demangle_poll(pollfd->events) |
EPOLLERR | EPOLLHUP;
mask = DEFAULT_POLLMASK; // (EPOLLIN | EPOLLOUT | EPOLLRDNORM | EPOLLWRNORM)
if (f.file->f_op->poll) {
pwait->_key = filter;
pwait->_key |= busy_flag; // key used in wake-up function
mask = f.file->f_op->poll(f.file, pwait); // Get ready events mask
if (mask & busy_flag)
*can_busy_poll = true;
}
/* Mask out unneeded events. */
mask &= filter; // Combine the returned events with the watched events
fdput(f);
}
}
/* ... and so does ->revents */
pollfd->revents = mangle_poll(mask); // Set ready events mask
return mask;
}
In the absence of errors, poll(2) returns the count of non-zero revents. In do_pollfd(), a non-zero mask increases the count. Possible reasons for mask being zero:
- The filter operation requires a valid fd.
- fd < 0, which is invalid and considered a user error.
For known file descriptors: eventfd and regular file poll functions return:
- EPOLLIN or EPOLLOUT or both
- (EPOLLIN | EPOLLOUT | EPOLLRDNORM | EPOLLWRNORM)
If the events are not among these, it may return zero, and the count will not increase.
struct pollfd fds[n];
rn = poll(fds, n, 0);
for (int i = 0; i < rn; ++i)
if (fds[i].revents ...)
Such an approach is risky as it may access beyond the rn index.
mangle_poll() sets the ready events mask
Expanding the ready events mask setting function, the __MAP function is a bit complex. It seems to map values from one range to another. In Linux 4.17, POLLIN and EPOLLIN macros have the same size.
#define __MAP(v, from, to) \
(from < to ? (v & from) * (to/from) : (v & from) / (from/to))
static inline __poll_t demangle_poll(u16 val) {
return (__force __poll_t)__MAP(val, POLLIN, (__force __u16)EPOLLIN) |
(__force __poll_t)__MAP(val, POLLOUT, (__force __u16)EPOLLOUT) |
(__force __poll_t)__MAP(val, POLLPRI, (__force __u16)EPOLLPRI) |
(__force __poll_t)__MAP(val, POLLERR, (__force __u16)EPOLLERR) |
(__force __poll_t)__MAP(val, POLLNVAL, (__force __u16)EPOLLNVAL) |
(__force __poll_t)__MAP(val, POLLRDNORM,
(__force __u16)EPOLLRDNORM) |
(__force __poll_t)__MAP(val, POLLRDBAND,
(__force __u16)EPOLLRDBAND) |
(__force __poll_t)__MAP(val, POLLWRNORM,
(__force __u16)EPOLLWRNORM) |
(__force __poll_t)__MAP(val, POLLWRBAND,
(__force __u16)EPOLLWRBAND) |
(__force __poll_t)__MAP(val, POLLHUP, (__force __u16)EPOLLHUP) |
(__force __poll_t)__MAP(val, POLLRDHUP, (__force __u16)EPOLLRDHUP) |
(__force __poll_t)__MAP(val, POLLMSG, (__force __u16)EPOLLMSG);
}
References
Analysis of select source code, a previous article discussing select, includes some details about the poll structure and file callbacks.