Fading Coder

One Final Commit for the Last Sprint

Home > Tech > Content

Understanding the poll System Call Implementation

Tech May 12 2

poll(2) is a system call that works similarly to select(2): it waits for a set of file descriptors to become ready for I/O operations.

  • Usage
  • Implementation

Limitations of select(2):

  • The maximum number of file descriptors that can be monitored is 1024.
  • File descriptors are processed in order, making it inefficient to monitor arbitrary file descriptors.

poll(2) improves upon select(2) by allowing a dynamically sized set of file descriptors and enabling arbitrary selection of file descriptors.

struct pollfd {
       int   fd;         /* file descriptor */
       short events;     /* requested events */
       short revents;    /* returned events */
};

- fd is the file descriptor to monitor.
- events represents the events to watch (bitmask).
- revents indicates the events that have occurred (bitmask).

#include <poll.h>

int poll(struct pollfd *fds, nfds_t nfds, int timeout);

- fds is the address of the file descriptor set.
- nfds is the number of file descriptors in the set.
- timeout is the maximum time to wait, in milliseconds.

The return value is the count of file descriptors with non-zero revents. A return of -1 indicates an error.


A simple example: wait for standard input to be ready, with a timeout of 3 seconds.

#include <poll.h>
#include <unistd.h>
#include <stdio.h>

int main()
{
        int timeout = 3000;

        struct pollfd fds = {0};
        fds.events |= POLLIN;  // fd = 0 waits for standard input

        int ret = poll(&fds, 1, timeout);
        if (ret == -1)
                printf("error poll\n");
        else if (ret)
                printf("data is available now.\n");
        else
                printf("no data within 3000 ms.\n");

}


Implementation

The code is located in fs/select.c. Some references provide details on file callbacks and the poll structure.

poll()

SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds,
                int, timeout_msecs)
{
        struct timespec64 end_time, *to = NULL;
        int ret;

        if (timeout_msecs >= 0) {
                to = &end_time;
                poll_select_set_timeout(to, timeout_msecs / MSEC_PER_SEC,
                        NSEC_PER_MSEC * (timeout_msecs % MSEC_PER_SEC));
        }

        ret = do_sys_poll(ufds, nfds, to);

        if (ret == -EINTR) {
                struct restart_block *restart_block;

                restart_block = &current->restart_block;
                restart_block->fn = do_restart_poll;
                restart_block->poll.ufds = ufds;
                restart_block->poll.nfds = nfds;

                if (timeout_msecs >= 0) {
                        restart_block->poll.tv_sec = end_time.tv_sec;
                        restart_block->poll.tv_nsec = end_time.tv_nsec;
                        restart_block->poll.has_timeout = 1;
                } else
                        restart_block->poll.has_timeout = 0;

                ret = -ERESTART_RESTARTBLOCK;
        }
        return ret;
}


poll() is straightforward:

  1. Handle timeout.
  2. Execute poll(2).
  3. Handle any post-processing: check for timeout or restart.

do_sys_poll()


static int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds,
                struct timespec64 *end_time)
{
        struct poll_wqueues table;
         int err = -EFAULT, fdcount, len, size;
        /* Allocate small arguments on the stack to save memory and be
           faster - use long to make sure the buffer is aligned properly
           on 64 bit archs to avoid unaligned access */
        long stack_pps[POLL_STACK_ALLOC/sizeof(long)];  // 256 bytes
        struct poll_list *const head = (struct poll_list *)stack_pps;
         struct poll_list *walk = head;
         unsigned long todo = nfds;

        if (nfds > rlimit(RLIMIT_NOFILE))  // Maximum open files limit
                return -EINVAL;

        // N_STACK_PPS = (256 - 16) / 8 = 30, stack space holds 30 pollfd structures
        // Copy user-space struct pollfd into a stack-based array
        len = min_t(unsigned int, nfds, N_STACK_PPS);
        for (;;) {
                walk->next = NULL;
                walk->len = len;
                if (!len)
                        break;

                if (copy_from_user(walk->entries, ufds + nfds-todo,
                                        sizeof(struct pollfd) * walk->len))
                        goto out_fds;

                todo -= walk->len;
                if (!todo)
                        break;

                // POLLFD_PER_PAGE = (4096 - 16) / 8 = 510, each page holds 510 pollfd structures
                len = min(todo, POLLFD_PER_PAGE);
                size = sizeof(struct poll_list) + sizeof(struct pollfd) * len;
                walk = walk->next = kmalloc(size, GFP_KERNEL);
                if (!walk) {
                        err = -ENOMEM;
                        goto out_fds;
                }
        }
        // Move all pollfd structures into kernel space starting at head

        poll_initwait(&table);  // Initialize table, see select analysis for more details
        fdcount = do_poll(head, &table, end_time);
        poll_freewait(&table);  // Free table

        // Copy revents back to user space
        for (walk = head; walk; walk = walk->next) {
                struct pollfd *fds = walk->entries;
                int j;

                for (j = 0; j < walk->len; j++, ufds++)
                        if (__put_user(fds[j].revents, &ufds->revents))
                                goto out_fds;
          }

        err = fdcount;
out_fds:
        walk = head->next;
        while (walk) {
                struct poll_list *pos = walk;
                walk = walk->next;
                kfree(pos);
        }

        return err;
}


do_sys_poll() is also divided into three steps:

  1. Copy data from user space to kernel space.
  2. Call the core implementation do_poll().
  3. Copy ready event data from kernel space back to user space.

do_poll()

static int do_poll(struct poll_list *list, struct poll_wqueues *wait,
                   struct timespec64 *end_time)
{
        poll_table* pt = &wait->pt;
        ktime_t expire, *to = NULL;
        int timed_out = 0, count = 0;
        u64 slack = 0;
        __poll_t busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0;
        unsigned long busy_start = 0;

        /* Optimise the no-wait case */
        if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
                pt->_qproc = NULL;
                timed_out = 1;
        }

        if (end_time && !timed_out)
                slack = select_estimate_accuracy(end_time);  // Estimate waiting time, returns nanoseconds

        for (;;) {
                struct poll_list *walk;
                bool can_busy_loop = false;

                for (walk = list; walk != NULL; walk = walk->next) {
                        struct pollfd * pfd, * pfd_end;

                        pfd = walk->entries;
                        pfd_end = pfd + walk->len;
                        for (; pfd != pfd_end; pfd++) {  // Process all struct pollfd entries, do_pollfd handles individual fd
                                /*
                                 * Fish for events. If we found one, record it
                                 * and kill poll_table->_qproc, so we don't
                                 * needlessly register any other waiters after
                                 * this. They'll get immediately deregistered
                                 * when we break out and return.
                                 */
                                if (do_pollfd(pfd, pt, &can_busy_loop,
                                              busy_flag)) {
                                        count++;
                                        pt->_qproc = NULL;
                                        /* found something, stop busy polling */
                                        busy_flag = 0;
                                        can_busy_loop = false;
                                }
                        }
                }
                /*
                 * All waiters have already been registered, so don't provide
                 * a poll_table->_qproc to them on the next loop iteration.
                 */
                pt->_qproc = NULL;
                if (!count) {
                        count = wait->error;
                        if (signal_pending(current))
                                count = -EINTR;
                }
                if (count || timed_out)
                        break;

                /* only if found POLL_BUSY_LOOP sockets && not out of time */
                if (can_busy_loop && !need_resched()) {
                        if (!busy_start) {
                                busy_start = busy_loop_current_time();
                                continue;
                        }
                        if (!busy_loop_timeout(busy_start))
                                continue;
                }
                busy_flag = 0;

                /*
                 * If this is the first loop and we have a timeout
                 * given, then we convert to ktime_t and set the to
                 * pointer to the expiry value.
                 */
                if (end_time && !to) {
                        expire = timespec64_to_ktime(*end_time);
                        to = &expire;
                }

                if (!poll_schedule_timeout(wait, TASK_INTERRUPTIBLE, to, slack))  // Schedule until timeout
                        timed_out = 1;
        }
        return count;
}


This function is well-documented with many comments.

  1. can_busy_loop is related to CONFIG_NET_RX_BUSY_POLL configuration and is not part of the general case, so its ignored for now.
  2. count is the return value, which increments when do_pollfd returns a matching mask, representing the number of ready file descriptors. When no file descriptors are ready, it returns the error code from the wait queue.
  3. pt->_qproc is the function used for poll operations on the file descriptor. Setting it to NULL means there is no need to register additional waiters again. This is explained in detail in another article on select(2).
/*
 * Fish for pollable events on the pollfd->fd file descriptor. We're only
 * interested in events matching the pollfd->events mask, and the result
 * matching that mask is both recorded in pollfd->revents and returned. The
 * pwait poll_table will be used by the fd-provided poll handler for waiting,
 * if pwait->_qproc is non-NULL.
 */
static inline __poll_t do_pollfd(struct pollfd *pollfd, poll_table *pwait,
                                     bool *can_busy_poll,
                                     __poll_t busy_flag)
{
        __poll_t mask;
        int fd;

        mask = 0;
        fd = pollfd->fd;
        if (fd >= 0) {
                struct fd f = fdget(fd);
                mask = EPOLLNVAL;  // 0x20
                if (f.file) {
                        /* userland u16 ->events contains POLL... bitmap */
                        // Set the events to watch
                        __poll_t filter = demangle_poll(pollfd->events) |
                                                EPOLLERR | EPOLLHUP;
                        mask = DEFAULT_POLLMASK;  // (EPOLLIN | EPOLLOUT | EPOLLRDNORM | EPOLLWRNORM)
                        if (f.file->f_op->poll) {
                                pwait->_key = filter;
                                pwait->_key |= busy_flag;  // key used in wake-up function
                                mask = f.file->f_op->poll(f.file, pwait);  // Get ready events mask
                                if (mask & busy_flag)
                                        *can_busy_poll = true;
                        }
                        /* Mask out unneeded events. */
                        mask &= filter;  // Combine the returned events with the watched events
                        fdput(f);
                }
        }
        /* ... and so does ->revents */
        pollfd->revents = mangle_poll(mask);  // Set ready events mask

        return mask;
}


In the absence of errors, poll(2) returns the count of non-zero revents. In do_pollfd(), a non-zero mask increases the count. Possible reasons for mask being zero:

  1. The filter operation requires a valid fd.
  2. fd < 0, which is invalid and considered a user error.

For known file descriptors: eventfd and regular file poll functions return:

  • EPOLLIN or EPOLLOUT or both
  • (EPOLLIN | EPOLLOUT | EPOLLRDNORM | EPOLLWRNORM)

If the events are not among these, it may return zero, and the count will not increase.

struct pollfd fds[n];
rn = poll(fds, n, 0);
for (int i = 0; i < rn; ++i)
        if (fds[i].revents ...)


Such an approach is risky as it may access beyond the rn index.

mangle_poll() sets the ready events mask

Expanding the ready events mask setting function, the __MAP function is a bit complex. It seems to map values from one range to another. In Linux 4.17, POLLIN and EPOLLIN macros have the same size.

#define __MAP(v, from, to) \
        (from < to ? (v & from) * (to/from) : (v & from) / (from/to))

static inline __poll_t demangle_poll(u16 val) {
    return (__force __poll_t)__MAP(val, POLLIN, (__force __u16)EPOLLIN) |
           (__force __poll_t)__MAP(val, POLLOUT, (__force __u16)EPOLLOUT) |
           (__force __poll_t)__MAP(val, POLLPRI, (__force __u16)EPOLLPRI) |
           (__force __poll_t)__MAP(val, POLLERR, (__force __u16)EPOLLERR) |
           (__force __poll_t)__MAP(val, POLLNVAL, (__force __u16)EPOLLNVAL) |
           (__force __poll_t)__MAP(val, POLLRDNORM,
                                   (__force __u16)EPOLLRDNORM) |
           (__force __poll_t)__MAP(val, POLLRDBAND,
                                   (__force __u16)EPOLLRDBAND) |
           (__force __poll_t)__MAP(val, POLLWRNORM,
                                   (__force __u16)EPOLLWRNORM) |
           (__force __poll_t)__MAP(val, POLLWRBAND,
                                   (__force __u16)EPOLLWRBAND) |
           (__force __poll_t)__MAP(val, POLLHUP, (__force __u16)EPOLLHUP) |
           (__force __poll_t)__MAP(val, POLLRDHUP, (__force __u16)EPOLLRDHUP) |
           (__force __poll_t)__MAP(val, POLLMSG, (__force __u16)EPOLLMSG);
}


References

Analysis of select source code, a previous article discussing select, includes some details about the poll structure and file callbacks.

Related Articles

Understanding Strong and Weak References in Java

Strong References Strong reference are the most prevalent type of object referencing in Java. When an object has a strong reference pointing to it, the garbage collector will not reclaim its memory. F...

Comprehensive Guide to SSTI Explained with Payload Bypass Techniques

Introduction Server-Side Template Injection (SSTI) is a vulnerability in web applications where user input is improper handled within the template engine and executed on the server. This exploit can r...

Implement Image Upload Functionality for Django Integrated TinyMCE Editor

Django’s Admin panel is highly user-friendly, and pairing it with TinyMCE, an effective rich text editor, simplifies content management significantly. Combining the two is particular useful for bloggi...

Leave a Comment

Anonymous

◎Feel free to join the discussion and share your thoughts.