Fading Coder

One Final Commit for the Last Sprint

Home > Tech > Content

High-Performance Packet Capture with AF_PACKET V3 on Linux

Tech 1

Introduction

While earlier implementations provided foundational packet capture capabilities, modern Linux kernels support AF_PACKET Version 3 (V3), delivering substantial performance improvements over both V1 and V2. (Version 2 primarily enhanced timestamp precision from microseconds to nanoseconds.) Key advantages of V3 include:

  • 15-20% reduction in CPU utilization
  • ~20% improvement in packet capture rates
  • 2x increase in packet density (improved memory efficiency)
  • Support for port aggregation analysis
  • Variable-length frames enabling storage of complete packets regardless of size

This article explores V3 implementation details and optimization techniques, including load distribution strategies for high-throughput environments.

AF_PACKET V3 Architecture

Unlike previous versions that traversed individual frames sequentially, V3 organizes memory into blocks containing multiple packets. Iteration occurs at the block level first, then within each frame. The protocol maintains nanosecond timestamp precision.

Data Structures

The configuration structure tpacket_req3 introduces several V3-specific fields:

struct tpacket_req3 {
    unsigned int tp_block_size;       // Minimum contiguous block size (PAGE_SIZE * 2^n)
    unsigned int tp_block_nr;         // Number of blocks in the ring
    unsigned int tp_frame_size;       // Maximum frame size (upper bound for variable lengths)
    unsigned int tp_frame_nr;         // Total frame slots (frames_per_block * block_count)
    unsigned int tp_retire_blk_tov;   // Block timeout in ms (0 = infinite)
    unsigned int tp_sizeof_priv;      // Private metadata area per block
    unsigned int tp_feature_req_word; // Feature flags (e.g., TP_FT_REQ_FILL_RXHASH)
};

struct tpacket3_hdr {
    __u32 tp_next_offset;  // Offset to next packet within block
    __u32 tp_sec;          // Timestamp seconds
    __u32 tp_nsec;         // Timestamp nanoseconds
    __u32 tp_snaplen;      // Captured bytes
    __u32 tp_len;          // Original packet length
    __u32 tp_status;       // Packet state
    __u16 tp_mac;          // MAC header offset
    __u16 tp_net;          // Network layer offset
    union {
        struct tpacket_hdr_variant1 hv1;  // VLAN and hash metadata
    };
    __u8 tp_padding[8];
};

Implementation Example

The following demonstrates kernel-optimized capture using memory-mapped ring buffers:

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <assert.h>
#include <net/if.h>
#include <arpa/inet.h>
#include <poll.h>
#include <unistd.h>
#include <signal.h>
#include <inttypes.h>
#include <sys/socket.h>
#include <sys/mman.h>
#include <linux/if_packet.h>
#include <linux/if_ether.h>
#include <linux/ip.h>

#ifndef likely
#define likely(x)    __builtin_expect(!!(x), 1)
#endif
#ifndef unlikely
#define unlikely(x)  __builtin_expect(!!(x), 0)
#endif

struct block_header {
    uint32_t version;
    uint32_t offset_to_priv;
    struct tpacket_hdr_v1 info;
};

struct capture_ring {
    struct iovec *vectors;
    uint8_t *base_addr;
    struct tpacket_req3 params;
};

static volatile sig_atomic_t terminate = 0;
static uint64_t pkt_count = 0, byte_count = 0;

static void signal_handler(int sig)
{
    terminate = 1;
}

static int create_packet_socket(struct capture_ring *ring, const char *dev)
{
    int sock, err, idx;
    int ver = TPACKET_V3;
    struct sockaddr_ll addr;
    unsigned int blk_size = 1 << 22;   // 4MB blocks
    unsigned int frm_size = 1 << 11;   // 2KB frames
    unsigned int blk_cnt = 64;

    sock = socket(AF_PACKET, SOCK_RAW, htons(ETH_P_ALL));
    if (sock < 0) {
        perror("socket");
        exit(EXIT_FAILURE);
    }

    err = setsockopt(sock, SOL_PACKET, PACKET_VERSION, &ver, sizeof(ver));
    if (err < 0) {
        perror("setsockopt version");
        exit(EXIT_FAILURE);
    }

    memset(&ring->params, 0, sizeof(ring->params));
    ring->params.tp_block_size = blk_size;
    ring->params.tp_frame_size = frm_size;
    ring->params.tp_block_nr = blk_cnt;
    ring->params.tp_frame_nr = (blk_size * blk_cnt) / frm_size;
    ring->params.tp_retire_blk_tov = 60;
    ring->params.tp_feature_req_word = TP_FT_REQ_FILL_RXHASH;

    err = setsockopt(sock, SOL_PACKET, PACKET_RX_RING, &ring->params, sizeof(ring->params));
    if (err < 0) {
        perror("setsockopt rx_ring");
        exit(EXIT_FAILURE);
    }

    ring->base_addr = mmap(NULL, ring->params.tp_block_size * ring->params.tp_block_nr,
                           PROT_READ | PROT_WRITE, MAP_SHARED | MAP_LOCKED, sock, 0);
    if (ring->base_addr == MAP_FAILED) {
        perror("mmap");
        exit(EXIT_FAILURE);
    }

    ring->vectors = malloc(ring->params.tp_block_nr * sizeof(*ring->vectors));
    assert(ring->vectors);
    
    for (idx = 0; idx < ring->params.tp_block_nr; idx++) {
        ring->vectors[idx].iov_base = ring->base_addr + (idx * ring->params.tp_block_size);
        ring->vectors[idx].iov_len = ring->params.tp_block_size;
    }

    memset(&addr, 0, sizeof(addr));
    addr.sll_family = PF_PACKET;
    addr.sll_protocol = htons(ETH_P_ALL);
    addr.sll_ifindex = if_nametoindex(dev);

    err = bind(sock, (struct sockaddr *)&addr, sizeof(addr));
    if (err < 0) {
        perror("bind");
        exit(EXIT_FAILURE);
    }

    return sock;
}

static void show_packet(struct tpacket3_hdr *hdr)
{
    struct ethhdr *eth = (struct ethhdr *)((uint8_t *)hdr + hdr->tp_mac);
    struct iphdr *ip = (struct iphdr *)((uint8_t *)eth + ETH_HLEN);

    if (eth->h_proto == htons(ETH_P_IP)) {
        char src[INET_ADDRSTRLEN], dst[INET_ADDRSTRLEN];
        inet_ntop(AF_INET, &ip->saddr, src, sizeof(src));
        inet_ntop(AF_INET, &ip->daddr, dst, sizeof(dst));
        printf("%s -> %s, ", src, dst);
    }
    printf("rxhash: 0x%x\n", hdr->hv1.tp_rxhash);
}

static void scan_block(struct block_header *blk, int blk_idx)
{
    int num_pkts = blk->info.num_pkts, i;
    unsigned long bytes = 0;
    struct tpacket3_hdr *pkt;

    pkt = (struct tpacket3_hdr *)((uint8_t *)blk + blk->info.offset_to_first_pkt);
    for (i = 0; i < num_pkts; i++) {
        bytes += pkt->tp_snaplen;
        show_packet(pkt);
        pkt = (struct tpacket3_hdr *)((uint8_t *)pkt + pkt->tp_next_offset);
    }

    pkt_count += num_pkts;
    byte_count += bytes;
}

static void return_block(struct block_header *blk)
{
    blk->info.block_status = TP_STATUS_KERNEL;
}

static void teardown(struct capture_ring *ring, int sock)
{
    munmap(ring->base_addr, ring->params.tp_block_size * ring->params.tp_block_nr);
    free(ring->vectors);
    close(sock);
}

int main(int argc, char **argv)
{
    int sock, err;
    socklen_t len;
    struct capture_ring ring;
    struct pollfd pfd;
    unsigned int curr_blk = 0, max_blks = 64;
    struct block_header *blk;
    struct tpacket_stats_v3 stats;

    if (argc != 2) {
        fprintf(stderr, "Usage: %s <interface>\n", argv[0]);
        return EXIT_FAILURE;
    }

    signal(SIGINT, signal_handler);
    memset(&ring, 0, sizeof(ring));
    sock = create_packet_socket(&ring, argv[1]);

    memset(&pfd, 0, sizeof(pfd));
    pfd.fd = sock;
    pfd.events = POLLIN | POLLERR;

    while (likely(!terminate)) {
        blk = (struct block_header *)ring.vectors[curr_blk].iov_base;

        if ((blk->info.block_status & TP_STATUS_USER) == 0) {
            poll(&pfd, 1, -1);
            continue;
        }

        scan_block(blk, curr_blk);
        return_block(blk);
        curr_blk = (curr_blk + 1) % max_blks;
    }

    len = sizeof(stats);
    getsockopt(sock, SOL_PACKET, PACKET_STATISTICS, &stats, &len);
    printf("\nReceived %u packets, %lu bytes, %u dropped\n",
           stats.tp_packets, byte_count, stats.tp_drops);

    teardown(&ring, sock);
    return 0;
}

Key Implementation Details

Block Timeout Mechanism The tp_retire_blk_tov field specifies a timeout in milliseconds. When triggered, the kernel transitions the block status from TP_STATUS_USER back to TP_STATUS_KERNEL, reclaiming the buffer for new data evenif not completely filled. This prevents stalls in low-traffic scenarios.

Receive Hash Population Setting tp_feature_req_word to TP_FT_REQ_FILL_RXHASH requests the kernel to populate the receive hash (RSS hash) in each packet header. This facilitates hardware-accelerated flow classification and load balancing across processing threads.

Memory Locking The MAP_LOCKED flag in mmap() prevents ring buffer pages from being swapped to disk, ensuring consistent low-latency access essential for high-speed capture.

Hardware Timestamping To enable hardware timestamps:

int flags = SOF_TIMESTAMPING_RAW_HARDWARE;
setsockopt(sock, SOL_PACKET, PACKET_TIMESTAMP, &flags, sizeof(flags));

Processing Flow The main loop implements a circular buffer over blocks. Within each block, packets form a linked list via tp_next_offset, allowing variable-length frames while maintaining cache-friendly contiguous memory access.

Multi-Core Scaling with Socket Fanout

While V3 improves single-thread efficiency, capturing multi-gigabit traffic requires parallel processing. Since kernel 3.1, AF_PACKET supports fanout groups, allowing multiple sockets to capture from the same interface with kernel-level load distribution.

A fanout group supports up to 65,536 sockets. Members join via setsockopt() with PACKET_FANOUT. The group persists until all member sockets close, with the kernel distributing packets according to configurable algorithms.

Distribution Algorithms

Mode Description Characteristics
PACKET_FANOUT_HASH Hash on 5-tuple (IP addresses, ports) Flow-aware, preserves session affinity (default)
PACKET_FANOUT_LB Round-robin distribution Simple load balancing, stateless
PACKET_FANOUT_RND Pseudo-random selection Stateless processing
PACKET_FANOUT_CPU Bind to receiving CPU NUMA-aware architectures
PACKET_FANOUT_ROLLOVER Failover on congestion High-reliability capture
PACKET_FANOUT_QM Map hardware queue to socket (3.14+) RSS queue affinity

The PACKET_FANOUT_HASH mode ensures all packets belonging to the same flow arrive at the same socket, preserving flow state for stateful analysis.

Fanout Implementation

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <sys/socket.h>
#include <sys/ioctl.h>
#include <unistd.h>
#include <linux/if_ether.h>
#include <linux/if_packet.h>
#include <net/if.h>

static const char *interface;
static int fanout_alg;
static int group_ident;

#ifndef PACKET_FANOUT
#define PACKET_FANOUT      18
#define PACKET_FANOUT_HASH 0
#define PACKET_FANOUT_LB   1
#endif

static int setup_fanout_sock(void)
{
    int fd = socket(AF_PACKET, SOCK_RAW, htons(ETH_P_IP));
    struct sockaddr_ll addr;
    struct ifreq ifr;
    int config;

    if (fd < 0) {
        perror("socket");
        return -1;
    }

    memset(&ifr, 0, sizeof(ifr));
    strncpy(ifr.ifr_name, interface, IFNAMSIZ - 1);
    if (ioctl(fd, SIOCGIFINDEX, &ifr) < 0) {
        perror("ioctl");
        return -1;
    }

    memset(&addr, 0, sizeof(addr));
    addr.sll_family = AF_PACKET;
    addr.sll_ifindex = ifr.ifr_ifindex;
    if (bind(fd, (struct sockaddr *)&addr, sizeof(addr)) < 0) {
        perror("bind");
        return -1;
    }

    /* High 16 bits: algorithm, Low 16 bits: group ID */
    config = (group_ident & 0xFFFF) | (fanout_alg << 16);
    if (setsockopt(fd, SOL_PACKET, PACKET_FANOUT, &config, sizeof(config)) < 0) {
        perror("setsockopt fanout");
        return -1;
    }

    return fd;
}

static void worker_proc(void)
{
    int fd = setup_fanout_sock();
    int remaining = 10000;
    char buf[1600];

    if (fd < 0) exit(EXIT_FAILURE);

    while (remaining-- > 0) {
        if (read(fd, buf, sizeof(buf)) < 0) {
            perror("read");
            exit(EXIT_FAILURE);
        }
        if ((remaining % 100) == 0)
            printf("PID %d: %d packets remaining\n", getpid(), remaining);
    }

    printf("PID %d completed capture\n", getpid());
    close(fd);
    exit(EXIT_SUCCESS);
}

int main(int argc, char **argv)
{
    int i;

    if (argc != 3) {
        fprintf(stderr, "Usage: %s <interface> {hash|lb}\n", argv[0]);
        return EXIT_FAILURE;
    }

    if (!strcmp(argv[2], "hash"))
        fanout_alg = PACKET_FANOUT_HASH;
    else if (!strcmp(argv[2], "lb"))
        fanout_alg = PACKET_FANOUT_LB;
    else {
        fprintf(stderr, "Unknown mode: %s\n", argv[2]);
        return EXIT_FAILURE;
    }

    interface = argv[1];
    group_ident = getpid() & 0xFFFF;

    for (i = 0; i < 4; i++) {
        pid_t pid = fork();
        if (pid == 0) {
            worker_proc();
        } else if (pid < 0) {
            perror("fork");
            exit(EXIT_FAILURE);
        }
    }

    for (i = 0; i < 4; i++)
        wait(NULL);

    return 0;
}

The config value combines the 16-bit group identifier with the algorithm selector. All sockets sharing the same group ID receive distributed packets according to the selected algorithm, enabling linear scaling across CPU cores.

References

Related Articles

Understanding Strong and Weak References in Java

Strong References Strong reference are the most prevalent type of object referencing in Java. When an object has a strong reference pointing to it, the garbage collector will not reclaim its memory. F...

Comprehensive Guide to SSTI Explained with Payload Bypass Techniques

Introduction Server-Side Template Injection (SSTI) is a vulnerability in web applications where user input is improper handled within the template engine and executed on the server. This exploit can r...

Implement Image Upload Functionality for Django Integrated TinyMCE Editor

Django’s Admin panel is highly user-friendly, and pairing it with TinyMCE, an effective rich text editor, simplifies content management significantly. Combining the two is particular useful for bloggi...

Leave a Comment

Anonymous

◎Feel free to join the discussion and share your thoughts.