High-Performance Packet Capture with AF_PACKET V3 on Linux
Introduction
While earlier implementations provided foundational packet capture capabilities, modern Linux kernels support AF_PACKET Version 3 (V3), delivering substantial performance improvements over both V1 and V2. (Version 2 primarily enhanced timestamp precision from microseconds to nanoseconds.) Key advantages of V3 include:
- 15-20% reduction in CPU utilization
- ~20% improvement in packet capture rates
- 2x increase in packet density (improved memory efficiency)
- Support for port aggregation analysis
- Variable-length frames enabling storage of complete packets regardless of size
This article explores V3 implementation details and optimization techniques, including load distribution strategies for high-throughput environments.
AF_PACKET V3 Architecture
Unlike previous versions that traversed individual frames sequentially, V3 organizes memory into blocks containing multiple packets. Iteration occurs at the block level first, then within each frame. The protocol maintains nanosecond timestamp precision.
Data Structures
The configuration structure tpacket_req3 introduces several V3-specific fields:
struct tpacket_req3 {
unsigned int tp_block_size; // Minimum contiguous block size (PAGE_SIZE * 2^n)
unsigned int tp_block_nr; // Number of blocks in the ring
unsigned int tp_frame_size; // Maximum frame size (upper bound for variable lengths)
unsigned int tp_frame_nr; // Total frame slots (frames_per_block * block_count)
unsigned int tp_retire_blk_tov; // Block timeout in ms (0 = infinite)
unsigned int tp_sizeof_priv; // Private metadata area per block
unsigned int tp_feature_req_word; // Feature flags (e.g., TP_FT_REQ_FILL_RXHASH)
};
struct tpacket3_hdr {
__u32 tp_next_offset; // Offset to next packet within block
__u32 tp_sec; // Timestamp seconds
__u32 tp_nsec; // Timestamp nanoseconds
__u32 tp_snaplen; // Captured bytes
__u32 tp_len; // Original packet length
__u32 tp_status; // Packet state
__u16 tp_mac; // MAC header offset
__u16 tp_net; // Network layer offset
union {
struct tpacket_hdr_variant1 hv1; // VLAN and hash metadata
};
__u8 tp_padding[8];
};
Implementation Example
The following demonstrates kernel-optimized capture using memory-mapped ring buffers:
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <assert.h>
#include <net/if.h>
#include <arpa/inet.h>
#include <poll.h>
#include <unistd.h>
#include <signal.h>
#include <inttypes.h>
#include <sys/socket.h>
#include <sys/mman.h>
#include <linux/if_packet.h>
#include <linux/if_ether.h>
#include <linux/ip.h>
#ifndef likely
#define likely(x) __builtin_expect(!!(x), 1)
#endif
#ifndef unlikely
#define unlikely(x) __builtin_expect(!!(x), 0)
#endif
struct block_header {
uint32_t version;
uint32_t offset_to_priv;
struct tpacket_hdr_v1 info;
};
struct capture_ring {
struct iovec *vectors;
uint8_t *base_addr;
struct tpacket_req3 params;
};
static volatile sig_atomic_t terminate = 0;
static uint64_t pkt_count = 0, byte_count = 0;
static void signal_handler(int sig)
{
terminate = 1;
}
static int create_packet_socket(struct capture_ring *ring, const char *dev)
{
int sock, err, idx;
int ver = TPACKET_V3;
struct sockaddr_ll addr;
unsigned int blk_size = 1 << 22; // 4MB blocks
unsigned int frm_size = 1 << 11; // 2KB frames
unsigned int blk_cnt = 64;
sock = socket(AF_PACKET, SOCK_RAW, htons(ETH_P_ALL));
if (sock < 0) {
perror("socket");
exit(EXIT_FAILURE);
}
err = setsockopt(sock, SOL_PACKET, PACKET_VERSION, &ver, sizeof(ver));
if (err < 0) {
perror("setsockopt version");
exit(EXIT_FAILURE);
}
memset(&ring->params, 0, sizeof(ring->params));
ring->params.tp_block_size = blk_size;
ring->params.tp_frame_size = frm_size;
ring->params.tp_block_nr = blk_cnt;
ring->params.tp_frame_nr = (blk_size * blk_cnt) / frm_size;
ring->params.tp_retire_blk_tov = 60;
ring->params.tp_feature_req_word = TP_FT_REQ_FILL_RXHASH;
err = setsockopt(sock, SOL_PACKET, PACKET_RX_RING, &ring->params, sizeof(ring->params));
if (err < 0) {
perror("setsockopt rx_ring");
exit(EXIT_FAILURE);
}
ring->base_addr = mmap(NULL, ring->params.tp_block_size * ring->params.tp_block_nr,
PROT_READ | PROT_WRITE, MAP_SHARED | MAP_LOCKED, sock, 0);
if (ring->base_addr == MAP_FAILED) {
perror("mmap");
exit(EXIT_FAILURE);
}
ring->vectors = malloc(ring->params.tp_block_nr * sizeof(*ring->vectors));
assert(ring->vectors);
for (idx = 0; idx < ring->params.tp_block_nr; idx++) {
ring->vectors[idx].iov_base = ring->base_addr + (idx * ring->params.tp_block_size);
ring->vectors[idx].iov_len = ring->params.tp_block_size;
}
memset(&addr, 0, sizeof(addr));
addr.sll_family = PF_PACKET;
addr.sll_protocol = htons(ETH_P_ALL);
addr.sll_ifindex = if_nametoindex(dev);
err = bind(sock, (struct sockaddr *)&addr, sizeof(addr));
if (err < 0) {
perror("bind");
exit(EXIT_FAILURE);
}
return sock;
}
static void show_packet(struct tpacket3_hdr *hdr)
{
struct ethhdr *eth = (struct ethhdr *)((uint8_t *)hdr + hdr->tp_mac);
struct iphdr *ip = (struct iphdr *)((uint8_t *)eth + ETH_HLEN);
if (eth->h_proto == htons(ETH_P_IP)) {
char src[INET_ADDRSTRLEN], dst[INET_ADDRSTRLEN];
inet_ntop(AF_INET, &ip->saddr, src, sizeof(src));
inet_ntop(AF_INET, &ip->daddr, dst, sizeof(dst));
printf("%s -> %s, ", src, dst);
}
printf("rxhash: 0x%x\n", hdr->hv1.tp_rxhash);
}
static void scan_block(struct block_header *blk, int blk_idx)
{
int num_pkts = blk->info.num_pkts, i;
unsigned long bytes = 0;
struct tpacket3_hdr *pkt;
pkt = (struct tpacket3_hdr *)((uint8_t *)blk + blk->info.offset_to_first_pkt);
for (i = 0; i < num_pkts; i++) {
bytes += pkt->tp_snaplen;
show_packet(pkt);
pkt = (struct tpacket3_hdr *)((uint8_t *)pkt + pkt->tp_next_offset);
}
pkt_count += num_pkts;
byte_count += bytes;
}
static void return_block(struct block_header *blk)
{
blk->info.block_status = TP_STATUS_KERNEL;
}
static void teardown(struct capture_ring *ring, int sock)
{
munmap(ring->base_addr, ring->params.tp_block_size * ring->params.tp_block_nr);
free(ring->vectors);
close(sock);
}
int main(int argc, char **argv)
{
int sock, err;
socklen_t len;
struct capture_ring ring;
struct pollfd pfd;
unsigned int curr_blk = 0, max_blks = 64;
struct block_header *blk;
struct tpacket_stats_v3 stats;
if (argc != 2) {
fprintf(stderr, "Usage: %s <interface>\n", argv[0]);
return EXIT_FAILURE;
}
signal(SIGINT, signal_handler);
memset(&ring, 0, sizeof(ring));
sock = create_packet_socket(&ring, argv[1]);
memset(&pfd, 0, sizeof(pfd));
pfd.fd = sock;
pfd.events = POLLIN | POLLERR;
while (likely(!terminate)) {
blk = (struct block_header *)ring.vectors[curr_blk].iov_base;
if ((blk->info.block_status & TP_STATUS_USER) == 0) {
poll(&pfd, 1, -1);
continue;
}
scan_block(blk, curr_blk);
return_block(blk);
curr_blk = (curr_blk + 1) % max_blks;
}
len = sizeof(stats);
getsockopt(sock, SOL_PACKET, PACKET_STATISTICS, &stats, &len);
printf("\nReceived %u packets, %lu bytes, %u dropped\n",
stats.tp_packets, byte_count, stats.tp_drops);
teardown(&ring, sock);
return 0;
}
Key Implementation Details
Block Timeout Mechanism
The tp_retire_blk_tov field specifies a timeout in milliseconds. When triggered, the kernel transitions the block status from TP_STATUS_USER back to TP_STATUS_KERNEL, reclaiming the buffer for new data evenif not completely filled. This prevents stalls in low-traffic scenarios.
Receive Hash Population
Setting tp_feature_req_word to TP_FT_REQ_FILL_RXHASH requests the kernel to populate the receive hash (RSS hash) in each packet header. This facilitates hardware-accelerated flow classification and load balancing across processing threads.
Memory Locking
The MAP_LOCKED flag in mmap() prevents ring buffer pages from being swapped to disk, ensuring consistent low-latency access essential for high-speed capture.
Hardware Timestamping To enable hardware timestamps:
int flags = SOF_TIMESTAMPING_RAW_HARDWARE;
setsockopt(sock, SOL_PACKET, PACKET_TIMESTAMP, &flags, sizeof(flags));
Processing Flow
The main loop implements a circular buffer over blocks. Within each block, packets form a linked list via tp_next_offset, allowing variable-length frames while maintaining cache-friendly contiguous memory access.
Multi-Core Scaling with Socket Fanout
While V3 improves single-thread efficiency, capturing multi-gigabit traffic requires parallel processing. Since kernel 3.1, AF_PACKET supports fanout groups, allowing multiple sockets to capture from the same interface with kernel-level load distribution.
A fanout group supports up to 65,536 sockets. Members join via setsockopt() with PACKET_FANOUT. The group persists until all member sockets close, with the kernel distributing packets according to configurable algorithms.
Distribution Algorithms
| Mode | Description | Characteristics |
|---|---|---|
PACKET_FANOUT_HASH |
Hash on 5-tuple (IP addresses, ports) | Flow-aware, preserves session affinity (default) |
PACKET_FANOUT_LB |
Round-robin distribution | Simple load balancing, stateless |
PACKET_FANOUT_RND |
Pseudo-random selection | Stateless processing |
PACKET_FANOUT_CPU |
Bind to receiving CPU | NUMA-aware architectures |
PACKET_FANOUT_ROLLOVER |
Failover on congestion | High-reliability capture |
PACKET_FANOUT_QM |
Map hardware queue to socket (3.14+) | RSS queue affinity |
The PACKET_FANOUT_HASH mode ensures all packets belonging to the same flow arrive at the same socket, preserving flow state for stateful analysis.
Fanout Implementation
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <sys/socket.h>
#include <sys/ioctl.h>
#include <unistd.h>
#include <linux/if_ether.h>
#include <linux/if_packet.h>
#include <net/if.h>
static const char *interface;
static int fanout_alg;
static int group_ident;
#ifndef PACKET_FANOUT
#define PACKET_FANOUT 18
#define PACKET_FANOUT_HASH 0
#define PACKET_FANOUT_LB 1
#endif
static int setup_fanout_sock(void)
{
int fd = socket(AF_PACKET, SOCK_RAW, htons(ETH_P_IP));
struct sockaddr_ll addr;
struct ifreq ifr;
int config;
if (fd < 0) {
perror("socket");
return -1;
}
memset(&ifr, 0, sizeof(ifr));
strncpy(ifr.ifr_name, interface, IFNAMSIZ - 1);
if (ioctl(fd, SIOCGIFINDEX, &ifr) < 0) {
perror("ioctl");
return -1;
}
memset(&addr, 0, sizeof(addr));
addr.sll_family = AF_PACKET;
addr.sll_ifindex = ifr.ifr_ifindex;
if (bind(fd, (struct sockaddr *)&addr, sizeof(addr)) < 0) {
perror("bind");
return -1;
}
/* High 16 bits: algorithm, Low 16 bits: group ID */
config = (group_ident & 0xFFFF) | (fanout_alg << 16);
if (setsockopt(fd, SOL_PACKET, PACKET_FANOUT, &config, sizeof(config)) < 0) {
perror("setsockopt fanout");
return -1;
}
return fd;
}
static void worker_proc(void)
{
int fd = setup_fanout_sock();
int remaining = 10000;
char buf[1600];
if (fd < 0) exit(EXIT_FAILURE);
while (remaining-- > 0) {
if (read(fd, buf, sizeof(buf)) < 0) {
perror("read");
exit(EXIT_FAILURE);
}
if ((remaining % 100) == 0)
printf("PID %d: %d packets remaining\n", getpid(), remaining);
}
printf("PID %d completed capture\n", getpid());
close(fd);
exit(EXIT_SUCCESS);
}
int main(int argc, char **argv)
{
int i;
if (argc != 3) {
fprintf(stderr, "Usage: %s <interface> {hash|lb}\n", argv[0]);
return EXIT_FAILURE;
}
if (!strcmp(argv[2], "hash"))
fanout_alg = PACKET_FANOUT_HASH;
else if (!strcmp(argv[2], "lb"))
fanout_alg = PACKET_FANOUT_LB;
else {
fprintf(stderr, "Unknown mode: %s\n", argv[2]);
return EXIT_FAILURE;
}
interface = argv[1];
group_ident = getpid() & 0xFFFF;
for (i = 0; i < 4; i++) {
pid_t pid = fork();
if (pid == 0) {
worker_proc();
} else if (pid < 0) {
perror("fork");
exit(EXIT_FAILURE);
}
}
for (i = 0; i < 4; i++)
wait(NULL);
return 0;
}
The config value combines the 16-bit group identifier with the algorithm selector. All sockets sharing the same group ID receive distributed packets according to the selected algorithm, enabling linear scaling across CPU cores.