为了更高的性能,需要将 XDP 程序下沉到网卡驱动里去运行。
因为服务器使用的物理网卡是 Mellanox,所以就研究一下 Mellanox 驱动里是怎么运行 XDP 程序的。
直接在内核源代码里的 /drivers/net/ethernet/mellanox
目录下搜索 XDP_REDIRECT
,就能找到如下代码片段:
// ${KERNEL}/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
/* returns true if packet was consumed by xdp */
bool mlx5e_xdp_handle(struct mlx5e_rq *rq, struct page *page,
struct bpf_prog *prog, struct xdp_buff *xdp)
{
u32 act;
int err;
act = bpf_prog_run_xdp(prog, xdp);
switch (act) {
case XDP_PASS:
return false;
case XDP_TX:
if (unlikely(!mlx5e_xmit_xdp_buff(rq->xdpsq, rq, page, xdp)))
goto xdp_abort;
__set_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags); /* non-atomic */
return true;
case XDP_REDIRECT:
/* When XDP enabled then page-refcnt==1 here */
err = xdp_do_redirect(rq->netdev, xdp, prog);
if (unlikely(err))
goto xdp_abort;
__set_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags);
__set_bit(MLX5E_RQ_FLAG_XDP_REDIRECT, rq->flags);
if (xdp->rxq->mem.type != MEM_TYPE_XSK_BUFF_POOL)
mlx5e_page_dma_unmap(rq, page);
rq->stats->xdp_redirect++;
return true;
default:
bpf_warn_invalid_xdp_action(rq->netdev, prog, act);
fallthrough;
case XDP_ABORTED:
xdp_abort:
trace_xdp_exception(rq->netdev, prog, act);
fallthrough;
case XDP_DROP:
rq->stats->xdp_drop++;
return true;
}
}
bpf_prog_run_xdp()
就是真实运行 XDP 程序的函数。
如果 XDP 程序里 XDP_PASS
该网络包到内核,Mellanox 网卡驱动还做了哪些处理呢?
有 3 个地方调了 mlx5e_xdp_handle()
函数。
// ${KERNEL}/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
// 第 1 个
static struct sk_buff *
mlx5e_skb_from_cqe_linear(struct mlx5e_rq *rq, struct mlx5e_wqe_frag_info *wi,
u32 cqe_bcnt)
{
// ...
net_prefetch(data);
prog = rcu_dereference(rq->xdp_prog);
if (prog) {
struct xdp_buff xdp;
net_prefetchw(va); /* xdp_frame data area */
mlx5e_fill_xdp_buff(rq, va, rx_headroom, cqe_bcnt, &xdp);
if (mlx5e_xdp_handle(rq, au->page, prog, &xdp))
return NULL; /* page/packet was consumed by XDP */
rx_headroom = xdp.data - xdp.data_hard_start;
metasize = xdp.data - xdp.data_meta;
cqe_bcnt = xdp.data_end - xdp.data;
}
frag_size = MLX5_SKB_FRAG_SZ(rx_headroom + cqe_bcnt);
skb = mlx5e_build_linear_skb(rq, va, frag_size, rx_headroom, cqe_bcnt, metasize);
// ...
return skb;
}
// 第 2 个
static struct sk_buff *
mlx5e_skb_from_cqe_nonlinear(struct mlx5e_rq *rq, struct mlx5e_wqe_frag_info *wi,
u32 cqe_bcnt)
{
// ...
prog = rcu_dereference(rq->xdp_prog);
if (prog && mlx5e_xdp_handle(rq, au->page, prog, &xdp)) {
if (test_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags)) {
int i;
for (i = wi - head_wi; i < rq->wqe.info.num_frags; i++)
mlx5e_put_rx_frag(rq, &head_wi[i], true);
}
return NULL; /* page/packet was consumed by XDP */
}
skb = mlx5e_build_linear_skb(rq, xdp.data_hard_start, rq->buff.frame0_sz,
xdp.data - xdp.data_hard_start,
xdp.data_end - xdp.data,
xdp.data - xdp.data_meta);
// ...
return skb;
}
// 第 3 个
static struct sk_buff *
mlx5e_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi,
u16 cqe_bcnt, u32 head_offset, u32 page_idx)
{
// ...
net_prefetch(data);
prog = rcu_dereference(rq->xdp_prog);
if (prog) {
struct xdp_buff xdp;
net_prefetchw(va); /* xdp_frame data area */
mlx5e_fill_xdp_buff(rq, va, rx_headroom, cqe_bcnt, &xdp);
if (mlx5e_xdp_handle(rq, au->page, prog, &xdp)) {
if (__test_and_clear_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags))
__set_bit(page_idx, wi->xdp_xmit_bitmap); /* non-atomic */
return NULL; /* page/packet was consumed by XDP */
}
rx_headroom = xdp.data - xdp.data_hard_start;
metasize = xdp.data - xdp.data_meta;
cqe_bcnt = xdp.data_end - xdp.data;
}
frag_size = MLX5_SKB_FRAG_SZ(rx_headroom + cqe_bcnt);
skb = mlx5e_build_linear_skb(rq, va, frag_size, rx_headroom, cqe_bcnt, metasize);
// ...
return skb;
}
static inline
struct sk_buff *mlx5e_build_linear_skb(struct mlx5e_rq *rq, void *va,
u32 frag_size, u16 headroom,
u32 cqe_bcnt, u32 metasize)
{
struct sk_buff *skb = build_skb(va, frag_size);
if (unlikely(!skb)) {
rq->stats->buff_alloc_err++;
return NULL;
}
skb_reserve(skb, headroom);
skb_put(skb, cqe_bcnt);
if (metasize)
skb_metadata_set(skb, metasize);
return skb;
}
从上面代码片段可以看出,在执行 XDP 程序之后,都调用 mlx5e_build_linear_skb()
函数来构建 skb
。
如果 XDP 程序里 XDP_TX
该网络包发送出去,Mellanox 网卡驱动还做了哪些处理呢?
// ${KERNEL}/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
mlx5e_xmit_xdp_buff()
|-->mlx5e_xdpi_fifo_push()
直接在驱动内部调 mlx5e_xdpi_fifo_push()
函数发送出去了。
如果 XDP 程序里 XDP_REDIRECT
转发该网络包,Mellanox 网卡驱动还做了哪些处理呢?
驱动里调 xdp_do_redirect()
进行了转发处理。
// ${KERNEL}/net/core/filter.c
int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
struct bpf_prog *xdp_prog)
{
// ...
return __xdp_do_redirect_frame(ri, dev, xdp_convert_buff_to_frame(xdp),
xdp_prog);
}
static __always_inline int __xdp_do_redirect_frame(struct bpf_redirect_info *ri,
struct net_device *dev,
struct xdp_frame *xdpf,
struct bpf_prog *xdp_prog)
{
// ...
case BPF_MAP_TYPE_UNSPEC:
if (map_id == INT_MAX) {
fwd = dev_get_by_index_rcu(dev_net(dev), ri->tgt_index);
if (unlikely(!fwd)) {
err = -EINVAL;
break;
}
err = dev_xdp_enqueue(fwd, xdpf, dev);
break;
}
fallthrough;
// ...
}
// ${KERNEL}/kernel/bpf/devmap.c
int dev_xdp_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
struct net_device *dev_rx)
{
return __xdp_enqueue(dev, xdpf, dev_rx, NULL);
}
static inline int __xdp_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
struct net_device *dev_rx,
struct bpf_prog *xdp_prog)
{
int err;
if (!dev->netdev_ops->ndo_xdp_xmit)
return -EOPNOTSUPP;
err = xdp_ok_fwd_dev(dev, xdp_get_frame_len(xdpf));
if (unlikely(err))
return err;
bq_enqueue(dev, xdpf, dev_rx, xdp_prog);
return 0;
}
// ${KERNEL}/include/net/xdp.h
static inline
int xdp_update_frame_from_buff(struct xdp_buff *xdp,
struct xdp_frame *xdp_frame)
{
int metasize, headroom;
/* Assure headroom is available for storing info */
headroom = xdp->data - xdp->data_hard_start;
metasize = xdp->data - xdp->data_meta;
metasize = metasize > 0 ? metasize : 0;
if (unlikely((headroom - metasize) < sizeof(*xdp_frame)))
return -ENOSPC;
/* Catch if driver didn't reserve tailroom for skb_shared_info */
if (unlikely(xdp->data_end > xdp_data_hard_end(xdp))) {
XDP_WARN("Driver BUG: missing reserved tailroom");
return -ENOSPC;
}
xdp_frame->data = xdp->data;
xdp_frame->len = xdp->data_end - xdp->data;
xdp_frame->headroom = headroom - sizeof(*xdp_frame);
xdp_frame->metasize = metasize;
xdp_frame->frame_sz = xdp->frame_sz;
xdp_frame->flags = xdp->flags;
return 0;
}
/* Convert xdp_buff to xdp_frame */
static inline
struct xdp_frame *xdp_convert_buff_to_frame(struct xdp_buff *xdp)
{
struct xdp_frame *xdp_frame;
if (xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL)
return xdp_convert_zc_to_xdp_frame(xdp);
/* Store info in top of packet */
xdp_frame = xdp->data_hard_start;
if (unlikely(xdp_update_frame_from_buff(xdp, xdp_frame) < 0))
return NULL;
/* rxq only valid until napi_schedule ends, convert to xdp_mem_info */
xdp_frame->mem = xdp->rxq->mem;
return xdp_frame;
}
以上代码片段的主要处理逻辑:
xdp_buff
转为 xdp_frame
。ndo_xdp_xmit()
函数将 xdp_frame
发送出去。关于 XDP_REDIRECT 的更多讲解,请看:
Mellanox 驱动对它们没有复杂的处理逻辑:
rq->stats->xdp_drop++;
只是递增了 xdp_drop
统计。
bpf_xdp_adjust_head()
// ${KERNEL}/net/core/filter.c
BPF_CALL_2(bpf_xdp_adjust_head, struct xdp_buff *, xdp, int, offset)
{
void *xdp_frame_end = xdp->data_hard_start + sizeof(struct xdp_frame);
unsigned long metalen = xdp_get_metalen(xdp);
void *data_start = xdp_frame_end + metalen;
void *data = xdp->data + offset;
if (unlikely(data < data_start ||
data > xdp->data_end - ETH_HLEN))
return -EINVAL;
if (metalen)
memmove(xdp->data_meta + offset,
xdp->data_meta, metalen);
xdp->data_meta += offset;
xdp->data = data;
return 0;
}
注意其中一个细节:如果有 metadata,metadata 会被 memmove 而不会被覆盖。
bpf_xdp_adjust_meta()
// ${KERNEL}/net/core/filter.c
BPF_CALL_2(bpf_xdp_adjust_meta, struct xdp_buff *, xdp, int, offset)
{
void *xdp_frame_end = xdp->data_hard_start + sizeof(struct xdp_frame);
void *meta = xdp->data_meta + offset;
unsigned long metalen = xdp->data - meta;
if (xdp_data_meta_unsupported(xdp))
return -ENOTSUPP;
if (unlikely(meta < xdp_frame_end ||
meta > xdp->data))
return -EINVAL;
if (unlikely(xdp_metalen_invalid(metalen)))
return -EACCES;
xdp->data_meta = meta;
return 0;
}
更详细的讲解请看:
bpf_xdp_adjust_tail()
// ${KERNEL}/net/core/filter.c
BPF_CALL_2(bpf_xdp_adjust_tail, struct xdp_buff *, xdp, int, offset)
{
void *data_hard_end = xdp_data_hard_end(xdp); /* use xdp->frame_sz */
void *data_end = xdp->data_end + offset;
if (unlikely(xdp_buff_has_frags(xdp))) { /* non-linear xdp buff */
if (offset < 0)
return bpf_xdp_frags_shrink_tail(xdp, -offset);
return bpf_xdp_frags_increase_tail(xdp, offset);
}
// ...
/* Clear memory area on grow, can contain uninit kernel memory */
if (offset > 0)
memset(xdp->data_end, 0, offset);
xdp->data_end = data_end;
return 0;
}
Q:经过 XDP adjust 后的网络包,能否 PASS 到内核?
A:可以。回头看 XDP_PASS on Mellanox
的处理逻辑,在调 mlx5e_build_linear_skb()
构建 skb
时便处理好了 head
、meta
和 tail
。
Q:经过 XDP adjust 后的网络包,在 REDIRECT 时会失去 meta
吗?
A:不会。以 veth 虚拟设备为例,veth 网卡驱动在将 xdp_frame
转为 skb
时,调 skb_metadata_set()
设置 meta
信息(意即,meta
信息可以跨设备传递):
// ${KERNEL}/net/core/xdp.c
struct sk_buff *__xdp_build_skb_from_frame(struct xdp_frame *xdpf,
struct sk_buff *skb,
struct net_device *dev)
{
// ...
/* Part of headroom was reserved to xdpf */
headroom = sizeof(*xdpf) + xdpf->headroom;
/* Memory size backing xdp_frame data already have reserved
* room for build_skb to place skb_shared_info in tailroom.
*/
frame_size = xdpf->frame_sz;
hard_start = xdpf->data - headroom;
skb = build_skb_around(skb, hard_start, frame_size);
if (unlikely(!skb))
return NULL;
skb_reserve(skb, headroom);
__skb_put(skb, xdpf->len);
if (xdpf->metasize)
skb_metadata_set(skb, xdpf->metasize);
// ...
return skb;
}
将 XDP on Mellanox 研究透彻后,就不再害怕将 XDP 程序下发到 Mellanox 驱动去运行的各种 corner case 了。