From 75fb890c73a77110a87fd346d53012aaa0d7b3e1 Mon Sep 17 00:00:00 2001 From: Roxana Nicolescu Date: Wed, 25 Feb 2026 15:21:07 +0100 Subject: [PATCH 01/11] libeth: add Tx buffer completion helpers jira KERNEL-168 commit-author Alexander Lobakin commit 080d72f471c86f8906845bc822051f5790d0a90d Software-side Tx buffers for storing DMA, frame size, skb pointers etc. are pretty much generic and every driver defines them the same way. The same can be said for software Tx completions -- same napi_consume_skb()s and all that... Add a couple simple wrappers for doing that to stop repeating the old tale at least within the Intel code. Drivers are free to use 'priv' member at the end of the structure. Reviewed-by: Przemek Kitszel Signed-off-by: Alexander Lobakin Signed-off-by: Tony Nguyen (cherry picked from commit 080d72f471c86f8906845bc822051f5790d0a90d) Signed-off-by: Roxana Nicolescu --- include/net/libeth/tx.h | 129 +++++++++++++++++++++++++++++++++++++ include/net/libeth/types.h | 25 +++++++ 2 files changed, 154 insertions(+) create mode 100644 include/net/libeth/tx.h create mode 100644 include/net/libeth/types.h diff --git a/include/net/libeth/tx.h b/include/net/libeth/tx.h new file mode 100644 index 0000000000000..35614f9523f60 --- /dev/null +++ b/include/net/libeth/tx.h @@ -0,0 +1,129 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* Copyright (C) 2024 Intel Corporation */ + +#ifndef __LIBETH_TX_H +#define __LIBETH_TX_H + +#include + +#include + +/* Tx buffer completion */ + +/** + * enum libeth_sqe_type - type of &libeth_sqe to act on Tx completion + * @LIBETH_SQE_EMPTY: unused/empty, no action required + * @LIBETH_SQE_CTX: context descriptor with empty SQE, no action required + * @LIBETH_SQE_SLAB: kmalloc-allocated buffer, unmap and kfree() + * @LIBETH_SQE_FRAG: mapped skb frag, only unmap DMA + * @LIBETH_SQE_SKB: &sk_buff, unmap and napi_consume_skb(), update stats + */ +enum libeth_sqe_type { + LIBETH_SQE_EMPTY = 0U, + LIBETH_SQE_CTX, + LIBETH_SQE_SLAB, + LIBETH_SQE_FRAG, + LIBETH_SQE_SKB, +}; + +/** + * struct libeth_sqe - represents a Send Queue Element / Tx buffer + * @type: type of the buffer, see the enum above + * @rs_idx: index of the last buffer from the batch this one was sent in + * @raw: slab buffer to free via kfree() + * @skb: &sk_buff to consume + * @dma: DMA address to unmap + * @len: length of the mapped region to unmap + * @nr_frags: number of frags in the frame this buffer belongs to + * @packets: number of physical packets sent for this frame + * @bytes: number of physical bytes sent for this frame + * @priv: driver-private scratchpad + */ +struct libeth_sqe { + enum libeth_sqe_type type:32; + u32 rs_idx; + + union { + void *raw; + struct sk_buff *skb; + }; + + DEFINE_DMA_UNMAP_ADDR(dma); + DEFINE_DMA_UNMAP_LEN(len); + + u32 nr_frags; + u32 packets; + u32 bytes; + + unsigned long priv; +} __aligned_largest; + +/** + * LIBETH_SQE_CHECK_PRIV - check the driver's private SQE data + * @p: type or name of the object the driver wants to fit into &libeth_sqe + * + * Make sure the driver's private data fits into libeth_sqe::priv. To be used + * right after its declaration. + */ +#define LIBETH_SQE_CHECK_PRIV(p) \ + static_assert(sizeof(p) <= sizeof_field(struct libeth_sqe, priv)) + +/** + * struct libeth_cq_pp - completion queue poll params + * @dev: &device to perform DMA unmapping + * @ss: onstack NAPI stats to fill + * @napi: whether it's called from the NAPI context + * + * libeth uses this structure to access objects needed for performing full + * Tx complete operation without passing lots of arguments and change the + * prototypes each time a new one is added. + */ +struct libeth_cq_pp { + struct device *dev; + struct libeth_sq_napi_stats *ss; + + bool napi; +}; + +/** + * libeth_tx_complete - perform Tx completion for one SQE + * @sqe: SQE to complete + * @cp: poll params + * + * Do Tx complete for all the types of buffers, incl. freeing, unmapping, + * updating the stats etc. + */ +static inline void libeth_tx_complete(struct libeth_sqe *sqe, + const struct libeth_cq_pp *cp) +{ + switch (sqe->type) { + case LIBETH_SQE_EMPTY: + return; + case LIBETH_SQE_SKB: + case LIBETH_SQE_FRAG: + case LIBETH_SQE_SLAB: + dma_unmap_page(cp->dev, dma_unmap_addr(sqe, dma), + dma_unmap_len(sqe, len), DMA_TO_DEVICE); + break; + default: + break; + } + + switch (sqe->type) { + case LIBETH_SQE_SKB: + cp->ss->packets += sqe->packets; + cp->ss->bytes += sqe->bytes; + + napi_consume_skb(sqe->skb, cp->napi); + break; + case LIBETH_SQE_SLAB: + kfree(sqe->raw); + break; + default: + break; + } + + sqe->type = LIBETH_SQE_EMPTY; +} + +#endif /* __LIBETH_TX_H */ diff --git a/include/net/libeth/types.h b/include/net/libeth/types.h new file mode 100644 index 0000000000000..603825e451339 --- /dev/null +++ b/include/net/libeth/types.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* Copyright (C) 2024 Intel Corporation */ + +#ifndef __LIBETH_TYPES_H +#define __LIBETH_TYPES_H + +#include + +/** + * struct libeth_sq_napi_stats - "hot" counters to update in Tx completion loop + * @packets: completed frames counter + * @bytes: sum of bytes of completed frames above + * @raw: alias to access all the fields as an array + */ +struct libeth_sq_napi_stats { + union { + struct { + u32 packets; + u32 bytes; + }; + DECLARE_FLEX_ARRAY(u32, raw); + }; +}; + +#endif /* __LIBETH_TYPES_H */ From 78d98f864d04bf73b70bcf3f5d19fd9ea2399181 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Wed, 4 Sep 2024 17:47:44 +0200 Subject: [PATCH 02/11] idpf: convert to libeth Tx buffer completion jira KERNEL-168 commit-author Alexander Lobakin commit d9028db618a63e4bbe63eb56c0b0db2b4cb924bc upstream-diff | adjusted context due to missing #include introduced in commit 1b1b26208515 ("idpf: reuse libeth's definitions of parsed ptype structures") part of "convert RX to libeth" patchset. &idpf_tx_buffer is almost identical to the previous generations, as well as the way it's handled. Moreover, relying on dma_unmap_addr() and !!buf->skb instead of explicit defining of buffer's type was never good. Use the newly added libeth helpers to do it properly and reduce the copy-paste around the Tx code. Reviewed-by: Przemek Kitszel Signed-off-by: Alexander Lobakin Signed-off-by: Tony Nguyen (cherry picked from commit d9028db618a63e4bbe63eb56c0b0db2b4cb924bc) Signed-off-by: Roxana Nicolescu Signed-off-by: Roxana Nicolescu --- .../ethernet/intel/idpf/idpf_singleq_txrx.c | 83 +++---- drivers/net/ethernet/intel/idpf/idpf_txrx.c | 206 +++++++----------- drivers/net/ethernet/intel/idpf/idpf_txrx.h | 50 +---- 3 files changed, 107 insertions(+), 232 deletions(-) diff --git a/drivers/net/ethernet/intel/idpf/idpf_singleq_txrx.c b/drivers/net/ethernet/intel/idpf/idpf_singleq_txrx.c index 3cf493a51b8d0..82fbe926c12bf 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_singleq_txrx.c +++ b/drivers/net/ethernet/intel/idpf/idpf_singleq_txrx.c @@ -1,6 +1,8 @@ // SPDX-License-Identifier: GPL-2.0-only /* Copyright (C) 2023 Intel Corporation */ +#include + #include "idpf.h" /** @@ -222,6 +224,7 @@ static void idpf_tx_singleq_map(struct idpf_tx_queue *tx_q, /* record length, and DMA address */ dma_unmap_len_set(tx_buf, len, size); dma_unmap_addr_set(tx_buf, dma, dma); + tx_buf->type = LIBETH_SQE_FRAG; /* align size to end of page */ max_data += -dma & (IDPF_TX_MAX_READ_REQ_SIZE - 1); @@ -243,6 +246,8 @@ static void idpf_tx_singleq_map(struct idpf_tx_queue *tx_q, i = 0; } + tx_q->tx_buf[i].type = LIBETH_SQE_EMPTY; + dma += max_data; size -= max_data; @@ -280,13 +285,13 @@ static void idpf_tx_singleq_map(struct idpf_tx_queue *tx_q, tx_desc->qw1 = idpf_tx_singleq_build_ctob(td_cmd, offsets, size, td_tag); - IDPF_SINGLEQ_BUMP_RING_IDX(tx_q, i); + first->type = LIBETH_SQE_SKB; + first->rs_idx = i; - /* set next_to_watch value indicating a packet is present */ - first->next_to_watch = tx_desc; + IDPF_SINGLEQ_BUMP_RING_IDX(tx_q, i); nq = netdev_get_tx_queue(tx_q->netdev, tx_q->idx); - netdev_tx_sent_queue(nq, first->bytecount); + netdev_tx_sent_queue(nq, first->bytes); idpf_tx_buf_hw_update(tx_q, i, netdev_xmit_more()); } @@ -304,8 +309,7 @@ idpf_tx_singleq_get_ctx_desc(struct idpf_tx_queue *txq) struct idpf_base_tx_ctx_desc *ctx_desc; int ntu = txq->next_to_use; - memset(&txq->tx_buf[ntu], 0, sizeof(struct idpf_tx_buf)); - txq->tx_buf[ntu].ctx_entry = true; + txq->tx_buf[ntu].type = LIBETH_SQE_CTX; ctx_desc = &txq->base_ctx[ntu]; @@ -399,11 +403,11 @@ netdev_tx_t idpf_tx_singleq_frame(struct sk_buff *skb, first->skb = skb; if (tso) { - first->gso_segs = offload.tso_segs; - first->bytecount = skb->len + ((first->gso_segs - 1) * offload.tso_hdr_len); + first->packets = offload.tso_segs; + first->bytes = skb->len + ((first->packets - 1) * offload.tso_hdr_len); } else { - first->bytecount = max_t(unsigned int, skb->len, ETH_ZLEN); - first->gso_segs = 1; + first->bytes = max_t(unsigned int, skb->len, ETH_ZLEN); + first->packets = 1; } idpf_tx_singleq_map(tx_q, first, &offload); @@ -423,10 +427,15 @@ netdev_tx_t idpf_tx_singleq_frame(struct sk_buff *skb, static bool idpf_tx_singleq_clean(struct idpf_tx_queue *tx_q, int napi_budget, int *cleaned) { - unsigned int total_bytes = 0, total_pkts = 0; + struct libeth_sq_napi_stats ss = { }; struct idpf_base_tx_desc *tx_desc; u32 budget = tx_q->clean_budget; s16 ntc = tx_q->next_to_clean; + struct libeth_cq_pp cp = { + .dev = tx_q->dev, + .ss = &ss, + .napi = napi_budget, + }; struct idpf_netdev_priv *np; struct idpf_tx_buf *tx_buf; struct netdev_queue *nq; @@ -444,47 +453,23 @@ static bool idpf_tx_singleq_clean(struct idpf_tx_queue *tx_q, int napi_budget, * such. We can skip this descriptor since there is no buffer * to clean. */ - if (tx_buf->ctx_entry) { - /* Clear this flag here to avoid stale flag values when - * this buffer is used for actual data in the future. - * There are cases where the tx_buf struct / the flags - * field will not be cleared before being reused. - */ - tx_buf->ctx_entry = false; + if (unlikely(tx_buf->type <= LIBETH_SQE_CTX)) { + tx_buf->type = LIBETH_SQE_EMPTY; goto fetch_next_txq_desc; } - /* if next_to_watch is not set then no work pending */ - eop_desc = (struct idpf_base_tx_desc *)tx_buf->next_to_watch; - if (!eop_desc) - break; - - /* prevent any other reads prior to eop_desc */ + /* prevent any other reads prior to type */ smp_rmb(); + eop_desc = &tx_q->base_tx[tx_buf->rs_idx]; + /* if the descriptor isn't done, no work yet to do */ if (!(eop_desc->qw1 & cpu_to_le64(IDPF_TX_DESC_DTYPE_DESC_DONE))) break; - /* clear next_to_watch to prevent false hangs */ - tx_buf->next_to_watch = NULL; - /* update the statistics for this packet */ - total_bytes += tx_buf->bytecount; - total_pkts += tx_buf->gso_segs; - - napi_consume_skb(tx_buf->skb, napi_budget); - - /* unmap skb header data */ - dma_unmap_single(tx_q->dev, - dma_unmap_addr(tx_buf, dma), - dma_unmap_len(tx_buf, len), - DMA_TO_DEVICE); - - /* clear tx_buf data */ - tx_buf->skb = NULL; - dma_unmap_len_set(tx_buf, len, 0); + libeth_tx_complete(tx_buf, &cp); /* unmap remaining buffers */ while (tx_desc != eop_desc) { @@ -498,13 +483,7 @@ static bool idpf_tx_singleq_clean(struct idpf_tx_queue *tx_q, int napi_budget, } /* unmap any remaining paged data */ - if (dma_unmap_len(tx_buf, len)) { - dma_unmap_page(tx_q->dev, - dma_unmap_addr(tx_buf, dma), - dma_unmap_len(tx_buf, len), - DMA_TO_DEVICE); - dma_unmap_len_set(tx_buf, len, 0); - } + libeth_tx_complete(tx_buf, &cp); } /* update budget only if we did something */ @@ -524,11 +503,11 @@ static bool idpf_tx_singleq_clean(struct idpf_tx_queue *tx_q, int napi_budget, ntc += tx_q->desc_count; tx_q->next_to_clean = ntc; - *cleaned += total_pkts; + *cleaned += ss.packets; u64_stats_update_begin(&tx_q->stats_sync); - u64_stats_add(&tx_q->q_stats.packets, total_pkts); - u64_stats_add(&tx_q->q_stats.bytes, total_bytes); + u64_stats_add(&tx_q->q_stats.packets, ss.packets); + u64_stats_add(&tx_q->q_stats.bytes, ss.bytes); u64_stats_update_end(&tx_q->stats_sync); np = netdev_priv(tx_q->netdev); @@ -536,7 +515,7 @@ static bool idpf_tx_singleq_clean(struct idpf_tx_queue *tx_q, int napi_budget, dont_wake = np->state != __IDPF_VPORT_UP || !netif_carrier_ok(tx_q->netdev); - __netif_txq_completed_wake(nq, total_pkts, total_bytes, + __netif_txq_completed_wake(nq, ss.packets, ss.bytes, IDPF_DESC_UNUSED(tx_q), IDPF_TX_WAKE_THRESH, dont_wake); diff --git a/drivers/net/ethernet/intel/idpf/idpf_txrx.c b/drivers/net/ethernet/intel/idpf/idpf_txrx.c index e163e54d1c31e..65cb09beb35e8 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_txrx.c +++ b/drivers/net/ethernet/intel/idpf/idpf_txrx.c @@ -1,9 +1,19 @@ // SPDX-License-Identifier: GPL-2.0-only /* Copyright (C) 2023 Intel Corporation */ +#include + #include "idpf.h" #include "idpf_virtchnl.h" +struct idpf_tx_stash { + struct hlist_node hlist; + struct libeth_sqe buf; +}; + +#define idpf_tx_buf_compl_tag(buf) (*(int *)&(buf)->priv) +LIBETH_SQE_CHECK_PRIV(int); + static bool idpf_chk_linearize(struct sk_buff *skb, unsigned int max_bufs, unsigned int count); @@ -58,41 +68,18 @@ void idpf_tx_timeout(struct net_device *netdev, unsigned int txqueue) } } -/** - * idpf_tx_buf_rel - Release a Tx buffer - * @tx_q: the queue that owns the buffer - * @tx_buf: the buffer to free - */ -static void idpf_tx_buf_rel(struct idpf_tx_queue *tx_q, - struct idpf_tx_buf *tx_buf) -{ - if (tx_buf->skb) { - if (dma_unmap_len(tx_buf, len)) - dma_unmap_single(tx_q->dev, - dma_unmap_addr(tx_buf, dma), - dma_unmap_len(tx_buf, len), - DMA_TO_DEVICE); - dev_kfree_skb_any(tx_buf->skb); - } else if (dma_unmap_len(tx_buf, len)) { - dma_unmap_page(tx_q->dev, - dma_unmap_addr(tx_buf, dma), - dma_unmap_len(tx_buf, len), - DMA_TO_DEVICE); - } - - tx_buf->next_to_watch = NULL; - tx_buf->skb = NULL; - tx_buf->compl_tag = IDPF_SPLITQ_TX_INVAL_COMPL_TAG; - dma_unmap_len_set(tx_buf, len, 0); -} - /** * idpf_tx_buf_rel_all - Free any empty Tx buffers * @txq: queue to be cleaned */ static void idpf_tx_buf_rel_all(struct idpf_tx_queue *txq) { + struct libeth_sq_napi_stats ss = { }; struct idpf_buf_lifo *buf_stack; + struct libeth_cq_pp cp = { + .dev = txq->dev, + .ss = &ss, + }; u16 i; /* Buffers already cleared, nothing to do */ @@ -101,7 +88,7 @@ static void idpf_tx_buf_rel_all(struct idpf_tx_queue *txq) /* Free all the Tx buffer sk_buffs */ for (i = 0; i < txq->desc_count; i++) - idpf_tx_buf_rel(txq, &txq->tx_buf[i]); + libeth_tx_complete(&txq->tx_buf[i], &cp); kfree(txq->tx_buf); txq->tx_buf = NULL; @@ -201,10 +188,6 @@ static int idpf_tx_buf_alloc_all(struct idpf_tx_queue *tx_q) if (!tx_q->tx_buf) return -ENOMEM; - /* Initialize tx_bufs with invalid completion tags */ - for (i = 0; i < tx_q->desc_count; i++) - tx_q->tx_buf[i].compl_tag = IDPF_SPLITQ_TX_INVAL_COMPL_TAG; - if (!idpf_queue_has(FLOW_SCH_EN, tx_q)) return 0; @@ -1651,37 +1634,6 @@ static void idpf_tx_handle_sw_marker(struct idpf_tx_queue *tx_q) wake_up(&vport->sw_marker_wq); } -/** - * idpf_tx_splitq_clean_hdr - Clean TX buffer resources for header portion of - * packet - * @tx_q: tx queue to clean buffer from - * @tx_buf: buffer to be cleaned - * @cleaned: pointer to stats struct to track cleaned packets/bytes - * @napi_budget: Used to determine if we are in netpoll - */ -static void idpf_tx_splitq_clean_hdr(struct idpf_tx_queue *tx_q, - struct idpf_tx_buf *tx_buf, - struct idpf_cleaned_stats *cleaned, - int napi_budget) -{ - napi_consume_skb(tx_buf->skb, napi_budget); - - if (dma_unmap_len(tx_buf, len)) { - dma_unmap_single(tx_q->dev, - dma_unmap_addr(tx_buf, dma), - dma_unmap_len(tx_buf, len), - DMA_TO_DEVICE); - - dma_unmap_len_set(tx_buf, len, 0); - } - - /* clear tx_buf data */ - tx_buf->skb = NULL; - - cleaned->bytes += tx_buf->bytecount; - cleaned->packets += tx_buf->gso_segs; -} - /** * idpf_tx_clean_stashed_bufs - clean bufs that were stored for * out of order completions @@ -1692,28 +1644,25 @@ static void idpf_tx_splitq_clean_hdr(struct idpf_tx_queue *tx_q, */ static void idpf_tx_clean_stashed_bufs(struct idpf_tx_queue *txq, u16 compl_tag, - struct idpf_cleaned_stats *cleaned, + struct libeth_sq_napi_stats *cleaned, int budget) { struct idpf_tx_stash *stash; struct hlist_node *tmp_buf; + struct libeth_cq_pp cp = { + .dev = txq->dev, + .ss = cleaned, + .napi = budget, + }; /* Buffer completion */ hash_for_each_possible_safe(txq->stash->sched_buf_hash, stash, tmp_buf, hlist, compl_tag) { - if (unlikely(stash->buf.compl_tag != (int)compl_tag)) + if (unlikely(idpf_tx_buf_compl_tag(&stash->buf) != + (int)compl_tag)) continue; - if (stash->buf.skb) { - idpf_tx_splitq_clean_hdr(txq, &stash->buf, cleaned, - budget); - } else if (dma_unmap_len(&stash->buf, len)) { - dma_unmap_page(txq->dev, - dma_unmap_addr(&stash->buf, dma), - dma_unmap_len(&stash->buf, len), - DMA_TO_DEVICE); - dma_unmap_len_set(&stash->buf, len, 0); - } + libeth_tx_complete(&stash->buf, &cp); /* Push shadow buf back onto stack */ idpf_buf_lifo_push(&txq->stash->buf_stack, stash); @@ -1733,8 +1682,7 @@ static int idpf_stash_flow_sch_buffers(struct idpf_tx_queue *txq, { struct idpf_tx_stash *stash; - if (unlikely(!dma_unmap_addr(tx_buf, dma) && - !dma_unmap_len(tx_buf, len))) + if (unlikely(tx_buf->type <= LIBETH_SQE_CTX)) return 0; stash = idpf_buf_lifo_pop(&txq->stash->buf_stack); @@ -1747,20 +1695,18 @@ static int idpf_stash_flow_sch_buffers(struct idpf_tx_queue *txq, /* Store buffer params in shadow buffer */ stash->buf.skb = tx_buf->skb; - stash->buf.bytecount = tx_buf->bytecount; - stash->buf.gso_segs = tx_buf->gso_segs; + stash->buf.bytes = tx_buf->bytes; + stash->buf.packets = tx_buf->packets; + stash->buf.type = tx_buf->type; dma_unmap_addr_set(&stash->buf, dma, dma_unmap_addr(tx_buf, dma)); dma_unmap_len_set(&stash->buf, len, dma_unmap_len(tx_buf, len)); - stash->buf.compl_tag = tx_buf->compl_tag; + idpf_tx_buf_compl_tag(&stash->buf) = idpf_tx_buf_compl_tag(tx_buf); /* Add buffer to buf_hash table to be freed later */ hash_add(txq->stash->sched_buf_hash, &stash->hlist, - stash->buf.compl_tag); - - memset(tx_buf, 0, sizeof(struct idpf_tx_buf)); + idpf_tx_buf_compl_tag(&stash->buf)); - /* Reinitialize buf_id portion of tag */ - tx_buf->compl_tag = IDPF_SPLITQ_TX_INVAL_COMPL_TAG; + tx_buf->type = LIBETH_SQE_EMPTY; return 0; } @@ -1796,12 +1742,17 @@ do { \ */ static void idpf_tx_splitq_clean(struct idpf_tx_queue *tx_q, u16 end, int napi_budget, - struct idpf_cleaned_stats *cleaned, + struct libeth_sq_napi_stats *cleaned, bool descs_only) { union idpf_tx_flex_desc *next_pending_desc = NULL; union idpf_tx_flex_desc *tx_desc; s16 ntc = tx_q->next_to_clean; + struct libeth_cq_pp cp = { + .dev = tx_q->dev, + .ss = cleaned, + .napi = napi_budget, + }; struct idpf_tx_buf *tx_buf; tx_desc = &tx_q->flex_tx[ntc]; @@ -1817,13 +1768,10 @@ static void idpf_tx_splitq_clean(struct idpf_tx_queue *tx_q, u16 end, * invalid completion tag since no buffer was used. We can * skip this descriptor since there is no buffer to clean. */ - if (unlikely(tx_buf->compl_tag == IDPF_SPLITQ_TX_INVAL_COMPL_TAG)) + if (tx_buf->type <= LIBETH_SQE_CTX) goto fetch_next_txq_desc; - eop_desc = (union idpf_tx_flex_desc *)tx_buf->next_to_watch; - - /* clear next_to_watch to prevent false hangs */ - tx_buf->next_to_watch = NULL; + eop_desc = &tx_q->flex_tx[tx_buf->rs_idx]; if (descs_only) { if (idpf_stash_flow_sch_buffers(tx_q, tx_buf)) @@ -1840,8 +1788,7 @@ static void idpf_tx_splitq_clean(struct idpf_tx_queue *tx_q, u16 end, } } } else { - idpf_tx_splitq_clean_hdr(tx_q, tx_buf, cleaned, - napi_budget); + libeth_tx_complete(tx_buf, &cp); /* unmap remaining buffers */ while (tx_desc != eop_desc) { @@ -1849,13 +1796,7 @@ static void idpf_tx_splitq_clean(struct idpf_tx_queue *tx_q, u16 end, tx_desc, tx_buf); /* unmap any remaining paged data */ - if (dma_unmap_len(tx_buf, len)) { - dma_unmap_page(tx_q->dev, - dma_unmap_addr(tx_buf, dma), - dma_unmap_len(tx_buf, len), - DMA_TO_DEVICE); - dma_unmap_len_set(tx_buf, len, 0); - } + libeth_tx_complete(tx_buf, &cp); } } @@ -1891,30 +1832,26 @@ do { \ * this completion tag. */ static bool idpf_tx_clean_buf_ring(struct idpf_tx_queue *txq, u16 compl_tag, - struct idpf_cleaned_stats *cleaned, + struct libeth_sq_napi_stats *cleaned, int budget) { u16 idx = compl_tag & txq->compl_tag_bufid_m; struct idpf_tx_buf *tx_buf = NULL; u16 ntc = txq->next_to_clean; + struct libeth_cq_pp cp = { + .dev = txq->dev, + .ss = cleaned, + .napi = budget, + }; u16 num_descs_cleaned = 0; u16 orig_idx = idx; tx_buf = &txq->tx_buf[idx]; + if (unlikely(tx_buf->type <= LIBETH_SQE_CTX)) + return false; - while (tx_buf->compl_tag == (int)compl_tag) { - if (tx_buf->skb) { - idpf_tx_splitq_clean_hdr(txq, tx_buf, cleaned, budget); - } else if (dma_unmap_len(tx_buf, len)) { - dma_unmap_page(txq->dev, - dma_unmap_addr(tx_buf, dma), - dma_unmap_len(tx_buf, len), - DMA_TO_DEVICE); - dma_unmap_len_set(tx_buf, len, 0); - } - - memset(tx_buf, 0, sizeof(struct idpf_tx_buf)); - tx_buf->compl_tag = IDPF_SPLITQ_TX_INVAL_COMPL_TAG; + while (idpf_tx_buf_compl_tag(tx_buf) == (int)compl_tag) { + libeth_tx_complete(tx_buf, &cp); num_descs_cleaned++; idpf_tx_clean_buf_ring_bump_ntc(txq, idx, tx_buf); @@ -1961,7 +1898,7 @@ static bool idpf_tx_clean_buf_ring(struct idpf_tx_queue *txq, u16 compl_tag, */ static void idpf_tx_handle_rs_completion(struct idpf_tx_queue *txq, struct idpf_splitq_tx_compl_desc *desc, - struct idpf_cleaned_stats *cleaned, + struct libeth_sq_napi_stats *cleaned, int budget) { u16 compl_tag; @@ -2004,7 +1941,7 @@ static bool idpf_tx_clean_complq(struct idpf_compl_queue *complq, int budget, ntc -= complq->desc_count; do { - struct idpf_cleaned_stats cleaned_stats = { }; + struct libeth_sq_napi_stats cleaned_stats = { }; struct idpf_tx_queue *tx_q; int rel_tx_qid; u16 hw_head; @@ -2275,6 +2212,12 @@ unsigned int idpf_tx_desc_count_required(struct idpf_tx_queue *txq, void idpf_tx_dma_map_error(struct idpf_tx_queue *txq, struct sk_buff *skb, struct idpf_tx_buf *first, u16 idx) { + struct libeth_sq_napi_stats ss = { }; + struct libeth_cq_pp cp = { + .dev = txq->dev, + .ss = &ss, + }; + u64_stats_update_begin(&txq->stats_sync); u64_stats_inc(&txq->q_stats.dma_map_errs); u64_stats_update_end(&txq->stats_sync); @@ -2284,7 +2227,7 @@ void idpf_tx_dma_map_error(struct idpf_tx_queue *txq, struct sk_buff *skb, struct idpf_tx_buf *tx_buf; tx_buf = &txq->tx_buf[idx]; - idpf_tx_buf_rel(txq, tx_buf); + libeth_tx_complete(tx_buf, &cp); if (tx_buf == first) break; if (idx == 0) @@ -2373,7 +2316,8 @@ static void idpf_tx_splitq_map(struct idpf_tx_queue *tx_q, if (dma_mapping_error(tx_q->dev, dma)) return idpf_tx_dma_map_error(tx_q, skb, first, i); - tx_buf->compl_tag = params->compl_tag; + idpf_tx_buf_compl_tag(tx_buf) = params->compl_tag; + tx_buf->type = LIBETH_SQE_FRAG; /* record length, and DMA address */ dma_unmap_len_set(tx_buf, len, size); @@ -2447,8 +2391,7 @@ static void idpf_tx_splitq_map(struct idpf_tx_queue *tx_q, * simply pass over these holes and finish cleaning the * rest of the packet. */ - memset(&tx_q->tx_buf[i], 0, sizeof(struct idpf_tx_buf)); - tx_q->tx_buf[i].compl_tag = params->compl_tag; + tx_q->tx_buf[i].type = LIBETH_SQE_EMPTY; /* Adjust the DMA offset and the remaining size of the * fragment. On the first iteration of this loop, @@ -2493,19 +2436,19 @@ static void idpf_tx_splitq_map(struct idpf_tx_queue *tx_q, /* record SW timestamp if HW timestamp is not available */ skb_tx_timestamp(skb); + first->type = LIBETH_SQE_SKB; + /* write last descriptor with RS and EOP bits */ + first->rs_idx = i; td_cmd |= params->eop_cmd; idpf_tx_splitq_build_desc(tx_desc, params, td_cmd, size); i = idpf_tx_splitq_bump_ntu(tx_q, i); - /* set next_to_watch value indicating a packet is present */ - first->next_to_watch = tx_desc; - tx_q->txq_grp->num_completions_pending++; /* record bytecount for BQL */ nq = netdev_get_tx_queue(tx_q->netdev, tx_q->idx); - netdev_tx_sent_queue(nq, first->bytecount); + netdev_tx_sent_queue(nq, first->bytes); idpf_tx_buf_hw_update(tx_q, i, netdev_xmit_more()); } @@ -2705,8 +2648,7 @@ idpf_tx_splitq_get_ctx_desc(struct idpf_tx_queue *txq) struct idpf_flex_tx_ctx_desc *desc; int i = txq->next_to_use; - memset(&txq->tx_buf[i], 0, sizeof(struct idpf_tx_buf)); - txq->tx_buf[i].compl_tag = IDPF_SPLITQ_TX_INVAL_COMPL_TAG; + txq->tx_buf[i].type = LIBETH_SQE_CTX; /* grab the next descriptor */ desc = &txq->flex_ctx[i]; @@ -2790,12 +2732,12 @@ static netdev_tx_t idpf_tx_splitq_frame(struct sk_buff *skb, first->skb = skb; if (tso) { - first->gso_segs = tx_params.offload.tso_segs; - first->bytecount = skb->len + - ((first->gso_segs - 1) * tx_params.offload.tso_hdr_len); + first->packets = tx_params.offload.tso_segs; + first->bytes = skb->len + + ((first->packets - 1) * tx_params.offload.tso_hdr_len); } else { - first->gso_segs = 1; - first->bytecount = max_t(unsigned int, skb->len, ETH_ZLEN); + first->packets = 1; + first->bytes = max_t(unsigned int, skb->len, ETH_ZLEN); } if (idpf_queue_has(FLOW_SCH_EN, tx_q)) { diff --git a/drivers/net/ethernet/intel/idpf/idpf_txrx.h b/drivers/net/ethernet/intel/idpf/idpf_txrx.h index f119f240d21cd..9ead6a55f9cbc 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_txrx.h +++ b/drivers/net/ethernet/intel/idpf/idpf_txrx.h @@ -137,7 +137,6 @@ do { \ (txq)->num_completions_pending - (txq)->complq->num_completions) #define IDPF_TX_SPLITQ_COMPL_TAG_WIDTH 16 -#define IDPF_SPLITQ_TX_INVAL_COMPL_TAG -1 /* Adjust the generation for the completion tag and wrap if necessary */ #define IDPF_TX_ADJ_COMPL_TAG_GEN(txq) \ ((++(txq)->compl_tag_cur_gen) >= (txq)->compl_tag_gen_max ? \ @@ -155,47 +154,7 @@ union idpf_tx_flex_desc { struct idpf_flex_tx_sched_desc flow; /* flow based scheduling */ }; -/** - * struct idpf_tx_buf - * @next_to_watch: Next descriptor to clean - * @skb: Pointer to the skb - * @dma: DMA address - * @len: DMA length - * @bytecount: Number of bytes - * @gso_segs: Number of GSO segments - * @compl_tag: Splitq only, unique identifier for a buffer. Used to compare - * with completion tag returned in buffer completion event. - * Because the completion tag is expected to be the same in all - * data descriptors for a given packet, and a single packet can - * span multiple buffers, we need this field to track all - * buffers associated with this completion tag independently of - * the buf_id. The tag consists of a N bit buf_id and M upper - * order "generation bits". See compl_tag_bufid_m and - * compl_tag_gen_s in struct idpf_queue. We'll use a value of -1 - * to indicate the tag is not valid. - * @ctx_entry: Singleq only. Used to indicate the corresponding entry - * in the descriptor ring was used for a context descriptor and - * this buffer entry should be skipped. - */ -struct idpf_tx_buf { - void *next_to_watch; - struct sk_buff *skb; - DEFINE_DMA_UNMAP_ADDR(dma); - DEFINE_DMA_UNMAP_LEN(len); - unsigned int bytecount; - unsigned short gso_segs; - - union { - int compl_tag; - - bool ctx_entry; - }; -}; - -struct idpf_tx_stash { - struct hlist_node hlist; - struct idpf_tx_buf buf; -}; +#define idpf_tx_buf libeth_sqe /** * struct idpf_buf_lifo - LIFO for managing OOO completions @@ -589,11 +548,6 @@ struct idpf_tx_queue_stats { u64_stats_t dma_map_errs; }; -struct idpf_cleaned_stats { - u32 packets; - u32 bytes; -}; - #define IDPF_ITR_DYNAMIC 1 #define IDPF_ITR_MAX 0x1FE0 #define IDPF_ITR_20K 0x0032 @@ -771,7 +725,7 @@ struct idpf_tx_queue { void *desc_ring; }; - struct idpf_tx_buf *tx_buf; + struct libeth_sqe *tx_buf; struct idpf_txq_group *txq_grp; struct device *dev; void __iomem *tail; From 704f59f71153ee127ca10a8f9bd3a213aa4bd28d Mon Sep 17 00:00:00 2001 From: Roxana Nicolescu Date: Wed, 25 Feb 2026 15:37:35 +0100 Subject: [PATCH 03/11] netdevice: add netdev_tx_reset_subqueue() shorthand jira KERNEL-168 commit-author Alexander Lobakin commit 3dc95a3edd0a86b4a59670b3fafcc64c7d83e2e7 Add a shorthand similar to other net*_subqueue() helpers for resetting the queue by its index w/o obtaining &netdev_tx_queue beforehand manually. Reviewed-by: Przemek Kitszel Signed-off-by: Alexander Lobakin Signed-off-by: Tony Nguyen (cherry picked from commit 3dc95a3edd0a86b4a59670b3fafcc64c7d83e2e7) Signed-off-by: Roxana Nicolescu --- include/linux/netdevice.h | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index cef072e7cfbbc..de45924a74c49 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3699,6 +3699,17 @@ static inline void netdev_tx_reset_queue(struct netdev_queue *q) #endif } +/** + * netdev_tx_reset_subqueue - reset the BQL stats and state of a netdev queue + * @dev: network device + * @qid: stack index of the queue to reset + */ +static inline void netdev_tx_reset_subqueue(const struct net_device *dev, + u32 qid) +{ + netdev_tx_reset_queue(netdev_get_tx_queue(dev, qid)); +} + /** * netdev_reset_queue - reset the packets and bytes count of a network device * @dev_queue: network device @@ -3708,7 +3719,7 @@ static inline void netdev_tx_reset_queue(struct netdev_queue *q) */ static inline void netdev_reset_queue(struct net_device *dev_queue) { - netdev_tx_reset_queue(netdev_get_tx_queue(dev_queue, 0)); + netdev_tx_reset_subqueue(dev_queue, 0); } /** From 4a623ccb132f0fd8e16150bba1f9a1df0323de64 Mon Sep 17 00:00:00 2001 From: Joshua Hay Date: Wed, 4 Sep 2024 17:47:46 +0200 Subject: [PATCH 04/11] idpf: refactor Tx completion routines jira KERNEL-168 commit-author Joshua Hay commit 24eb35b15152ed6a2473019413b86b8f1c9714be upstream-diff | adjusted context in .h file around struct idpf_compl_queue due to missing commit 5a816aae2d46 ("idpf: strictly assert cachelines of queue and queue vector structures") Add a mechanism to guard against stashing partial packets into the hash table to make the driver more robust, with more efficient decision making when cleaning. Don't stash partial packets. This can happen when an RE (Report Event) completion is received in flow scheduling mode, or when an out of order RS (Report Status) completion is received. The first buffer with the skb is stashed, but some or all of its frags are not because the stack is out of reserve buffers. This leaves the ring in a weird state since the frags are still on the ring. Use the field libeth_sqe::nr_frags to track the number of fragments/tx_bufs representing the packet. The clean routines check to make sure there are enough reserve buffers on the stack before stashing any part of the packet. If there are not, next_to_clean is left pointing to the first buffer of the packet that failed to be stashed. This leaves the whole packet on the ring, and the next time around, cleaning will start from this packet. An RS completion is still expected for this packet in either case. So instead of being cleaned from the hash table, it will be cleaned from the ring directly. This should all still be fine since the DESC_UNUSED and BUFS_UNUSED will reflect the state of the ring. If we ever fall below the thresholds, the TxQ will still be stopped, giving the completion queue time to catch up. This may lead to stopping the queue more frequently, but it guarantees the Tx ring will always be in a good state. Also, always use the idpf_tx_splitq_clean function to clean descriptors, i.e. use it from clean_buf_ring as well. This way we avoid duplicating the logic and make sure we're using the same reserve buffers guard rail. This does require a switch from the s16 next_to_clean overflow descriptor ring wrap calculation to u16 and the normal ring size check. Signed-off-by: Joshua Hay Reviewed-by: Przemek Kitszel Signed-off-by: Alexander Lobakin Signed-off-by: Tony Nguyen (cherry picked from commit 24eb35b15152ed6a2473019413b86b8f1c9714be) Signed-off-by: Roxana Nicolescu Signed-off-by: Roxana Nicolescu --- .../ethernet/intel/idpf/idpf_singleq_txrx.c | 24 +-- drivers/net/ethernet/intel/idpf/idpf_txrx.c | 168 +++++++++++------- drivers/net/ethernet/intel/idpf/idpf_txrx.h | 6 +- 3 files changed, 122 insertions(+), 76 deletions(-) diff --git a/drivers/net/ethernet/intel/idpf/idpf_singleq_txrx.c b/drivers/net/ethernet/intel/idpf/idpf_singleq_txrx.c index 82fbe926c12bf..762c8af258522 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_singleq_txrx.c +++ b/drivers/net/ethernet/intel/idpf/idpf_singleq_txrx.c @@ -238,15 +238,16 @@ static void idpf_tx_singleq_map(struct idpf_tx_queue *tx_q, offsets, max_data, td_tag); - tx_desc++; - i++; - - if (i == tx_q->desc_count) { + if (unlikely(++i == tx_q->desc_count)) { + tx_buf = &tx_q->tx_buf[0]; tx_desc = &tx_q->base_tx[0]; i = 0; + } else { + tx_buf++; + tx_desc++; } - tx_q->tx_buf[i].type = LIBETH_SQE_EMPTY; + tx_buf->type = LIBETH_SQE_EMPTY; dma += max_data; size -= max_data; @@ -260,12 +261,14 @@ static void idpf_tx_singleq_map(struct idpf_tx_queue *tx_q, tx_desc->qw1 = idpf_tx_singleq_build_ctob(td_cmd, offsets, size, td_tag); - tx_desc++; - i++; - if (i == tx_q->desc_count) { + if (unlikely(++i == tx_q->desc_count)) { + tx_buf = &tx_q->tx_buf[0]; tx_desc = &tx_q->base_tx[0]; i = 0; + } else { + tx_buf++; + tx_desc++; } size = skb_frag_size(frag); @@ -273,8 +276,6 @@ static void idpf_tx_singleq_map(struct idpf_tx_queue *tx_q, dma = skb_frag_dma_map(tx_q->dev, frag, 0, size, DMA_TO_DEVICE); - - tx_buf = &tx_q->tx_buf[i]; } skb_tx_timestamp(first->skb); @@ -458,6 +459,9 @@ static bool idpf_tx_singleq_clean(struct idpf_tx_queue *tx_q, int napi_budget, goto fetch_next_txq_desc; } + if (unlikely(tx_buf->type != LIBETH_SQE_SKB)) + break; + /* prevent any other reads prior to type */ smp_rmb(); diff --git a/drivers/net/ethernet/intel/idpf/idpf_txrx.c b/drivers/net/ethernet/intel/idpf/idpf_txrx.c index 65cb09beb35e8..e5ea2e40d9215 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_txrx.c +++ b/drivers/net/ethernet/intel/idpf/idpf_txrx.c @@ -11,8 +11,8 @@ struct idpf_tx_stash { struct libeth_sqe buf; }; -#define idpf_tx_buf_compl_tag(buf) (*(int *)&(buf)->priv) -LIBETH_SQE_CHECK_PRIV(int); +#define idpf_tx_buf_compl_tag(buf) (*(u32 *)&(buf)->priv) +LIBETH_SQE_CHECK_PRIV(u32); static bool idpf_chk_linearize(struct sk_buff *skb, unsigned int max_bufs, unsigned int count); @@ -76,11 +76,13 @@ static void idpf_tx_buf_rel_all(struct idpf_tx_queue *txq) { struct libeth_sq_napi_stats ss = { }; struct idpf_buf_lifo *buf_stack; + struct idpf_tx_stash *stash; struct libeth_cq_pp cp = { .dev = txq->dev, .ss = &ss, }; - u16 i; + struct hlist_node *tmp; + u32 i, tag; /* Buffers already cleared, nothing to do */ if (!txq->tx_buf) @@ -100,6 +102,20 @@ static void idpf_tx_buf_rel_all(struct idpf_tx_queue *txq) if (!buf_stack->bufs) return; + /* + * If a Tx timeout occurred, there are potentially still bufs in the + * hash table, free them here. + */ + hash_for_each_safe(txq->stash->sched_buf_hash, tag, tmp, stash, + hlist) { + if (!stash) + continue; + + libeth_tx_complete(&stash->buf, &cp); + hash_del(&stash->hlist); + idpf_buf_lifo_push(buf_stack, stash); + } + for (i = 0; i < buf_stack->size; i++) kfree(buf_stack->bufs[i]); @@ -116,6 +132,7 @@ static void idpf_tx_buf_rel_all(struct idpf_tx_queue *txq) static void idpf_tx_desc_rel(struct idpf_tx_queue *txq) { idpf_tx_buf_rel_all(txq); + netdev_tx_reset_subqueue(txq->netdev, txq->idx); if (!txq->desc_ring) return; @@ -1658,16 +1675,14 @@ static void idpf_tx_clean_stashed_bufs(struct idpf_tx_queue *txq, /* Buffer completion */ hash_for_each_possible_safe(txq->stash->sched_buf_hash, stash, tmp_buf, hlist, compl_tag) { - if (unlikely(idpf_tx_buf_compl_tag(&stash->buf) != - (int)compl_tag)) + if (unlikely(idpf_tx_buf_compl_tag(&stash->buf) != compl_tag)) continue; + hash_del(&stash->hlist); libeth_tx_complete(&stash->buf, &cp); /* Push shadow buf back onto stack */ idpf_buf_lifo_push(&txq->stash->buf_stack, stash); - - hash_del(&stash->hlist); } } @@ -1698,6 +1713,7 @@ static int idpf_stash_flow_sch_buffers(struct idpf_tx_queue *txq, stash->buf.bytes = tx_buf->bytes; stash->buf.packets = tx_buf->packets; stash->buf.type = tx_buf->type; + stash->buf.nr_frags = tx_buf->nr_frags; dma_unmap_addr_set(&stash->buf, dma, dma_unmap_addr(tx_buf, dma)); dma_unmap_len_set(&stash->buf, len, dma_unmap_len(tx_buf, len)); idpf_tx_buf_compl_tag(&stash->buf) = idpf_tx_buf_compl_tag(tx_buf); @@ -1713,9 +1729,8 @@ static int idpf_stash_flow_sch_buffers(struct idpf_tx_queue *txq, #define idpf_tx_splitq_clean_bump_ntc(txq, ntc, desc, buf) \ do { \ - (ntc)++; \ - if (unlikely(!(ntc))) { \ - ntc -= (txq)->desc_count; \ + if (unlikely(++(ntc) == (txq)->desc_count)) { \ + ntc = 0; \ buf = (txq)->tx_buf; \ desc = &(txq)->flex_tx[0]; \ } else { \ @@ -1739,59 +1754,66 @@ do { \ * Separate packet completion events will be reported on the completion queue, * and the buffers will be cleaned separately. The stats are not updated from * this function when using flow-based scheduling. + * + * Furthermore, in flow scheduling mode, check to make sure there are enough + * reserve buffers to stash the packet. If there are not, return early, which + * will leave next_to_clean pointing to the packet that failed to be stashed. + * + * Return: false in the scenario above, true otherwise. */ -static void idpf_tx_splitq_clean(struct idpf_tx_queue *tx_q, u16 end, +static bool idpf_tx_splitq_clean(struct idpf_tx_queue *tx_q, u16 end, int napi_budget, struct libeth_sq_napi_stats *cleaned, bool descs_only) { union idpf_tx_flex_desc *next_pending_desc = NULL; union idpf_tx_flex_desc *tx_desc; - s16 ntc = tx_q->next_to_clean; + u32 ntc = tx_q->next_to_clean; struct libeth_cq_pp cp = { .dev = tx_q->dev, .ss = cleaned, .napi = napi_budget, }; struct idpf_tx_buf *tx_buf; + bool clean_complete = true; tx_desc = &tx_q->flex_tx[ntc]; next_pending_desc = &tx_q->flex_tx[end]; tx_buf = &tx_q->tx_buf[ntc]; - ntc -= tx_q->desc_count; while (tx_desc != next_pending_desc) { - union idpf_tx_flex_desc *eop_desc; + u32 eop_idx; /* If this entry in the ring was used as a context descriptor, - * it's corresponding entry in the buffer ring will have an - * invalid completion tag since no buffer was used. We can - * skip this descriptor since there is no buffer to clean. + * it's corresponding entry in the buffer ring is reserved. We + * can skip this descriptor since there is no buffer to clean. */ if (tx_buf->type <= LIBETH_SQE_CTX) goto fetch_next_txq_desc; - eop_desc = &tx_q->flex_tx[tx_buf->rs_idx]; + if (unlikely(tx_buf->type != LIBETH_SQE_SKB)) + break; + + eop_idx = tx_buf->rs_idx; if (descs_only) { - if (idpf_stash_flow_sch_buffers(tx_q, tx_buf)) + if (IDPF_TX_BUF_RSV_UNUSED(tx_q) < tx_buf->nr_frags) { + clean_complete = false; goto tx_splitq_clean_out; + } + + idpf_stash_flow_sch_buffers(tx_q, tx_buf); - while (tx_desc != eop_desc) { + while (ntc != eop_idx) { idpf_tx_splitq_clean_bump_ntc(tx_q, ntc, tx_desc, tx_buf); - - if (dma_unmap_len(tx_buf, len)) { - if (idpf_stash_flow_sch_buffers(tx_q, - tx_buf)) - goto tx_splitq_clean_out; - } + idpf_stash_flow_sch_buffers(tx_q, tx_buf); } } else { libeth_tx_complete(tx_buf, &cp); /* unmap remaining buffers */ - while (tx_desc != eop_desc) { + while (ntc != eop_idx) { idpf_tx_splitq_clean_bump_ntc(tx_q, ntc, tx_desc, tx_buf); @@ -1805,8 +1827,9 @@ static void idpf_tx_splitq_clean(struct idpf_tx_queue *tx_q, u16 end, } tx_splitq_clean_out: - ntc += tx_q->desc_count; tx_q->next_to_clean = ntc; + + return clean_complete; } #define idpf_tx_clean_buf_ring_bump_ntc(txq, ntc, buf) \ @@ -1837,48 +1860,63 @@ static bool idpf_tx_clean_buf_ring(struct idpf_tx_queue *txq, u16 compl_tag, { u16 idx = compl_tag & txq->compl_tag_bufid_m; struct idpf_tx_buf *tx_buf = NULL; - u16 ntc = txq->next_to_clean; struct libeth_cq_pp cp = { .dev = txq->dev, .ss = cleaned, .napi = budget, }; - u16 num_descs_cleaned = 0; - u16 orig_idx = idx; + u16 ntc, orig_idx = idx; tx_buf = &txq->tx_buf[idx]; - if (unlikely(tx_buf->type <= LIBETH_SQE_CTX)) + + if (unlikely(tx_buf->type <= LIBETH_SQE_CTX || + idpf_tx_buf_compl_tag(tx_buf) != compl_tag)) return false; - while (idpf_tx_buf_compl_tag(tx_buf) == (int)compl_tag) { + if (tx_buf->type == LIBETH_SQE_SKB) libeth_tx_complete(tx_buf, &cp); - num_descs_cleaned++; + idpf_tx_clean_buf_ring_bump_ntc(txq, idx, tx_buf); + + while (idpf_tx_buf_compl_tag(tx_buf) == compl_tag) { + libeth_tx_complete(tx_buf, &cp); idpf_tx_clean_buf_ring_bump_ntc(txq, idx, tx_buf); } - /* If we didn't clean anything on the ring for this completion, there's - * nothing more to do. - */ - if (unlikely(!num_descs_cleaned)) - return false; - - /* Otherwise, if we did clean a packet on the ring directly, it's safe - * to assume that the descriptors starting from the original - * next_to_clean up until the previously cleaned packet can be reused. - * Therefore, we will go back in the ring and stash any buffers still - * in the ring into the hash table to be cleaned later. + /* + * It's possible the packet we just cleaned was an out of order + * completion, which means we can stash the buffers starting from + * the original next_to_clean and reuse the descriptors. We need + * to compare the descriptor ring next_to_clean packet's "first" buffer + * to the "first" buffer of the packet we just cleaned to determine if + * this is the case. Howevever, next_to_clean can point to either a + * reserved buffer that corresponds to a context descriptor used for the + * next_to_clean packet (TSO packet) or the "first" buffer (single + * packet). The orig_idx from the packet we just cleaned will always + * point to the "first" buffer. If next_to_clean points to a reserved + * buffer, let's bump ntc once and start the comparison from there. */ + ntc = txq->next_to_clean; tx_buf = &txq->tx_buf[ntc]; - while (tx_buf != &txq->tx_buf[orig_idx]) { - idpf_stash_flow_sch_buffers(txq, tx_buf); + + if (tx_buf->type == LIBETH_SQE_CTX) idpf_tx_clean_buf_ring_bump_ntc(txq, ntc, tx_buf); - } - /* Finally, update next_to_clean to reflect the work that was just done - * on the ring, if any. If the packet was only cleaned from the hash - * table, the ring will not be impacted, therefore we should not touch - * next_to_clean. The updated idx is used here + /* + * If ntc still points to a different "first" buffer, clean the + * descriptor ring and stash all of the buffers for later cleaning. If + * we cannot stash all of the buffers, next_to_clean will point to the + * "first" buffer of the packet that could not be stashed and cleaning + * will start there next time. + */ + if (unlikely(tx_buf != &txq->tx_buf[orig_idx] && + !idpf_tx_splitq_clean(txq, orig_idx, budget, cleaned, + true))) + return true; + + /* + * Otherwise, update next_to_clean to reflect the cleaning that was + * done above. */ txq->next_to_clean = idx; @@ -1906,7 +1944,8 @@ static void idpf_tx_handle_rs_completion(struct idpf_tx_queue *txq, if (!idpf_queue_has(FLOW_SCH_EN, txq)) { u16 head = le16_to_cpu(desc->q_head_compl_tag.q_head); - return idpf_tx_splitq_clean(txq, head, budget, cleaned, false); + idpf_tx_splitq_clean(txq, head, budget, cleaned, false); + return; } compl_tag = le16_to_cpu(desc->q_head_compl_tag.compl_tag); @@ -2306,6 +2345,7 @@ static void idpf_tx_splitq_map(struct idpf_tx_queue *tx_q, dma = dma_map_single(tx_q->dev, skb->data, size, DMA_TO_DEVICE); tx_buf = first; + first->nr_frags = 0; params->compl_tag = (tx_q->compl_tag_cur_gen << tx_q->compl_tag_gen_s) | i; @@ -2316,6 +2356,7 @@ static void idpf_tx_splitq_map(struct idpf_tx_queue *tx_q, if (dma_mapping_error(tx_q->dev, dma)) return idpf_tx_dma_map_error(tx_q, skb, first, i); + first->nr_frags++; idpf_tx_buf_compl_tag(tx_buf) = params->compl_tag; tx_buf->type = LIBETH_SQE_FRAG; @@ -2371,14 +2412,15 @@ static void idpf_tx_splitq_map(struct idpf_tx_queue *tx_q, idpf_tx_splitq_build_desc(tx_desc, params, td_cmd, max_data); - tx_desc++; - i++; - - if (i == tx_q->desc_count) { + if (unlikely(++i == tx_q->desc_count)) { + tx_buf = tx_q->tx_buf; tx_desc = &tx_q->flex_tx[0]; i = 0; tx_q->compl_tag_cur_gen = IDPF_TX_ADJ_COMPL_TAG_GEN(tx_q); + } else { + tx_buf++; + tx_desc++; } /* Since this packet has a buffer that is going to span @@ -2391,7 +2433,7 @@ static void idpf_tx_splitq_map(struct idpf_tx_queue *tx_q, * simply pass over these holes and finish cleaning the * rest of the packet. */ - tx_q->tx_buf[i].type = LIBETH_SQE_EMPTY; + tx_buf->type = LIBETH_SQE_EMPTY; /* Adjust the DMA offset and the remaining size of the * fragment. On the first iteration of this loop, @@ -2415,13 +2457,15 @@ static void idpf_tx_splitq_map(struct idpf_tx_queue *tx_q, break; idpf_tx_splitq_build_desc(tx_desc, params, td_cmd, size); - tx_desc++; - i++; - if (i == tx_q->desc_count) { + if (unlikely(++i == tx_q->desc_count)) { + tx_buf = tx_q->tx_buf; tx_desc = &tx_q->flex_tx[0]; i = 0; tx_q->compl_tag_cur_gen = IDPF_TX_ADJ_COMPL_TAG_GEN(tx_q); + } else { + tx_buf++; + tx_desc++; } size = skb_frag_size(frag); @@ -2429,8 +2473,6 @@ static void idpf_tx_splitq_map(struct idpf_tx_queue *tx_q, dma = skb_frag_dma_map(tx_q->dev, frag, 0, size, DMA_TO_DEVICE); - - tx_buf = &tx_q->tx_buf[i]; } /* record SW timestamp if HW timestamp is not available */ diff --git a/drivers/net/ethernet/intel/idpf/idpf_txrx.h b/drivers/net/ethernet/intel/idpf/idpf_txrx.h index 9ead6a55f9cbc..4d4086aad6203 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_txrx.h +++ b/drivers/net/ethernet/intel/idpf/idpf_txrx.h @@ -133,7 +133,7 @@ do { \ */ #define IDPF_TX_COMPLQ_PENDING(txq) \ (((txq)->num_completions_pending >= (txq)->complq->num_completions ? \ - 0 : U64_MAX) + \ + 0 : U32_MAX) + \ (txq)->num_completions_pending - (txq)->complq->num_completions) #define IDPF_TX_SPLITQ_COMPL_TAG_WIDTH 16 @@ -847,7 +847,7 @@ struct idpf_compl_queue { struct net_device *netdev; u32 clean_budget; - u32 num_completions; + aligned_u64 num_completions; /* Slowpath */ u32 q_id; @@ -968,7 +968,7 @@ struct idpf_txq_group { struct idpf_compl_queue *complq; - u32 num_completions_pending; + aligned_u64 num_completions_pending; }; /** From 2a57ec7100ff4c40f5d50a3f1de4d6977f828dad Mon Sep 17 00:00:00 2001 From: Roxana Nicolescu Date: Wed, 25 Feb 2026 15:46:51 +0100 Subject: [PATCH 05/11] idpf: set completion tag for "empty" bufs associated with a packet jira KERNEL-168 commit-author Joshua Hay commit 4c69c77aafe74cf755af55070584b643e5c4e4d8 Commit d9028db618a6 ("idpf: convert to libeth Tx buffer completion") inadvertently removed code that was necessary for the tx buffer cleaning routine to iterate over all buffers associated with a packet. When a frag is too large for a single data descriptor, it will be split across multiple data descriptors. This means the frag will span multiple buffers in the buffer ring in order to keep the descriptor and buffer ring indexes aligned. The buffer entries in the ring are technically empty and no cleaning actions need to be performed. These empty buffers can precede other frags associated with the same packet. I.e. a single packet on the buffer ring can look like: buf[0]=skb0.frag0 buf[1]=skb0.frag1 buf[2]=empty buf[3]=skb0.frag2 The cleaning routine iterates through these buffers based on a matching completion tag. If the completion tag is not set for buf2, the loop will end prematurely. Frag2 will be left uncleaned and next_to_clean will be left pointing to the end of packet, which will break the cleaning logic for subsequent cleans. This consequently leads to tx timeouts. Assign the empty bufs the same completion tag for the packet to ensure the cleaning routine iterates over all of the buffers associated with the packet. Fixes: d9028db618a6 ("idpf: convert to libeth Tx buffer completion") Signed-off-by: Joshua Hay Acked-by: Alexander Lobakin Reviewed-by: Madhu chittim Reviewed-by: Simon Horman Tested-by: Krishneil Singh Signed-off-by: Tony Nguyen (cherry picked from commit 4c69c77aafe74cf755af55070584b643e5c4e4d8) Signed-off-by: Roxana Nicolescu --- drivers/net/ethernet/intel/idpf/idpf_txrx.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/intel/idpf/idpf_txrx.c b/drivers/net/ethernet/intel/idpf/idpf_txrx.c index e5ea2e40d9215..5c2824200cb70 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_txrx.c +++ b/drivers/net/ethernet/intel/idpf/idpf_txrx.c @@ -2434,6 +2434,7 @@ static void idpf_tx_splitq_map(struct idpf_tx_queue *tx_q, * rest of the packet. */ tx_buf->type = LIBETH_SQE_EMPTY; + idpf_tx_buf_compl_tag(tx_buf) = params->compl_tag; /* Adjust the DMA offset and the remaining size of the * fragment. On the first iteration of this loop, From 3d7487afc6560f81ebbbdacf1685f7e9d2418276 Mon Sep 17 00:00:00 2001 From: Roxana Nicolescu Date: Mon, 23 Feb 2026 15:16:09 +0100 Subject: [PATCH 06/11] idpf: add support for Tx refillqs in flow scheduling mode jira KERNEL-168 commit-author Joshua Hay commit cb83b559bea39f207ee214ee2972657e8576ed18 upstream-diff | 1. adjusted context around idpf_rx_splitq_clean function due to missing - 74d1412ac8f3 ("idpf: use libeth Rx buffer management for payload buffer") - 6ad5ff6e7282 ("libeth: convert to netmem") 2. adjusted context around struct idpf_tx_queue members and docstring and did not include the libeth_cacheline_set_assert changes due to missing: - 5a816aae2d46 ("idpf: strictly assert cachelines of queue and queue vector structures") 3. fix compilation issue (std=89) in idpf_tx_desc_alloc due to for loop var declaration In certain production environments, it is possible for completion tags to collide, meaning N packets with the same completion tag are in flight at the same time. In this environment, any given Tx queue is effectively used to send both slower traffic and higher throughput traffic simultaneously. This is the result of a customer's specific configuration in the device pipeline, the details of which Intel cannot provide. This configuration results in a small number of out-of-order completions, i.e., a small number of packets in flight. The existing guardrails in the driver only protect against a large number of packets in flight. The slower flow completions are delayed which causes the out-of-order completions. The fast flow will continue sending traffic and generating tags. Because tags are generated on the fly, the fast flow eventually uses the same tag for a packet that is still in flight from the slower flow. The driver has no idea which packet it should clean when it processes the completion with that tag, but it will look for the packet on the buffer ring before the hash table. If the slower flow packet completion is processed first, it will end up cleaning the fast flow packet on the ring prematurely. This leaves the descriptor ring in a bad state resulting in a crash or Tx timeout. In summary, generating a tag when a packet is sent can lead to the same tag being associated with multiple packets. This can lead to resource leaks, crashes, and/or Tx timeouts. Before we can replace the tag generation, we need a new mechanism for the send path to know what tag to use next. The driver will allocate and initialize a refillq for each TxQ with all of the possible free tag values. During send, the driver grabs the next free tag from the refillq from next_to_clean. While cleaning the packet, the clean routine posts the tag back to the refillq's next_to_use to indicate that it is now free to use. This mechanism works exactly the same way as the existing Rx refill queues, which post the cleaned buffer IDs back to the buffer queue to be reposted to HW. Since we're using the refillqs for both Rx and Tx now, genericize some of the existing refillq support. Note: the refillqs will not be used yet. This is only demonstrating how they will be used to pass free tags back to the send path. Signed-off-by: Joshua Hay Reviewed-by: Madhu Chittim Tested-by: Samuel Salin Signed-off-by: Tony Nguyen (cherry picked from commit cb83b559bea39f207ee214ee2972657e8576ed18) Signed-off-by: Roxana Nicolescu --- drivers/net/ethernet/intel/idpf/idpf_txrx.c | 94 +++++++++++++++++++-- drivers/net/ethernet/intel/idpf/idpf_txrx.h | 6 +- 2 files changed, 91 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/intel/idpf/idpf_txrx.c b/drivers/net/ethernet/intel/idpf/idpf_txrx.c index 5c2824200cb70..eff094d2f5059 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_txrx.c +++ b/drivers/net/ethernet/intel/idpf/idpf_txrx.c @@ -137,6 +137,9 @@ static void idpf_tx_desc_rel(struct idpf_tx_queue *txq) if (!txq->desc_ring) return; + if (txq->refillq) + kfree(txq->refillq->ring); + dmam_free_coherent(txq->dev, txq->size, txq->desc_ring, txq->dma); txq->desc_ring = NULL; txq->next_to_use = 0; @@ -242,7 +245,9 @@ static int idpf_tx_desc_alloc(const struct idpf_vport *vport, struct idpf_tx_queue *tx_q) { struct device *dev = tx_q->dev; + struct idpf_sw_queue *refillq; int err; + unsigned int i = 0; err = idpf_tx_buf_alloc_all(tx_q); if (err) @@ -265,6 +270,29 @@ static int idpf_tx_desc_alloc(const struct idpf_vport *vport, tx_q->next_to_clean = 0; idpf_queue_set(GEN_CHK, tx_q); + if (!idpf_queue_has(FLOW_SCH_EN, tx_q)) + return 0; + + refillq = tx_q->refillq; + refillq->desc_count = tx_q->desc_count; + refillq->ring = kcalloc(refillq->desc_count, sizeof(u32), + GFP_KERNEL); + if (!refillq->ring) { + err = -ENOMEM; + goto err_alloc; + } + + for (i = 0; i < refillq->desc_count; i++) + refillq->ring[i] = + FIELD_PREP(IDPF_RFL_BI_BUFID_M, i) | + FIELD_PREP(IDPF_RFL_BI_GEN_M, + idpf_queue_has(GEN_CHK, refillq)); + + /* Go ahead and flip the GEN bit since this counts as filling + * up the ring, i.e. we already ring wrapped. + */ + idpf_queue_change(GEN_CHK, refillq); + return 0; err_alloc: @@ -586,18 +614,18 @@ static int idpf_rx_hdr_buf_alloc_all(struct idpf_buf_queue *bufq) } /** - * idpf_rx_post_buf_refill - Post buffer id to refill queue + * idpf_post_buf_refill - Post buffer id to refill queue * @refillq: refill queue to post to * @buf_id: buffer id to post */ -static void idpf_rx_post_buf_refill(struct idpf_sw_queue *refillq, u16 buf_id) +static void idpf_post_buf_refill(struct idpf_sw_queue *refillq, u16 buf_id) { u32 nta = refillq->next_to_use; /* store the buffer ID and the SW maintained GEN bit to the refillq */ refillq->ring[nta] = - FIELD_PREP(IDPF_RX_BI_BUFID_M, buf_id) | - FIELD_PREP(IDPF_RX_BI_GEN_M, + FIELD_PREP(IDPF_RFL_BI_BUFID_M, buf_id) | + FIELD_PREP(IDPF_RFL_BI_GEN_M, idpf_queue_has(GEN_CHK, refillq)); if (unlikely(++nta == refillq->desc_count)) { @@ -977,6 +1005,11 @@ static void idpf_txq_group_rel(struct idpf_vport *vport) struct idpf_txq_group *txq_grp = &vport->txq_grps[i]; for (j = 0; j < txq_grp->num_txq; j++) { + if (flow_sch_en) { + kfree(txq_grp->txqs[j]->refillq); + txq_grp->txqs[j]->refillq = NULL; + } + kfree(txq_grp->txqs[j]); txq_grp->txqs[j] = NULL; } @@ -1394,6 +1427,13 @@ static int idpf_txq_group_alloc(struct idpf_vport *vport, u16 num_txq) } idpf_queue_set(FLOW_SCH_EN, q); + + q->refillq = kzalloc(sizeof(*q->refillq), GFP_KERNEL); + if (!q->refillq) + goto err_alloc; + + idpf_queue_set(GEN_CHK, q->refillq); + idpf_queue_set(RFL_GEN_CHK, q->refillq); } if (!split) @@ -1950,6 +1990,8 @@ static void idpf_tx_handle_rs_completion(struct idpf_tx_queue *txq, compl_tag = le16_to_cpu(desc->q_head_compl_tag.compl_tag); + idpf_post_buf_refill(txq->refillq, compl_tag); + /* If we didn't clean anything on the ring, this packet must be * in the hash table. Go clean it there. */ @@ -2309,6 +2351,37 @@ static unsigned int idpf_tx_splitq_bump_ntu(struct idpf_tx_queue *txq, u16 ntu) return ntu; } +/** + * idpf_tx_get_free_buf_id - get a free buffer ID from the refill queue + * @refillq: refill queue to get buffer ID from + * @buf_id: return buffer ID + * + * Return: true if a buffer ID was found, false if not + */ +static bool idpf_tx_get_free_buf_id(struct idpf_sw_queue *refillq, + u16 *buf_id) +{ + u32 ntc = refillq->next_to_clean; + u32 refill_desc; + + refill_desc = refillq->ring[ntc]; + + if (unlikely(idpf_queue_has(RFL_GEN_CHK, refillq) != + !!(refill_desc & IDPF_RFL_BI_GEN_M))) + return false; + + *buf_id = FIELD_GET(IDPF_RFL_BI_BUFID_M, refill_desc); + + if (unlikely(++ntc == refillq->desc_count)) { + idpf_queue_change(RFL_GEN_CHK, refillq); + ntc = 0; + } + + refillq->next_to_clean = ntc; + + return true; +} + /** * idpf_tx_splitq_map - Build the Tx flex descriptor * @tx_q: queue to send buffer on @@ -2784,6 +2857,13 @@ static netdev_tx_t idpf_tx_splitq_frame(struct sk_buff *skb, } if (idpf_queue_has(FLOW_SCH_EN, tx_q)) { + if (unlikely(!idpf_tx_get_free_buf_id(tx_q->refillq, + &tx_params.compl_tag))) { + u64_stats_update_begin(&tx_q->stats_sync); + u64_stats_inc(&tx_q->q_stats.q_busy); + u64_stats_update_end(&tx_q->stats_sync); + } + tx_params.dtype = IDPF_TX_DESC_DTYPE_FLEX_FLOW_SCHE; tx_params.eop_cmd = IDPF_TXD_FLEX_FLOW_CMD_EOP; /* Set the RE bit to catch any packets that may have not been @@ -3360,7 +3440,7 @@ static int idpf_rx_splitq_clean(struct idpf_rx_queue *rxq, int budget) if (!skb) break; - idpf_rx_post_buf_refill(refillq, buf_id); + idpf_post_buf_refill(refillq, buf_id); IDPF_RX_BUMP_NTC(rxq, ntc); /* skip if it is non EOP desc */ @@ -3458,10 +3538,10 @@ static void idpf_rx_clean_refillq(struct idpf_buf_queue *bufq, bool failure; if (idpf_queue_has(RFL_GEN_CHK, refillq) != - !!(refill_desc & IDPF_RX_BI_GEN_M)) + !!(refill_desc & IDPF_RFL_BI_GEN_M)) break; - buf_id = FIELD_GET(IDPF_RX_BI_BUFID_M, refill_desc); + buf_id = FIELD_GET(IDPF_RFL_BI_BUFID_M, refill_desc); failure = idpf_rx_update_bufq_desc(bufq, buf_id, buf_desc); if (failure) break; diff --git a/drivers/net/ethernet/intel/idpf/idpf_txrx.h b/drivers/net/ethernet/intel/idpf/idpf_txrx.h index 4d4086aad6203..647093b2d3dd4 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_txrx.h +++ b/drivers/net/ethernet/intel/idpf/idpf_txrx.h @@ -113,8 +113,8 @@ do { \ */ #define IDPF_TX_SPLITQ_RE_MIN_GAP 64 -#define IDPF_RX_BI_GEN_M BIT(16) -#define IDPF_RX_BI_BUFID_M GENMASK(15, 0) +#define IDPF_RFL_BI_GEN_M BIT(16) +#define IDPF_RFL_BI_BUFID_M GENMASK(15, 0) #define IDPF_RXD_EOF_SPLITQ VIRTCHNL2_RX_FLEX_DESC_ADV_STATUS0_EOF_M #define IDPF_RXD_EOF_SINGLEQ VIRTCHNL2_RX_BASE_DESC_STATUS_EOF_M @@ -685,6 +685,7 @@ struct idpf_rx_queue { * @cleaned_pkts: Number of packets cleaned for the above said case * @tx_max_bufs: Max buffers that can be transmitted with scatter-gather * @tx_min_pkt_len: Min supported packet length + * @refillq: Pointer to refill queue * @compl_tag_bufid_m: Completion tag buffer id mask * @compl_tag_gen_s: Completion tag generation bit * The format of the completion tag will change based on the TXQ @@ -746,6 +747,7 @@ struct idpf_tx_queue { u16 tx_max_bufs; u16 tx_min_pkt_len; + struct idpf_sw_queue *refillq; u16 compl_tag_bufid_m; u16 compl_tag_gen_s; From 2c34100e46378e1df8b052053be27979006dcd62 Mon Sep 17 00:00:00 2001 From: Roxana Nicolescu Date: Mon, 23 Feb 2026 15:55:30 +0100 Subject: [PATCH 07/11] idpf: improve when to set RE bit logic jira KERNEL-168 commit-author Joshua Hay commit f2d18e16479cac7a708d77cbfb4220a9114a71fc upstream-diff | adjusted context in struct idpf_tx_queue because the order of the fields is different due to missing - 5a816aae2d46 ("idpf: strictly assert cachelines of queue and queue vector structures") Track the gap between next_to_use and the last RE index. Set RE again if the gap is large enough to ensure RE bit is set frequently. This is critical before removing the stashing mechanisms because the opportunistic descriptor ring cleaning from the out-of-order completions will go away. Previously the descriptors would be "cleaned" by both the descriptor (RE) completion and the out-of-order completions. Without the latter, we must ensure the RE bit is set more frequently. Otherwise, it's theoretically possible for the descriptor ring next_to_clean to never advance. The previous implementation was dependent on the start of a packet falling on a 64th index in the descriptor ring, which is not guaranteed with large packets. Signed-off-by: Luigi Rizzo Signed-off-by: Brian Vazquez Signed-off-by: Joshua Hay Reviewed-by: Madhu Chittim Tested-by: Samuel Salin Signed-off-by: Tony Nguyen (cherry picked from commit f2d18e16479cac7a708d77cbfb4220a9114a71fc) Signed-off-by: Roxana Nicolescu --- drivers/net/ethernet/intel/idpf/idpf_txrx.c | 20 +++++++++++++++++++- drivers/net/ethernet/intel/idpf/idpf_txrx.h | 6 ++++-- 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/intel/idpf/idpf_txrx.c b/drivers/net/ethernet/intel/idpf/idpf_txrx.c index eff094d2f5059..8493070cbb5f7 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_txrx.c +++ b/drivers/net/ethernet/intel/idpf/idpf_txrx.c @@ -293,6 +293,8 @@ static int idpf_tx_desc_alloc(const struct idpf_vport *vport, */ idpf_queue_change(GEN_CHK, refillq); + tx_q->last_re = tx_q->desc_count - IDPF_TX_SPLITQ_RE_MIN_GAP; + return 0; err_alloc: @@ -2791,6 +2793,21 @@ netdev_tx_t idpf_tx_drop_skb(struct idpf_tx_queue *tx_q, struct sk_buff *skb) return NETDEV_TX_OK; } +/** + * idpf_tx_splitq_need_re - check whether RE bit needs to be set + * @tx_q: pointer to Tx queue + * + * Return: true if RE bit needs to be set, false otherwise + */ +static bool idpf_tx_splitq_need_re(struct idpf_tx_queue *tx_q) +{ + int gap = tx_q->next_to_use - tx_q->last_re; + + gap += (gap < 0) ? tx_q->desc_count : 0; + + return gap >= IDPF_TX_SPLITQ_RE_MIN_GAP; +} + /** * idpf_tx_splitq_frame - Sends buffer on Tx ring using flex descriptors * @skb: send buffer @@ -2871,9 +2888,10 @@ static netdev_tx_t idpf_tx_splitq_frame(struct sk_buff *skb, * MIN_RING size to ensure it will be set at least once each * time around the ring. */ - if (!(tx_q->next_to_use % IDPF_TX_SPLITQ_RE_MIN_GAP)) { + if (idpf_tx_splitq_need_re(tx_q)) { tx_params.eop_cmd |= IDPF_TXD_FLEX_FLOW_CMD_RE; tx_q->txq_grp->num_completions_pending++; + tx_q->last_re = tx_q->next_to_use; } if (skb->ip_summed == CHECKSUM_PARTIAL) diff --git a/drivers/net/ethernet/intel/idpf/idpf_txrx.h b/drivers/net/ethernet/intel/idpf/idpf_txrx.h index 647093b2d3dd4..35c8f5c4de14a 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_txrx.h +++ b/drivers/net/ethernet/intel/idpf/idpf_txrx.h @@ -672,6 +672,8 @@ struct idpf_rx_queue { * @desc_count: Number of descriptors * @next_to_use: Next descriptor to use * @next_to_clean: Next descriptor to clean + * @last_re: last descriptor index that RE bit was set + * @tx_max_bufs: Max buffers that can be transmitted with scatter-gather * @netdev: &net_device corresponding to this queue * @cleaned_bytes: Splitq only, TXQ only: When a TX completion is received on * the TX completion queue, it can be for any TXQ associated @@ -683,7 +685,6 @@ struct idpf_rx_queue { * only once at the end of the cleaning routine. * @clean_budget: singleq only, queue cleaning budget * @cleaned_pkts: Number of packets cleaned for the above said case - * @tx_max_bufs: Max buffers that can be transmitted with scatter-gather * @tx_min_pkt_len: Min supported packet length * @refillq: Pointer to refill queue * @compl_tag_bufid_m: Completion tag buffer id mask @@ -736,6 +737,8 @@ struct idpf_tx_queue { u16 desc_count; u16 next_to_use; u16 next_to_clean; + u16 last_re; + u16 tx_max_bufs; struct net_device *netdev; @@ -745,7 +748,6 @@ struct idpf_tx_queue { }; u16 cleaned_pkts; - u16 tx_max_bufs; u16 tx_min_pkt_len; struct idpf_sw_queue *refillq; From 916485ec68a4ad4607a89b4b6b13958b5022977d Mon Sep 17 00:00:00 2001 From: Joshua Hay Date: Fri, 25 Jul 2025 11:42:20 -0700 Subject: [PATCH 08/11] idpf: simplify and fix splitq Tx packet rollback error path jira KERNEL-168 commit-author Joshua Hay commit b61dfa9bc4430ad82b96d3a7c1c485350f91b467 upstream-diff | adjusted context in 2 places: - when removing func idpf_tx_dma_map_error due to different memset call that uses the hardcoded struct type; - in func idpf_tx_splitq_frame due to missing expected union idpf_flex_tx_ctx_desc *ctx_desc; both differences were introduced in commit 1a49cf814fe1e ("idpf: add Tx timestamp flows"). Move (and rename) the existing rollback logic to singleq.c since that will be the only consumer. Create a simplified splitq specific rollback function to loop through and unmap tx_bufs based on the completion tag. This is critical before replacing the Tx buffer ring with the buffer pool since the previous rollback indexing will not work to unmap the chained buffers from the pool. Cache the next_to_use index before any portion of the packet is put on the descriptor ring. In case of an error, the rollback will bump tail to the correct next_to_use value. Because the splitq path now supports different types of context descriptors (and potentially multiple in the future), this will take care of rolling back any and all context descriptors encoded on the ring for the erroneous packet. The previous rollback logic was broken for PTP packets since it would not account for the PTP context descriptor. Fixes: 1a49cf814fe1 ("idpf: add Tx timestamp flows") Signed-off-by: Joshua Hay Reviewed-by: Madhu Chittim Tested-by: Samuel Salin Signed-off-by: Tony Nguyen (cherry picked from commit b61dfa9bc4430ad82b96d3a7c1c485350f91b467) Signed-off-by: Roxana Nicolescu --- .../ethernet/intel/idpf/idpf_singleq_txrx.c | 57 +++++++++++- drivers/net/ethernet/intel/idpf/idpf_txrx.c | 91 ++++++++----------- drivers/net/ethernet/intel/idpf/idpf_txrx.h | 5 +- 3 files changed, 95 insertions(+), 58 deletions(-) diff --git a/drivers/net/ethernet/intel/idpf/idpf_singleq_txrx.c b/drivers/net/ethernet/intel/idpf/idpf_singleq_txrx.c index 762c8af258522..0163d4488ff05 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_singleq_txrx.c +++ b/drivers/net/ethernet/intel/idpf/idpf_singleq_txrx.c @@ -178,6 +178,58 @@ static int idpf_tx_singleq_csum(struct sk_buff *skb, return 1; } +/** + * idpf_tx_singleq_dma_map_error - handle TX DMA map errors + * @txq: queue to send buffer on + * @skb: send buffer + * @first: original first buffer info buffer for packet + * @idx: starting point on ring to unwind + */ +static void idpf_tx_singleq_dma_map_error(struct idpf_tx_queue *txq, + struct sk_buff *skb, + struct idpf_tx_buf *first, u16 idx) +{ + struct libeth_sq_napi_stats ss = { }; + struct libeth_cq_pp cp = { + .dev = txq->dev, + .ss = &ss, + }; + + u64_stats_update_begin(&txq->stats_sync); + u64_stats_inc(&txq->q_stats.dma_map_errs); + u64_stats_update_end(&txq->stats_sync); + + /* clear dma mappings for failed tx_buf map */ + for (;;) { + struct idpf_tx_buf *tx_buf; + + tx_buf = &txq->tx_buf[idx]; + libeth_tx_complete(tx_buf, &cp); + if (tx_buf == first) + break; + if (idx == 0) + idx = txq->desc_count; + idx--; + } + + if (skb_is_gso(skb)) { + union idpf_tx_flex_desc *tx_desc; + + /* If we failed a DMA mapping for a TSO packet, we will have + * used one additional descriptor for a context + * descriptor. Reset that here. + */ + tx_desc = &txq->flex_tx[idx]; + memset(tx_desc, 0, sizeof(*tx_desc)); + if (idx == 0) + idx = txq->desc_count; + idx--; + } + + /* Update tail in case netdev_xmit_more was previously true */ + idpf_tx_buf_hw_update(txq, idx, false); +} + /** * idpf_tx_singleq_map - Build the Tx base descriptor * @tx_q: queue to send buffer on @@ -218,8 +270,9 @@ static void idpf_tx_singleq_map(struct idpf_tx_queue *tx_q, for (frag = &skb_shinfo(skb)->frags[0];; frag++) { unsigned int max_data = IDPF_TX_MAX_DESC_DATA_ALIGNED; - if (dma_mapping_error(tx_q->dev, dma)) - return idpf_tx_dma_map_error(tx_q, skb, first, i); + if (unlikely(dma_mapping_error(tx_q->dev, dma))) + return idpf_tx_singleq_dma_map_error(tx_q, skb, + first, i); /* record length, and DMA address */ dma_unmap_len_set(tx_buf, len, size); diff --git a/drivers/net/ethernet/intel/idpf/idpf_txrx.c b/drivers/net/ethernet/intel/idpf/idpf_txrx.c index 8493070cbb5f7..7eba4f151bdbf 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_txrx.c +++ b/drivers/net/ethernet/intel/idpf/idpf_txrx.c @@ -2285,57 +2285,6 @@ unsigned int idpf_tx_desc_count_required(struct idpf_tx_queue *txq, return count; } -/** - * idpf_tx_dma_map_error - handle TX DMA map errors - * @txq: queue to send buffer on - * @skb: send buffer - * @first: original first buffer info buffer for packet - * @idx: starting point on ring to unwind - */ -void idpf_tx_dma_map_error(struct idpf_tx_queue *txq, struct sk_buff *skb, - struct idpf_tx_buf *first, u16 idx) -{ - struct libeth_sq_napi_stats ss = { }; - struct libeth_cq_pp cp = { - .dev = txq->dev, - .ss = &ss, - }; - - u64_stats_update_begin(&txq->stats_sync); - u64_stats_inc(&txq->q_stats.dma_map_errs); - u64_stats_update_end(&txq->stats_sync); - - /* clear dma mappings for failed tx_buf map */ - for (;;) { - struct idpf_tx_buf *tx_buf; - - tx_buf = &txq->tx_buf[idx]; - libeth_tx_complete(tx_buf, &cp); - if (tx_buf == first) - break; - if (idx == 0) - idx = txq->desc_count; - idx--; - } - - if (skb_is_gso(skb)) { - union idpf_tx_flex_desc *tx_desc; - - /* If we failed a DMA mapping for a TSO packet, we will have - * used one additional descriptor for a context - * descriptor. Reset that here. - */ - tx_desc = &txq->flex_tx[idx]; - memset(tx_desc, 0, sizeof(struct idpf_flex_tx_ctx_desc)); - if (idx == 0) - idx = txq->desc_count; - idx--; - } - - /* Update tail in case netdev_xmit_more was previously true */ - idpf_tx_buf_hw_update(txq, idx, false); -} - /** * idpf_tx_splitq_bump_ntu - adjust NTU and generation * @txq: the tx ring to wrap @@ -2384,6 +2333,37 @@ static bool idpf_tx_get_free_buf_id(struct idpf_sw_queue *refillq, return true; } +/** + * idpf_tx_splitq_pkt_err_unmap - Unmap buffers and bump tail in case of error + * @txq: Tx queue to unwind + * @params: pointer to splitq params struct + * @first: starting buffer for packet to unmap + */ +static void idpf_tx_splitq_pkt_err_unmap(struct idpf_tx_queue *txq, + struct idpf_tx_splitq_params *params, + struct idpf_tx_buf *first) +{ + struct libeth_sq_napi_stats ss = { }; + struct idpf_tx_buf *tx_buf = first; + struct libeth_cq_pp cp = { + .dev = txq->dev, + .ss = &ss, + }; + u32 idx = 0; + + u64_stats_update_begin(&txq->stats_sync); + u64_stats_inc(&txq->q_stats.dma_map_errs); + u64_stats_update_end(&txq->stats_sync); + + do { + libeth_tx_complete(tx_buf, &cp); + idpf_tx_clean_buf_ring_bump_ntc(txq, idx, tx_buf); + } while (idpf_tx_buf_compl_tag(tx_buf) == params->compl_tag); + + /* Update tail in case netdev_xmit_more was previously true. */ + idpf_tx_buf_hw_update(txq, params->prev_ntu, false); +} + /** * idpf_tx_splitq_map - Build the Tx flex descriptor * @tx_q: queue to send buffer on @@ -2428,8 +2408,9 @@ static void idpf_tx_splitq_map(struct idpf_tx_queue *tx_q, for (frag = &skb_shinfo(skb)->frags[0];; frag++) { unsigned int max_data = IDPF_TX_MAX_DESC_DATA_ALIGNED; - if (dma_mapping_error(tx_q->dev, dma)) - return idpf_tx_dma_map_error(tx_q, skb, first, i); + if (unlikely(dma_mapping_error(tx_q->dev, dma))) + return idpf_tx_splitq_pkt_err_unmap(tx_q, params, + first); first->nr_frags++; idpf_tx_buf_compl_tag(tx_buf) = params->compl_tag; @@ -2818,7 +2799,9 @@ static bool idpf_tx_splitq_need_re(struct idpf_tx_queue *tx_q) static netdev_tx_t idpf_tx_splitq_frame(struct sk_buff *skb, struct idpf_tx_queue *tx_q) { - struct idpf_tx_splitq_params tx_params = { }; + struct idpf_tx_splitq_params tx_params = { + .prev_ntu = tx_q->next_to_use, + }; struct idpf_tx_buf *first; unsigned int count; int tso; diff --git a/drivers/net/ethernet/intel/idpf/idpf_txrx.h b/drivers/net/ethernet/intel/idpf/idpf_txrx.h index 35c8f5c4de14a..e3f9183232268 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_txrx.h +++ b/drivers/net/ethernet/intel/idpf/idpf_txrx.h @@ -200,6 +200,7 @@ struct idpf_tx_offload_params { * @compl_tag: Associated tag for completion * @td_tag: Descriptor tunneling tag * @offload: Offload parameters + * @prev_ntu: stored TxQ next_to_use in case of rollback */ struct idpf_tx_splitq_params { enum idpf_tx_desc_dtype_value dtype; @@ -210,6 +211,8 @@ struct idpf_tx_splitq_params { }; struct idpf_tx_offload_params offload; + + u16 prev_ntu; }; enum idpf_tx_ctx_desc_eipt_offload { @@ -1131,8 +1134,6 @@ void idpf_tx_buf_hw_update(struct idpf_tx_queue *tx_q, u32 val, bool xmit_more); unsigned int idpf_size_to_txd_count(unsigned int size); netdev_tx_t idpf_tx_drop_skb(struct idpf_tx_queue *tx_q, struct sk_buff *skb); -void idpf_tx_dma_map_error(struct idpf_tx_queue *txq, struct sk_buff *skb, - struct idpf_tx_buf *first, u16 ring_idx); unsigned int idpf_tx_desc_count_required(struct idpf_tx_queue *txq, struct sk_buff *skb); void idpf_tx_timeout(struct net_device *netdev, unsigned int txqueue); From 1bf91468a70d209ceda526ad49c54315003436f3 Mon Sep 17 00:00:00 2001 From: Joshua Hay Date: Fri, 25 Jul 2025 11:42:21 -0700 Subject: [PATCH 09/11] idpf: replace flow scheduling buffer ring with buffer pool jira KERNEL-168 commit-author Joshua Hay commit 5f417d551324d2894168b362f2429d120ab06243 upstream-diff | adjusted context in: - ifpf_tx_splitq_frame and idpf_tx_clean_bufs; - struct idpf_tx_queue due to missing of some elements in the struct; both due to missing commit - 1a49cf814fe1e ("idpf: add Tx timestamp flows"). and did not include the cacheline assert changes due to missing - 5a816aae2d46 ("idpf: strictly assert cachelines of queue and queue vector structures") Replace the TxQ buffer ring with one large pool/array of buffers (only for flow scheduling). This eliminates the tag generation and makes it impossible for a tag to be associated with more than one packet. The completion tag passed to HW through the descriptor is the index into the array. That same completion tag is posted back to the driver in the completion descriptor, and used to index into the array to quickly retrieve the buffer during cleaning. In this way, the tags are treated as a fix sized resource. If all tags are in use, no more packets can be sent on that particular queue (until some are freed up). The tag pool size is 64K since the completion tag width is 16 bits. For each packet, the driver pulls a free tag from the refillq to get the next free buffer index. When cleaning is complete, the tag is posted back to the refillq. A multi-frag packet spans multiple buffers in the driver, therefore it uses multiple buffer indexes/tags from the pool. Each frag pulls from the refillq to get the next free buffer index. These are tracked in a next_buf field that replaces the completion tag field in the buffer struct. This chains the buffers together so that the packet can be cleaned from the starting completion tag taken from the completion descriptor, then from the next_buf field for each subsequent buffer. In case of a dma_mapping_error occurs or the refillq runs out of free buf_ids, the packet will execute the rollback error path. This unmaps any buffers previously mapped for the packet. Since several free buf_ids could have already been pulled from the refillq, we need to restore its original state as well. Otherwise, the buf_ids/tags will be leaked and not used again until the queue is reallocated. Descriptor completions only advance the descriptor ring index to "clean" the descriptors. The packet completions only clean the buffers associated with the given packet completion tag and do not update the descriptor ring index. When operating in queue based scheduling mode, the array still acts as a ring and will only have TxQ descriptor count entries. The tx_bufs are still associated 1:1 with the descriptor ring entries and we can use the conventional indexing mechanisms. Fixes: c2d548cad150 ("idpf: add TX splitq napi poll support") Signed-off-by: Luigi Rizzo Signed-off-by: Brian Vazquez Signed-off-by: Joshua Hay Reviewed-by: Madhu Chittim Reviewed-by: Aleksandr Loktionov Tested-by: Samuel Salin Signed-off-by: Tony Nguyen (cherry picked from commit 5f417d551324d2894168b362f2429d120ab06243) Signed-off-by: Roxana Nicolescu --- drivers/net/ethernet/intel/idpf/idpf_txrx.c | 207 +++++++++----------- drivers/net/ethernet/intel/idpf/idpf_txrx.h | 8 + 2 files changed, 104 insertions(+), 111 deletions(-) diff --git a/drivers/net/ethernet/intel/idpf/idpf_txrx.c b/drivers/net/ethernet/intel/idpf/idpf_txrx.c index 7eba4f151bdbf..ff0693c8e1e4b 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_txrx.c +++ b/drivers/net/ethernet/intel/idpf/idpf_txrx.c @@ -11,6 +11,7 @@ struct idpf_tx_stash { struct libeth_sqe buf; }; +#define idpf_tx_buf_next(buf) (*(u32 *)&(buf)->priv) #define idpf_tx_buf_compl_tag(buf) (*(u32 *)&(buf)->priv) LIBETH_SQE_CHECK_PRIV(u32); @@ -89,7 +90,7 @@ static void idpf_tx_buf_rel_all(struct idpf_tx_queue *txq) return; /* Free all the Tx buffer sk_buffs */ - for (i = 0; i < txq->desc_count; i++) + for (i = 0; i < txq->buf_pool_size; i++) libeth_tx_complete(&txq->tx_buf[i], &cp); kfree(txq->tx_buf); @@ -197,14 +198,17 @@ static void idpf_tx_desc_rel_all(struct idpf_vport *vport) static int idpf_tx_buf_alloc_all(struct idpf_tx_queue *tx_q) { struct idpf_buf_lifo *buf_stack; - int buf_size; int i; /* Allocate book keeping buffers only. Buffers to be supplied to HW * are allocated by kernel network stack and received as part of skb */ - buf_size = sizeof(struct idpf_tx_buf) * tx_q->desc_count; - tx_q->tx_buf = kzalloc(buf_size, GFP_KERNEL); + if (idpf_queue_has(FLOW_SCH_EN, tx_q)) + tx_q->buf_pool_size = U16_MAX; + else + tx_q->buf_pool_size = tx_q->desc_count; + tx_q->tx_buf = kcalloc(tx_q->buf_pool_size, sizeof(*tx_q->tx_buf), + GFP_KERNEL); if (!tx_q->tx_buf) return -ENOMEM; @@ -274,7 +278,7 @@ static int idpf_tx_desc_alloc(const struct idpf_vport *vport, return 0; refillq = tx_q->refillq; - refillq->desc_count = tx_q->desc_count; + refillq->desc_count = tx_q->buf_pool_size; refillq->ring = kcalloc(refillq->desc_count, sizeof(u32), GFP_KERNEL); if (!refillq->ring) { @@ -1819,6 +1823,12 @@ static bool idpf_tx_splitq_clean(struct idpf_tx_queue *tx_q, u16 end, struct idpf_tx_buf *tx_buf; bool clean_complete = true; + if (descs_only) { + /* Bump ring index to mark as cleaned. */ + tx_q->next_to_clean = end; + return true; + } + tx_desc = &tx_q->flex_tx[ntc]; next_pending_desc = &tx_q->flex_tx[end]; tx_buf = &tx_q->tx_buf[ntc]; @@ -1885,83 +1895,40 @@ do { \ } while (0) /** - * idpf_tx_clean_buf_ring - clean flow scheduling TX queue buffers + * idpf_tx_clean_bufs - clean flow scheduling TX queue buffers * @txq: queue to clean - * @compl_tag: completion tag of packet to clean (from completion descriptor) + * @buf_id: packet's starting buffer ID, from completion descriptor * @cleaned: pointer to stats struct to track cleaned packets/bytes * @budget: Used to determine if we are in netpoll * - * Cleans all buffers associated with the input completion tag either from the - * TX buffer ring or from the hash table if the buffers were previously - * stashed. Returns the byte/segment count for the cleaned packet associated - * this completion tag. + * Clean all buffers associated with the packet starting at buf_id. Returns the + * byte/segment count for the cleaned packet. */ -static bool idpf_tx_clean_buf_ring(struct idpf_tx_queue *txq, u16 compl_tag, - struct libeth_sq_napi_stats *cleaned, - int budget) +static bool idpf_tx_clean_bufs(struct idpf_tx_queue *txq, u32 buf_id, + struct libeth_sq_napi_stats *cleaned, + int budget) { - u16 idx = compl_tag & txq->compl_tag_bufid_m; struct idpf_tx_buf *tx_buf = NULL; struct libeth_cq_pp cp = { .dev = txq->dev, .ss = cleaned, .napi = budget, }; - u16 ntc, orig_idx = idx; - - tx_buf = &txq->tx_buf[idx]; - - if (unlikely(tx_buf->type <= LIBETH_SQE_CTX || - idpf_tx_buf_compl_tag(tx_buf) != compl_tag)) - return false; - if (tx_buf->type == LIBETH_SQE_SKB) + tx_buf = &txq->tx_buf[buf_id]; + if (tx_buf->type == LIBETH_SQE_SKB) { libeth_tx_complete(tx_buf, &cp); + idpf_post_buf_refill(txq->refillq, buf_id); + } - idpf_tx_clean_buf_ring_bump_ntc(txq, idx, tx_buf); + while (idpf_tx_buf_next(tx_buf) != IDPF_TXBUF_NULL) { + buf_id = idpf_tx_buf_next(tx_buf); - while (idpf_tx_buf_compl_tag(tx_buf) == compl_tag) { + tx_buf = &txq->tx_buf[buf_id]; libeth_tx_complete(tx_buf, &cp); - idpf_tx_clean_buf_ring_bump_ntc(txq, idx, tx_buf); + idpf_post_buf_refill(txq->refillq, buf_id); } - /* - * It's possible the packet we just cleaned was an out of order - * completion, which means we can stash the buffers starting from - * the original next_to_clean and reuse the descriptors. We need - * to compare the descriptor ring next_to_clean packet's "first" buffer - * to the "first" buffer of the packet we just cleaned to determine if - * this is the case. Howevever, next_to_clean can point to either a - * reserved buffer that corresponds to a context descriptor used for the - * next_to_clean packet (TSO packet) or the "first" buffer (single - * packet). The orig_idx from the packet we just cleaned will always - * point to the "first" buffer. If next_to_clean points to a reserved - * buffer, let's bump ntc once and start the comparison from there. - */ - ntc = txq->next_to_clean; - tx_buf = &txq->tx_buf[ntc]; - - if (tx_buf->type == LIBETH_SQE_CTX) - idpf_tx_clean_buf_ring_bump_ntc(txq, ntc, tx_buf); - - /* - * If ntc still points to a different "first" buffer, clean the - * descriptor ring and stash all of the buffers for later cleaning. If - * we cannot stash all of the buffers, next_to_clean will point to the - * "first" buffer of the packet that could not be stashed and cleaning - * will start there next time. - */ - if (unlikely(tx_buf != &txq->tx_buf[orig_idx] && - !idpf_tx_splitq_clean(txq, orig_idx, budget, cleaned, - true))) - return true; - - /* - * Otherwise, update next_to_clean to reflect the cleaning that was - * done above. - */ - txq->next_to_clean = idx; - return true; } @@ -1992,12 +1959,10 @@ static void idpf_tx_handle_rs_completion(struct idpf_tx_queue *txq, compl_tag = le16_to_cpu(desc->q_head_compl_tag.compl_tag); - idpf_post_buf_refill(txq->refillq, compl_tag); - /* If we didn't clean anything on the ring, this packet must be * in the hash table. Go clean it there. */ - if (!idpf_tx_clean_buf_ring(txq, compl_tag, cleaned, budget)) + if (!idpf_tx_clean_bufs(txq, compl_tag, cleaned, budget)) idpf_tx_clean_stashed_bufs(txq, compl_tag, cleaned, budget); } @@ -2310,7 +2275,7 @@ static unsigned int idpf_tx_splitq_bump_ntu(struct idpf_tx_queue *txq, u16 ntu) * Return: true if a buffer ID was found, false if not */ static bool idpf_tx_get_free_buf_id(struct idpf_sw_queue *refillq, - u16 *buf_id) + u32 *buf_id) { u32 ntc = refillq->next_to_clean; u32 refill_desc; @@ -2343,25 +2308,34 @@ static void idpf_tx_splitq_pkt_err_unmap(struct idpf_tx_queue *txq, struct idpf_tx_splitq_params *params, struct idpf_tx_buf *first) { + struct idpf_sw_queue *refillq = txq->refillq; struct libeth_sq_napi_stats ss = { }; struct idpf_tx_buf *tx_buf = first; struct libeth_cq_pp cp = { .dev = txq->dev, .ss = &ss, }; - u32 idx = 0; u64_stats_update_begin(&txq->stats_sync); u64_stats_inc(&txq->q_stats.dma_map_errs); u64_stats_update_end(&txq->stats_sync); - do { + libeth_tx_complete(tx_buf, &cp); + while (idpf_tx_buf_next(tx_buf) != IDPF_TXBUF_NULL) { + tx_buf = &txq->tx_buf[idpf_tx_buf_next(tx_buf)]; libeth_tx_complete(tx_buf, &cp); - idpf_tx_clean_buf_ring_bump_ntc(txq, idx, tx_buf); - } while (idpf_tx_buf_compl_tag(tx_buf) == params->compl_tag); + } /* Update tail in case netdev_xmit_more was previously true. */ idpf_tx_buf_hw_update(txq, params->prev_ntu, false); + + if (!refillq) + return; + + /* Restore refillq state to avoid leaking tags. */ + if (params->prev_refill_gen != idpf_queue_has(RFL_GEN_CHK, refillq)) + idpf_queue_change(RFL_GEN_CHK, refillq); + refillq->next_to_clean = params->prev_refill_ntc; } /** @@ -2385,6 +2359,7 @@ static void idpf_tx_splitq_map(struct idpf_tx_queue *tx_q, struct netdev_queue *nq; struct sk_buff *skb; skb_frag_t *frag; + u32 next_buf_id; u16 td_cmd = 0; dma_addr_t dma; @@ -2402,18 +2377,16 @@ static void idpf_tx_splitq_map(struct idpf_tx_queue *tx_q, tx_buf = first; first->nr_frags = 0; - params->compl_tag = - (tx_q->compl_tag_cur_gen << tx_q->compl_tag_gen_s) | i; - for (frag = &skb_shinfo(skb)->frags[0];; frag++) { unsigned int max_data = IDPF_TX_MAX_DESC_DATA_ALIGNED; - if (unlikely(dma_mapping_error(tx_q->dev, dma))) + if (unlikely(dma_mapping_error(tx_q->dev, dma))) { + idpf_tx_buf_next(tx_buf) = IDPF_TXBUF_NULL; return idpf_tx_splitq_pkt_err_unmap(tx_q, params, first); + } first->nr_frags++; - idpf_tx_buf_compl_tag(tx_buf) = params->compl_tag; tx_buf->type = LIBETH_SQE_FRAG; /* record length, and DMA address */ @@ -2469,29 +2442,14 @@ static void idpf_tx_splitq_map(struct idpf_tx_queue *tx_q, max_data); if (unlikely(++i == tx_q->desc_count)) { - tx_buf = tx_q->tx_buf; tx_desc = &tx_q->flex_tx[0]; i = 0; tx_q->compl_tag_cur_gen = IDPF_TX_ADJ_COMPL_TAG_GEN(tx_q); } else { - tx_buf++; tx_desc++; } - /* Since this packet has a buffer that is going to span - * multiple descriptors, it's going to leave holes in - * to the TX buffer ring. To ensure these holes do not - * cause issues in the cleaning routines, we will clear - * them of any stale data and assign them the same - * completion tag as the current packet. Then when the - * packet is being cleaned, the cleaning routines will - * simply pass over these holes and finish cleaning the - * rest of the packet. - */ - tx_buf->type = LIBETH_SQE_EMPTY; - idpf_tx_buf_compl_tag(tx_buf) = params->compl_tag; - /* Adjust the DMA offset and the remaining size of the * fragment. On the first iteration of this loop, * max_data will be >= 12K and <= 16K-1. On any @@ -2516,15 +2474,26 @@ static void idpf_tx_splitq_map(struct idpf_tx_queue *tx_q, idpf_tx_splitq_build_desc(tx_desc, params, td_cmd, size); if (unlikely(++i == tx_q->desc_count)) { - tx_buf = tx_q->tx_buf; tx_desc = &tx_q->flex_tx[0]; i = 0; tx_q->compl_tag_cur_gen = IDPF_TX_ADJ_COMPL_TAG_GEN(tx_q); } else { - tx_buf++; tx_desc++; } + if (idpf_queue_has(FLOW_SCH_EN, tx_q)) { + if (unlikely(!idpf_tx_get_free_buf_id(tx_q->refillq, + &next_buf_id))) { + idpf_tx_buf_next(tx_buf) = IDPF_TXBUF_NULL; + return idpf_tx_splitq_pkt_err_unmap(tx_q, params, + first); + } + } else { + next_buf_id = i; + } + idpf_tx_buf_next(tx_buf) = next_buf_id; + tx_buf = &tx_q->tx_buf[next_buf_id]; + size = skb_frag_size(frag); data_len -= size; @@ -2539,6 +2508,7 @@ static void idpf_tx_splitq_map(struct idpf_tx_queue *tx_q, /* write last descriptor with RS and EOP bits */ first->rs_idx = i; + idpf_tx_buf_next(tx_buf) = IDPF_TXBUF_NULL; td_cmd |= params->eop_cmd; idpf_tx_splitq_build_desc(tx_desc, params, td_cmd, size); i = idpf_tx_splitq_bump_ntu(tx_q, i); @@ -2747,8 +2717,6 @@ idpf_tx_splitq_get_ctx_desc(struct idpf_tx_queue *txq) struct idpf_flex_tx_ctx_desc *desc; int i = txq->next_to_use; - txq->tx_buf[i].type = LIBETH_SQE_CTX; - /* grab the next descriptor */ desc = &txq->flex_ctx[i]; txq->next_to_use = idpf_tx_splitq_bump_ntu(txq, i); @@ -2805,6 +2773,7 @@ static netdev_tx_t idpf_tx_splitq_frame(struct sk_buff *skb, struct idpf_tx_buf *first; unsigned int count; int tso; + u32 buf_id; count = idpf_tx_desc_count_required(tx_q, skb); if (unlikely(!count)) @@ -2843,26 +2812,28 @@ static netdev_tx_t idpf_tx_splitq_frame(struct sk_buff *skb, u64_stats_update_end(&tx_q->stats_sync); } - /* record the location of the first descriptor for this packet */ - first = &tx_q->tx_buf[tx_q->next_to_use]; - first->skb = skb; + if (idpf_queue_has(FLOW_SCH_EN, tx_q)) { + struct idpf_sw_queue *refillq = tx_q->refillq; - if (tso) { - first->packets = tx_params.offload.tso_segs; - first->bytes = skb->len + - ((first->packets - 1) * tx_params.offload.tso_hdr_len); - } else { - first->packets = 1; - first->bytes = max_t(unsigned int, skb->len, ETH_ZLEN); - } + /* Save refillq state in case of a packet rollback. Otherwise, + * the tags will be leaked since they will be popped from the + * refillq but never reposted during cleaning. + */ + tx_params.prev_refill_gen = + idpf_queue_has(RFL_GEN_CHK, refillq); + tx_params.prev_refill_ntc = refillq->next_to_clean; - if (idpf_queue_has(FLOW_SCH_EN, tx_q)) { if (unlikely(!idpf_tx_get_free_buf_id(tx_q->refillq, - &tx_params.compl_tag))) { - u64_stats_update_begin(&tx_q->stats_sync); - u64_stats_inc(&tx_q->q_stats.q_busy); - u64_stats_update_end(&tx_q->stats_sync); + &buf_id))) { + if (tx_params.prev_refill_gen != + idpf_queue_has(RFL_GEN_CHK, refillq)) + idpf_queue_change(RFL_GEN_CHK, refillq); + refillq->next_to_clean = tx_params.prev_refill_ntc; + + tx_q->next_to_use = tx_params.prev_ntu; + return idpf_tx_drop_skb(tx_q, skb); } + tx_params.compl_tag = buf_id; tx_params.dtype = IDPF_TX_DESC_DTYPE_FLEX_FLOW_SCHE; tx_params.eop_cmd = IDPF_TXD_FLEX_FLOW_CMD_EOP; @@ -2881,6 +2852,8 @@ static netdev_tx_t idpf_tx_splitq_frame(struct sk_buff *skb, tx_params.offload.td_cmd |= IDPF_TXD_FLEX_FLOW_CMD_CS_EN; } else { + buf_id = tx_q->next_to_use; + tx_params.dtype = IDPF_TX_DESC_DTYPE_FLEX_L2TAG1_L2TAG2; tx_params.eop_cmd = IDPF_TXD_LAST_DESC_CMD; @@ -2888,6 +2861,18 @@ static netdev_tx_t idpf_tx_splitq_frame(struct sk_buff *skb, tx_params.offload.td_cmd |= IDPF_TX_FLEX_DESC_CMD_CS_EN; } + first = &tx_q->tx_buf[buf_id]; + first->skb = skb; + + if (tso) { + first->packets = tx_params.offload.tso_segs; + first->bytes = skb->len + + ((first->packets - 1) * tx_params.offload.tso_hdr_len); + } else { + first->packets = 1; + first->bytes = max_t(unsigned int, skb->len, ETH_ZLEN); + } + idpf_tx_splitq_map(tx_q, &tx_params, first); return NETDEV_TX_OK; diff --git a/drivers/net/ethernet/intel/idpf/idpf_txrx.h b/drivers/net/ethernet/intel/idpf/idpf_txrx.h index e3f9183232268..5ee0cac22baaf 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_txrx.h +++ b/drivers/net/ethernet/intel/idpf/idpf_txrx.h @@ -142,6 +142,8 @@ do { \ ((++(txq)->compl_tag_cur_gen) >= (txq)->compl_tag_gen_max ? \ 0 : (txq)->compl_tag_cur_gen) +#define IDPF_TXBUF_NULL U32_MAX + #define IDPF_TXD_LAST_DESC_CMD (IDPF_TX_DESC_CMD_EOP | IDPF_TX_DESC_CMD_RS) #define IDPF_TX_FLAGS_TSO BIT(0) @@ -201,6 +203,8 @@ struct idpf_tx_offload_params { * @td_tag: Descriptor tunneling tag * @offload: Offload parameters * @prev_ntu: stored TxQ next_to_use in case of rollback + * @prev_refill_ntc: stored refillq next_to_clean in case of packet rollback + * @prev_refill_gen: stored refillq generation bit in case of packet rollback */ struct idpf_tx_splitq_params { enum idpf_tx_desc_dtype_value dtype; @@ -213,6 +217,8 @@ struct idpf_tx_splitq_params { struct idpf_tx_offload_params offload; u16 prev_ntu; + u16 prev_refill_ntc; + bool prev_refill_gen; }; enum idpf_tx_ctx_desc_eipt_offload { @@ -720,6 +726,7 @@ struct idpf_rx_queue { * @size: Length of descriptor ring in bytes * @dma: Physical address of ring * @q_vector: Backreference to associated vector + * @buf_pool_size: Total number of idpf_tx_buf */ struct idpf_tx_queue { union { @@ -771,6 +778,7 @@ struct idpf_tx_queue { dma_addr_t dma; struct idpf_q_vector *q_vector; + u32 buf_pool_size; } ____cacheline_aligned; /** From 6ed9b40baa8eeaa9800d7a8eaa1da2b4caef8180 Mon Sep 17 00:00:00 2001 From: Joshua Hay Date: Fri, 25 Jul 2025 11:42:22 -0700 Subject: [PATCH 10/11] idpf: stop Tx if there are insufficient buffer resources jira KERNEL-168 commit-author Joshua Hay commit 0c3f135e840d4a2ba4253e15d530ec61bc30718e upstream-diff | adjusted conflict in idpf_tx_splitq_frame func due to missing 1a49cf814fe1e ("idpf: add Tx timestamp flows"). The Tx refillq logic will cause packets to be silently dropped if there are not enough buffer resources available to send a packet in flow scheduling mode. Instead, determine how many buffers are needed along with number of descriptors. Make sure there are enough of both resources to send the packet, and stop the queue if not. Fixes: 7292af042bcf ("idpf: fix a race in txq wakeup") Signed-off-by: Joshua Hay Reviewed-by: Madhu Chittim Tested-by: Samuel Salin Signed-off-by: Tony Nguyen (cherry picked from commit 0c3f135e840d4a2ba4253e15d530ec61bc30718e) Signed-off-by: Roxana Nicolescu --- .../ethernet/intel/idpf/idpf_singleq_txrx.c | 4 +- drivers/net/ethernet/intel/idpf/idpf_txrx.c | 47 +++++++++++++------ drivers/net/ethernet/intel/idpf/idpf_txrx.h | 15 +++++- 3 files changed, 47 insertions(+), 19 deletions(-) diff --git a/drivers/net/ethernet/intel/idpf/idpf_singleq_txrx.c b/drivers/net/ethernet/intel/idpf/idpf_singleq_txrx.c index 0163d4488ff05..e761f12d8901d 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_singleq_txrx.c +++ b/drivers/net/ethernet/intel/idpf/idpf_singleq_txrx.c @@ -414,11 +414,11 @@ netdev_tx_t idpf_tx_singleq_frame(struct sk_buff *skb, { struct idpf_tx_offload_params offload = { }; struct idpf_tx_buf *first; + u32 count, buf_count = 1; int csum, tso, needed; - unsigned int count; __be16 protocol; - count = idpf_tx_desc_count_required(tx_q, skb); + count = idpf_tx_res_count_required(tx_q, skb, &buf_count); if (unlikely(!count)) return idpf_tx_drop_skb(tx_q, skb); diff --git a/drivers/net/ethernet/intel/idpf/idpf_txrx.c b/drivers/net/ethernet/intel/idpf/idpf_txrx.c index ff0693c8e1e4b..2a441e0e39040 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_txrx.c +++ b/drivers/net/ethernet/intel/idpf/idpf_txrx.c @@ -2138,15 +2138,22 @@ void idpf_tx_splitq_build_flow_desc(union idpf_tx_flex_desc *desc, desc->flow.qw1.compl_tag = cpu_to_le16(params->compl_tag); } -/* Global conditions to tell whether the txq (and related resources) - * has room to allow the use of "size" descriptors. +/** + * idpf_tx_splitq_has_room - check if enough Tx splitq resources are available + * @tx_q: the queue to be checked + * @descs_needed: number of descriptors required for this packet + * @bufs_needed: number of Tx buffers required for this packet + * + * Return: 0 if no room available, 1 otherwise */ -static int idpf_txq_has_room(struct idpf_tx_queue *tx_q, u32 size) +static int idpf_txq_has_room(struct idpf_tx_queue *tx_q, u32 descs_needed, + u32 bufs_needed) { - if (IDPF_DESC_UNUSED(tx_q) < size || + if (IDPF_DESC_UNUSED(tx_q) < descs_needed || IDPF_TX_COMPLQ_PENDING(tx_q->txq_grp) > IDPF_TX_COMPLQ_OVERFLOW_THRESH(tx_q->txq_grp->complq) || - IDPF_TX_BUF_RSV_LOW(tx_q)) + IDPF_TX_BUF_RSV_LOW(tx_q) || + idpf_tx_splitq_get_free_bufs(tx_q->refillq) < bufs_needed) return 0; return 1; } @@ -2155,14 +2162,21 @@ static int idpf_txq_has_room(struct idpf_tx_queue *tx_q, u32 size) * idpf_tx_maybe_stop_splitq - 1st level check for Tx splitq stop conditions * @tx_q: the queue to be checked * @descs_needed: number of descriptors required for this packet + * @bufs_needed: number of buffers needed for this packet * - * Returns 0 if stop is not needed + * Return: 0 if stop is not needed */ static int idpf_tx_maybe_stop_splitq(struct idpf_tx_queue *tx_q, - unsigned int descs_needed) + u32 descs_needed, + u32 bufs_needed) { + /* Since we have multiple resources to check for splitq, our + * start,stop_thrs becomes a boolean check instead of a count + * threshold. + */ if (netif_subqueue_maybe_stop(tx_q->netdev, tx_q->idx, - idpf_txq_has_room(tx_q, descs_needed), + idpf_txq_has_room(tx_q, descs_needed, + bufs_needed), 1, 1)) return 0; @@ -2204,14 +2218,16 @@ void idpf_tx_buf_hw_update(struct idpf_tx_queue *tx_q, u32 val, } /** - * idpf_tx_desc_count_required - calculate number of Tx descriptors needed + * idpf_tx_res_count_required - get number of Tx resources needed for this pkt * @txq: queue to send buffer on * @skb: send buffer + * @bufs_needed: (output) number of buffers needed for this skb. * - * Returns number of data descriptors needed for this skb. + * Return: number of data descriptors and buffers needed for this skb. */ -unsigned int idpf_tx_desc_count_required(struct idpf_tx_queue *txq, - struct sk_buff *skb) +unsigned int idpf_tx_res_count_required(struct idpf_tx_queue *txq, + struct sk_buff *skb, + u32 *bufs_needed) { const struct skb_shared_info *shinfo; unsigned int count = 0, i; @@ -2222,6 +2238,7 @@ unsigned int idpf_tx_desc_count_required(struct idpf_tx_queue *txq, return count; shinfo = skb_shinfo(skb); + *bufs_needed += shinfo->nr_frags; for (i = 0; i < shinfo->nr_frags; i++) { unsigned int size; @@ -2771,11 +2788,11 @@ static netdev_tx_t idpf_tx_splitq_frame(struct sk_buff *skb, .prev_ntu = tx_q->next_to_use, }; struct idpf_tx_buf *first; - unsigned int count; + u32 count, buf_count = 1; int tso; u32 buf_id; - count = idpf_tx_desc_count_required(tx_q, skb); + count = idpf_tx_res_count_required(tx_q, skb, &buf_count); if (unlikely(!count)) return idpf_tx_drop_skb(tx_q, skb); @@ -2785,7 +2802,7 @@ static netdev_tx_t idpf_tx_splitq_frame(struct sk_buff *skb, /* Check for splitq specific TX resources */ count += (IDPF_TX_DESCS_PER_CACHE_LINE + tso); - if (idpf_tx_maybe_stop_splitq(tx_q, count)) { + if (idpf_tx_maybe_stop_splitq(tx_q, count, buf_count)) { idpf_tx_buf_hw_update(tx_q, tx_q->next_to_use, false); return NETDEV_TX_BUSY; diff --git a/drivers/net/ethernet/intel/idpf/idpf_txrx.h b/drivers/net/ethernet/intel/idpf/idpf_txrx.h index 5ee0cac22baaf..cfa6aa5b3bb8f 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_txrx.h +++ b/drivers/net/ethernet/intel/idpf/idpf_txrx.h @@ -1112,6 +1112,17 @@ static inline void idpf_vport_intr_set_wb_on_itr(struct idpf_q_vector *q_vector) reg->dyn_ctl); } +/** + * idpf_tx_splitq_get_free_bufs - get number of free buf_ids in refillq + * @refillq: pointer to refillq containing buf_ids + */ +static inline u32 idpf_tx_splitq_get_free_bufs(struct idpf_sw_queue *refillq) +{ + return (refillq->next_to_use > refillq->next_to_clean ? + 0 : refillq->desc_count) + + refillq->next_to_use - refillq->next_to_clean - 1; +} + int idpf_vport_singleq_napi_poll(struct napi_struct *napi, int budget); void idpf_vport_init_num_qs(struct idpf_vport *vport, struct virtchnl2_create_vport *vport_msg); @@ -1142,8 +1153,8 @@ void idpf_tx_buf_hw_update(struct idpf_tx_queue *tx_q, u32 val, bool xmit_more); unsigned int idpf_size_to_txd_count(unsigned int size); netdev_tx_t idpf_tx_drop_skb(struct idpf_tx_queue *tx_q, struct sk_buff *skb); -unsigned int idpf_tx_desc_count_required(struct idpf_tx_queue *txq, - struct sk_buff *skb); +unsigned int idpf_tx_res_count_required(struct idpf_tx_queue *txq, + struct sk_buff *skb, u32 *buf_count); void idpf_tx_timeout(struct net_device *netdev, unsigned int txqueue); netdev_tx_t idpf_tx_singleq_frame(struct sk_buff *skb, struct idpf_tx_queue *tx_q); From fe33d5dfce01f5b77d965bea5c6899b435fbb1a6 Mon Sep 17 00:00:00 2001 From: Joshua Hay Date: Fri, 25 Jul 2025 11:42:23 -0700 Subject: [PATCH 11/11] idpf: remove obsolete stashing code jira KERNEL-168 commit-author Joshua Hay commit 6c4e68480238274f84aa50d54da0d9e262df6284 upstream-diff | - adjusted context in .h due to different order in the struct idpf_tx_queue - adjusted context due to missing idpf_tx_read_tstamp func; both are due to missing 1a49cf814fe1e ("idpf: add Tx timestamp flows"). - did not include libeth_cacheline_set_assert for struct idpf_tx_queue due to missing 5a816aae2d46 ("idpf: strictly assert cachelines of queue and queue vector structures") With the new Tx buffer management scheme, there is no need for all of the stashing mechanisms, the hash table, the reserve buffer stack, etc. Remove all of that. Signed-off-by: Joshua Hay Reviewed-by: Madhu Chittim Reviewed-by: Aleksandr Loktionov Tested-by: Samuel Salin Signed-off-by: Tony Nguyen (cherry picked from commit 6c4e68480238274f84aa50d54da0d9e262df6284) Signed-off-by: Roxana Nicolescu --- drivers/net/ethernet/intel/idpf/idpf_txrx.c | 309 ++------------------ drivers/net/ethernet/intel/idpf/idpf_txrx.h | 44 --- 2 files changed, 21 insertions(+), 332 deletions(-) diff --git a/drivers/net/ethernet/intel/idpf/idpf_txrx.c b/drivers/net/ethernet/intel/idpf/idpf_txrx.c index 2a441e0e39040..24b255145f57c 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_txrx.c +++ b/drivers/net/ethernet/intel/idpf/idpf_txrx.c @@ -6,48 +6,12 @@ #include "idpf.h" #include "idpf_virtchnl.h" -struct idpf_tx_stash { - struct hlist_node hlist; - struct libeth_sqe buf; -}; - #define idpf_tx_buf_next(buf) (*(u32 *)&(buf)->priv) -#define idpf_tx_buf_compl_tag(buf) (*(u32 *)&(buf)->priv) LIBETH_SQE_CHECK_PRIV(u32); static bool idpf_chk_linearize(struct sk_buff *skb, unsigned int max_bufs, unsigned int count); -/** - * idpf_buf_lifo_push - push a buffer pointer onto stack - * @stack: pointer to stack struct - * @buf: pointer to buf to push - * - * Returns 0 on success, negative on failure - **/ -static int idpf_buf_lifo_push(struct idpf_buf_lifo *stack, - struct idpf_tx_stash *buf) -{ - if (unlikely(stack->top == stack->size)) - return -ENOSPC; - - stack->bufs[stack->top++] = buf; - - return 0; -} - -/** - * idpf_buf_lifo_pop - pop a buffer pointer from stack - * @stack: pointer to stack struct - **/ -static struct idpf_tx_stash *idpf_buf_lifo_pop(struct idpf_buf_lifo *stack) -{ - if (unlikely(!stack->top)) - return NULL; - - return stack->bufs[--stack->top]; -} - /** * idpf_tx_timeout - Respond to a Tx Hang * @netdev: network interface device structure @@ -76,14 +40,11 @@ void idpf_tx_timeout(struct net_device *netdev, unsigned int txqueue) static void idpf_tx_buf_rel_all(struct idpf_tx_queue *txq) { struct libeth_sq_napi_stats ss = { }; - struct idpf_buf_lifo *buf_stack; - struct idpf_tx_stash *stash; struct libeth_cq_pp cp = { .dev = txq->dev, .ss = &ss, }; - struct hlist_node *tmp; - u32 i, tag; + u32 i; /* Buffers already cleared, nothing to do */ if (!txq->tx_buf) @@ -95,33 +56,6 @@ static void idpf_tx_buf_rel_all(struct idpf_tx_queue *txq) kfree(txq->tx_buf); txq->tx_buf = NULL; - - if (!idpf_queue_has(FLOW_SCH_EN, txq)) - return; - - buf_stack = &txq->stash->buf_stack; - if (!buf_stack->bufs) - return; - - /* - * If a Tx timeout occurred, there are potentially still bufs in the - * hash table, free them here. - */ - hash_for_each_safe(txq->stash->sched_buf_hash, tag, tmp, stash, - hlist) { - if (!stash) - continue; - - libeth_tx_complete(&stash->buf, &cp); - hash_del(&stash->hlist); - idpf_buf_lifo_push(buf_stack, stash); - } - - for (i = 0; i < buf_stack->size; i++) - kfree(buf_stack->bufs[i]); - - kfree(buf_stack->bufs); - buf_stack->bufs = NULL; } /** @@ -197,9 +131,6 @@ static void idpf_tx_desc_rel_all(struct idpf_vport *vport) */ static int idpf_tx_buf_alloc_all(struct idpf_tx_queue *tx_q) { - struct idpf_buf_lifo *buf_stack; - int i; - /* Allocate book keeping buffers only. Buffers to be supplied to HW * are allocated by kernel network stack and received as part of skb */ @@ -212,29 +143,6 @@ static int idpf_tx_buf_alloc_all(struct idpf_tx_queue *tx_q) if (!tx_q->tx_buf) return -ENOMEM; - if (!idpf_queue_has(FLOW_SCH_EN, tx_q)) - return 0; - - buf_stack = &tx_q->stash->buf_stack; - - /* Initialize tx buf stack for out-of-order completions if - * flow scheduling offload is enabled - */ - buf_stack->bufs = kcalloc(tx_q->desc_count, sizeof(*buf_stack->bufs), - GFP_KERNEL); - if (!buf_stack->bufs) - return -ENOMEM; - - buf_stack->size = tx_q->desc_count; - buf_stack->top = tx_q->desc_count; - - for (i = 0; i < tx_q->desc_count; i++) { - buf_stack->bufs[i] = kzalloc(sizeof(*buf_stack->bufs[i]), - GFP_KERNEL); - if (!buf_stack->bufs[i]) - return -ENOMEM; - } - return 0; } @@ -349,8 +257,6 @@ static int idpf_tx_desc_alloc_all(struct idpf_vport *vport) for (i = 0; i < vport->num_txq_grp; i++) { for (j = 0; j < vport->txq_grps[i].num_txq; j++) { struct idpf_tx_queue *txq = vport->txq_grps[i].txqs[j]; - u8 gen_bits = 0; - u16 bufidx_mask; err = idpf_tx_desc_alloc(vport, txq); if (err) { @@ -359,34 +265,6 @@ static int idpf_tx_desc_alloc_all(struct idpf_vport *vport) i); goto err_out; } - - if (!idpf_is_queue_model_split(vport->txq_model)) - continue; - - txq->compl_tag_cur_gen = 0; - - /* Determine the number of bits in the bufid - * mask and add one to get the start of the - * generation bits - */ - bufidx_mask = txq->desc_count - 1; - while (bufidx_mask >> 1) { - txq->compl_tag_gen_s++; - bufidx_mask = bufidx_mask >> 1; - } - txq->compl_tag_gen_s++; - - gen_bits = IDPF_TX_SPLITQ_COMPL_TAG_WIDTH - - txq->compl_tag_gen_s; - txq->compl_tag_gen_max = GETMAXVAL(gen_bits); - - /* Set bufid mask based on location of first - * gen bit; it cannot simply be the descriptor - * ring size-1 since we can have size values - * where not all of those bits are set. - */ - txq->compl_tag_bufid_m = - GETMAXVAL(txq->compl_tag_gen_s); } if (!idpf_is_queue_model_split(vport->txq_model)) @@ -1025,9 +903,6 @@ static void idpf_txq_group_rel(struct idpf_vport *vport) kfree(txq_grp->complq); txq_grp->complq = NULL; - - if (flow_sch_en) - kfree(txq_grp->stashes); } kfree(vport->txq_grps); vport->txq_grps = NULL; @@ -1386,7 +1261,6 @@ static int idpf_txq_group_alloc(struct idpf_vport *vport, u16 num_txq) for (i = 0; i < vport->num_txq_grp; i++) { struct idpf_txq_group *tx_qgrp = &vport->txq_grps[i]; struct idpf_adapter *adapter = vport->adapter; - struct idpf_txq_stash *stashes; int j; tx_qgrp->vport = vport; @@ -1399,15 +1273,6 @@ static int idpf_txq_group_alloc(struct idpf_vport *vport, u16 num_txq) goto err_alloc; } - if (split && flow_sch_en) { - stashes = kcalloc(num_txq, sizeof(*stashes), - GFP_KERNEL); - if (!stashes) - goto err_alloc; - - tx_qgrp->stashes = stashes; - } - for (j = 0; j < tx_qgrp->num_txq; j++) { struct idpf_tx_queue *q = tx_qgrp->txqs[j]; @@ -1427,11 +1292,6 @@ static int idpf_txq_group_alloc(struct idpf_vport *vport, u16 num_txq) if (!flow_sch_en) continue; - if (split) { - q->stash = &stashes[j]; - hash_init(q->stash->sched_buf_hash); - } - idpf_queue_set(FLOW_SCH_EN, q); q->refillq = kzalloc(sizeof(*q->refillq), GFP_KERNEL); @@ -1697,82 +1557,6 @@ static void idpf_tx_handle_sw_marker(struct idpf_tx_queue *tx_q) wake_up(&vport->sw_marker_wq); } -/** - * idpf_tx_clean_stashed_bufs - clean bufs that were stored for - * out of order completions - * @txq: queue to clean - * @compl_tag: completion tag of packet to clean (from completion descriptor) - * @cleaned: pointer to stats struct to track cleaned packets/bytes - * @budget: Used to determine if we are in netpoll - */ -static void idpf_tx_clean_stashed_bufs(struct idpf_tx_queue *txq, - u16 compl_tag, - struct libeth_sq_napi_stats *cleaned, - int budget) -{ - struct idpf_tx_stash *stash; - struct hlist_node *tmp_buf; - struct libeth_cq_pp cp = { - .dev = txq->dev, - .ss = cleaned, - .napi = budget, - }; - - /* Buffer completion */ - hash_for_each_possible_safe(txq->stash->sched_buf_hash, stash, tmp_buf, - hlist, compl_tag) { - if (unlikely(idpf_tx_buf_compl_tag(&stash->buf) != compl_tag)) - continue; - - hash_del(&stash->hlist); - libeth_tx_complete(&stash->buf, &cp); - - /* Push shadow buf back onto stack */ - idpf_buf_lifo_push(&txq->stash->buf_stack, stash); - } -} - -/** - * idpf_stash_flow_sch_buffers - store buffer parameters info to be freed at a - * later time (only relevant for flow scheduling mode) - * @txq: Tx queue to clean - * @tx_buf: buffer to store - */ -static int idpf_stash_flow_sch_buffers(struct idpf_tx_queue *txq, - struct idpf_tx_buf *tx_buf) -{ - struct idpf_tx_stash *stash; - - if (unlikely(tx_buf->type <= LIBETH_SQE_CTX)) - return 0; - - stash = idpf_buf_lifo_pop(&txq->stash->buf_stack); - if (unlikely(!stash)) { - net_err_ratelimited("%s: No out-of-order TX buffers left!\n", - netdev_name(txq->netdev)); - - return -ENOMEM; - } - - /* Store buffer params in shadow buffer */ - stash->buf.skb = tx_buf->skb; - stash->buf.bytes = tx_buf->bytes; - stash->buf.packets = tx_buf->packets; - stash->buf.type = tx_buf->type; - stash->buf.nr_frags = tx_buf->nr_frags; - dma_unmap_addr_set(&stash->buf, dma, dma_unmap_addr(tx_buf, dma)); - dma_unmap_len_set(&stash->buf, len, dma_unmap_len(tx_buf, len)); - idpf_tx_buf_compl_tag(&stash->buf) = idpf_tx_buf_compl_tag(tx_buf); - - /* Add buffer to buf_hash table to be freed later */ - hash_add(txq->stash->sched_buf_hash, &stash->hlist, - idpf_tx_buf_compl_tag(&stash->buf)); - - tx_buf->type = LIBETH_SQE_EMPTY; - - return 0; -} - #define idpf_tx_splitq_clean_bump_ntc(txq, ntc, desc, buf) \ do { \ if (unlikely(++(ntc) == (txq)->desc_count)) { \ @@ -1800,14 +1584,8 @@ do { \ * Separate packet completion events will be reported on the completion queue, * and the buffers will be cleaned separately. The stats are not updated from * this function when using flow-based scheduling. - * - * Furthermore, in flow scheduling mode, check to make sure there are enough - * reserve buffers to stash the packet. If there are not, return early, which - * will leave next_to_clean pointing to the packet that failed to be stashed. - * - * Return: false in the scenario above, true otherwise. */ -static bool idpf_tx_splitq_clean(struct idpf_tx_queue *tx_q, u16 end, +static void idpf_tx_splitq_clean(struct idpf_tx_queue *tx_q, u16 end, int napi_budget, struct libeth_sq_napi_stats *cleaned, bool descs_only) @@ -1821,12 +1599,11 @@ static bool idpf_tx_splitq_clean(struct idpf_tx_queue *tx_q, u16 end, .napi = napi_budget, }; struct idpf_tx_buf *tx_buf; - bool clean_complete = true; if (descs_only) { /* Bump ring index to mark as cleaned. */ tx_q->next_to_clean = end; - return true; + return; } tx_desc = &tx_q->flex_tx[ntc]; @@ -1847,53 +1624,24 @@ static bool idpf_tx_splitq_clean(struct idpf_tx_queue *tx_q, u16 end, break; eop_idx = tx_buf->rs_idx; + libeth_tx_complete(tx_buf, &cp); - if (descs_only) { - if (IDPF_TX_BUF_RSV_UNUSED(tx_q) < tx_buf->nr_frags) { - clean_complete = false; - goto tx_splitq_clean_out; - } - - idpf_stash_flow_sch_buffers(tx_q, tx_buf); + /* unmap remaining buffers */ + while (ntc != eop_idx) { + idpf_tx_splitq_clean_bump_ntc(tx_q, ntc, + tx_desc, tx_buf); - while (ntc != eop_idx) { - idpf_tx_splitq_clean_bump_ntc(tx_q, ntc, - tx_desc, tx_buf); - idpf_stash_flow_sch_buffers(tx_q, tx_buf); - } - } else { + /* unmap any remaining paged data */ libeth_tx_complete(tx_buf, &cp); - - /* unmap remaining buffers */ - while (ntc != eop_idx) { - idpf_tx_splitq_clean_bump_ntc(tx_q, ntc, - tx_desc, tx_buf); - - /* unmap any remaining paged data */ - libeth_tx_complete(tx_buf, &cp); - } } fetch_next_txq_desc: idpf_tx_splitq_clean_bump_ntc(tx_q, ntc, tx_desc, tx_buf); } -tx_splitq_clean_out: tx_q->next_to_clean = ntc; - - return clean_complete; } -#define idpf_tx_clean_buf_ring_bump_ntc(txq, ntc, buf) \ -do { \ - (buf)++; \ - (ntc)++; \ - if (unlikely((ntc) == (txq)->desc_count)) { \ - buf = (txq)->tx_buf; \ - ntc = 0; \ - } \ -} while (0) - /** * idpf_tx_clean_bufs - clean flow scheduling TX queue buffers * @txq: queue to clean @@ -1904,7 +1652,7 @@ do { \ * Clean all buffers associated with the packet starting at buf_id. Returns the * byte/segment count for the cleaned packet. */ -static bool idpf_tx_clean_bufs(struct idpf_tx_queue *txq, u32 buf_id, +static void idpf_tx_clean_bufs(struct idpf_tx_queue *txq, u32 buf_id, struct libeth_sq_napi_stats *cleaned, int budget) { @@ -1928,8 +1676,6 @@ static bool idpf_tx_clean_bufs(struct idpf_tx_queue *txq, u32 buf_id, libeth_tx_complete(tx_buf, &cp); idpf_post_buf_refill(txq->refillq, buf_id); } - - return true; } /** @@ -1948,22 +1694,17 @@ static void idpf_tx_handle_rs_completion(struct idpf_tx_queue *txq, struct libeth_sq_napi_stats *cleaned, int budget) { - u16 compl_tag; + /* RS completion contains queue head for queue based scheduling or + * completion tag for flow based scheduling. + */ + u16 rs_compl_val = le16_to_cpu(desc->q_head_compl_tag.q_head); if (!idpf_queue_has(FLOW_SCH_EN, txq)) { - u16 head = le16_to_cpu(desc->q_head_compl_tag.q_head); - - idpf_tx_splitq_clean(txq, head, budget, cleaned, false); + idpf_tx_splitq_clean(txq, rs_compl_val, budget, cleaned, false); return; } - compl_tag = le16_to_cpu(desc->q_head_compl_tag.compl_tag); - - /* If we didn't clean anything on the ring, this packet must be - * in the hash table. Go clean it there. - */ - if (!idpf_tx_clean_bufs(txq, compl_tag, cleaned, budget)) - idpf_tx_clean_stashed_bufs(txq, compl_tag, cleaned, budget); + idpf_tx_clean_bufs(txq, rs_compl_val, cleaned, budget); } /** @@ -2080,8 +1821,7 @@ static bool idpf_tx_clean_complq(struct idpf_compl_queue *complq, int budget, /* Update BQL */ nq = netdev_get_tx_queue(tx_q->netdev, tx_q->idx); - dont_wake = !complq_ok || IDPF_TX_BUF_RSV_LOW(tx_q) || - np->state != __IDPF_VPORT_UP || + dont_wake = !complq_ok || np->state != __IDPF_VPORT_UP || !netif_carrier_ok(tx_q->netdev); /* Check if the TXQ needs to and can be restarted */ __netif_txq_completed_wake(nq, tx_q->cleaned_pkts, tx_q->cleaned_bytes, @@ -2152,7 +1892,6 @@ static int idpf_txq_has_room(struct idpf_tx_queue *tx_q, u32 descs_needed, if (IDPF_DESC_UNUSED(tx_q) < descs_needed || IDPF_TX_COMPLQ_PENDING(tx_q->txq_grp) > IDPF_TX_COMPLQ_OVERFLOW_THRESH(tx_q->txq_grp->complq) || - IDPF_TX_BUF_RSV_LOW(tx_q) || idpf_tx_splitq_get_free_bufs(tx_q->refillq) < bufs_needed) return 0; return 1; @@ -2276,10 +2015,8 @@ static unsigned int idpf_tx_splitq_bump_ntu(struct idpf_tx_queue *txq, u16 ntu) { ntu++; - if (ntu == txq->desc_count) { + if (ntu == txq->desc_count) ntu = 0; - txq->compl_tag_cur_gen = IDPF_TX_ADJ_COMPL_TAG_GEN(txq); - } return ntu; } @@ -2461,8 +2198,6 @@ static void idpf_tx_splitq_map(struct idpf_tx_queue *tx_q, if (unlikely(++i == tx_q->desc_count)) { tx_desc = &tx_q->flex_tx[0]; i = 0; - tx_q->compl_tag_cur_gen = - IDPF_TX_ADJ_COMPL_TAG_GEN(tx_q); } else { tx_desc++; } @@ -2493,7 +2228,6 @@ static void idpf_tx_splitq_map(struct idpf_tx_queue *tx_q, if (unlikely(++i == tx_q->desc_count)) { tx_desc = &tx_q->flex_tx[0]; i = 0; - tx_q->compl_tag_cur_gen = IDPF_TX_ADJ_COMPL_TAG_GEN(tx_q); } else { tx_desc++; } @@ -2854,10 +2588,9 @@ static netdev_tx_t idpf_tx_splitq_frame(struct sk_buff *skb, tx_params.dtype = IDPF_TX_DESC_DTYPE_FLEX_FLOW_SCHE; tx_params.eop_cmd = IDPF_TXD_FLEX_FLOW_CMD_EOP; - /* Set the RE bit to catch any packets that may have not been - * stashed during RS completion cleaning. MIN_GAP is set to - * MIN_RING size to ensure it will be set at least once each - * time around the ring. + /* Set the RE bit to periodically "clean" the descriptor ring. + * MIN_GAP is set to MIN_RING size to ensure it will be set at + * least once each time around the ring. */ if (idpf_tx_splitq_need_re(tx_q)) { tx_params.eop_cmd |= IDPF_TXD_FLEX_FLOW_CMD_RE; diff --git a/drivers/net/ethernet/intel/idpf/idpf_txrx.h b/drivers/net/ethernet/intel/idpf/idpf_txrx.h index cfa6aa5b3bb8f..49e95dbdd543a 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_txrx.h +++ b/drivers/net/ethernet/intel/idpf/idpf_txrx.h @@ -123,10 +123,6 @@ do { \ ((((txq)->next_to_clean > (txq)->next_to_use) ? 0 : (txq)->desc_count) + \ (txq)->next_to_clean - (txq)->next_to_use - 1) -#define IDPF_TX_BUF_RSV_UNUSED(txq) ((txq)->stash->buf_stack.top) -#define IDPF_TX_BUF_RSV_LOW(txq) (IDPF_TX_BUF_RSV_UNUSED(txq) < \ - (txq)->desc_count >> 2) - #define IDPF_TX_COMPLQ_OVERFLOW_THRESH(txcq) ((txcq)->desc_count >> 1) /* Determine the absolute number of completions pending, i.e. the number of * completions that are expected to arrive on the TX completion queue. @@ -136,12 +132,6 @@ do { \ 0 : U32_MAX) + \ (txq)->num_completions_pending - (txq)->complq->num_completions) -#define IDPF_TX_SPLITQ_COMPL_TAG_WIDTH 16 -/* Adjust the generation for the completion tag and wrap if necessary */ -#define IDPF_TX_ADJ_COMPL_TAG_GEN(txq) \ - ((++(txq)->compl_tag_cur_gen) >= (txq)->compl_tag_gen_max ? \ - 0 : (txq)->compl_tag_cur_gen) - #define IDPF_TXBUF_NULL U32_MAX #define IDPF_TXD_LAST_DESC_CMD (IDPF_TX_DESC_CMD_EOP | IDPF_TX_DESC_CMD_RS) @@ -158,18 +148,6 @@ union idpf_tx_flex_desc { #define idpf_tx_buf libeth_sqe -/** - * struct idpf_buf_lifo - LIFO for managing OOO completions - * @top: Used to know how many buffers are left - * @size: Total size of LIFO - * @bufs: Backing array - */ -struct idpf_buf_lifo { - u16 top; - u16 size; - struct idpf_tx_stash **bufs; -}; - /** * struct idpf_tx_offload_params - Offload parameters for a given packet * @tx_flags: Feature flags enabled for this packet @@ -573,17 +551,6 @@ struct idpf_tx_queue_stats { #define IDPF_ITR_IDX_SPACING(spacing, dflt) (spacing ? spacing : dflt) #define IDPF_DIM_DEFAULT_PROFILE_IX 1 -/** - * struct idpf_txq_stash - Tx buffer stash for Flow-based scheduling mode - * @buf_stack: Stack of empty buffers to store buffer info for out of order - * buffer completions. See struct idpf_buf_lifo - * @sched_buf_hash: Hash table to store buffers - */ -struct idpf_txq_stash { - struct idpf_buf_lifo buf_stack; - DECLARE_HASHTABLE(sched_buf_hash, 12); -} ____cacheline_aligned; - /** * struct idpf_rx_queue - software structure representing a receive queue * @rx: universal receive descriptor array @@ -696,7 +663,6 @@ struct idpf_rx_queue { * @cleaned_pkts: Number of packets cleaned for the above said case * @tx_min_pkt_len: Min supported packet length * @refillq: Pointer to refill queue - * @compl_tag_bufid_m: Completion tag buffer id mask * @compl_tag_gen_s: Completion tag generation bit * The format of the completion tag will change based on the TXQ * descriptor ring size so that we can maintain roughly the same level @@ -717,9 +683,6 @@ struct idpf_rx_queue { * -------------------------------- * * This gives us 8*8160 = 65280 possible unique values. - * @compl_tag_cur_gen: Used to keep track of current completion tag generation - * @compl_tag_gen_max: To determine when compl_tag_cur_gen should be reset - * @stash: Tx buffer stash for Flow-based scheduling mode * @stats_sync: See struct u64_stats_sync * @q_stats: See union idpf_tx_queue_stats * @q_id: Queue id @@ -761,13 +724,8 @@ struct idpf_tx_queue { u16 tx_min_pkt_len; struct idpf_sw_queue *refillq; - u16 compl_tag_bufid_m; u16 compl_tag_gen_s; - u16 compl_tag_cur_gen; - u16 compl_tag_gen_max; - - struct idpf_txq_stash *stash; struct u64_stats_sync stats_sync; struct idpf_tx_queue_stats q_stats; @@ -964,7 +922,6 @@ struct idpf_rxq_group { * @vport: Vport back pointer * @num_txq: Number of TX queues associated * @txqs: Array of TX queue pointers - * @stashes: array of OOO stashes for the queues * @complq: Associated completion queue pointer, split queue only * @num_completions_pending: Total number of completions pending for the * completion queue, acculumated for all TX queues @@ -979,7 +936,6 @@ struct idpf_txq_group { u16 num_txq; struct idpf_tx_queue *txqs[IDPF_LARGE_MAX_Q]; - struct idpf_txq_stash *stashes; struct idpf_compl_queue *complq;