mirror of
				https://github.com/optim-enterprises-bv/openwrt-ipq.git
				synced 2025-11-04 04:08:34 +00:00 
			
		
		
		
	generic: 6.6: replace (broken) downstream patch with upstream solution
Our downstream patch "net/core: add optional threading for backlog processing" has been broken with the switch to Linux 6.6. Replace it by backporting the now available upstream solution. Signed-off-by: Daniel Golle <daniel@makrotopia.org> Link: https://github.com/openwrt/openwrt/pull/15592 Signed-off-by: Christian Marangi <ansuelsmth@gmail.com>
This commit is contained in:
		
				
					committed by
					
						
						Christian Marangi
					
				
			
			
				
	
			
			
			
						parent
						
							f3080677f5
						
					
				
				
					commit
					a5c095c453
				
			@@ -0,0 +1,75 @@
 | 
			
		||||
From 56364c910691f6d10ba88c964c9041b9ab777bd6 Mon Sep 17 00:00:00 2001
 | 
			
		||||
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
 | 
			
		||||
Date: Mon, 25 Mar 2024 08:40:28 +0100
 | 
			
		||||
Subject: [PATCH 1/4] net: Remove conditional threaded-NAPI wakeup based on
 | 
			
		||||
 task state.
 | 
			
		||||
 | 
			
		||||
A NAPI thread is scheduled by first setting NAPI_STATE_SCHED bit. If
 | 
			
		||||
successful (the bit was not yet set) then the NAPI_STATE_SCHED_THREADED
 | 
			
		||||
is set but only if thread's state is not TASK_INTERRUPTIBLE (is
 | 
			
		||||
TASK_RUNNING) followed by task wakeup.
 | 
			
		||||
 | 
			
		||||
If the task is idle (TASK_INTERRUPTIBLE) then the
 | 
			
		||||
NAPI_STATE_SCHED_THREADED bit is not set. The thread is no relying on
 | 
			
		||||
the bit but always leaving the wait-loop after returning from schedule()
 | 
			
		||||
because there must have been a wakeup.
 | 
			
		||||
 | 
			
		||||
The smpboot-threads implementation for per-CPU threads requires an
 | 
			
		||||
explicit condition and does not support "if we get out of schedule()
 | 
			
		||||
then there must be something to do".
 | 
			
		||||
 | 
			
		||||
Removing this optimisation simplifies the following integration.
 | 
			
		||||
 | 
			
		||||
Set NAPI_STATE_SCHED_THREADED unconditionally on wakeup and rely on it
 | 
			
		||||
in the wait path by removing the `woken' condition.
 | 
			
		||||
 | 
			
		||||
Acked-by: Jakub Kicinski <kuba@kernel.org>
 | 
			
		||||
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
 | 
			
		||||
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
 | 
			
		||||
---
 | 
			
		||||
 net/core/dev.c | 14 ++------------
 | 
			
		||||
 1 file changed, 2 insertions(+), 12 deletions(-)
 | 
			
		||||
 | 
			
		||||
--- a/net/core/dev.c
 | 
			
		||||
+++ b/net/core/dev.c
 | 
			
		||||
@@ -4473,13 +4473,7 @@ static inline void ____napi_schedule(str
 | 
			
		||||
 		 */
 | 
			
		||||
 		thread = READ_ONCE(napi->thread);
 | 
			
		||||
 		if (thread) {
 | 
			
		||||
-			/* Avoid doing set_bit() if the thread is in
 | 
			
		||||
-			 * INTERRUPTIBLE state, cause napi_thread_wait()
 | 
			
		||||
-			 * makes sure to proceed with napi polling
 | 
			
		||||
-			 * if the thread is explicitly woken from here.
 | 
			
		||||
-			 */
 | 
			
		||||
-			if (READ_ONCE(thread->__state) != TASK_INTERRUPTIBLE)
 | 
			
		||||
-				set_bit(NAPI_STATE_SCHED_THREADED, &napi->state);
 | 
			
		||||
+			set_bit(NAPI_STATE_SCHED_THREADED, &napi->state);
 | 
			
		||||
 			wake_up_process(thread);
 | 
			
		||||
 			return;
 | 
			
		||||
 		}
 | 
			
		||||
@@ -6635,8 +6629,6 @@ static int napi_poll(struct napi_struct
 | 
			
		||||
 
 | 
			
		||||
 static int napi_thread_wait(struct napi_struct *napi)
 | 
			
		||||
 {
 | 
			
		||||
-	bool woken = false;
 | 
			
		||||
-
 | 
			
		||||
 	set_current_state(TASK_INTERRUPTIBLE);
 | 
			
		||||
 
 | 
			
		||||
 	while (!kthread_should_stop()) {
 | 
			
		||||
@@ -6645,15 +6637,13 @@ static int napi_thread_wait(struct napi_
 | 
			
		||||
 		 * Testing SCHED bit is not enough because SCHED bit might be
 | 
			
		||||
 		 * set by some other busy poll thread or by napi_disable().
 | 
			
		||||
 		 */
 | 
			
		||||
-		if (test_bit(NAPI_STATE_SCHED_THREADED, &napi->state) || woken) {
 | 
			
		||||
+		if (test_bit(NAPI_STATE_SCHED_THREADED, &napi->state)) {
 | 
			
		||||
 			WARN_ON(!list_empty(&napi->poll_list));
 | 
			
		||||
 			__set_current_state(TASK_RUNNING);
 | 
			
		||||
 			return 0;
 | 
			
		||||
 		}
 | 
			
		||||
 
 | 
			
		||||
 		schedule();
 | 
			
		||||
-		/* woken being true indicates this thread owns this napi. */
 | 
			
		||||
-		woken = true;
 | 
			
		||||
 		set_current_state(TASK_INTERRUPTIBLE);
 | 
			
		||||
 	}
 | 
			
		||||
 	__set_current_state(TASK_RUNNING);
 | 
			
		||||
@@ -0,0 +1,330 @@
 | 
			
		||||
From dad6b97702639fba27a2bd3e986982ad6f0db3a7 Mon Sep 17 00:00:00 2001
 | 
			
		||||
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
 | 
			
		||||
Date: Mon, 25 Mar 2024 08:40:29 +0100
 | 
			
		||||
Subject: [PATCH 2/4] net: Allow to use SMP threads for backlog NAPI.
 | 
			
		||||
 | 
			
		||||
Backlog NAPI is a per-CPU NAPI struct only (with no device behind it)
 | 
			
		||||
used by drivers which don't do NAPI them self, RPS and parts of the
 | 
			
		||||
stack which need to avoid recursive deadlocks while processing a packet.
 | 
			
		||||
 | 
			
		||||
The non-NAPI driver use the CPU local backlog NAPI. If RPS is enabled
 | 
			
		||||
then a flow for the skb is computed and based on the flow the skb can be
 | 
			
		||||
enqueued on a remote CPU. Scheduling/ raising the softirq (for backlog's
 | 
			
		||||
NAPI) on the remote CPU isn't trivial because the softirq is only
 | 
			
		||||
scheduled on the local CPU and performed after the hardirq is done.
 | 
			
		||||
In order to schedule a softirq on the remote CPU, an IPI is sent to the
 | 
			
		||||
remote CPU which schedules the backlog-NAPI on the then local CPU.
 | 
			
		||||
 | 
			
		||||
On PREEMPT_RT interrupts are force-threaded. The soft interrupts are
 | 
			
		||||
raised within the interrupt thread and processed after the interrupt
 | 
			
		||||
handler completed still within the context of the interrupt thread. The
 | 
			
		||||
softirq is handled in the context where it originated.
 | 
			
		||||
 | 
			
		||||
With force-threaded interrupts enabled, ksoftirqd is woken up if a
 | 
			
		||||
softirq is raised from hardirq context. This is the case if it is raised
 | 
			
		||||
from an IPI. Additionally there is a warning on PREEMPT_RT if the
 | 
			
		||||
softirq is raised from the idle thread.
 | 
			
		||||
This was done for two reasons:
 | 
			
		||||
- With threaded interrupts the processing should happen in thread
 | 
			
		||||
  context (where it originated) and ksoftirqd is the only thread for
 | 
			
		||||
  this context if raised from hardirq. Using the currently running task
 | 
			
		||||
  instead would "punish" a random task.
 | 
			
		||||
- Once ksoftirqd is active it consumes all further softirqs until it
 | 
			
		||||
  stops running. This changed recently and is no longer the case.
 | 
			
		||||
 | 
			
		||||
Instead of keeping the backlog NAPI in ksoftirqd (in force-threaded/
 | 
			
		||||
PREEMPT_RT setups) I am proposing NAPI-threads for backlog.
 | 
			
		||||
The "proper" setup with threaded-NAPI is not doable because the threads
 | 
			
		||||
are not pinned to an individual CPU and can be modified by the user.
 | 
			
		||||
Additionally a dummy network device would have to be assigned. Also
 | 
			
		||||
CPU-hotplug has to be considered if additional CPUs show up.
 | 
			
		||||
All this can be probably done/ solved but the smpboot-threads already
 | 
			
		||||
provide this infrastructure.
 | 
			
		||||
 | 
			
		||||
Sending UDP packets over loopback expects that the packet is processed
 | 
			
		||||
within the call. Delaying it by handing it over to the thread hurts
 | 
			
		||||
performance. It is not beneficial to the outcome if the context switch
 | 
			
		||||
happens immediately after enqueue or after a while to process a few
 | 
			
		||||
packets in a batch.
 | 
			
		||||
There is no need to always use the thread if the backlog NAPI is
 | 
			
		||||
requested on the local CPU. This restores the loopback throuput. The
 | 
			
		||||
performance drops mostly to the same value after enabling RPS on the
 | 
			
		||||
loopback comparing the IPI and the tread result.
 | 
			
		||||
 | 
			
		||||
Create NAPI-threads for backlog if request during boot. The thread runs
 | 
			
		||||
the inner loop from napi_threaded_poll(), the wait part is different. It
 | 
			
		||||
checks for NAPI_STATE_SCHED (the backlog NAPI can not be disabled).
 | 
			
		||||
 | 
			
		||||
The NAPI threads for backlog are optional, it has to be enabled via the boot
 | 
			
		||||
argument "thread_backlog_napi". It is mandatory for PREEMPT_RT to avoid the
 | 
			
		||||
wakeup of ksoftirqd from the IPI.
 | 
			
		||||
 | 
			
		||||
Acked-by: Jakub Kicinski <kuba@kernel.org>
 | 
			
		||||
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
 | 
			
		||||
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
 | 
			
		||||
---
 | 
			
		||||
 net/core/dev.c | 148 +++++++++++++++++++++++++++++++++++++------------
 | 
			
		||||
 1 file changed, 113 insertions(+), 35 deletions(-)
 | 
			
		||||
 | 
			
		||||
--- a/net/core/dev.c
 | 
			
		||||
+++ b/net/core/dev.c
 | 
			
		||||
@@ -78,6 +78,7 @@
 | 
			
		||||
 #include <linux/slab.h>
 | 
			
		||||
 #include <linux/sched.h>
 | 
			
		||||
 #include <linux/sched/mm.h>
 | 
			
		||||
+#include <linux/smpboot.h>
 | 
			
		||||
 #include <linux/mutex.h>
 | 
			
		||||
 #include <linux/rwsem.h>
 | 
			
		||||
 #include <linux/string.h>
 | 
			
		||||
@@ -217,6 +218,31 @@ static inline struct hlist_head *dev_ind
 | 
			
		||||
 	return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 | 
			
		||||
 }
 | 
			
		||||
 
 | 
			
		||||
+#ifndef CONFIG_PREEMPT_RT
 | 
			
		||||
+
 | 
			
		||||
+static DEFINE_STATIC_KEY_FALSE(use_backlog_threads_key);
 | 
			
		||||
+
 | 
			
		||||
+static int __init setup_backlog_napi_threads(char *arg)
 | 
			
		||||
+{
 | 
			
		||||
+	static_branch_enable(&use_backlog_threads_key);
 | 
			
		||||
+	return 0;
 | 
			
		||||
+}
 | 
			
		||||
+early_param("thread_backlog_napi", setup_backlog_napi_threads);
 | 
			
		||||
+
 | 
			
		||||
+static bool use_backlog_threads(void)
 | 
			
		||||
+{
 | 
			
		||||
+	return static_branch_unlikely(&use_backlog_threads_key);
 | 
			
		||||
+}
 | 
			
		||||
+
 | 
			
		||||
+#else
 | 
			
		||||
+
 | 
			
		||||
+static bool use_backlog_threads(void)
 | 
			
		||||
+{
 | 
			
		||||
+	return true;
 | 
			
		||||
+}
 | 
			
		||||
+
 | 
			
		||||
+#endif
 | 
			
		||||
+
 | 
			
		||||
 static inline void rps_lock_irqsave(struct softnet_data *sd,
 | 
			
		||||
 				    unsigned long *flags)
 | 
			
		||||
 {
 | 
			
		||||
@@ -4441,6 +4467,7 @@ EXPORT_SYMBOL(__dev_direct_xmit);
 | 
			
		||||
 /*************************************************************************
 | 
			
		||||
  *			Receiver routines
 | 
			
		||||
  *************************************************************************/
 | 
			
		||||
+static DEFINE_PER_CPU(struct task_struct *, backlog_napi);
 | 
			
		||||
 
 | 
			
		||||
 int netdev_max_backlog __read_mostly = 1000;
 | 
			
		||||
 EXPORT_SYMBOL(netdev_max_backlog);
 | 
			
		||||
@@ -4473,12 +4500,16 @@ static inline void ____napi_schedule(str
 | 
			
		||||
 		 */
 | 
			
		||||
 		thread = READ_ONCE(napi->thread);
 | 
			
		||||
 		if (thread) {
 | 
			
		||||
+			if (use_backlog_threads() && thread == raw_cpu_read(backlog_napi))
 | 
			
		||||
+				goto use_local_napi;
 | 
			
		||||
+
 | 
			
		||||
 			set_bit(NAPI_STATE_SCHED_THREADED, &napi->state);
 | 
			
		||||
 			wake_up_process(thread);
 | 
			
		||||
 			return;
 | 
			
		||||
 		}
 | 
			
		||||
 	}
 | 
			
		||||
 
 | 
			
		||||
+use_local_napi:
 | 
			
		||||
 	list_add_tail(&napi->poll_list, &sd->poll_list);
 | 
			
		||||
 	WRITE_ONCE(napi->list_owner, smp_processor_id());
 | 
			
		||||
 	/* If not called from net_rx_action()
 | 
			
		||||
@@ -4724,6 +4755,11 @@ static void napi_schedule_rps(struct sof
 | 
			
		||||
 
 | 
			
		||||
 #ifdef CONFIG_RPS
 | 
			
		||||
 	if (sd != mysd) {
 | 
			
		||||
+		if (use_backlog_threads()) {
 | 
			
		||||
+			__napi_schedule_irqoff(&sd->backlog);
 | 
			
		||||
+			return;
 | 
			
		||||
+		}
 | 
			
		||||
+
 | 
			
		||||
 		sd->rps_ipi_next = mysd->rps_ipi_list;
 | 
			
		||||
 		mysd->rps_ipi_list = sd;
 | 
			
		||||
 
 | 
			
		||||
@@ -5947,7 +5983,7 @@ static void net_rps_action_and_irq_enabl
 | 
			
		||||
 #ifdef CONFIG_RPS
 | 
			
		||||
 	struct softnet_data *remsd = sd->rps_ipi_list;
 | 
			
		||||
 
 | 
			
		||||
-	if (remsd) {
 | 
			
		||||
+	if (!use_backlog_threads() && remsd) {
 | 
			
		||||
 		sd->rps_ipi_list = NULL;
 | 
			
		||||
 
 | 
			
		||||
 		local_irq_enable();
 | 
			
		||||
@@ -5962,7 +5998,7 @@ static void net_rps_action_and_irq_enabl
 | 
			
		||||
 static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
 | 
			
		||||
 {
 | 
			
		||||
 #ifdef CONFIG_RPS
 | 
			
		||||
-	return sd->rps_ipi_list != NULL;
 | 
			
		||||
+	return !use_backlog_threads() && sd->rps_ipi_list;
 | 
			
		||||
 #else
 | 
			
		||||
 	return false;
 | 
			
		||||
 #endif
 | 
			
		||||
@@ -6006,7 +6042,7 @@ static int process_backlog(struct napi_s
 | 
			
		||||
 			 * We can use a plain write instead of clear_bit(),
 | 
			
		||||
 			 * and we dont need an smp_mb() memory barrier.
 | 
			
		||||
 			 */
 | 
			
		||||
-			napi->state = 0;
 | 
			
		||||
+			napi->state &= NAPIF_STATE_THREADED;
 | 
			
		||||
 			again = false;
 | 
			
		||||
 		} else {
 | 
			
		||||
 			skb_queue_splice_tail_init(&sd->input_pkt_queue,
 | 
			
		||||
@@ -6672,43 +6708,48 @@ static void skb_defer_free_flush(struct
 | 
			
		||||
 	}
 | 
			
		||||
 }
 | 
			
		||||
 
 | 
			
		||||
-static int napi_threaded_poll(void *data)
 | 
			
		||||
+static void napi_threaded_poll_loop(struct napi_struct *napi)
 | 
			
		||||
 {
 | 
			
		||||
-	struct napi_struct *napi = data;
 | 
			
		||||
 	struct softnet_data *sd;
 | 
			
		||||
-	void *have;
 | 
			
		||||
+	unsigned long last_qs = jiffies;
 | 
			
		||||
 
 | 
			
		||||
-	while (!napi_thread_wait(napi)) {
 | 
			
		||||
-		unsigned long last_qs = jiffies;
 | 
			
		||||
+	for (;;) {
 | 
			
		||||
+		bool repoll = false;
 | 
			
		||||
+		void *have;
 | 
			
		||||
 
 | 
			
		||||
-		for (;;) {
 | 
			
		||||
-			bool repoll = false;
 | 
			
		||||
+		local_bh_disable();
 | 
			
		||||
+		sd = this_cpu_ptr(&softnet_data);
 | 
			
		||||
+		sd->in_napi_threaded_poll = true;
 | 
			
		||||
 
 | 
			
		||||
-			local_bh_disable();
 | 
			
		||||
-			sd = this_cpu_ptr(&softnet_data);
 | 
			
		||||
-			sd->in_napi_threaded_poll = true;
 | 
			
		||||
-
 | 
			
		||||
-			have = netpoll_poll_lock(napi);
 | 
			
		||||
-			__napi_poll(napi, &repoll);
 | 
			
		||||
-			netpoll_poll_unlock(have);
 | 
			
		||||
-
 | 
			
		||||
-			sd->in_napi_threaded_poll = false;
 | 
			
		||||
-			barrier();
 | 
			
		||||
-
 | 
			
		||||
-			if (sd_has_rps_ipi_waiting(sd)) {
 | 
			
		||||
-				local_irq_disable();
 | 
			
		||||
-				net_rps_action_and_irq_enable(sd);
 | 
			
		||||
-			}
 | 
			
		||||
-			skb_defer_free_flush(sd);
 | 
			
		||||
-			local_bh_enable();
 | 
			
		||||
+		have = netpoll_poll_lock(napi);
 | 
			
		||||
+		__napi_poll(napi, &repoll);
 | 
			
		||||
+		netpoll_poll_unlock(have);
 | 
			
		||||
+
 | 
			
		||||
+		sd->in_napi_threaded_poll = false;
 | 
			
		||||
+		barrier();
 | 
			
		||||
+
 | 
			
		||||
+		if (sd_has_rps_ipi_waiting(sd)) {
 | 
			
		||||
+			local_irq_disable();
 | 
			
		||||
+			net_rps_action_and_irq_enable(sd);
 | 
			
		||||
+		}
 | 
			
		||||
+		skb_defer_free_flush(sd);
 | 
			
		||||
+		local_bh_enable();
 | 
			
		||||
 
 | 
			
		||||
-			if (!repoll)
 | 
			
		||||
-				break;
 | 
			
		||||
+		if (!repoll)
 | 
			
		||||
+			break;
 | 
			
		||||
 
 | 
			
		||||
-			rcu_softirq_qs_periodic(last_qs);
 | 
			
		||||
-			cond_resched();
 | 
			
		||||
-		}
 | 
			
		||||
+		rcu_softirq_qs_periodic(last_qs);
 | 
			
		||||
+		cond_resched();
 | 
			
		||||
 	}
 | 
			
		||||
+}
 | 
			
		||||
+
 | 
			
		||||
+static int napi_threaded_poll(void *data)
 | 
			
		||||
+{
 | 
			
		||||
+	struct napi_struct *napi = data;
 | 
			
		||||
+
 | 
			
		||||
+	while (!napi_thread_wait(napi))
 | 
			
		||||
+		napi_threaded_poll_loop(napi);
 | 
			
		||||
+
 | 
			
		||||
 	return 0;
 | 
			
		||||
 }
 | 
			
		||||
 
 | 
			
		||||
@@ -11288,7 +11329,7 @@ static int dev_cpu_dead(unsigned int old
 | 
			
		||||
 
 | 
			
		||||
 		list_del_init(&napi->poll_list);
 | 
			
		||||
 		if (napi->poll == process_backlog)
 | 
			
		||||
-			napi->state = 0;
 | 
			
		||||
+			napi->state &= NAPIF_STATE_THREADED;
 | 
			
		||||
 		else
 | 
			
		||||
 			____napi_schedule(sd, napi);
 | 
			
		||||
 	}
 | 
			
		||||
@@ -11296,12 +11337,14 @@ static int dev_cpu_dead(unsigned int old
 | 
			
		||||
 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
 | 
			
		||||
 	local_irq_enable();
 | 
			
		||||
 
 | 
			
		||||
+	if (!use_backlog_threads()) {
 | 
			
		||||
 #ifdef CONFIG_RPS
 | 
			
		||||
-	remsd = oldsd->rps_ipi_list;
 | 
			
		||||
-	oldsd->rps_ipi_list = NULL;
 | 
			
		||||
+		remsd = oldsd->rps_ipi_list;
 | 
			
		||||
+		oldsd->rps_ipi_list = NULL;
 | 
			
		||||
 #endif
 | 
			
		||||
-	/* send out pending IPI's on offline CPU */
 | 
			
		||||
-	net_rps_send_ipi(remsd);
 | 
			
		||||
+		/* send out pending IPI's on offline CPU */
 | 
			
		||||
+		net_rps_send_ipi(remsd);
 | 
			
		||||
+	}
 | 
			
		||||
 
 | 
			
		||||
 	/* Process offline CPU's input_pkt_queue */
 | 
			
		||||
 	while ((skb = __skb_dequeue(&oldsd->process_queue))) {
 | 
			
		||||
@@ -11564,6 +11607,38 @@ static struct pernet_operations __net_in
 | 
			
		||||
  *
 | 
			
		||||
  */
 | 
			
		||||
 
 | 
			
		||||
+static int backlog_napi_should_run(unsigned int cpu)
 | 
			
		||||
+{
 | 
			
		||||
+	struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu);
 | 
			
		||||
+	struct napi_struct *napi = &sd->backlog;
 | 
			
		||||
+
 | 
			
		||||
+	return test_bit(NAPI_STATE_SCHED_THREADED, &napi->state);
 | 
			
		||||
+}
 | 
			
		||||
+
 | 
			
		||||
+static void run_backlog_napi(unsigned int cpu)
 | 
			
		||||
+{
 | 
			
		||||
+	struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu);
 | 
			
		||||
+
 | 
			
		||||
+	napi_threaded_poll_loop(&sd->backlog);
 | 
			
		||||
+}
 | 
			
		||||
+
 | 
			
		||||
+static void backlog_napi_setup(unsigned int cpu)
 | 
			
		||||
+{
 | 
			
		||||
+	struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu);
 | 
			
		||||
+	struct napi_struct *napi = &sd->backlog;
 | 
			
		||||
+
 | 
			
		||||
+	napi->thread = this_cpu_read(backlog_napi);
 | 
			
		||||
+	set_bit(NAPI_STATE_THREADED, &napi->state);
 | 
			
		||||
+}
 | 
			
		||||
+
 | 
			
		||||
+static struct smp_hotplug_thread backlog_threads = {
 | 
			
		||||
+	.store			= &backlog_napi,
 | 
			
		||||
+	.thread_should_run	= backlog_napi_should_run,
 | 
			
		||||
+	.thread_fn		= run_backlog_napi,
 | 
			
		||||
+	.thread_comm		= "backlog_napi/%u",
 | 
			
		||||
+	.setup			= backlog_napi_setup,
 | 
			
		||||
+};
 | 
			
		||||
+
 | 
			
		||||
 /*
 | 
			
		||||
  *       This is called single threaded during boot, so no need
 | 
			
		||||
  *       to take the rtnl semaphore.
 | 
			
		||||
@@ -11614,7 +11689,10 @@ static int __init net_dev_init(void)
 | 
			
		||||
 		init_gro_hash(&sd->backlog);
 | 
			
		||||
 		sd->backlog.poll = process_backlog;
 | 
			
		||||
 		sd->backlog.weight = weight_p;
 | 
			
		||||
+		INIT_LIST_HEAD(&sd->backlog.poll_list);
 | 
			
		||||
 	}
 | 
			
		||||
+	if (use_backlog_threads())
 | 
			
		||||
+		smpboot_register_percpu_thread(&backlog_threads);
 | 
			
		||||
 
 | 
			
		||||
 	dev_boot_phase = 0;
 | 
			
		||||
 
 | 
			
		||||
@@ -0,0 +1,121 @@
 | 
			
		||||
From 80d2eefcb4c84aa9018b2a997ab3a4c567bc821a Mon Sep 17 00:00:00 2001
 | 
			
		||||
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
 | 
			
		||||
Date: Mon, 25 Mar 2024 08:40:30 +0100
 | 
			
		||||
Subject: [PATCH 3/4] net: Use backlog-NAPI to clean up the defer_list.
 | 
			
		||||
 | 
			
		||||
The defer_list is a per-CPU list which is used to free skbs outside of
 | 
			
		||||
the socket lock and on the CPU on which they have been allocated.
 | 
			
		||||
The list is processed during NAPI callbacks so ideally the list is
 | 
			
		||||
cleaned up.
 | 
			
		||||
Should the amount of skbs on the list exceed a certain water mark then
 | 
			
		||||
the softirq is triggered remotely on the target CPU by invoking a remote
 | 
			
		||||
function call. The raise of the softirqs via a remote function call
 | 
			
		||||
leads to waking the ksoftirqd on PREEMPT_RT which is undesired.
 | 
			
		||||
The backlog-NAPI threads already provide the infrastructure which can be
 | 
			
		||||
utilized to perform the cleanup of the defer_list.
 | 
			
		||||
 | 
			
		||||
The NAPI state is updated with the input_pkt_queue.lock acquired. It
 | 
			
		||||
order not to break the state, it is needed to also wake the backlog-NAPI
 | 
			
		||||
thread with the lock held. This requires to acquire the use the lock in
 | 
			
		||||
rps_lock_irq*() if the backlog-NAPI threads are used even with RPS
 | 
			
		||||
disabled.
 | 
			
		||||
 | 
			
		||||
Move the logic of remotely starting softirqs to clean up the defer_list
 | 
			
		||||
into kick_defer_list_purge(). Make sure a lock is held in
 | 
			
		||||
rps_lock_irq*() if backlog-NAPI threads are used. Schedule backlog-NAPI
 | 
			
		||||
for defer_list cleanup if backlog-NAPI is available.
 | 
			
		||||
 | 
			
		||||
Acked-by: Jakub Kicinski <kuba@kernel.org>
 | 
			
		||||
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
 | 
			
		||||
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
 | 
			
		||||
---
 | 
			
		||||
 include/linux/netdevice.h |  1 +
 | 
			
		||||
 net/core/dev.c            | 25 +++++++++++++++++++++----
 | 
			
		||||
 net/core/skbuff.c         |  4 ++--
 | 
			
		||||
 3 files changed, 24 insertions(+), 6 deletions(-)
 | 
			
		||||
 | 
			
		||||
--- a/include/linux/netdevice.h
 | 
			
		||||
+++ b/include/linux/netdevice.h
 | 
			
		||||
@@ -3300,6 +3300,7 @@ static inline void dev_xmit_recursion_de
 | 
			
		||||
 	__this_cpu_dec(softnet_data.xmit.recursion);
 | 
			
		||||
 }
 | 
			
		||||
 
 | 
			
		||||
+void kick_defer_list_purge(struct softnet_data *sd, unsigned int cpu);
 | 
			
		||||
 void __netif_schedule(struct Qdisc *q);
 | 
			
		||||
 void netif_schedule_queue(struct netdev_queue *txq);
 | 
			
		||||
 
 | 
			
		||||
--- a/net/core/dev.c
 | 
			
		||||
+++ b/net/core/dev.c
 | 
			
		||||
@@ -246,7 +246,7 @@ static bool use_backlog_threads(void)
 | 
			
		||||
 static inline void rps_lock_irqsave(struct softnet_data *sd,
 | 
			
		||||
 				    unsigned long *flags)
 | 
			
		||||
 {
 | 
			
		||||
-	if (IS_ENABLED(CONFIG_RPS))
 | 
			
		||||
+	if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads())
 | 
			
		||||
 		spin_lock_irqsave(&sd->input_pkt_queue.lock, *flags);
 | 
			
		||||
 	else if (!IS_ENABLED(CONFIG_PREEMPT_RT))
 | 
			
		||||
 		local_irq_save(*flags);
 | 
			
		||||
@@ -254,7 +254,7 @@ static inline void rps_lock_irqsave(stru
 | 
			
		||||
 
 | 
			
		||||
 static inline void rps_lock_irq_disable(struct softnet_data *sd)
 | 
			
		||||
 {
 | 
			
		||||
-	if (IS_ENABLED(CONFIG_RPS))
 | 
			
		||||
+	if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads())
 | 
			
		||||
 		spin_lock_irq(&sd->input_pkt_queue.lock);
 | 
			
		||||
 	else if (!IS_ENABLED(CONFIG_PREEMPT_RT))
 | 
			
		||||
 		local_irq_disable();
 | 
			
		||||
@@ -263,7 +263,7 @@ static inline void rps_lock_irq_disable(
 | 
			
		||||
 static inline void rps_unlock_irq_restore(struct softnet_data *sd,
 | 
			
		||||
 					  unsigned long *flags)
 | 
			
		||||
 {
 | 
			
		||||
-	if (IS_ENABLED(CONFIG_RPS))
 | 
			
		||||
+	if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads())
 | 
			
		||||
 		spin_unlock_irqrestore(&sd->input_pkt_queue.lock, *flags);
 | 
			
		||||
 	else if (!IS_ENABLED(CONFIG_PREEMPT_RT))
 | 
			
		||||
 		local_irq_restore(*flags);
 | 
			
		||||
@@ -271,7 +271,7 @@ static inline void rps_unlock_irq_restor
 | 
			
		||||
 
 | 
			
		||||
 static inline void rps_unlock_irq_enable(struct softnet_data *sd)
 | 
			
		||||
 {
 | 
			
		||||
-	if (IS_ENABLED(CONFIG_RPS))
 | 
			
		||||
+	if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads())
 | 
			
		||||
 		spin_unlock_irq(&sd->input_pkt_queue.lock);
 | 
			
		||||
 	else if (!IS_ENABLED(CONFIG_PREEMPT_RT))
 | 
			
		||||
 		local_irq_enable();
 | 
			
		||||
@@ -4774,6 +4774,23 @@ static void napi_schedule_rps(struct sof
 | 
			
		||||
 	__napi_schedule_irqoff(&mysd->backlog);
 | 
			
		||||
 }
 | 
			
		||||
 
 | 
			
		||||
+void kick_defer_list_purge(struct softnet_data *sd, unsigned int cpu)
 | 
			
		||||
+{
 | 
			
		||||
+	unsigned long flags;
 | 
			
		||||
+
 | 
			
		||||
+	if (use_backlog_threads()) {
 | 
			
		||||
+		rps_lock_irqsave(sd, &flags);
 | 
			
		||||
+
 | 
			
		||||
+		if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state))
 | 
			
		||||
+			__napi_schedule_irqoff(&sd->backlog);
 | 
			
		||||
+
 | 
			
		||||
+		rps_unlock_irq_restore(sd, &flags);
 | 
			
		||||
+
 | 
			
		||||
+	} else if (!cmpxchg(&sd->defer_ipi_scheduled, 0, 1)) {
 | 
			
		||||
+		smp_call_function_single_async(cpu, &sd->defer_csd);
 | 
			
		||||
+	}
 | 
			
		||||
+}
 | 
			
		||||
+
 | 
			
		||||
 #ifdef CONFIG_NET_FLOW_LIMIT
 | 
			
		||||
 int netdev_flow_limit_table_len __read_mostly = (1 << 12);
 | 
			
		||||
 #endif
 | 
			
		||||
--- a/net/core/skbuff.c
 | 
			
		||||
+++ b/net/core/skbuff.c
 | 
			
		||||
@@ -6863,8 +6863,8 @@ nodefer:	__kfree_skb(skb);
 | 
			
		||||
 	/* Make sure to trigger NET_RX_SOFTIRQ on the remote CPU
 | 
			
		||||
 	 * if we are unlucky enough (this seems very unlikely).
 | 
			
		||||
 	 */
 | 
			
		||||
-	if (unlikely(kick) && !cmpxchg(&sd->defer_ipi_scheduled, 0, 1))
 | 
			
		||||
-		smp_call_function_single_async(cpu, &sd->defer_csd);
 | 
			
		||||
+	if (unlikely(kick))
 | 
			
		||||
+		kick_defer_list_purge(sd, cpu);
 | 
			
		||||
 }
 | 
			
		||||
 
 | 
			
		||||
 static void skb_splice_csum_page(struct sk_buff *skb, struct page *page,
 | 
			
		||||
@@ -0,0 +1,164 @@
 | 
			
		||||
From 765b11f8f4e20b7433e4ba4a3e9106a0d59501ed Mon Sep 17 00:00:00 2001
 | 
			
		||||
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
 | 
			
		||||
Date: Mon, 25 Mar 2024 08:40:31 +0100
 | 
			
		||||
Subject: [PATCH 4/4] net: Rename rps_lock to backlog_lock.
 | 
			
		||||
 | 
			
		||||
The rps_lock.*() functions use the inner lock of a sk_buff_head for
 | 
			
		||||
locking. This lock is used if RPS is enabled, otherwise the list is
 | 
			
		||||
accessed lockless and disabling interrupts is enough for the
 | 
			
		||||
synchronisation because it is only accessed CPU local. Not only the list
 | 
			
		||||
is protected but also the NAPI state protected.
 | 
			
		||||
With the addition of backlog threads, the lock is also needed because of
 | 
			
		||||
the cross CPU access even without RPS. The clean up of the defer_list
 | 
			
		||||
list is also done via backlog threads (if enabled).
 | 
			
		||||
 | 
			
		||||
It has been suggested to rename the locking function since it is no
 | 
			
		||||
longer just RPS.
 | 
			
		||||
 | 
			
		||||
Rename the rps_lock*() functions to backlog_lock*().
 | 
			
		||||
 | 
			
		||||
Suggested-by: Jakub Kicinski <kuba@kernel.org>
 | 
			
		||||
Acked-by: Jakub Kicinski <kuba@kernel.org>
 | 
			
		||||
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
 | 
			
		||||
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
 | 
			
		||||
---
 | 
			
		||||
 net/core/dev.c | 34 +++++++++++++++++-----------------
 | 
			
		||||
 1 file changed, 17 insertions(+), 17 deletions(-)
 | 
			
		||||
 | 
			
		||||
--- a/net/core/dev.c
 | 
			
		||||
+++ b/net/core/dev.c
 | 
			
		||||
@@ -243,8 +243,8 @@ static bool use_backlog_threads(void)
 | 
			
		||||
 
 | 
			
		||||
 #endif
 | 
			
		||||
 
 | 
			
		||||
-static inline void rps_lock_irqsave(struct softnet_data *sd,
 | 
			
		||||
-				    unsigned long *flags)
 | 
			
		||||
+static inline void backlog_lock_irq_save(struct softnet_data *sd,
 | 
			
		||||
+					 unsigned long *flags)
 | 
			
		||||
 {
 | 
			
		||||
 	if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads())
 | 
			
		||||
 		spin_lock_irqsave(&sd->input_pkt_queue.lock, *flags);
 | 
			
		||||
@@ -252,7 +252,7 @@ static inline void rps_lock_irqsave(stru
 | 
			
		||||
 		local_irq_save(*flags);
 | 
			
		||||
 }
 | 
			
		||||
 
 | 
			
		||||
-static inline void rps_lock_irq_disable(struct softnet_data *sd)
 | 
			
		||||
+static inline void backlog_lock_irq_disable(struct softnet_data *sd)
 | 
			
		||||
 {
 | 
			
		||||
 	if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads())
 | 
			
		||||
 		spin_lock_irq(&sd->input_pkt_queue.lock);
 | 
			
		||||
@@ -260,8 +260,8 @@ static inline void rps_lock_irq_disable(
 | 
			
		||||
 		local_irq_disable();
 | 
			
		||||
 }
 | 
			
		||||
 
 | 
			
		||||
-static inline void rps_unlock_irq_restore(struct softnet_data *sd,
 | 
			
		||||
-					  unsigned long *flags)
 | 
			
		||||
+static inline void backlog_unlock_irq_restore(struct softnet_data *sd,
 | 
			
		||||
+					      unsigned long *flags)
 | 
			
		||||
 {
 | 
			
		||||
 	if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads())
 | 
			
		||||
 		spin_unlock_irqrestore(&sd->input_pkt_queue.lock, *flags);
 | 
			
		||||
@@ -269,7 +269,7 @@ static inline void rps_unlock_irq_restor
 | 
			
		||||
 		local_irq_restore(*flags);
 | 
			
		||||
 }
 | 
			
		||||
 
 | 
			
		||||
-static inline void rps_unlock_irq_enable(struct softnet_data *sd)
 | 
			
		||||
+static inline void backlog_unlock_irq_enable(struct softnet_data *sd)
 | 
			
		||||
 {
 | 
			
		||||
 	if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads())
 | 
			
		||||
 		spin_unlock_irq(&sd->input_pkt_queue.lock);
 | 
			
		||||
@@ -4779,12 +4779,12 @@ void kick_defer_list_purge(struct softne
 | 
			
		||||
 	unsigned long flags;
 | 
			
		||||
 
 | 
			
		||||
 	if (use_backlog_threads()) {
 | 
			
		||||
-		rps_lock_irqsave(sd, &flags);
 | 
			
		||||
+		backlog_lock_irq_save(sd, &flags);
 | 
			
		||||
 
 | 
			
		||||
 		if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state))
 | 
			
		||||
 			__napi_schedule_irqoff(&sd->backlog);
 | 
			
		||||
 
 | 
			
		||||
-		rps_unlock_irq_restore(sd, &flags);
 | 
			
		||||
+		backlog_unlock_irq_restore(sd, &flags);
 | 
			
		||||
 
 | 
			
		||||
 	} else if (!cmpxchg(&sd->defer_ipi_scheduled, 0, 1)) {
 | 
			
		||||
 		smp_call_function_single_async(cpu, &sd->defer_csd);
 | 
			
		||||
@@ -4846,7 +4846,7 @@ static int enqueue_to_backlog(struct sk_
 | 
			
		||||
 	reason = SKB_DROP_REASON_NOT_SPECIFIED;
 | 
			
		||||
 	sd = &per_cpu(softnet_data, cpu);
 | 
			
		||||
 
 | 
			
		||||
-	rps_lock_irqsave(sd, &flags);
 | 
			
		||||
+	backlog_lock_irq_save(sd, &flags);
 | 
			
		||||
 	if (!netif_running(skb->dev))
 | 
			
		||||
 		goto drop;
 | 
			
		||||
 	qlen = skb_queue_len(&sd->input_pkt_queue);
 | 
			
		||||
@@ -4855,7 +4855,7 @@ static int enqueue_to_backlog(struct sk_
 | 
			
		||||
 enqueue:
 | 
			
		||||
 			__skb_queue_tail(&sd->input_pkt_queue, skb);
 | 
			
		||||
 			input_queue_tail_incr_save(sd, qtail);
 | 
			
		||||
-			rps_unlock_irq_restore(sd, &flags);
 | 
			
		||||
+			backlog_unlock_irq_restore(sd, &flags);
 | 
			
		||||
 			return NET_RX_SUCCESS;
 | 
			
		||||
 		}
 | 
			
		||||
 
 | 
			
		||||
@@ -4870,7 +4870,7 @@ enqueue:
 | 
			
		||||
 
 | 
			
		||||
 drop:
 | 
			
		||||
 	sd->dropped++;
 | 
			
		||||
-	rps_unlock_irq_restore(sd, &flags);
 | 
			
		||||
+	backlog_unlock_irq_restore(sd, &flags);
 | 
			
		||||
 
 | 
			
		||||
 	dev_core_stats_rx_dropped_inc(skb->dev);
 | 
			
		||||
 	kfree_skb_reason(skb, reason);
 | 
			
		||||
@@ -5901,7 +5901,7 @@ static void flush_backlog(struct work_st
 | 
			
		||||
 	local_bh_disable();
 | 
			
		||||
 	sd = this_cpu_ptr(&softnet_data);
 | 
			
		||||
 
 | 
			
		||||
-	rps_lock_irq_disable(sd);
 | 
			
		||||
+	backlog_lock_irq_disable(sd);
 | 
			
		||||
 	skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
 | 
			
		||||
 		if (skb->dev->reg_state == NETREG_UNREGISTERING) {
 | 
			
		||||
 			__skb_unlink(skb, &sd->input_pkt_queue);
 | 
			
		||||
@@ -5909,7 +5909,7 @@ static void flush_backlog(struct work_st
 | 
			
		||||
 			input_queue_head_incr(sd);
 | 
			
		||||
 		}
 | 
			
		||||
 	}
 | 
			
		||||
-	rps_unlock_irq_enable(sd);
 | 
			
		||||
+	backlog_unlock_irq_enable(sd);
 | 
			
		||||
 
 | 
			
		||||
 	skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
 | 
			
		||||
 		if (skb->dev->reg_state == NETREG_UNREGISTERING) {
 | 
			
		||||
@@ -5927,14 +5927,14 @@ static bool flush_required(int cpu)
 | 
			
		||||
 	struct softnet_data *sd = &per_cpu(softnet_data, cpu);
 | 
			
		||||
 	bool do_flush;
 | 
			
		||||
 
 | 
			
		||||
-	rps_lock_irq_disable(sd);
 | 
			
		||||
+	backlog_lock_irq_disable(sd);
 | 
			
		||||
 
 | 
			
		||||
 	/* as insertion into process_queue happens with the rps lock held,
 | 
			
		||||
 	 * process_queue access may race only with dequeue
 | 
			
		||||
 	 */
 | 
			
		||||
 	do_flush = !skb_queue_empty(&sd->input_pkt_queue) ||
 | 
			
		||||
 		   !skb_queue_empty_lockless(&sd->process_queue);
 | 
			
		||||
-	rps_unlock_irq_enable(sd);
 | 
			
		||||
+	backlog_unlock_irq_enable(sd);
 | 
			
		||||
 
 | 
			
		||||
 	return do_flush;
 | 
			
		||||
 #endif
 | 
			
		||||
@@ -6049,7 +6049,7 @@ static int process_backlog(struct napi_s
 | 
			
		||||
 
 | 
			
		||||
 		}
 | 
			
		||||
 
 | 
			
		||||
-		rps_lock_irq_disable(sd);
 | 
			
		||||
+		backlog_lock_irq_disable(sd);
 | 
			
		||||
 		if (skb_queue_empty(&sd->input_pkt_queue)) {
 | 
			
		||||
 			/*
 | 
			
		||||
 			 * Inline a custom version of __napi_complete().
 | 
			
		||||
@@ -6065,7 +6065,7 @@ static int process_backlog(struct napi_s
 | 
			
		||||
 			skb_queue_splice_tail_init(&sd->input_pkt_queue,
 | 
			
		||||
 						   &sd->process_queue);
 | 
			
		||||
 		}
 | 
			
		||||
-		rps_unlock_irq_enable(sd);
 | 
			
		||||
+		backlog_unlock_irq_enable(sd);
 | 
			
		||||
 	}
 | 
			
		||||
 
 | 
			
		||||
 	return work;
 | 
			
		||||
@@ -85,7 +85,7 @@ Signed-off-by: Paolo Abeni <pabeni@redhat.com>
 | 
			
		||||
 /**
 | 
			
		||||
--- a/net/core/dev.c
 | 
			
		||||
+++ b/net/core/dev.c
 | 
			
		||||
@@ -6555,7 +6555,7 @@ static int __napi_poll(struct napi_struc
 | 
			
		||||
@@ -6602,7 +6602,7 @@ static int __napi_poll(struct napi_struc
 | 
			
		||||
 	 * accidentally calling ->poll() when NAPI is not scheduled.
 | 
			
		||||
 	 */
 | 
			
		||||
 	work = 0;
 | 
			
		||||
 
 | 
			
		||||
@@ -19,7 +19,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
 | 
			
		||||
 | 
			
		||||
--- a/include/linux/netdevice.h
 | 
			
		||||
+++ b/include/linux/netdevice.h
 | 
			
		||||
@@ -1759,6 +1759,7 @@ enum netdev_priv_flags {
 | 
			
		||||
@@ -1758,6 +1758,7 @@ enum netdev_priv_flags {
 | 
			
		||||
 	IFF_TX_SKB_NO_LINEAR		= BIT_ULL(31),
 | 
			
		||||
 	IFF_CHANGE_PROTO_DOWN		= BIT_ULL(32),
 | 
			
		||||
 	IFF_SEE_ALL_HWTSTAMP_REQUESTS	= BIT_ULL(33),
 | 
			
		||||
@@ -27,7 +27,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
 | 
			
		||||
 };
 | 
			
		||||
 
 | 
			
		||||
 #define IFF_802_1Q_VLAN			IFF_802_1Q_VLAN
 | 
			
		||||
@@ -1792,6 +1793,7 @@ enum netdev_priv_flags {
 | 
			
		||||
@@ -1791,6 +1792,7 @@ enum netdev_priv_flags {
 | 
			
		||||
 #define IFF_FAILOVER_SLAVE		IFF_FAILOVER_SLAVE
 | 
			
		||||
 #define IFF_L3MDEV_RX_HANDLER		IFF_L3MDEV_RX_HANDLER
 | 
			
		||||
 #define IFF_TX_SKB_NO_LINEAR		IFF_TX_SKB_NO_LINEAR
 | 
			
		||||
@@ -35,7 +35,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
 | 
			
		||||
 
 | 
			
		||||
 /* Specifies the type of the struct net_device::ml_priv pointer */
 | 
			
		||||
 enum netdev_ml_priv_type {
 | 
			
		||||
@@ -2184,6 +2186,11 @@ struct net_device {
 | 
			
		||||
@@ -2183,6 +2185,11 @@ struct net_device {
 | 
			
		||||
 	const struct tlsdev_ops *tlsdev_ops;
 | 
			
		||||
 #endif
 | 
			
		||||
 
 | 
			
		||||
@@ -47,7 +47,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
 | 
			
		||||
 	const struct header_ops *header_ops;
 | 
			
		||||
 
 | 
			
		||||
 	unsigned char		operstate;
 | 
			
		||||
@@ -2257,6 +2264,10 @@ struct net_device {
 | 
			
		||||
@@ -2256,6 +2263,10 @@ struct net_device {
 | 
			
		||||
 	struct mctp_dev __rcu	*mctp_ptr;
 | 
			
		||||
 #endif
 | 
			
		||||
 
 | 
			
		||||
@@ -105,7 +105,7 @@ Signed-off-by: Felix Fietkau <nbd@nbd.name>
 | 
			
		||||
 	help
 | 
			
		||||
--- a/net/core/dev.c
 | 
			
		||||
+++ b/net/core/dev.c
 | 
			
		||||
@@ -3571,6 +3571,11 @@ static int xmit_one(struct sk_buff *skb,
 | 
			
		||||
@@ -3597,6 +3597,11 @@ static int xmit_one(struct sk_buff *skb,
 | 
			
		||||
 	if (dev_nit_active(dev))
 | 
			
		||||
 		dev_queue_xmit_nit(skb, dev);
 | 
			
		||||
 
 | 
			
		||||
 
 | 
			
		||||
@@ -1,227 +0,0 @@
 | 
			
		||||
From: Felix Fietkau <nbd@nbd.name>
 | 
			
		||||
Date: Thu, 16 Feb 2023 18:39:04 +0100
 | 
			
		||||
Subject: [PATCH] net/core: add optional threading for backlog processing
 | 
			
		||||
 | 
			
		||||
When dealing with few flows or an imbalance on CPU utilization, static RPS
 | 
			
		||||
CPU assignment can be too inflexible. Add support for enabling threaded NAPI
 | 
			
		||||
for backlog processing in order to allow the scheduler to better balance
 | 
			
		||||
processing. This helps better spread the load across idle CPUs.
 | 
			
		||||
 | 
			
		||||
Signed-off-by: Felix Fietkau <nbd@nbd.name>
 | 
			
		||||
---
 | 
			
		||||
 | 
			
		||||
--- a/include/linux/netdevice.h
 | 
			
		||||
+++ b/include/linux/netdevice.h
 | 
			
		||||
@@ -558,6 +558,7 @@ static inline bool napi_complete(struct
 | 
			
		||||
 }
 | 
			
		||||
 
 | 
			
		||||
 int dev_set_threaded(struct net_device *dev, bool threaded);
 | 
			
		||||
+int backlog_set_threaded(bool threaded);
 | 
			
		||||
 
 | 
			
		||||
 /**
 | 
			
		||||
  *	napi_disable - prevent NAPI from scheduling
 | 
			
		||||
@@ -3236,6 +3237,7 @@ struct softnet_data {
 | 
			
		||||
 	/* stats */
 | 
			
		||||
 	unsigned int		processed;
 | 
			
		||||
 	unsigned int		time_squeeze;
 | 
			
		||||
+	unsigned int		process_queue_empty;
 | 
			
		||||
 #ifdef CONFIG_RPS
 | 
			
		||||
 	struct softnet_data	*rps_ipi_list;
 | 
			
		||||
 #endif
 | 
			
		||||
--- a/net/core/dev.c
 | 
			
		||||
+++ b/net/core/dev.c
 | 
			
		||||
@@ -4729,7 +4729,7 @@ static void napi_schedule_rps(struct sof
 | 
			
		||||
 	struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
 | 
			
		||||
 
 | 
			
		||||
 #ifdef CONFIG_RPS
 | 
			
		||||
-	if (sd != mysd) {
 | 
			
		||||
+	if (sd != mysd && !test_bit(NAPI_STATE_THREADED, &sd->backlog.state)) {
 | 
			
		||||
 		sd->rps_ipi_next = mysd->rps_ipi_list;
 | 
			
		||||
 		mysd->rps_ipi_list = sd;
 | 
			
		||||
 
 | 
			
		||||
@@ -5848,6 +5848,8 @@ static DEFINE_PER_CPU(struct work_struct
 | 
			
		||||
 /* Network device is going away, flush any packets still pending */
 | 
			
		||||
 static void flush_backlog(struct work_struct *work)
 | 
			
		||||
 {
 | 
			
		||||
+	unsigned int process_queue_empty;
 | 
			
		||||
+	bool threaded, flush_processq;
 | 
			
		||||
 	struct sk_buff *skb, *tmp;
 | 
			
		||||
 	struct softnet_data *sd;
 | 
			
		||||
 
 | 
			
		||||
@@ -5862,8 +5864,17 @@ static void flush_backlog(struct work_st
 | 
			
		||||
 			input_queue_head_incr(sd);
 | 
			
		||||
 		}
 | 
			
		||||
 	}
 | 
			
		||||
+
 | 
			
		||||
+	threaded = test_bit(NAPI_STATE_THREADED, &sd->backlog.state);
 | 
			
		||||
+	flush_processq = threaded &&
 | 
			
		||||
+			 !skb_queue_empty_lockless(&sd->process_queue);
 | 
			
		||||
+	if (flush_processq)
 | 
			
		||||
+		process_queue_empty = sd->process_queue_empty;
 | 
			
		||||
 	rps_unlock_irq_enable(sd);
 | 
			
		||||
 
 | 
			
		||||
+	if (threaded)
 | 
			
		||||
+		goto out;
 | 
			
		||||
+
 | 
			
		||||
 	skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
 | 
			
		||||
 		if (skb->dev->reg_state == NETREG_UNREGISTERING) {
 | 
			
		||||
 			__skb_unlink(skb, &sd->process_queue);
 | 
			
		||||
@@ -5871,7 +5882,16 @@ static void flush_backlog(struct work_st
 | 
			
		||||
 			input_queue_head_incr(sd);
 | 
			
		||||
 		}
 | 
			
		||||
 	}
 | 
			
		||||
+
 | 
			
		||||
+out:
 | 
			
		||||
 	local_bh_enable();
 | 
			
		||||
+
 | 
			
		||||
+	while (flush_processq) {
 | 
			
		||||
+		msleep(1);
 | 
			
		||||
+		rps_lock_irq_disable(sd);
 | 
			
		||||
+		flush_processq = process_queue_empty == sd->process_queue_empty;
 | 
			
		||||
+		rps_unlock_irq_enable(sd);
 | 
			
		||||
+	}
 | 
			
		||||
 }
 | 
			
		||||
 
 | 
			
		||||
 static bool flush_required(int cpu)
 | 
			
		||||
@@ -6003,6 +6023,7 @@ static int process_backlog(struct napi_s
 | 
			
		||||
 		}
 | 
			
		||||
 
 | 
			
		||||
 		rps_lock_irq_disable(sd);
 | 
			
		||||
+		sd->process_queue_empty++;
 | 
			
		||||
 		if (skb_queue_empty(&sd->input_pkt_queue)) {
 | 
			
		||||
 			/*
 | 
			
		||||
 			 * Inline a custom version of __napi_complete().
 | 
			
		||||
@@ -6012,7 +6033,8 @@ static int process_backlog(struct napi_s
 | 
			
		||||
 			 * We can use a plain write instead of clear_bit(),
 | 
			
		||||
 			 * and we dont need an smp_mb() memory barrier.
 | 
			
		||||
 			 */
 | 
			
		||||
-			napi->state = 0;
 | 
			
		||||
+			napi->state &= ~(NAPIF_STATE_SCHED |
 | 
			
		||||
+					 NAPIF_STATE_SCHED_THREADED);
 | 
			
		||||
 			again = false;
 | 
			
		||||
 		} else {
 | 
			
		||||
 			skb_queue_splice_tail_init(&sd->input_pkt_queue,
 | 
			
		||||
@@ -6426,6 +6448,55 @@ int dev_set_threaded(struct net_device *
 | 
			
		||||
 }
 | 
			
		||||
 EXPORT_SYMBOL(dev_set_threaded);
 | 
			
		||||
 
 | 
			
		||||
+int backlog_set_threaded(bool threaded)
 | 
			
		||||
+{
 | 
			
		||||
+	static bool backlog_threaded;
 | 
			
		||||
+	int err = 0;
 | 
			
		||||
+	int i;
 | 
			
		||||
+
 | 
			
		||||
+	if (backlog_threaded == threaded)
 | 
			
		||||
+		return 0;
 | 
			
		||||
+
 | 
			
		||||
+	for_each_possible_cpu(i) {
 | 
			
		||||
+		struct softnet_data *sd = &per_cpu(softnet_data, i);
 | 
			
		||||
+		struct napi_struct *n = &sd->backlog;
 | 
			
		||||
+
 | 
			
		||||
+		if (n->thread)
 | 
			
		||||
+			continue;
 | 
			
		||||
+		n->thread = kthread_run(napi_threaded_poll, n, "napi/backlog-%d", i);
 | 
			
		||||
+		if (IS_ERR(n->thread)) {
 | 
			
		||||
+			err = PTR_ERR(n->thread);
 | 
			
		||||
+			pr_err("kthread_run failed with err %d\n", err);
 | 
			
		||||
+			n->thread = NULL;
 | 
			
		||||
+			threaded = false;
 | 
			
		||||
+			break;
 | 
			
		||||
+		}
 | 
			
		||||
+
 | 
			
		||||
+	}
 | 
			
		||||
+
 | 
			
		||||
+	backlog_threaded = threaded;
 | 
			
		||||
+
 | 
			
		||||
+	/* Make sure kthread is created before THREADED bit
 | 
			
		||||
+	 * is set.
 | 
			
		||||
+	 */
 | 
			
		||||
+	smp_mb__before_atomic();
 | 
			
		||||
+
 | 
			
		||||
+	for_each_possible_cpu(i) {
 | 
			
		||||
+		struct softnet_data *sd = &per_cpu(softnet_data, i);
 | 
			
		||||
+		struct napi_struct *n = &sd->backlog;
 | 
			
		||||
+		unsigned long flags;
 | 
			
		||||
+
 | 
			
		||||
+		rps_lock_irqsave(sd, &flags);
 | 
			
		||||
+		if (threaded)
 | 
			
		||||
+			n->state |= NAPIF_STATE_THREADED;
 | 
			
		||||
+		else
 | 
			
		||||
+			n->state &= ~NAPIF_STATE_THREADED;
 | 
			
		||||
+		rps_unlock_irq_restore(sd, &flags);
 | 
			
		||||
+	}
 | 
			
		||||
+
 | 
			
		||||
+	return err;
 | 
			
		||||
+}
 | 
			
		||||
+
 | 
			
		||||
 void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi,
 | 
			
		||||
 			   int (*poll)(struct napi_struct *, int), int weight)
 | 
			
		||||
 {
 | 
			
		||||
@@ -11307,6 +11378,9 @@ static int dev_cpu_dead(unsigned int old
 | 
			
		||||
 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
 | 
			
		||||
 	local_irq_enable();
 | 
			
		||||
 
 | 
			
		||||
+	if (test_bit(NAPI_STATE_THREADED, &oldsd->backlog.state))
 | 
			
		||||
+		return 0;
 | 
			
		||||
+
 | 
			
		||||
 #ifdef CONFIG_RPS
 | 
			
		||||
 	remsd = oldsd->rps_ipi_list;
 | 
			
		||||
 	oldsd->rps_ipi_list = NULL;
 | 
			
		||||
@@ -11622,6 +11696,7 @@ static int __init net_dev_init(void)
 | 
			
		||||
 		INIT_CSD(&sd->defer_csd, trigger_rx_softirq, sd);
 | 
			
		||||
 		spin_lock_init(&sd->defer_lock);
 | 
			
		||||
 
 | 
			
		||||
+		INIT_LIST_HEAD(&sd->backlog.poll_list);
 | 
			
		||||
 		init_gro_hash(&sd->backlog);
 | 
			
		||||
 		sd->backlog.poll = process_backlog;
 | 
			
		||||
 		sd->backlog.weight = weight_p;
 | 
			
		||||
--- a/net/core/sysctl_net_core.c
 | 
			
		||||
+++ b/net/core/sysctl_net_core.c
 | 
			
		||||
@@ -30,6 +30,7 @@ static int int_3600 = 3600;
 | 
			
		||||
 static int min_sndbuf = SOCK_MIN_SNDBUF;
 | 
			
		||||
 static int min_rcvbuf = SOCK_MIN_RCVBUF;
 | 
			
		||||
 static int max_skb_frags = MAX_SKB_FRAGS;
 | 
			
		||||
+static int backlog_threaded;
 | 
			
		||||
 static int min_mem_pcpu_rsv = SK_MEMORY_PCPU_RESERVE;
 | 
			
		||||
 
 | 
			
		||||
 static int net_msg_warn;	/* Unused, but still a sysctl */
 | 
			
		||||
@@ -189,6 +190,23 @@ static int rps_sock_flow_sysctl(struct c
 | 
			
		||||
 }
 | 
			
		||||
 #endif /* CONFIG_RPS */
 | 
			
		||||
 
 | 
			
		||||
+static int backlog_threaded_sysctl(struct ctl_table *table, int write,
 | 
			
		||||
+			       void *buffer, size_t *lenp, loff_t *ppos)
 | 
			
		||||
+{
 | 
			
		||||
+	static DEFINE_MUTEX(backlog_threaded_mutex);
 | 
			
		||||
+	int ret;
 | 
			
		||||
+
 | 
			
		||||
+	mutex_lock(&backlog_threaded_mutex);
 | 
			
		||||
+
 | 
			
		||||
+	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 | 
			
		||||
+	if (write && !ret)
 | 
			
		||||
+		ret = backlog_set_threaded(backlog_threaded);
 | 
			
		||||
+
 | 
			
		||||
+	mutex_unlock(&backlog_threaded_mutex);
 | 
			
		||||
+
 | 
			
		||||
+	return ret;
 | 
			
		||||
+}
 | 
			
		||||
+
 | 
			
		||||
 #ifdef CONFIG_NET_FLOW_LIMIT
 | 
			
		||||
 static DEFINE_MUTEX(flow_limit_update_mutex);
 | 
			
		||||
 
 | 
			
		||||
@@ -541,6 +559,15 @@ static struct ctl_table net_core_table[]
 | 
			
		||||
 		.proc_handler	= rps_sock_flow_sysctl
 | 
			
		||||
 	},
 | 
			
		||||
 #endif
 | 
			
		||||
+	{
 | 
			
		||||
+		.procname	= "backlog_threaded",
 | 
			
		||||
+		.data		= &backlog_threaded,
 | 
			
		||||
+		.maxlen		= sizeof(unsigned int),
 | 
			
		||||
+		.mode		= 0644,
 | 
			
		||||
+		.proc_handler	= backlog_threaded_sysctl,
 | 
			
		||||
+		.extra1		= SYSCTL_ZERO,
 | 
			
		||||
+		.extra2		= SYSCTL_ONE
 | 
			
		||||
+	},
 | 
			
		||||
 #ifdef CONFIG_NET_FLOW_LIMIT
 | 
			
		||||
 	{
 | 
			
		||||
 		.procname	= "flow_limit_cpu_bitmap",
 | 
			
		||||
		Reference in New Issue
	
	Block a user